From e9bd39486b3ed0cfa7062be1798efae8316cda0d Mon Sep 17 00:00:00 2001 From: Sergio Bossa Date: Thu, 5 Oct 2017 16:51:06 +0100 Subject: [PATCH 001/151] STAR-567: Add test from DB-1208 (bug itself is not present) (cherry picked from commit f4cca35aaa78780dbc344217c2ac0a70ca617679) --- .../io/sstable/ReducingKeyIteratorTest.java | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java diff --git a/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java b/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java new file mode 100644 index 000000000000..aaf1a2aa779b --- /dev/null +++ b/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.io.sstable; + +import java.io.IOException; +import java.util.Set; + +import org.junit.After; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.RowUpdateBuilder; +import org.apache.cassandra.db.compaction.CompactionManager; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.utils.ByteBufferUtil; + +public class ReducingKeyIteratorTest +{ + public static final String KEYSPACE1 = "ReducingKeyIteratorTest"; + public static final String CF_STANDARD = "Standard1"; + + @BeforeClass + public static void setup() throws Exception + { + SchemaLoader.prepareServer(); + CompactionManager.instance.disableAutoCompaction(); + + SchemaLoader.createKeyspace(KEYSPACE1, + KeyspaceParams.simple(1), + SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD)); + } + + @After + public void afterTest() throws Exception + { + ColumnFamilyStore store = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD); + store.truncateBlocking(); + } + + @Test + public void testTotalAndReadBytesOneSSTable() throws IOException + { + testTotalAndReadBytes(1, 1000); + } + + @Test + public void testTotalAndReadBytesManySSTables() throws IOException + { + testTotalAndReadBytes(10, 100); + } + + public void testTotalAndReadBytes(int tableCount, int rowCount) throws IOException + { + Keyspace keyspace = Keyspace.open(KEYSPACE1); + ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD); + LoggerFactory.getLogger(getClass()).info("Compression {}", store.metadata().params.compression.asMap()); + + for (int t = 0; t < tableCount; ++t) + { + for (int i = 0; i < rowCount; i++) + { + new RowUpdateBuilder(store.metadata(), i, String.valueOf(i)) + .clustering("0") + .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER) + .build() + .applyUnsafe(); + } + store.forceBlockingFlush(); + } + + Set sstables = store.getLiveSSTables(); + ReducingKeyIterator reducingIterator = new ReducingKeyIterator(sstables); + + while (reducingIterator.hasNext()) + { + Assert.assertTrue(reducingIterator.getTotalBytes() >= reducingIterator.getBytesRead()); + reducingIterator.next(); + } + Assert.assertEquals(reducingIterator.getTotalBytes(), reducingIterator.getBytesRead()); + } +} From 1d6eb628b3748f7f0742b62465ae20715249e165 Mon Sep 17 00:00:00 2001 From: Ruslan Fomkin Date: Mon, 31 May 2021 12:16:05 +0200 Subject: [PATCH 002/151] STAR-570 Synchronize schema pull handling (#175) Synchronize schema pull handling with applying new schema changes There's a race condition around pulling schema changes, that can occur in case the schema changes push/propagation mechanism is not immediately effective (e.g. because of network delay, or because of the pulling node being down, etc.). If schema changes happen on node 1, these changes do not reach node 2 immediately through the SCHEMA.PUSH mechanism, and are first recognized during gossiping, the corresponding SCHEMA.PULL request from node 2 can catch the node 1 schema in the middle of it being modified by another schema change request. This can easily lead to problems (e.g. if a new table is being added, and the node 2 request reads the changes that need to be applied to system_schema.tables, but not the ones that need to be applied to system_schema.columns). This PR addresses that by synchronizing the SCHEMA.PULL "RPC call" executed in node 1 by a request from node 2 with the method for applying schema changes in node 1. It also adds debug level logging tracking SCHEMA.PUSH and SCHEMA.PULL messages, as there were some unexpected findings around these that may need further investigation. E.g. during my investigations, seemingly redundant SCHEMA.PULL messages were sent multiple times from node 1 to node 2, even though no schema changes were made at node 2, and node 2 did not go offline. Co-authored-by: Dimitar Dimitrov Co-authored-by: Dimitar Dimitrov <30328539+dimitarndimitrov@users.noreply.github.com> (cherry picked from commit ab2a669cb8b318e7014f6445d032e75f371e3da9) --- src/java/org/apache/cassandra/schema/SchemaKeyspace.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java index 65e9adbdae67..fdb209e06ca0 100644 --- a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java +++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java @@ -393,7 +393,7 @@ private static ReadCommand getReadCommandForTableSchema(String schemaTableName) return PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds()); } - static Collection convertSchemaToMutations() + static synchronized Collection convertSchemaToMutations() { Map mutationMap = new HashMap<>(); From 7abf8639965e317a73f1193fc5624ea968400a67 Mon Sep 17 00:00:00 2001 From: Ruslan Fomkin Date: Tue, 1 Jun 2021 09:37:12 +0200 Subject: [PATCH 003/151] STAR-572 Improve error message when altering MV (#176) Improve error message when altering an MV with default ttl > 0. Co-authored-by: Brandon Williams (cherry picked from commit 8296fe1319524480ae1910fde5f23c266c1642ed) --- .../cassandra/cql3/statements/schema/AlterViewStatement.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java index 1931bb489df3..3eba21561a48 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java @@ -66,8 +66,9 @@ public Keyspaces apply(Keyspaces schema) if (params.defaultTimeToLive > 0) { throw ire("Cannot set or alter default_time_to_live for a materialized view. " + - "Data in a materialized view always expire at the same time than " + - "the corresponding data in the parent table."); + "Data in a materialized view always expires at the same time as " + + "the corresponding data in the parent table. default_time_to_live " + + "must be set to zero, see CASSANDRA-12868 for more information."); } ViewMetadata newView = view.copy(view.metadata.withSwapped(params)); From 03dc51051c1a2188edbb715128c08ad285aa8a0b Mon Sep 17 00:00:00 2001 From: Jaroslaw Grabowski Date: Tue, 1 Jun 2021 09:39:25 +0200 Subject: [PATCH 004/151] STAR-565 Use the indexed item type as backing table key validator (#172) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... of 2i on collections Co-authored-by: Andrés de la Peña andres.de_la_pena_garcia@datastax.com (cherry picked from commit 07edf0d25def34c1c8b2e7f06b93a817692784dd) --- .../index/internal/CassandraIndex.java | 2 +- .../entities/SecondaryIndexTest.java | 104 ++++++++++++++++++ 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java index ea5f8e2f613c..06a6cd9cfd66 100644 --- a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java +++ b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java @@ -742,7 +742,7 @@ public static TableMetadata indexCfsMetadata(TableMetadata baseCfsMetadata, Inde TableMetadata.builder(baseCfsMetadata.keyspace, baseCfsMetadata.indexTableName(indexMetadata), baseCfsMetadata.id) .kind(TableMetadata.Kind.INDEX) .partitioner(new LocalPartitioner(indexedValueType)) - .addPartitionKeyColumn(indexedColumn.name, indexedColumn.type) + .addPartitionKeyColumn(indexedColumn.name, indexedValueType) .addClusteringColumn("partition_key", baseCfsMetadata.partitioner.partitionOrdering()); // Adding clustering columns, which depends on the index type. diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java index 5c17fb85638b..3d97df931068 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java @@ -27,6 +27,12 @@ import org.apache.commons.lang3.StringUtils; import org.junit.Test; +import org.apache.cassandra.db.marshal.CompositeType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; @@ -432,6 +438,104 @@ public void testIndexOnCollections() throws Throwable }); } + private static void assertBackingTableKeyValidator(SecondaryIndexManager indexManager, String indexName, AbstractType expectedType) + { + assertEquals(expectedType, indexManager.getIndexByName(indexName) + .getBackingTable() + .map(ColumnFamilyStore::metadata) + .map(m -> m.partitionKeyType) + .orElseThrow(AssertionError::new)); + } + + /** + * Test for DB-1121 + */ + @Test + public void testIndexOnCollectionsBackingTableKeyValidator() throws Throwable + { + createTable("CREATE TABLE %s (" + + "k int PRIMARY KEY, " + + "non_frozen_list list, " + + "non_frozen_set set, " + + "non_frozen_map map," + + "frozen_list frozen>, " + + "frozen_set frozen>, " + + "frozen_map frozen>)"); + + createIndex("CREATE INDEX non_frozen_list_idx ON %s (non_frozen_list)"); + createIndex("CREATE INDEX non_frozen_set_idx ON %s (non_frozen_set)"); + createIndex("CREATE INDEX non_frozen_map_idx ON %s (non_frozen_map)"); + createIndex("CREATE INDEX non_frozen_map_keys_idx ON %s (KEYS(non_frozen_map))"); + createIndex("CREATE INDEX non_frozen_map_entries_idx ON %s (ENTRIES(non_frozen_map))"); + createIndex("CREATE INDEX frozen_list_idx ON %s (FULL(frozen_list))"); + createIndex("CREATE INDEX frozen_set_idx ON %s (FULL(frozen_set))"); + createIndex("CREATE INDEX frozen_map_idx ON %s (FULL(frozen_map))"); + + SecondaryIndexManager indexManager = ColumnFamilyStore.getIfExists(keyspace(), currentTable()).indexManager; + + assertBackingTableKeyValidator(indexManager, "non_frozen_list_idx", Int32Type.instance); + assertBackingTableKeyValidator(indexManager, "non_frozen_set_idx", UTF8Type.instance); + assertBackingTableKeyValidator(indexManager, "non_frozen_map_idx", Int32Type.instance); + assertBackingTableKeyValidator(indexManager, "non_frozen_map_keys_idx", UTF8Type.instance); + assertBackingTableKeyValidator(indexManager, "non_frozen_map_entries_idx", CompositeType.getInstance(UTF8Type.instance, Int32Type.instance)); + assertBackingTableKeyValidator(indexManager, "frozen_list_idx", ListType.getInstance(Int32Type.instance, false)); + assertBackingTableKeyValidator(indexManager, "frozen_set_idx", SetType.getInstance(UTF8Type.instance, false)); + assertBackingTableKeyValidator(indexManager, "frozen_map_idx", MapType.getInstance(UTF8Type.instance, Int32Type.instance, false)); + + // Unsupported index types for non-frozen list + assertInvalidMessage("Cannot create index on keys of column non_frozen_list with non-map type", + "CREATE INDEX ON %s (KEYS(non_frozen_list))"); + assertInvalidMessage("Cannot create index on entries of column non_frozen_list with non-map type", + "CREATE INDEX ON %s (ENTRIES(non_frozen_list))"); + assertInvalidMessage("full() indexes can only be created on frozen collections", + "CREATE INDEX ON %s (FULL(non_frozen_list))"); + + // Unsupported index types for non-frozen set + assertInvalidMessage("Cannot create index on keys of column non_frozen_set with non-map type", + "CREATE INDEX ON %s (KEYS(non_frozen_set))"); + assertInvalidMessage("Cannot create index on entries of column non_frozen_set with non-map type", + "CREATE INDEX ON %s (ENTRIES(non_frozen_set))"); + assertInvalidMessage("full() indexes can only be created on frozen collections", + "CREATE INDEX ON %s (FULL(non_frozen_set))"); + + // Unsupported index types for non-frozen map + assertInvalidMessage("full() indexes can only be created on frozen collections", + "CREATE INDEX ON %s (FULL(non_frozen_map))"); + + // Unsupported index types for frozen list + assertInvalidMessage("Cannot create keys() index on frozen column frozen_list. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_list)' modifier", + "CREATE INDEX ON %s (KEYS(frozen_list))"); + assertInvalidMessage("Cannot create entries() index on frozen column frozen_list. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_list)' modifier", + "CREATE INDEX ON %s (ENTRIES(frozen_list))"); + assertInvalidMessage("Cannot create values() index on frozen column frozen_list. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_list)' modifier", + "CREATE INDEX ON %s (VALUES(frozen_list))"); + + // Unsupported index types for frozen set + assertInvalidMessage("Cannot create keys() index on frozen column frozen_set. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_set)' modifier", + "CREATE INDEX ON %s (KEYS(frozen_set))"); + assertInvalidMessage("Cannot create entries() index on frozen column frozen_set. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_set)' modifier", + "CREATE INDEX ON %s (ENTRIES(frozen_set))"); + assertInvalidMessage("Cannot create values() index on frozen column frozen_set. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_set)' modifier", + "CREATE INDEX ON %s (VALUES(frozen_set))"); + + // Unsupported index types for frozen map + assertInvalidMessage("Cannot create keys() index on frozen column frozen_map. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_map)' modifier", + "CREATE INDEX ON %s (KEYS(frozen_map))"); + assertInvalidMessage("Cannot create entries() index on frozen column frozen_map. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_map)' modifier", + "CREATE INDEX ON %s (ENTRIES(frozen_map))"); + assertInvalidMessage("Cannot create values() index on frozen column frozen_map. Frozen collections " + + "are immutable and must be fully indexed by using the 'full(frozen_map)' modifier", + "CREATE INDEX ON %s (VALUES(frozen_map))"); + } + @Test public void testSelectOnMultiIndexOnCollectionsWithNull() throws Throwable { From c1eefd1f5b8d2332fa14ec7a6c6db862dbcced88 Mon Sep 17 00:00:00 2001 From: Jaroslaw Grabowski Date: Tue, 1 Jun 2021 10:32:21 +0200 Subject: [PATCH 005/151] STAR-564 Check only MODIFY on base when updating table with MV (#171) If a user has only MODIFY permission on a table and there is a materialized view built on the same table an insert will fail with the following error: Unauthorized: Error from server: code=2100 [Unauthorized] Only base MODIFY permission is required to update base with MV. Co-authored-by: Zhao Yang (cherry picked from commit 53c5aa68693cf084bb36d56389c7ede485abbbf6) --- CHANGES.txt | 3 +++ .../cql3/statements/ModificationStatement.java | 12 +----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 7e0c1a98a71a..462c5f41d00e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,6 @@ +Future version (tbd) + * Require only MODIFY permission on base when updating table with MV (STAR-564) + 4.0.1 * Cleanup dependency scopes (CASSANDRA-16704) * Make JmxHistogram#getRecentValues() and JmxTimer#getRecentValues() thread-safe (CASSANDRA-16707) diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java index 087f3b0e7c4b..7ec3a69fcedd 100644 --- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java @@ -244,17 +244,7 @@ public void authorize(ClientState state) throws InvalidRequestException, Unautho if (hasConditions()) state.ensureTablePermission(metadata, Permission.SELECT); - // MV updates need to get the current state from the table, and might update the views - // Require Permission.SELECT on the base table, and Permission.MODIFY on the views - Iterator views = View.findAll(keyspace(), columnFamily()).iterator(); - if (views.hasNext()) - { - state.ensureTablePermission(metadata, Permission.SELECT); - do - { - state.ensureTablePermission(views.next().metadata, Permission.MODIFY); - } while (views.hasNext()); - } + // Modification on base table with MV should skip SELECT access control to base table and WRITE access control to view table. for (Function function : getFunctions()) state.ensurePermission(Permission.EXECUTE, function); From a484e25631d6d7ac2cadfdf2b1a6008d44794894 Mon Sep 17 00:00:00 2001 From: Jaroslaw Grabowski Date: Tue, 1 Jun 2021 14:55:09 +0200 Subject: [PATCH 006/151] STAR-571 fix nanoseconds overflowing in CommitLogService (#174) * remove dead code * add awaitSyncAt test The test verifies two cases when nanoseconds overflowing causes awaitSyncAt to never return (as described in the original ticket). * fix nanoseconds overflowing in CommitLogService * Use clock consistently in all CommitLogService implementations * Compare nanoseconds as a difference as advised in the docs: https://docs.oracle.com/javase/8/docs/api/java/lang/System.html#nanoTime * add abs values where the overflowed value may cause a negative result Co-authored-by: Sergio Bossa (cherry picked from commit 91db2d4c8490e6beff706e1ca7e7339d52cbd155) --- .../commitlog/AbstractCommitLogService.java | 33 ++-- .../db/commitlog/BatchCommitLogService.java | 6 +- .../cassandra/db/commitlog/CommitLog.java | 12 +- .../db/commitlog/GroupCommitLogService.java | 5 +- .../commitlog/PeriodicCommitLogService.java | 9 +- .../cassandra/utils/MonotonicClock.java | 4 +- .../cassandra/utils/SlidingTimeRate.java | 171 ------------------ .../cassandra/utils/SystemTimeSource.java | 54 ------ .../apache/cassandra/utils/TimeSource.java | 58 ------ .../utils/concurrent/IntervalLock.java | 69 ------- .../AbstractCommitLogServiceTest.java | 3 +- .../commitlog/CommitLogAwaitAsyncAtTest.java | 110 +++++++++++ .../cassandra/utils/MonotonicClockTest.java | 10 + .../cassandra/utils/SlidingTimeRateTest.java | 161 ----------------- .../cassandra/utils/TestTimeSource.java | 72 -------- 15 files changed, 165 insertions(+), 612 deletions(-) delete mode 100644 src/java/org/apache/cassandra/utils/SlidingTimeRate.java delete mode 100644 src/java/org/apache/cassandra/utils/SystemTimeSource.java delete mode 100644 src/java/org/apache/cassandra/utils/TimeSource.java delete mode 100644 src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java create mode 100644 test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java delete mode 100644 test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java delete mode 100644 test/unit/org/apache/cassandra/utils/TestTimeSource.java diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java index a65ef00a11bd..326936cdb5d1 100644 --- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java +++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java @@ -45,7 +45,7 @@ public abstract class AbstractCommitLogService private volatile boolean shutdown = false; // all Allocations written before this time will be synced - protected volatile long lastSyncedAt = System.currentTimeMillis(); + protected volatile long lastSyncedAt; // counts of total written, and pending, log messages private final AtomicLong written = new AtomicLong(0); @@ -68,6 +68,11 @@ public abstract class AbstractCommitLogService */ final long markerIntervalNanos; + /** + * Provides time related functions for commit log syncing scheduling. + */ + protected final MonotonicClock clock; + /** * A flag that callers outside of the sync thread can use to signal they want the commitlog segments * to be flushed to disk. Note: this flag is primarily to support commit log's batch mode, which requires @@ -83,9 +88,9 @@ public abstract class AbstractCommitLogService * * Subclasses may be notified when a sync finishes by using the syncComplete WaitQueue. */ - AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis) + AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, MonotonicClock clock) { - this (commitLog, name, syncIntervalMillis, false); + this (commitLog, name, syncIntervalMillis, clock, false); } /** @@ -96,10 +101,12 @@ public abstract class AbstractCommitLogService * * @param markHeadersFaster true if the chained markers should be updated more frequently than on the disk sync bounds. */ - AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, boolean markHeadersFaster) + AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, MonotonicClock clock, boolean markHeadersFaster) { this.commitLog = commitLog; this.name = name; + this.clock = clock; + this.lastSyncedAt = clock.now(); final long markerIntervalMillis; if (markHeadersFaster && syncIntervalMillis > DEFAULT_MARKER_INTERVAL_MILLIS) @@ -132,7 +139,7 @@ void start() throw new IllegalArgumentException(String.format("Commit log flush interval must be positive: %fms", syncIntervalNanos * 1e-6)); shutdown = false; - thread = NamedThreadFactory.createThread(new SyncRunnable(MonotonicClock.preciseTime), name); + thread = NamedThreadFactory.createThread(new SyncRunnable(clock), name); thread.start(); } @@ -168,7 +175,7 @@ boolean sync() { // sync and signal long pollStarted = clock.now(); - boolean flushToDisk = lastSyncedAt + syncIntervalNanos <= pollStarted || shutdownRequested || syncRequested; + boolean flushToDisk = lastSyncedAt + syncIntervalNanos - pollStarted <= 0 || shutdownRequested || syncRequested; if (flushToDisk) { // in this branch, we want to flush the commit log to disk @@ -192,7 +199,7 @@ boolean sync() return false; long wakeUpAt = pollStarted + markerIntervalNanos; - if (wakeUpAt > now) + if (wakeUpAt - now > 0) LockSupport.parkNanos(wakeUpAt - now); } catch (Throwable t) @@ -218,7 +225,7 @@ boolean maybeLogFlushLag(long pollStarted, long now) // this is the timestamp by which we should have completed the flush long maxFlushTimestamp = pollStarted + syncIntervalNanos; - if (maxFlushTimestamp > now) + if (maxFlushTimestamp - now > 0) return false; // if we have lagged noticeably, update our lag counter @@ -229,7 +236,7 @@ boolean maybeLogFlushLag(long pollStarted, long now) syncCount = 1; totalSyncDuration = flushDuration; } - syncExceededIntervalBy += now - maxFlushTimestamp; + syncExceededIntervalBy += Math.abs(now - maxFlushTimestamp); lagCount++; if (firstLagAt > 0) @@ -241,7 +248,7 @@ boolean maybeLogFlushLag(long pollStarted, long now) TimeUnit.MINUTES, "Out of {} commit log syncs over the past {}s with average duration of {}ms, {} have exceeded the configured commit interval by an average of {}ms", syncCount, - String.format("%.2f", (now - firstLagAt) * 1e-9d), + String.format("%.2f", Math.abs(now - firstLagAt) * 1e-9d), String.format("%.2f", totalSyncDuration * 1e-6d / syncCount), lagCount, String.format("%.2f", syncExceededIntervalBy * 1e-6d / lagCount)); @@ -292,7 +299,7 @@ public void shutdown() */ public void syncBlocking() { - long requestTime = System.nanoTime(); + long requestTime = clock.now(); requestExtraSync(); awaitSyncAt(requestTime, null); } @@ -302,12 +309,12 @@ void awaitSyncAt(long syncTime, Context context) do { WaitQueue.Signal signal = context != null ? syncComplete.register(context) : syncComplete.register(); - if (lastSyncedAt < syncTime) + if (lastSyncedAt - syncTime < 0) signal.awaitUninterruptibly(); else signal.cancel(); } - while (lastSyncedAt < syncTime); + while (lastSyncedAt - syncTime < 0); } public void awaitTermination() throws InterruptedException diff --git a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java index 78bf30c336c6..e354b925a036 100644 --- a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java +++ b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java @@ -17,6 +17,8 @@ */ package org.apache.cassandra.db.commitlog; +import org.apache.cassandra.utils.MonotonicClock; + class BatchCommitLogService extends AbstractCommitLogService { /** @@ -26,9 +28,9 @@ class BatchCommitLogService extends AbstractCommitLogService */ private static final int POLL_TIME_MILLIS = 1000; - public BatchCommitLogService(CommitLog commitLog) + public BatchCommitLogService(CommitLog commitLog, MonotonicClock clock) { - super(commitLog, "COMMIT-LOG-WRITER", POLL_TIME_MILLIS); + super(commitLog, "COMMIT-LOG-WRITER", POLL_TIME_MILLIS, clock); } protected void maybeWaitForSync(CommitLogSegment.Allocation alloc) diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java index 7670c5e1a1f5..a32b8a1030e7 100644 --- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java +++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java @@ -46,6 +46,7 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.JVMStabilityInspector; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.MonotonicClock; import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation; import static org.apache.cassandra.db.commitlog.CommitLogSegment.CommitLogSegmentFileComparator; @@ -72,6 +73,9 @@ public class CommitLog implements CommitLogMBean volatile Configuration configuration; private boolean started = false; + @VisibleForTesting + final MonotonicClock clock; + private static CommitLog construct() { CommitLog log = new CommitLog(CommitLogArchiver.construct(), DatabaseDescriptor.getCommitLogSegmentMgrProvider()); @@ -96,16 +100,18 @@ private static CommitLog construct() this.archiver = archiver; metrics = new CommitLogMetrics(); + this.clock = MonotonicClock.preciseTime; + switch (DatabaseDescriptor.getCommitLogSync()) { case periodic: - executor = new PeriodicCommitLogService(this); + executor = new PeriodicCommitLogService(this, clock); break; case batch: - executor = new BatchCommitLogService(this); + executor = new BatchCommitLogService(this, clock); break; case group: - executor = new GroupCommitLogService(this); + executor = new GroupCommitLogService(this, clock); break; default: throw new IllegalArgumentException("Unknown commitlog service type: " + DatabaseDescriptor.getCommitLogSync()); diff --git a/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java index a76923e581e0..056bc6c88cbe 100644 --- a/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java +++ b/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java @@ -19,6 +19,7 @@ package org.apache.cassandra.db.commitlog; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.utils.MonotonicClock; /** * A commitlog service that will block returning an ACK back to the a coordinator/client @@ -26,9 +27,9 @@ */ public class GroupCommitLogService extends AbstractCommitLogService { - public GroupCommitLogService(CommitLog commitLog) + public GroupCommitLogService(CommitLog commitLog, MonotonicClock clock) { - super(commitLog, "GROUP-COMMIT-LOG-WRITER", (int) DatabaseDescriptor.getCommitLogSyncGroupWindow()); + super(commitLog, "GROUP-COMMIT-LOG-WRITER", (int) DatabaseDescriptor.getCommitLogSyncGroupWindow(), clock); } protected void maybeWaitForSync(CommitLogSegment.Allocation alloc) diff --git a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java index e94c616e444f..c33624cde41d 100644 --- a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java +++ b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java @@ -20,21 +20,22 @@ import java.util.concurrent.TimeUnit; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.utils.MonotonicClock; class PeriodicCommitLogService extends AbstractCommitLogService { private static final long blockWhenSyncLagsNanos = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getPeriodicCommitLogSyncBlock()); - public PeriodicCommitLogService(final CommitLog commitLog) + public PeriodicCommitLogService(final CommitLog commitLog, MonotonicClock clock) { - super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod(), + super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod(), clock, !(commitLog.configuration.useCompression() || commitLog.configuration.useEncryption())); } protected void maybeWaitForSync(CommitLogSegment.Allocation alloc) { - long expectedSyncTime = System.nanoTime() - blockWhenSyncLagsNanos; - if (lastSyncedAt < expectedSyncTime) + long expectedSyncTime = clock.now() - blockWhenSyncLagsNanos; + if (lastSyncedAt - expectedSyncTime < 0) { pending.incrementAndGet(); awaitSyncAt(expectedSyncTime, commitLog.metrics.waitingOnCommit.time()); diff --git a/src/java/org/apache/cassandra/utils/MonotonicClock.java b/src/java/org/apache/cassandra/utils/MonotonicClock.java index 5a1aa3c0361e..d641ec2abfd3 100644 --- a/src/java/org/apache/cassandra/utils/MonotonicClock.java +++ b/src/java/org/apache/cassandra/utils/MonotonicClock.java @@ -259,13 +259,13 @@ public long error() @Override public boolean isAfter(long instant) { - return now() > instant; + return instant - now() < 0; } @Override public boolean isAfter(long now, long instant) { - return now > instant; + return instant - now < 0; } } diff --git a/src/java/org/apache/cassandra/utils/SlidingTimeRate.java b/src/java/org/apache/cassandra/utils/SlidingTimeRate.java deleted file mode 100644 index 0e00054d0205..000000000000 --- a/src/java/org/apache/cassandra/utils/SlidingTimeRate.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.utils; - -import java.util.concurrent.ConcurrentNavigableMap; -import java.util.concurrent.ConcurrentSkipListMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.ReadWriteLock; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -/** - * Concurrent rate computation over a sliding time window. - * - * Currently not used in the Cassandra 4.0 code base. If you decide to use it, please check CASSANDRA-16713. - * There still might be a bug, flaky test to be fixed before using it again. - */ -public class SlidingTimeRate -{ - private final ConcurrentSkipListMap counters = new ConcurrentSkipListMap<>(); - private final AtomicLong lastCounterTimestamp = new AtomicLong(0); - private final ReadWriteLock pruneLock = new ReentrantReadWriteLock(); - private final long sizeInMillis; - private final long precisionInMillis; - private final TimeSource timeSource; - - /** - * Creates a sliding rate whose time window is of the given size, with the given precision and time unit. - *

- * The precision defines how accurate the rate computation is, as it will be computed over window size +/- - * precision. - *

- */ - public SlidingTimeRate(TimeSource timeSource, long size, long precision, TimeUnit unit) - { - Preconditions.checkArgument(size > precision, "Size should be greater than precision."); - Preconditions.checkArgument(TimeUnit.MILLISECONDS.convert(precision, unit) >= 1, "Precision must be greater than or equal to 1 millisecond."); - this.sizeInMillis = TimeUnit.MILLISECONDS.convert(size, unit); - this.precisionInMillis = TimeUnit.MILLISECONDS.convert(precision, unit); - this.timeSource = timeSource; - } - - /** - * Updates the rate. - */ - public void update(int delta) - { - pruneLock.readLock().lock(); - try - { - while (true) - { - long now = timeSource.currentTimeMillis(); - long lastTimestamp = lastCounterTimestamp.get(); - boolean isWithinPrecisionRange = (now - lastTimestamp) < precisionInMillis; - AtomicInteger lastCounter = counters.get(lastTimestamp); - // If there's a valid counter for the current last timestamp, and we're in the precision range, - // update such counter: - if (lastCounter != null && isWithinPrecisionRange) - { - lastCounter.addAndGet(delta); - - break; - } - // Else if there's no counter or we're past the precision range, try to create a new counter, - // but only the thread updating the last timestamp will create a new counter: - else if (lastCounterTimestamp.compareAndSet(lastTimestamp, now)) - { - AtomicInteger existing = counters.putIfAbsent(now, new AtomicInteger(delta)); - if (existing != null) - { - existing.addAndGet(delta); - } - - break; - } - } - } - finally - { - pruneLock.readLock().unlock(); - } - } - - /** - * Gets the current rate in the given time unit from the beginning of the time window to the - * provided point in time ago. - */ - public double get(long toAgo, TimeUnit unit) - { - pruneLock.readLock().lock(); - try - { - long toAgoInMillis = TimeUnit.MILLISECONDS.convert(toAgo, unit); - Preconditions.checkArgument(toAgoInMillis < sizeInMillis, "Cannot get rate in the past!"); - - long now = timeSource.currentTimeMillis(); - long sum = 0; - ConcurrentNavigableMap tailCounters = counters - .tailMap(now - sizeInMillis, true) - .headMap(now - toAgoInMillis, true); - for (AtomicInteger i : tailCounters.values()) - { - sum += i.get(); - } - - double rateInMillis = sum == 0 - ? sum - : sum / (double) Math.max(1000, (now - toAgoInMillis) - tailCounters.firstKey()); - double multiplier = TimeUnit.MILLISECONDS.convert(1, unit); - return rateInMillis * multiplier; - } - finally - { - pruneLock.readLock().unlock(); - } - } - - /** - * Gets the current rate in the given time unit. - */ - public double get(TimeUnit unit) - { - return get(0, unit); - } - - /** - * Prunes the time window of old unused updates. - */ - public void prune() - { - pruneLock.writeLock().lock(); - try - { - long now = timeSource.currentTimeMillis(); - counters.headMap(now - sizeInMillis, false).clear(); - } - finally - { - pruneLock.writeLock().unlock(); - } - } - - @VisibleForTesting - public int size() - { - return counters.values().stream().reduce(new AtomicInteger(), (v1, v2) -> { - v1.addAndGet(v2.get()); - return v1; - }).get(); - } -} diff --git a/src/java/org/apache/cassandra/utils/SystemTimeSource.java b/src/java/org/apache/cassandra/utils/SystemTimeSource.java deleted file mode 100644 index fef525e39c17..000000000000 --- a/src/java/org/apache/cassandra/utils/SystemTimeSource.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.utils; - -import java.util.concurrent.TimeUnit; - -import com.google.common.util.concurrent.Uninterruptibles; - -/** - * Time source backed by JVM clock. - */ -public class SystemTimeSource implements TimeSource -{ - @Override - public long currentTimeMillis() - { - return System.currentTimeMillis(); - } - - @Override - public long nanoTime() - { - return System.nanoTime(); - } - - @Override - public TimeSource sleepUninterruptibly(long sleepFor, TimeUnit unit) - { - Uninterruptibles.sleepUninterruptibly(sleepFor, unit); - return this; - } - - @Override - public TimeSource sleep(long sleepFor, TimeUnit unit) throws InterruptedException - { - TimeUnit.NANOSECONDS.sleep(TimeUnit.NANOSECONDS.convert(sleepFor, unit)); - return this; - } -} diff --git a/src/java/org/apache/cassandra/utils/TimeSource.java b/src/java/org/apache/cassandra/utils/TimeSource.java deleted file mode 100644 index 5d8acec7031a..000000000000 --- a/src/java/org/apache/cassandra/utils/TimeSource.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.utils; - -import java.util.concurrent.TimeUnit; - -public interface TimeSource -{ - /** - * - * @return the current time in milliseconds - */ - long currentTimeMillis(); - - /** - * - * @return Returns the current time value in nanoseconds. - * - *

This method can only be used to measure elapsed time and is - * not related to any other notion of system or wall-clock time. - */ - long nanoTime(); - - /** - * Sleep for the given amount of time uninterruptibly. - * - * @param sleepFor given amout. - * @param unit time unit - * @return The time source itself after the given sleep period. - */ - TimeSource sleepUninterruptibly(long sleepFor, TimeUnit unit); - - /** - * Sleep for the given amount of time. This operation could interrupted. - * Hence after returning from this method, it is not guaranteed - * that the request amount of time has passed. - * - * @param sleepFor given amout. - * @param unit time unit - * @return The time source itself after the given sleep period. - */ - TimeSource sleep(long sleepFor, TimeUnit unit) throws InterruptedException; -} diff --git a/src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java b/src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java deleted file mode 100644 index 382a2dc43669..000000000000 --- a/src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.utils.concurrent; - -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.ReentrantReadWriteLock; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.cassandra.utils.TimeSource; - -/** - * This class extends ReentrantReadWriteLock to provide a write lock that can only be acquired at provided intervals. - */ -public class IntervalLock extends ReentrantReadWriteLock -{ - private final AtomicLong lastAcquire = new AtomicLong(); - private final TimeSource timeSource; - - public IntervalLock(TimeSource timeSource) - { - this.timeSource = timeSource; - } - - /** - * Try acquiring a write lock if the given interval is passed since the last call to this method. - * - * @param interval In millis. - * @return True if acquired and locked, false otherwise. - */ - public boolean tryIntervalLock(long interval) - { - long now = timeSource.currentTimeMillis(); - boolean acquired = (now - lastAcquire.get() >= interval) && writeLock().tryLock(); - if (acquired) - lastAcquire.set(now); - - return acquired; - } - - /** - * Release the last acquired interval lock. - */ - public void releaseIntervalLock() - { - writeLock().unlock(); - } - - @VisibleForTesting - public long getLastIntervalAcquire() - { - return lastAcquire.get(); - } -} diff --git a/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java b/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java index 741b1454b5c9..f91690cf5d5b 100644 --- a/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java +++ b/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java @@ -29,6 +29,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.commitlog.AbstractCommitLogService.SyncRunnable; import org.apache.cassandra.utils.FreeRunningClock; +import org.apache.cassandra.utils.MonotonicClock; import static org.apache.cassandra.db.commitlog.AbstractCommitLogService.DEFAULT_MARKER_INTERVAL_MILLIS; @@ -100,7 +101,7 @@ private static class FakeCommitLogService extends AbstractCommitLogService { FakeCommitLogService(long syncIntervalMillis) { - super(new FakeCommitLog(), "This is not a real commit log", syncIntervalMillis, true); + super(new FakeCommitLog(), "This is not a real commit log", syncIntervalMillis, MonotonicClock.preciseTime, true); lastSyncedAt = 0; } diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java new file mode 100644 index 000000000000..e024954a7958 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.commitlog; + +import java.util.concurrent.TimeUnit; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.utils.FreeRunningClock; +import org.apache.cassandra.utils.MonotonicClock; +import org.mockito.Mockito; + +public class CommitLogAwaitAsyncAtTest +{ + @BeforeClass + public static void beforeClass() throws ConfigurationException + { + DatabaseDescriptor.daemonInitialization(); + } + + /** + * syncTime (awaitSyncAt param) is in the past, now value overflowed, awaitSyncAt should not block, + * no clock advance calls. + */ + @Test + public void notBlockIfSyncTimeIsInPast() throws InterruptedException + { + testResumingAwaitSyncAt(Long.MIN_VALUE + 10, + Long.MAX_VALUE - 10, + 0); + } + + /** + * syncTime (awaitSyncAt param) is in the future, awaitSyncAt should block, unblocking is caused by the flush + */ + @Test + public void flushShouldUnblockAwaitSync() throws InterruptedException + { + testResumingAwaitSyncAt(Long.MAX_VALUE - 10, + Long.MAX_VALUE - 5, + 1000); + } + + /** + * Creates a CommitLogService instance and a new thread that calls awaitSyncAt. Awaits for at most a minute + * for the call to return. + * Uses artificial clock to progress through the commit flush. One clock advance is performed after the service and + * the thread are started. + * + * @param nowNanos test start time nanoseconds + * @param syncAtNanos awaitSyncAt parameter nanoseconds + * @param advanceMillis clock step in milliseconds + */ + private void testResumingAwaitSyncAt(long nowNanos, long syncAtNanos, long advanceMillis) throws InterruptedException + { + FreeRunningClock clock = new FreeRunningClock(nowNanos); + AbstractCommitLogService service = getCommitLogService(clock); + + Thread awaitForSync = new Thread(CommitLogAwaitAsyncAtTest.class.getSimpleName() + " commit log waiting thread") + { + @Override + public void run() + { + service.awaitSyncAt(syncAtNanos, null); + } + }; + awaitForSync.start(); + + service.start(); + + // move clock once with advance millis + clock.advance(advanceMillis, TimeUnit.MILLISECONDS); + + // wait at most 1 minute for awaitSyncAt to unblock + awaitForSync.join(60 * 1000); + if (awaitForSync.isAlive()) + Assert.fail("awaitSyncAt should be unblocked by now, check commit log code for bugs in nanoseconds" + + "comparisons"); + } + + private AbstractCommitLogService getCommitLogService(MonotonicClock clock) { + CommitLog commitLog = Mockito.mock(CommitLog.class); + return new AbstractCommitLogService(commitLog, "testService", 100, clock) + { + @Override + protected void maybeWaitForSync(CommitLogSegment.Allocation alloc) + { + } + }; + } +} diff --git a/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java b/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java index b2891a9950e2..db12ff2df5cb 100644 --- a/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java +++ b/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java @@ -20,6 +20,7 @@ import static org.apache.cassandra.utils.MonotonicClock.approxTime; import static org.junit.Assert.*; +import org.junit.Assert; import org.junit.Test; public class MonotonicClockTest @@ -53,4 +54,13 @@ public void testTimestampOrdering() throws Exception lastConverted = convertedNow; } } + + @Test + public void testTimestampOverflowComparison() + { + MonotonicClock clock = MonotonicClock.preciseTime; + + Assert.assertTrue("Overflown long (now) should be after long close to max", + clock.isAfter(Long.MIN_VALUE + 1, Long.MAX_VALUE)); + } } diff --git a/test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java b/test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java deleted file mode 100644 index 8dc4a14d6d76..000000000000 --- a/test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.utils; - -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; - -import org.junit.Assert; -import org.junit.Ignore; -import org.junit.Test; - -/** - * No objects are created currently from SlidingTimeRate in Cassandra 4.0. - * If you decide to use it, please check CASSANDRA-16713. - * There still might be a bug, flaky test to be fixed before using it again. - * - * Skipping all tests for running now to clean he noise before 4.0 GA release. - */ -public class SlidingTimeRateTest -{ - @Ignore - @Test - public void testUpdateAndGet() - { - SlidingTimeRate rate = new SlidingTimeRate(new TestTimeSource(), 10, 1, TimeUnit.SECONDS); - int updates = 100; - for (int i = 0; i < updates; i++) - { - rate.update(1); - } - Assert.assertEquals(updates, rate.get(TimeUnit.SECONDS), 0.0); - } - - @Ignore - @Test - public void testUpdateAndGetBetweenWindows() - { - TestTimeSource time = new TestTimeSource(); - SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS); - int updates = 100; - for (int i = 0; i < updates; i++) - { - rate.update(1); - time.sleep(100, TimeUnit.MILLISECONDS); - } - Assert.assertEquals(10, rate.get(TimeUnit.SECONDS), 0.0); - } - - @Ignore - @Test - public void testUpdateAndGetPastWindowSize() - { - TestTimeSource time = new TestTimeSource(); - SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS); - int updates = 100; - for (int i = 0; i < updates; i++) - { - rate.update(1); - } - - time.sleep(6, TimeUnit.SECONDS); - - Assert.assertEquals(0, rate.get(TimeUnit.SECONDS), 0.0); - } - - @Ignore - @Test - public void testUpdateAndGetToPointInTime() - { - TestTimeSource time = new TestTimeSource(); - SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS); - int updates = 10; - for (int i = 0; i < updates; i++) - { - rate.update(1); - time.sleep(100, TimeUnit.MILLISECONDS); - } - - time.sleep(1, TimeUnit.SECONDS); - - Assert.assertEquals(5, rate.get(TimeUnit.SECONDS), 0.0); - Assert.assertEquals(10, rate.get(1, TimeUnit.SECONDS), 0.0); - } - - @Ignore - @Test - public void testDecay() throws InterruptedException - { - TestTimeSource time = new TestTimeSource(); - SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS); - int updates = 10; - for (int i = 0; i < updates; i++) - { - rate.update(1); - time.sleep(100, TimeUnit.MILLISECONDS); - } - Assert.assertEquals(10, rate.get(TimeUnit.SECONDS), 0.0); - - time.sleep(1, TimeUnit.SECONDS); - - Assert.assertEquals(5, rate.get(TimeUnit.SECONDS), 0.0); - - time.sleep(2, TimeUnit.SECONDS); - - Assert.assertEquals(2.5, rate.get(TimeUnit.SECONDS), 0.0); - } - - @Ignore - @Test - public void testPruning() - { - TestTimeSource time = new TestTimeSource(); - SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS); - - rate.update(1); - Assert.assertEquals(1, rate.size()); - - time.sleep(6, TimeUnit.SECONDS); - - rate.prune(); - Assert.assertEquals(0, rate.size()); - } - - @Ignore - @Test - public void testConcurrentUpdateAndGet() throws InterruptedException - { - final ExecutorService executor = Executors.newFixedThreadPool(FBUtilities.getAvailableProcessors()); - final TestTimeSource time = new TestTimeSource(); - final SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS); - int updates = 100000; - for (int i = 0; i < updates; i++) - { - executor.submit(() -> { - time.sleep(1, TimeUnit.MILLISECONDS); - rate.update(1); - }); - } - - executor.shutdown(); - - Assert.assertTrue(executor.awaitTermination(1, TimeUnit.MINUTES)); - Assert.assertEquals(1000, rate.get(TimeUnit.SECONDS), 100.0); - } -} diff --git a/test/unit/org/apache/cassandra/utils/TestTimeSource.java b/test/unit/org/apache/cassandra/utils/TestTimeSource.java deleted file mode 100644 index 4ecd086f38d5..000000000000 --- a/test/unit/org/apache/cassandra/utils/TestTimeSource.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.utils; - -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; - -public class TestTimeSource implements TimeSource -{ - private final AtomicLong timeInMillis = new AtomicLong(System.currentTimeMillis()); - - @Override - public long currentTimeMillis() - { - return timeInMillis.get(); - } - - @Override - public long nanoTime() - { - return timeInMillis.get() * 1_000_000; - } - - @Override - public TimeSource sleep(long sleepFor, TimeUnit unit) - { - long current = timeInMillis.get(); - long sleepInMillis = TimeUnit.MILLISECONDS.convert(sleepFor, unit); - boolean elapsed; - do - { - long newTime = current + sleepInMillis; - elapsed = timeInMillis.compareAndSet(current, newTime); - if (!elapsed) - { - long updated = timeInMillis.get(); - if (updated - current >= sleepInMillis) - { - elapsed = true; - } - else - { - sleepInMillis -= updated - current; - current = updated; - } - } - } - while (!elapsed); - return this; - } - - @Override - public TimeSource sleepUninterruptibly(long sleepFor, TimeUnit unit) - { - return sleep(sleepFor, unit); - } -} From 68984d8f6932c3d5ea56ba5e834213c9cd5dcf11 Mon Sep 17 00:00:00 2001 From: Jaroslaw Grabowski Date: Wed, 2 Jun 2021 11:10:44 +0200 Subject: [PATCH 007/151] STAR-561 selectors should contain elements with same type (#169) * selectors should contain elements with same type Expressions like "SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t, WRITETIME(t))] FROM %s" can't be deserialized. If we try to extract the result by calling getRows we get: org.apache.cassandra.serializers.MarshalException: Expected 8 or 0 byte long This commit makes sure that all collection elements have the same type. Co-authored-by: Zhao Yang * test for Maps.getExactMapTypeIfKnow Maps are a bit different than Lists and Sets. Maps can't be selected without a type hint, which makes it impossible to exercise query execution path with `getExactMapTypeIfKnow`. That is why tests for "map SELECTs" are missing TermSelectionTest.java. The previous fixed `Maps.getExactMapTypeIfKnow` but it can't be tested in a similar way as for Lists and Sets. That's why commit adds a separate unit tests for `Maps.getExactMapTypeIfKnow`. Co-authored-by: Zhao Yang (cherry picked from commit 78b0117ac1a233de739ed8d17f72d300a81a5fea) --- src/java/org/apache/cassandra/cql3/Lists.java | 26 ++++- src/java/org/apache/cassandra/cql3/Maps.java | 27 ++++-- src/java/org/apache/cassandra/cql3/Sets.java | 4 +- .../cassandra/cql3/selection/Selectable.java | 23 +++++ .../org/apache/cassandra/cql3/CQLTester.java | 6 +- .../org/apache/cassandra/cql3/MapsTest.java | 75 +++++++++++++++ .../cql3/selection/TermSelectionTest.java | 95 ++++++++++++++++++- 7 files changed, 241 insertions(+), 15 deletions(-) create mode 100644 test/unit/org/apache/cassandra/cql3/MapsTest.java diff --git a/src/java/org/apache/cassandra/cql3/Lists.java b/src/java/org/apache/cassandra/cql3/Lists.java index 1d94d697a50e..cd45095ea5c4 100644 --- a/src/java/org/apache/cassandra/cql3/Lists.java +++ b/src/java/org/apache/cassandra/cql3/Lists.java @@ -128,8 +128,30 @@ public static String listToString(Iterable items, java.util.function.Func public static AbstractType getExactListTypeIfKnown(List items, java.util.function.Function> mapper) { - Optional> type = items.stream().map(mapper).filter(Objects::nonNull).findFirst(); - return type.isPresent() ? ListType.getInstance(type.get(), false) : null; + AbstractType type = getElementType(items, mapper); + return type != null ? ListType.getInstance(type, false) : null; + } + + protected static AbstractType getElementType(List items, + java.util.function.Function> mapper) + { + AbstractType type = null; + for (T item : items) + { + AbstractType itemType = mapper.apply(item); + if (itemType == null) + continue; + + if (type != null && !itemType.isCompatibleWith(type)) + { + if (type.isCompatibleWith(itemType)) + continue; + + throw new InvalidRequestException("Invalid collection literal: all selectors must have the same CQL type inside collection literals"); + } + type = itemType; + } + return type; } public static class Literal extends Term.Raw diff --git a/src/java/org/apache/cassandra/cql3/Maps.java b/src/java/org/apache/cassandra/cql3/Maps.java index 6e7e07b57601..a4c213c98a3a 100644 --- a/src/java/org/apache/cassandra/cql3/Maps.java +++ b/src/java/org/apache/cassandra/cql3/Maps.java @@ -134,16 +134,31 @@ public static AbstractType getExactMapTypeIfKnown(List> entrie AbstractType valueType = null; for (Pair entry : entries) { - if (keyType == null) - keyType = mapper.apply(entry.left); - if (valueType == null) - valueType = mapper.apply(entry.right); - if (keyType != null && valueType != null) - return MapType.getInstance(keyType, valueType, false); + keyType = selectType(keyType, mapper.apply(entry.left)); + valueType = selectType(valueType, mapper.apply(entry.right)); } + + if (keyType != null && valueType != null) + return MapType.getInstance(keyType, valueType, false); + return null; } + private static AbstractType selectType(AbstractType type, AbstractType otherType) + { + if (otherType == null) + return type; + + if (type != null && !otherType.isCompatibleWith(type)) + { + if (type.isCompatibleWith(otherType)) + return type; + + throw new InvalidRequestException("Invalid collection literal: all selectors must have the same CQL type inside collection literals"); + } + return otherType; + } + public static class Literal extends Term.Raw { public final List> entries; diff --git a/src/java/org/apache/cassandra/cql3/Sets.java b/src/java/org/apache/cassandra/cql3/Sets.java index aab4192587fb..e31841a583cc 100644 --- a/src/java/org/apache/cassandra/cql3/Sets.java +++ b/src/java/org/apache/cassandra/cql3/Sets.java @@ -119,8 +119,8 @@ public static String setToString(Iterable items, java.util.function.Funct public static AbstractType getExactSetTypeIfKnown(List items, java.util.function.Function> mapper) { - Optional> type = items.stream().map(mapper).filter(Objects::nonNull).findFirst(); - return type.isPresent() ? SetType.getInstance(type.get(), false) : null; + AbstractType type = Lists.getElementType(items, mapper); + return type != null ? SetType.getInstance(type, false) : null; } public static class Literal extends Term.Raw diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java index de5360f52529..66759381ef5a 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java @@ -114,6 +114,23 @@ default ColumnSpecification specForElementOrSlice(Selectable selected, ColumnSpe } } + /** + * Checks that this {@code Selectable} is or can be converted into the specified type. + * @param table the table schema + * @param type the expected type + * @throws InvalidRequestException if the {@code Selectable} can not be converted into the specified type + */ + default void validateType(TableMetadata table, AbstractType type) + { + ColumnSpecification receiver = new ColumnSpecification(table.keyspace, + table.name, + new ColumnIdentifier(toString(), true), + type); + + if (!testAssignment(table.keyspace, receiver).isAssignable()) + throw invalidRequest("%s is not of the expected type: %s", this, type.asCQL3Type()); + } + public interface Raw { public Selectable prepare(TableMetadata table); @@ -175,6 +192,8 @@ public Selector.Factory newSelectorFactory(TableMetadata table, AbstractType type = expectedType; if (type == null) throw new InvalidRequestException("Cannot infer type for term " + this + " in selection clause (try using a cast to force a type)"); + + validateType(table, type); } // The fact we default the name to "[selection]" inconditionally means that any bind marker in a @@ -624,6 +643,7 @@ public Factory newSelectorFactory(TableMetadata cfm, if (type == null) throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)", this); + validateType(cfm, type); } if (selectables.size() == 1 && !type.isTuple()) @@ -742,6 +762,7 @@ public Factory newSelectorFactory(TableMetadata cfm, if (type == null) throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)", this); + validateType(cfm, type); } ListType listType = (ListType) type; @@ -827,6 +848,7 @@ public Factory newSelectorFactory(TableMetadata cfm, if (type == null) throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)", this); + validateType(cfm, type); } // The parser treats empty Maps as Sets so if the type is a MapType we know that the Map is empty @@ -931,6 +953,7 @@ public Factory newSelectorFactory(TableMetadata cfm, if (type == null) throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)", this); + validateType(cfm, type); } if (type.isUDT()) diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java index 2f88f7c5dcb1..1c9150e20981 100644 --- a/test/unit/org/apache/cassandra/cql3/CQLTester.java +++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java @@ -77,6 +77,7 @@ import org.apache.cassandra.schema.IndexMetadata; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.service.ClientState; import org.apache.cassandra.service.QueryState; @@ -1137,10 +1138,13 @@ public static void assertRows(UntypedResultSet result, Object[]... rows) int i = 0; while (iter.hasNext() && i < rows.length) { + if (rows[i] == null) + throw new IllegalArgumentException(String.format("Invalid expected value for row: %d. A row cannot be null.", i)); + Object[] expected = rows[i]; UntypedResultSet.Row actual = iter.next(); - Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), expected == null ? 1 : expected.length, meta.size()); + Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), expected.length, meta.size()); for (int j = 0; j < meta.size(); j++) { diff --git a/test/unit/org/apache/cassandra/cql3/MapsTest.java b/test/unit/org/apache/cassandra/cql3/MapsTest.java new file mode 100644 index 000000000000..118485ad9191 --- /dev/null +++ b/test/unit/org/apache/cassandra/cql3/MapsTest.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3; + +import java.util.function.Function; + +import com.google.common.collect.ImmutableList; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.NumberType; +import org.apache.cassandra.exceptions.InvalidRequestException; +import org.apache.cassandra.utils.Pair; + +public class MapsTest extends CQLTester +{ + private final Function, AbstractType> identityMapper = integerType -> integerType; + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void testGetExactMapTypeIfKnownWithDifferentTypes() + { + thrown.expect(InvalidRequestException.class); + thrown.expectMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals"); + + Maps.getExactMapTypeIfKnown(ImmutableList.of( + Pair.create(Int32Type.instance, Int32Type.instance), + Pair.create(Int32Type.instance, IntegerType.instance) + ), identityMapper); + } + + @Test + public void testGetExactMapTypeIfKnownWithTheSameTypes() + { + AbstractType exactType = Maps.getExactMapTypeIfKnown(ImmutableList.of( + Pair.create(Int32Type.instance, Int32Type.instance), + Pair.create(Int32Type.instance, Int32Type.instance) + ), identityMapper); + + AbstractType expected = MapType.getInstance(Int32Type.instance, Int32Type.instance, false).freeze(); + Assert.assertEquals(expected, exactType); + } + + @Test + public void testGetExactMapTypeIfKnownWithoutTypes() + { + AbstractType exactType = Maps.getExactMapTypeIfKnown(ImmutableList.of(), identityMapper); + + Assert.assertNull(exactType); + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java b/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java index fb46809ff48b..ae907cf55a7a 100644 --- a/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java +++ b/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java @@ -134,6 +134,10 @@ public void testSelectLiteral() throws Throwable row(list(set(1), set(3))), row(list(set(1), set(2))), row(list(set(1), set(1)))); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT [{pk, t}, {ck}] FROM %s WHERE pk = 1"); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT [{pk}, {t}] FROM %s WHERE pk = 1"); // Test Maps nested within Lists assertRows(execute("SELECT [{}, (map){'min' : min(ck), 'max' : max(ck)}] FROM %s"), @@ -154,10 +158,50 @@ public void testSelectLiteral() throws Throwable row(list(tuple(1, 3, timestampInMicros)))); assertRows(execute("SELECT [(min(ck), max(ck))] FROM %s"), row(list(tuple(1, 3)))); - assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t, WRITETIME(t))] FROM %s"), - row(list(tuple(1L, 1L), tuple("one", timestampInMicros))), - row(list(tuple(1L, 2L), tuple("two", timestampInMicros))), - row(list(tuple(1L, 3L), tuple("three", timestampInMicros)))); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t, writetime(t))] FROM %s"); + + assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT), t), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))] FROM %s"), + row(list(tuple(1L, 1L, "one"), tuple(1L, 1L))), + row(list(tuple(1L, 2L, "two"), tuple(1L, 2L))), + row(list(tuple(1L, 3L, "three"), tuple(1L, 3L)))); + + assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT), t)] FROM %s"), + row(list(tuple(1L, 1L), tuple(1L, 1L, "one"))), + row(list(tuple(1L, 2L), tuple(1L, 2L, "two"))), + row(list(tuple(1L, 3L), tuple(1L, 3L, "three")))); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT [(CAST(pk AS BIGINT), t, CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))] FROM %s"); + + // list of tuples of tuples + assertRows(execute("SELECT [((t,t, t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT)))] FROM %s"), + row(list(tuple(tuple("one", "one", "one"), tuple("one", "one", 1L)), + tuple(tuple("one", "one"), tuple("one", "one", 1L)))), + row(list(tuple(tuple("two", "two", "two"), tuple("two", "two", 2L)), + tuple(tuple("two", "two"), tuple("two", "two", 2L)))), + row(list(tuple(tuple("three", "three", "three"), tuple("three", "three", 3L)), + tuple(tuple("three", "three"), tuple("three", "three", 3L))))); + + assertRows(execute("SELECT [((t,t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT),t))] FROM %s"), + row(list(tuple(tuple("one", "one"), tuple("one", "one", 1L)), + tuple(tuple("one", "one"), tuple("one", "one", 1L, "one")))), + row(list(tuple(tuple("two", "two"), tuple("two", "two", 2L)), + tuple(tuple("two", "two"), tuple("two", "two", 2L, "two")))), + row(list(tuple(tuple("three", "three"), tuple("three", "three", 3L)), + tuple(tuple("three", "three"), tuple("three", "three", 3L, "three"))))); + + // single element tuple: tuple(t) incompatible with tuple(long, long) + assertInvalidMessage("(t) is not of the expected type: frozen>", + "SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t)] FROM %s"); + + assertInvalidMessage("(cast(ck as bigint)) is not of the expected type: frozen>", + "SELECT [(t, t), (CAST(ck AS BIGINT))] FROM %s"); + + // single element tuple: tuple(long) compatible with tuple(long, long) + assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (CAST(ck AS BIGINT))] FROM %s"), + row(list(tuple(1L, 1L), tuple(1L))), + row(list(tuple(1L, 2L), tuple(2L))), + row(list(tuple(1L, 3L), tuple(3L)))); // Test UDTs nested within Lists String type = createType("CREATE TYPE %s(a int, b int, c bigint)"); @@ -189,6 +233,10 @@ public void testSelectLiteral() throws Throwable row(set(list(1), list(3)))); assertRows(execute("SELECT {([min(ck)]), [max(ck)]} FROM %s"), row(set(list(1), list(3)))); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT {[min(ck), writetime(t)], [max(ck)]} FROM %s"); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT {[writetime(t)], [max(ck)]} FROM %s"); // Test Sets nested within Sets assertRows(execute("SELECT {{}, {min(ck), max(ck)}} FROM %s"), @@ -223,6 +271,45 @@ public void testSelectLiteral() throws Throwable row(set(tuple(1, 3, timestampInMicros)))); assertRows(execute("SELECT {(min(ck), max(ck))} FROM %s"), row(set(tuple(1, 3)))); + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT {(min(ck), max(ck)), (t, writetime(t))} FROM %s"); + + assertRows(execute("SELECT {(CAST(pk AS BIGINT), CAST(ck AS BIGINT), t), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s"), + row(set(tuple(1L, 1L, "one"), tuple(1L, 1L))), + row(set(tuple(1L, 2L, "two"), tuple(1L, 2L))), + row(set(tuple(1L, 3L, "three"), tuple(1L, 3L)))); + + assertRows(execute("SELECT {(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT), t)} FROM %s"), + row(set(tuple(1L, 1L), tuple(1L, 1L, "one"))), + row(set(tuple(1L, 2L), tuple(1L, 2L, "two"))), + row(set(tuple(1L, 3L), tuple(1L, 3L, "three")))); + + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT {(CAST(pk AS BIGINT), t, CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s"); + + assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals", + "SELECT {(CAST(pk AS BIGINT), t, CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s"); + + // set of tuples of tuples + assertRows(execute("SELECT {((t,t, t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT)))} FROM %s"), + row(set(tuple(tuple("one", "one", "one"), tuple("one", "one", 1L)), + tuple(tuple("one", "one"), tuple("one", "one", 1L)))), + row(set(tuple(tuple("two", "two", "two"), tuple("two", "two", 2L)), + tuple(tuple("two", "two"), tuple("two", "two", 2L)))), + row(set(tuple(tuple("three", "three", "three"), tuple("three", "three", 3L)), + tuple(tuple("three", "three"), tuple("three", "three", 3L))))); + + assertRows(execute("SELECT {((t,t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT),t))} FROM %s"), + row(set(tuple(tuple("one", "one"), tuple("one", "one", 1L)), + tuple(tuple("one", "one"), tuple("one", "one", 1L, "one")))), + row(set(tuple(tuple("two", "two"), tuple("two", "two", 2L)), + tuple(tuple("two", "two"), tuple("two", "two", 2L, "two")))), + row(set(tuple(tuple("three", "three"), tuple("three", "three", 3L)), + tuple(tuple("three", "three"), tuple("three", "three", 3L, "three"))))); + + // getExactType for (t) is null + assertInvalidMessage("(t) is not of the expected type: frozen>", + "SELECT {(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s"); // Test UDTs nested within Sets assertRows(execute("SELECT {(" + type + "){a : min(ck), b: max(ck)}} FROM %s"), From 47d1561b532e41c6de8517b8267b0292348e6762 Mon Sep 17 00:00:00 2001 From: Ruslan Fomkin Date: Wed, 2 Jun 2021 12:20:28 +0200 Subject: [PATCH 008/151] STAR-573 Make assassinate more resilient to missing tokens (#178) Co-authored-by: Robert Stupp (cherry picked from commit 4109c7377fe92e55cb8751d66b28b7c2499e4669) --- src/java/org/apache/cassandra/gms/Gossiper.java | 10 ++++++---- .../org/apache/cassandra/locator/TokenMetadata.java | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java index 39fe1fe7287c..5dd121831008 100644 --- a/src/java/org/apache/cassandra/gms/Gossiper.java +++ b/src/java/org/apache/cassandra/gms/Gossiper.java @@ -762,7 +762,6 @@ public void assassinateEndpoint(String address) throws UnknownHostException InetAddressAndPort endpoint = InetAddressAndPort.getByName(address); runInGossipStageBlocking(() -> { EndpointState epState = endpointStateMap.get(endpoint); - Collection tokens; logger.warn("Assassinating {} via gossip", endpoint); if (epState == null) @@ -787,6 +786,7 @@ else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat) epState.getHeartBeatState().forceNewerGenerationUnsafe(); } + Collection tokens = null; try { tokens = StorageService.instance.getTokenMetadata().getTokens(endpoint); @@ -794,8 +794,10 @@ else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat) catch (Throwable th) { JVMStabilityInspector.inspectThrowable(th); - // TODO this is broken - logger.warn("Unable to calculate tokens for {}. Will use a random one", address); + } + if (tokens == null || tokens.isEmpty()) + { + logger.warn("Trying to assassinate an endpoint {} that does not have any tokens assigned. This should not have happened, trying to continue with a random token.", address); tokens = Collections.singletonList(StorageService.instance.getTokenMetadata().partitioner.getRandomToken()); } @@ -1014,7 +1016,7 @@ void doStatusCheck() // to make sure that the previous read data was correct logger.info("Race condition marking {} as a FatClient; ignoring", endpoint); return; - } + } removeEndpoint(endpoint); // will put it in justRemovedEndpoints to respect quarantine delay evictFromMembership(endpoint); // can get rid of the state immediately }); diff --git a/src/java/org/apache/cassandra/locator/TokenMetadata.java b/src/java/org/apache/cassandra/locator/TokenMetadata.java index f2bbb9fe71eb..ab210457f0dd 100644 --- a/src/java/org/apache/cassandra/locator/TokenMetadata.java +++ b/src/java/org/apache/cassandra/locator/TokenMetadata.java @@ -564,11 +564,11 @@ public void removeFromMoving(InetAddressAndPort endpoint) public Collection getTokens(InetAddressAndPort endpoint) { assert endpoint != null; - assert isMember(endpoint); // don't want to return nulls lock.readLock().lock(); try { + assert isMember(endpoint); // don't want to return nulls return new ArrayList<>(tokenToEndpointMap.inverse().get(endpoint)); } finally From f90677c7649ea7288fc5bc58e747513f09f815af Mon Sep 17 00:00:00 2001 From: Stefania Date: Tue, 22 Aug 2017 17:41:39 +0800 Subject: [PATCH 009/151] STAR-563: Fix SIGSEGVs on aborted flush If a flush is aborted, e.g. by exception thrown by flushAllNonCFSBackedIndexesBlocking(), this was done by closing the flush writer, potentially concurrently with operations on it. The latter is unsafe and may cause writes to released memory. Fixed by adding an abort mechanism to the flush runnables. Port of DB-962 with an earlier commit. patch by Stefania Alborghetti; reviewed by Alex Petrov ported by Branimir Lambov; reviewed by Ruslan Fomkin (cherry picked from commit 8df735dd3b558dddeb927ac1f8b917ba4cab164c) --- .../cassandra/db/ColumnFamilyStore.java | 20 +- .../org/apache/cassandra/db/Memtable.java | 150 +++++++++++---- .../cassandra/io/util/SequentialWriter.java | 7 + .../org/apache/cassandra/db/MemtableTest.java | 172 ++++++++++++++++++ 4 files changed, 310 insertions(+), 39 deletions(-) create mode 100644 test/unit/org/apache/cassandra/db/MemtableTest.java diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index 113a9164b4d0..6b49855ec2fe 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -95,6 +95,7 @@ import static org.apache.cassandra.utils.Throwables.maybeFail; import static org.apache.cassandra.utils.Throwables.merge; +import static org.apache.cassandra.utils.Throwables.perform; public class ColumnFamilyStore implements ColumnFamilyStoreMBean { @@ -1132,12 +1133,29 @@ public Collection flushMemtable(Memtable memtable, boolean flushN if (flushNonCf2i) indexManager.flushAllNonCFSBackedIndexesBlocking(); + // It may be worthwhile to add an early abort mechanism here if one of the futures throws. + // In such a case this code will run the other threads to completion and only then abort the operation. flushResults = Lists.newArrayList(FBUtilities.waitOnFutures(futures)); } catch (Throwable t) { - t = memtable.abortRunnables(flushRunnables, t); + logger.error("Flushing {} failed with error", memtable.toString(), t); + if (flushRunnables != null) + { + for (Memtable.FlushRunnable runnable : flushRunnables) + t = runnable.abort(t); + } + + // wait for any flush runnables that were submitted (after aborting they should complete immediately) + // this ensures that the writers are aborted by FlushRunnable.writeSortedContents(), in the worst + // case we'll repeat the same exception twice if the initial exception was thrown whilst waiting + // on a future + t = perform(t, () -> FBUtilities.waitOnFutures(futures)); + + //finally abort the transaction t = txn.abort(t); + + // and re-throw throw Throwables.propagate(t); } diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java index 73c64169f712..3186ffb92aeb 100644 --- a/src/java/org/apache/cassandra/db/Memtable.java +++ b/src/java/org/apache/cassandra/db/Memtable.java @@ -75,6 +75,8 @@ import org.apache.cassandra.utils.memory.NativePool; import org.apache.cassandra.utils.memory.SlabPool; +import static org.apache.cassandra.utils.Throwables.maybeFail; + public class Memtable implements Comparable { private static final Logger logger = LoggerFactory.getLogger(Memtable.class); @@ -319,6 +321,12 @@ private List createFlushRunnables(LifecycleTransaction txn) if (boundaries == null) return Collections.singletonList(new FlushRunnable(txn)); + return createFlushRunnables(boundaries, locations, txn); + } + + @VisibleForTesting + List createFlushRunnables(List boundaries, List locations, LifecycleTransaction txn) + { List runnables = new ArrayList<>(boundaries.size()); PartitionPosition rangeStart = cfs.getPartitioner().getMinimumToken().minKeyBound(); try @@ -333,16 +341,11 @@ private List createFlushRunnables(LifecycleTransaction txn) } catch (Throwable e) { - throw Throwables.propagate(abortRunnables(runnables, e)); - } - } + for (Memtable.FlushRunnable runnable : runnables) + e = runnable.abort(e); - public Throwable abortRunnables(List runnables, Throwable t) - { - if (runnables != null) - for (FlushRunnable runnable : runnables) - t = runnable.writer.abort(t); - return t; + throw Throwables.propagate(e); + } } public String toString() @@ -411,6 +414,22 @@ public void makeUnflushable() liveDataSize.addAndGet((long) 1024 * 1024 * 1024 * 1024 * 1024); } + /** + * The valid states for {@link FlushRunnable} writers. The thread writing the contents + * will transition from IDLE -> RUNNING and back to IDLE when finished using the writer + * or from ABORTING -> ABORTED if another thread has transitioned from RUNNING -> ABORTING. + * We can also transition directly from IDLE -> ABORTED. Whichever threads transitions + * to ABORTED is responsible to abort the writer. + */ + @VisibleForTesting + enum FlushRunnableWriterState + { + IDLE, // the runnable is idle, either not yet started or completed but with the writer waiting to be committed + RUNNING, // the runnable is executing, therefore the writer cannot be aborted or else a SEGV may ensue + ABORTING, // an abort request has been issued, this only happens if abort() is called whilst RUNNING + ABORTED // the writer has been aborted, no resources will be leaked + } + class FlushRunnable implements Callable { private final long estimatedSize; @@ -423,6 +442,8 @@ class FlushRunnable implements Callable private final PartitionPosition from; private final PartitionPosition to; + private final AtomicReference state; + FlushRunnable(PartitionPosition from, PartitionPosition to, Directories.DataDirectory flushLocation, LifecycleTransaction txn) { this(partitions.subMap(from, to), flushLocation, from, to, txn); @@ -439,6 +460,8 @@ class FlushRunnable implements Callable this.from = from; this.to = to; long keySize = 0; + state = new AtomicReference<>(FlushRunnableWriterState.IDLE); + for (PartitionPosition key : toFlush.keySet()) { // make sure we don't write non-sensical keys @@ -456,7 +479,6 @@ class FlushRunnable implements Callable writer = createFlushWriter(txn, cfs.newSSTableDescriptor(getDirectories().getWriteableLocationAsFile(estimatedSize)), columnsCollector.get(), statsCollector.get()); else writer = createFlushWriter(txn, cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(flushLocation)), columnsCollector.get(), statsCollector.get()); - } protected Directories getDirectories() @@ -466,44 +488,96 @@ protected Directories getDirectories() private void writeSortedContents() { - logger.info("Writing {}, flushed range = ({}, {}]", Memtable.this.toString(), from, to); + if (!state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.RUNNING)) + { + logger.debug("Failed to write {}, flushed range = ({}, {}], state: {}", + Memtable.this.toString(), from, to, state); + return; + } + + logger.info("Writing {}, flushed range = ({}, {}], state: {}", + Memtable.this.toString(), from, to, state); - boolean trackContention = logger.isTraceEnabled(); int heavilyContendedRowCount = 0; - // (we can't clear out the map as-we-go to free up memory, - // since the memtable is being used for queries in the "pending flush" category) - for (AtomicBTreePartition partition : toFlush.values()) + try { - // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2 - // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local, - // we don't need to preserve tombstones for repair. So if both operation are in this - // memtable (which will almost always be the case if there is no ongoing failure), we can - // just skip the entry (CASSANDRA-4667). - if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows()) - continue; - - if (trackContention && partition.useLock()) - heavilyContendedRowCount++; - - if (!partition.isEmpty()) + boolean trackContention = logger.isTraceEnabled(); + // (we can't clear out the map as-we-go to free up memory, + // since the memtable is being used for queries in the "pending flush" category) + for (AtomicBTreePartition partition : toFlush.values()) { - try (UnfilteredRowIterator iter = partition.unfilteredIterator()) + if (state.get() == FlushRunnableWriterState.ABORTING) + break; + + // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2 + // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local, + // we don't need to preserve tombstones for repair. So if both operation are in this + // memtable (which will almost always be the case if there is no ongoing failure), we can + // just skip the entry (CASSANDRA-4667). + if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows()) + continue; + + if (trackContention && partition.useLock()) + heavilyContendedRowCount++; + + if (!partition.isEmpty()) { - writer.append(iter); + try (UnfilteredRowIterator iter = partition.unfilteredIterator()) + { + writer.append(iter); + } } } } + finally + { + while (true) + { + if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.IDLE)) + { + long bytesFlushed = writer.getFilePointer(); + logger.info("Completed flushing {} ({}) for commitlog position {}", + writer.getFilename(), + FBUtilities.prettyPrintMemory(bytesFlushed), + commitLogUpperBound); + // Update the metrics + cfs.metric.bytesFlushed.inc(bytesFlushed); + + if (heavilyContendedRowCount > 0) + logger.trace("High update contention in {}/{} partitions of {} ", heavilyContendedRowCount, toFlush.size(), Memtable.this); + break; + } + else if (state.compareAndSet(FlushRunnableWriterState.ABORTING, FlushRunnableWriterState.ABORTED)) + { + logger.debug("Flushing of {} aborted", writer.getFilename()); + maybeFail(writer.abort(null)); + break; + } + } + } + } - long bytesFlushed = writer.getFilePointer(); - logger.info("Completed flushing {} ({}) for commitlog position {}", - writer.getFilename(), - FBUtilities.prettyPrintMemory(bytesFlushed), - commitLogUpperBound); - // Update the metrics - cfs.metric.bytesFlushed.inc(bytesFlushed); + public Throwable abort(Throwable throwable) + { + while (true) + { + if (state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.ABORTED)) + { + logger.debug("Flushing of {} aborted", writer.getFilename()); + return writer.abort(throwable); + } + else if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.ABORTING)) + { + // thread currently executing writeSortedContents() will take care of aborting and throw any exceptions + return throwable; + } + } + } - if (heavilyContendedRowCount > 0) - logger.trace("High update contention in {}/{} partitions of {} ", heavilyContendedRowCount, toFlush.size(), Memtable.this); + @VisibleForTesting + FlushRunnableWriterState state() + { + return state.get(); } public SSTableMultiWriter createFlushWriter(LifecycleTransaction txn, diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java index 9ad944be3bc0..a17135621786 100644 --- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java +++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java @@ -384,6 +384,13 @@ public final Throwable commit(Throwable accumulate) return txnProxy.commit(accumulate); } + /** + * Stop the operation after errors, i.e. close and release all held resources. + * + * Do not use this to interrupt a write operation running in another thread. + * This is thread-unsafe, releasing and cleaning the buffer while it is being written can have disastrous + * consequences (e.g. SIGSEGV). + */ public final Throwable abort(Throwable accumulate) { return txnProxy.abort(accumulate); diff --git a/test/unit/org/apache/cassandra/db/MemtableTest.java b/test/unit/org/apache/cassandra/db/MemtableTest.java new file mode 100644 index 000000000000..63b27ed19932 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/MemtableTest.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.Semaphore; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; + +import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.db.commitlog.CommitLog; +import org.apache.cassandra.db.compaction.OperationType; +import org.apache.cassandra.db.lifecycle.LifecycleTransaction; +import org.apache.cassandra.io.sstable.SSTableMultiWriter; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.concurrent.OpOrder; +import org.jboss.byteman.contrib.bmunit.BMRule; +import org.jboss.byteman.contrib.bmunit.BMUnitConfig; +import org.jboss.byteman.contrib.bmunit.BMUnitRunner; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +@RunWith(BMUnitRunner.class) +@BMUnitConfig(debug = true) +public class MemtableTest extends CQLTester +{ + List ranges; + List locations; + ColumnFamilyStore cfs; + Memtable memtable; + ExecutorService executor; + int nThreads; + + @Before + public void setup() throws Throwable + { + createTable("CREATE TABLE %s (pk int PRIMARY KEY, value int)"); + + for (int i = 0; i < 10000; i++) + execute("INSERT INTO %s (pk, value) VALUES (?, ?)", i, i); + + cfs = getCurrentColumnFamilyStore(); + memtable = cfs.getTracker().getView().getCurrentMemtable(); + + OpOrder.Barrier barrier = cfs.keyspace.writeOrder.newBarrier(); + memtable.setDiscarding(barrier, new AtomicReference<>(CommitLog.instance.getCurrentPosition())); + barrier.issue(); + + ranges = new ArrayList<>(); + locations = new ArrayList<>(); + // this determines the number of flush writers created, the FlushRunnable will convert a null location into an sstable location for us + int rangeCount = 24; + for (int i = 0; i < rangeCount; ++i) + { + // split the range to ensure there are partitions to write + ranges.add(cfs.getPartitioner().split(cfs.getPartitioner().getMinimumToken(), + cfs.getPartitioner().getMaximumToken(), + (i+1) * 1.0 / rangeCount).minKeyBound()); + locations.add(null); + } + nThreads = locations.size() / 2; + executor = Executors.newFixedThreadPool(nThreads); + } + + @Test + public void testAbortingFlushRunnablesWithoutStarting() throws Throwable + { + // abort without starting + try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH)) + { + List flushRunnables = memtable.createFlushRunnables(ranges, locations, txn); + assertNotNull(flushRunnables); + + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertEquals(Memtable.FlushRunnableWriterState.IDLE, flushRunnable.state()); + + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertNull(flushRunnable.abort(null)); + + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state()); + } + } + + static Semaphore stopSignal = null; + static Semaphore continueSignal; + + public static void stopAndWait() throws InterruptedException + { + if (stopSignal != null) + { + stopSignal.release(); + continueSignal.acquire(); + } + } + + @Test + @BMRule(name = "Wait before loop", + targetClass = "Memtable$FlushRunnable", + targetMethod = "writeSortedContents", + targetLocation = "AT INVOKE Logger.isTraceEnabled()", + action = "org.apache.cassandra.db.MemtableTest.stopAndWait()") + public void testAbortingFlushRunnablesAfterStarting() throws Throwable + { + // abort after starting + try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH)) + { + List flushRunnables = memtable.createFlushRunnables(ranges, locations, txn); + + stopSignal = new Semaphore(0); + continueSignal = new Semaphore(0); + + List> futures = flushRunnables.stream().map(executor::submit).collect(Collectors.toList()); + + stopSignal.acquire(nThreads); + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertNull(flushRunnable.abort(null)); + continueSignal.release(flushRunnables.size()); // release all, including the ones that have not started yet + + FBUtilities.waitOnFutures(futures); + + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state()); + } + } + + @Test + public void testAbortingFlushRunnablesBeforeStarting() throws Throwable + { + // abort before starting + try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH)) + { + List flushRunnables = memtable.createFlushRunnables(ranges, locations, txn); + + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertNull(flushRunnable.abort(null)); + + List> futures = flushRunnables.stream().map(executor::submit).collect(Collectors.toList()); + + FBUtilities.waitOnFutures(futures); + + for (Memtable.FlushRunnable flushRunnable : flushRunnables) + assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state()); + } + } +} From 12db5848f0477db8945c79ec23537dcdb9881180 Mon Sep 17 00:00:00 2001 From: Zhao Yang Date: Wed, 4 Oct 2017 05:14:44 -0500 Subject: [PATCH 010/151] STAR-566: Test for wrap-around in estimatedKeysForRanges This ports the up-to-date version of the test introduced by DB-1157 patch by Zhao Yang; reviewed by Branimir Lambov ported by Branimir Lambov (cherry picked from commit beec1105e96f5f025d61bbe35088e518820c6e00) --- .../sstable/format/big/BigTableScanner.java | 2 +- .../io/sstable/SSTableReaderTest.java | 86 ++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java index 20105cd1e14c..6644b3b8cff1 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java @@ -164,7 +164,7 @@ private static void addRange(SSTableReader sstable, AbstractBounds left, right; left = requested.leftBoundary(); right = requested.rightBoundary(); diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java index 0b64028c4d72..6ba942a7be1d 100644 --- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java @@ -26,6 +26,7 @@ import java.util.concurrent.*; import com.google.common.collect.Sets; +import org.junit.After; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; @@ -44,6 +45,7 @@ import org.apache.cassandra.db.lifecycle.View; import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators; import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.LocalPartitioner.LocalToken; import org.apache.cassandra.dht.Range; @@ -90,7 +92,9 @@ public static void defineSchema() throws Exception SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD), - SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2), + SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2) + .minIndexInterval(8) + .maxIndexInterval(8), // ensure close key count estimation SchemaLoader.standardCFMD(KEYSPACE1, CF_COMPRESSED).compression(CompressionParams.DEFAULT), SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEXED, true), SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL) @@ -99,6 +103,12 @@ public static void defineSchema() throws Exception .caching(CachingParams.CACHE_NOTHING)); } + @After + public void Cleanup() { + Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD).truncateBlocking(); + Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2).truncateBlocking(); + } + @Test public void testGetPositionsForRanges() { @@ -140,6 +150,80 @@ public void testGetPositionsForRanges() } } + @Test + public void testEstimatedKeysForRangesAndKeySamples() + { + // prepare data + Keyspace keyspace = Keyspace.open(KEYSPACE1); + ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2"); + partitioner = store.getPartitioner(); + + Random random = new Random(); + List tokens = new ArrayList<>(); + tokens.add(partitioner.getMinimumToken()); + if (partitioner.splitter().isPresent()) + tokens.add(partitioner.getMaximumToken()); + + for (int j = 0; j < 100; j++) + { + Mutation mutation = new RowUpdateBuilder(store.metadata(), j, String.valueOf(random.nextInt())).clustering("0") + .add("val", + ByteBufferUtil.EMPTY_BYTE_BUFFER) + .build(); + if (j % 4 != 0) // skip some keys + mutation.applyUnsafe(); + tokens.add(mutation.key().getToken()); + } + + store.forceBlockingFlush(); + assertEquals(1, store.getLiveSSTables().size()); + SSTableReader sstable = store.getLiveSSTables().iterator().next(); + + // verify any combination of start and end point among the keys we have, which includes empty, full and + // wrap-around ranges + for (int i = 0; i < tokens.size(); i++) + for (int j = 0; j < tokens.size(); j++) + { + verifyEstimatedKeysAndKeySamples(sstable, new Range(tokens.get(i), tokens.get(j))); + } + } + + private void verifyEstimatedKeysAndKeySamples(SSTableReader sstable, Range range) + { + List expectedKeys = new ArrayList<>(); + try (ISSTableScanner scanner = sstable.getScanner()) + { + while (scanner.hasNext()) + { + try (UnfilteredRowIterator rowIterator = scanner.next()) + { + if (range.contains(rowIterator.partitionKey().getToken())) + expectedKeys.add(rowIterator.partitionKey()); + } + } + } + + // check estimated key + long estimated = sstable.estimatedKeysForRanges(Collections.singleton(range)); + assertTrue("Range: " + range + " having " + expectedKeys.size() + " partitions, but estimated " + + estimated, closeEstimation(expectedKeys.size(), estimated)); + + // check key samples + List sampledKeys = new ArrayList<>(); + sstable.getKeySamples(range).forEach(sampledKeys::add); + + assertTrue("Range: " + range + " having " + expectedKeys + " keys, but keys sampled: " + + sampledKeys, expectedKeys.containsAll(sampledKeys)); + // no duplicate + assertEquals(expectedKeys.size(), expectedKeys.stream().distinct().count()); + assertEquals(sampledKeys.size(), sampledKeys.stream().distinct().count()); + } + + private boolean closeEstimation(long expected, long estimated) + { + return expected <= estimated + 16 && expected >= estimated - 16; + } + @Test public void testSpannedIndexPositions() throws IOException { From b8eb62df7dbedb58ab4ef5624cb75e2cb507c373 Mon Sep 17 00:00:00 2001 From: Jakub Zytka Date: Wed, 2 Jun 2021 17:28:44 +0200 Subject: [PATCH 011/151] STAR-692: test exposing Date type overflow when using functions (cherry picked from commit 3dcd047cb2f6cd0a39fbe547372a73142d80e71c) --- .../cassandra/cql3/functions/OperationFctsTest.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java b/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java index c8ee9352e944..b8a8e37d75d7 100644 --- a/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java +++ b/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java @@ -850,6 +850,16 @@ public void testOperationsWithDuration() throws Throwable "SELECT time / 10m FROM %s WHERE pk = 1"); assertInvalidMessage("the operation 'date - duration' failed: The duration must have a day precision. Was: 10m", "SELECT * FROM %s WHERE pk = 1 AND time > ? - 10m", toDate("2016-10-04")); + + // test overflow errors + assertInvalidMessage("is greater than max supported date", + "INSERT INTO %s (pk, time, v) VALUES (2, '+5881581-01-01', 7)"); + assertInvalidMessage("is greater than max supported date", + "INSERT INTO %s (pk, time, v) VALUES (4, '+5881580-01-01' + 1y, 9)"); + assertInvalidMessage("is less than min supported date", + "INSERT INTO %s (pk, time, v) VALUES (3, '-5877642-01-01', 8)"); + assertInvalidMessage("is less than min supported date", + "INSERT INTO %s (pk, time, v) VALUES (5, '-5877640-01-01' - 2y, 10)"); } private Date toTimestamp(String timestampAsString) From 2a9a89b6e7e7da4a3653da78449daa14ef462e00 Mon Sep 17 00:00:00 2001 From: Jakub Zytka Date: Wed, 2 Jun 2021 17:42:24 +0200 Subject: [PATCH 012/151] STAR-692: protect Date type from overflows regardless of whether it is constructed from date string or from millis since epoch Co-authored-by: Ulises Cervino Beresi (cherry picked from commit 7bcbf8444f853c812fc21fa9187867f17ddd5c4c) --- .../serializers/SimpleDateSerializer.java | 26 +++++++++++++------ .../serializers/SimpleDateSerializerTest.java | 13 ++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java b/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java index c367705fc7d3..4bcb2f94dbab 100644 --- a/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java +++ b/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java @@ -70,14 +70,7 @@ public static int dateStringToDays(String source) throws MarshalException { LocalDate parsed = formatter.parse(source, LocalDate::from); long millis = parsed.atStartOfDay(UTC).toInstant().toEpochMilli(); - if (millis < minSupportedDateMillis) - throw new MarshalException(String.format("Input date %s is less than min supported date %s", source, - ZonedDateTime.ofInstant(Instant.ofEpochMilli(minSupportedDateMillis), UTC).toString())); - if (millis > maxSupportedDateMillis) - throw new MarshalException(String.format("Input date %s is greater than max supported date %s", source, - ZonedDateTime.ofInstant(Instant.ofEpochMilli(maxSupportedDateMillis), UTC).toString())); - - return timeInMillisToDay(millis); + return timeInMillisToDay(source, millis); } catch (DateTimeParseException| ArithmeticException e1) { @@ -107,6 +100,23 @@ private static int parseRaw(String source) { public static int timeInMillisToDay(long millis) { + return timeInMillisToDay(null, millis); + } + + private static int timeInMillisToDay(String source, long millis) + { + if (millis < minSupportedDateMillis) + { + throw new MarshalException(String.format("Input date %s is less than min supported date %s", + null == source ? ZonedDateTime.ofInstant(Instant.ofEpochMilli(millis), UTC).toLocalDate() : source, + ZonedDateTime.ofInstant(Instant.ofEpochMilli(minSupportedDateMillis), UTC).toLocalDate())); + } + if (millis > maxSupportedDateMillis) + { + throw new MarshalException(String.format("Input date %s is greater than max supported date %s", + null == source ? ZonedDateTime.ofInstant(Instant.ofEpochMilli(millis), UTC).toLocalDate() : source, + ZonedDateTime.ofInstant(Instant.ofEpochMilli(maxSupportedDateMillis), UTC).toLocalDate())); + } return (int) (Duration.ofMillis(millis).toDays() - Integer.MIN_VALUE); } diff --git a/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java b/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java index 9c1ef886f96d..8502fbde7659 100644 --- a/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java +++ b/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java @@ -26,6 +26,7 @@ import java.text.SimpleDateFormat; import java.time.temporal.ChronoUnit; import java.util.*; +import java.util.concurrent.TimeUnit; public class SimpleDateSerializerTest { @@ -152,4 +153,16 @@ public void testBadDayToMonth() { Integer days = SimpleDateSerializer.dateStringToDays("1000-09-31"); } + + @Test(expected = MarshalException.class) + public void testOutOfBoundsHighMillis() + { + SimpleDateSerializer.timeInMillisToDay(TimeUnit.DAYS.toMillis(Integer.MAX_VALUE) + 1); + } + + @Test(expected = MarshalException.class) + public void testOutOfBoundsLowMillis() + { + SimpleDateSerializer.timeInMillisToDay(TimeUnit.DAYS.toMillis(Integer.MIN_VALUE) - 1L); + } } From 58d49ab93f70d22df6772c8f708803a782d368d5 Mon Sep 17 00:00:00 2001 From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com> Date: Mon, 7 Jun 2021 15:05:36 +0200 Subject: [PATCH 013/151] STAR-749: Fixed ArrayIndexOutOfBoundsException in FunctionResource#fromName (#168) The problematic syntax was a function name with empty argument list. Co-authored-by: kamlesh ghoradkar (cherry picked from commit 5d7df12d037759b7410fab182c49e60f6abde03e) --- .../cassandra/auth/FunctionResource.java | 3 +- .../cassandra/auth/FunctionResourceTest.java | 106 ++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 test/unit/org/apache/cassandra/auth/FunctionResourceTest.java diff --git a/src/java/org/apache/cassandra/auth/FunctionResource.java b/src/java/org/apache/cassandra/auth/FunctionResource.java index 61c6a2966694..d47c019cf543 100644 --- a/src/java/org/apache/cassandra/auth/FunctionResource.java +++ b/src/java/org/apache/cassandra/auth/FunctionResource.java @@ -18,6 +18,7 @@ package org.apache.cassandra.auth; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.Set; @@ -187,7 +188,7 @@ public static FunctionResource fromName(String name) return keyspace(parts[1]); String[] nameAndArgs = StringUtils.split(parts[2], "[|]"); - return function(parts[1], nameAndArgs[0], argsListFromString(nameAndArgs[1])); + return function(parts[1], nameAndArgs[0], nameAndArgs.length > 1 ? argsListFromString(nameAndArgs[1]) : Collections.emptyList()); } /** diff --git a/test/unit/org/apache/cassandra/auth/FunctionResourceTest.java b/test/unit/org/apache/cassandra/auth/FunctionResourceTest.java new file mode 100644 index 000000000000..89863958b8f8 --- /dev/null +++ b/test/unit/org/apache/cassandra/auth/FunctionResourceTest.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.auth; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.TypeParser; + +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.junit.Assert.assertEquals; + +public class FunctionResourceTest +{ + private static final String ks = "fr_ks"; + private static final String func = "functions"; + private static final String name = "concat"; + private static final String varType = "org.apache.cassandra.db.marshal.UTF8Type"; + + @Test + public void testFunction() throws Exception + { + FunctionResource expected = FunctionResource.root(); + FunctionResource actual = FunctionResource.fromName(func); + assertEquals(expected, actual); + assertEquals(expected.getName(), actual.getName()); + } + + @Test + public void testFunctionKeyspace() throws Exception + { + FunctionResource expected = FunctionResource.keyspace(ks); + FunctionResource actual = FunctionResource.fromName(String.format("%s/%s", func, ks)); + assertEquals(expected, actual); + assertEquals(expected.getKeyspace(), actual.getKeyspace()); + } + + @Test + public void testFunctionWithSingleInputParameter() throws Exception + { + List> argTypes = new ArrayList<>(); + argTypes.add(TypeParser.parse(varType)); + FunctionResource expected = FunctionResource.function(ks, name, argTypes); + FunctionResource actual = FunctionResource.fromName(String.format("%s/%s/%s[%s]", func, ks, name, varType)); + assertEquals(expected, actual); + assertEquals(expected.getKeyspace(), actual.getKeyspace()); + } + + @Test + public void testFunctionWithMultipleInputParameter() throws Exception + { + List> argTypes = new ArrayList<>(); + argTypes.add(TypeParser.parse(varType)); + argTypes.add(TypeParser.parse(varType)); + FunctionResource expected = FunctionResource.function(ks, name, argTypes); + FunctionResource actual = FunctionResource.fromName(String.format("%s/%s/%s[%s^%s]", func, ks, name, varType, varType)); + assertEquals(expected, actual); + assertEquals(expected.getKeyspace(), actual.getKeyspace()); + } + + @Test + public void testFunctionWithoutInputParameter() throws Exception + { + List> argTypes = new ArrayList<>(); + FunctionResource expected = FunctionResource.function(ks, name, argTypes); + FunctionResource actual = FunctionResource.fromName(String.format("%s/%s/%s[]", func, ks, name)); + assertEquals(expected, actual); + assertEquals(expected.getKeyspace(), actual.getKeyspace()); + } + + @Test + public void testInvalidFunctionName() + { + String expected = "functions_test is not a valid function resource name"; + assertThatExceptionOfType(IllegalArgumentException.class) + .describedAs(expected) + .isThrownBy(() -> FunctionResource.fromName("functions_test")); + } + + @Test + public void testFunctionWithInvalidInput() + { + String expected = String.format("%s/%s/%s[%s]/test is not a valid function resource name", func, ks, name, varType); + assertThatExceptionOfType(IllegalArgumentException.class) + .describedAs(expected) + .isThrownBy(() -> FunctionResource.fromName(String.format("%s/%s/%s[%s]/test", func, ks, name, varType))); + } +} \ No newline at end of file From 90d17f0473d67a3d58fb558e0a731a656472770b Mon Sep 17 00:00:00 2001 From: dan jatnieks Date: Thu, 10 Jun 2021 17:27:39 -0700 Subject: [PATCH 014/151] STAR-578 avoid copying EMPTY_STATIC_ROW to heap with offheap memtable(#523) (#188) patch by Zhao Yang; reviewed by Robert Stupp for DB-1375 Co-authored-by: Zhao Yang (cherry picked from commit d09bc6624a657b3e0a37ce7f4c1b5e6a0fc82c91) --- src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java b/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java index 71c17934cbd6..6893fb0ac985 100644 --- a/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java +++ b/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java @@ -57,8 +57,9 @@ public DecoratedKey applyToPartitionKey(DecoratedKey key) public Row applyToRow(Row row) { - if (row == null) - return null; + // If current "row" is Rows.EMPTY_STATIC_ROW, don't copy it again, as "copied_empty_static_row" != EMPTY_STATIC_ROW + if (row == null || row == Rows.EMPTY_STATIC_ROW) + return row; return Rows.copy(row, HeapAllocator.instance.cloningBTreeRowBuilder()).build(); } From 18a8ce994d87ec242639fc46d3750198bd94b46a Mon Sep 17 00:00:00 2001 From: Ruslan Fomkin Date: Fri, 11 Jun 2021 11:10:57 +0200 Subject: [PATCH 015/151] STAR-582 avoid assertion when repairing 1 node cluster (#185) Porting patch DB-1511, riptano/apollo#627 Co-authored-by: Zhao Yang (cherry picked from commit 998eca09654c6f571c6479333d8bbef3a74a7707) --- src/java/org/apache/cassandra/repair/RepairRunnable.java | 1 + src/java/org/apache/cassandra/service/StorageService.java | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/repair/RepairRunnable.java b/src/java/org/apache/cassandra/repair/RepairRunnable.java index 5ada11661ede..fa64b85c0f2e 100644 --- a/src/java/org/apache/cassandra/repair/RepairRunnable.java +++ b/src/java/org/apache/cassandra/repair/RepairRunnable.java @@ -326,6 +326,7 @@ private NeighborsAndRanges getNeighborsAndRanges() EndpointsForRange neighbors = ActiveRepairService.getNeighbors(keyspace, keyspaceLocalRanges, range, options.getDataCenters(), options.getHosts()); + // local RF = 1 or given range is not part of local range, neighbors would be empty. if (neighbors.isEmpty()) { if (options.ignoreUnreplicatedKeyspaces()) diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index a8bbef76c595..92b57d9b186b 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -4001,8 +4001,11 @@ else if (option.isInLocalDCOnly()) Iterables.addAll(option.getRanges(), getLocalReplicas(keyspace).onlyFull().ranges()); } } - if (option.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor().allReplicas < 2) + if (option.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor().allReplicas < 2 + || tokenMetadata.getAllEndpoints().size() < 2) + { return Pair.create(0, Futures.immediateFuture(null)); + } int cmd = nextRepairCommand.incrementAndGet(); return Pair.create(cmd, ActiveRepairService.repairCommandExecutor().submit(createRepairTask(cmd, keyspace, option, listeners))); From 5d10d16c88458a1ae655e59575658b7cd74c3ea6 Mon Sep 17 00:00:00 2001 From: Jaroslaw Grabowski Date: Mon, 14 Jun 2021 11:12:38 +0200 Subject: [PATCH 016/151] STAR-571 fix *SnichTests (#197) Migrate to commit log initialization implemented by ServerTestUtils that is race free. Files were removed by the cleanup that followed CL init. (cherry picked from commit 37e2e3d52aae414c521110efdf5bf1bb20c37fad) --- .../org/apache/cassandra/locator/AlibabaCloudSnitchTest.java | 5 ++--- .../org/apache/cassandra/locator/CloudstackSnitchTest.java | 5 ++--- test/unit/org/apache/cassandra/locator/EC2SnitchTest.java | 5 ++--- .../org/apache/cassandra/locator/GoogleCloudSnitchTest.java | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java b/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java index fb85a23f4732..809037e283d9 100644 --- a/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java +++ b/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java @@ -25,6 +25,7 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.commitlog.CommitLog; @@ -47,9 +48,7 @@ public static void setup() throws Exception { System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true"); DatabaseDescriptor.daemonInitialization(); - CommitLog.instance.start(); - mkdirs(); - cleanup(); + ServerTestUtils.cleanupAndLeaveDirs(); Keyspace.setInitialized(); StorageService.instance.initServer(0); } diff --git a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java index 9e39c48abde0..7f7a07e3f374 100644 --- a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java +++ b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java @@ -26,6 +26,7 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.commitlog.CommitLog; @@ -48,9 +49,7 @@ public static void setup() throws Exception { System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true"); DatabaseDescriptor.daemonInitialization(); - CommitLog.instance.start(); - mkdirs(); - cleanup(); + ServerTestUtils.cleanupAndLeaveDirs(); Keyspace.setInitialized(); StorageService.instance.initServer(0); } diff --git a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java index f05f4a274b20..0cc819c609d4 100644 --- a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java +++ b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java @@ -31,6 +31,7 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.commitlog.CommitLog; @@ -62,9 +63,7 @@ public static void setup() throws Exception { System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true"); DatabaseDescriptor.daemonInitialization(); - CommitLog.instance.start(); - mkdirs(); - cleanup(); + ServerTestUtils.cleanupAndLeaveDirs(); Keyspace.setInitialized(); StorageService.instance.initServer(0); } diff --git a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java index e524f3a531e6..5a8589022241 100644 --- a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java +++ b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java @@ -27,6 +27,7 @@ import org.junit.BeforeClass; import org.junit.Test; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.commitlog.CommitLog; @@ -49,9 +50,7 @@ public static void setup() throws Exception { System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true"); DatabaseDescriptor.daemonInitialization(); - CommitLog.instance.start(); - mkdirs(); - cleanup(); + ServerTestUtils.cleanupAndLeaveDirs(); Keyspace.setInitialized(); StorageService.instance.initServer(0); } From a75b01a98120636d051f53301f82b4fff08c9fb1 Mon Sep 17 00:00:00 2001 From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com> Date: Tue, 15 Jun 2021 19:31:44 +0200 Subject: [PATCH 017/151] STAR-745: Recreate BF on fp chance change and limit the total memory usage (#177) * STAR-745: Add BloomFilter global memory limit Added a memory limiter which is a global instance (static field in BloomFilter) which monitors the total memory used by all created (and deserialized) Bloom filters. When the limit is reached we return a dummy always-true filter with a log error message. * STAR-745: Recreate Bloom filter on startup In certain situations BF will be recreated on startup (for example, if FP chance changed more than the defined tolerance). Reviewed by: Daniel Jatnieks Co-authored-by: Stefania Alborghetti (cherry picked from commit c1a2595e556cb325fe915febd6c57b77955a09d8) --- .../cassandra/db/compaction/Verifier.java | 4 +- .../io/sstable/format/SSTableReader.java | 2 +- .../sstable/format/SSTableReaderBuilder.java | 109 +++++++---- .../io/sstable/format/big/BigTableWriter.java | 2 +- .../sstable/metadata/IMetadataSerializer.java | 7 + .../sstable/metadata/MetadataSerializer.java | 8 + .../apache/cassandra/utils/BloomFilter.java | 46 ++++- .../utils/BloomFilterSerializer.java | 37 +++- .../apache/cassandra/utils/FilterFactory.java | 43 +++-- .../cassandra/utils/obs/MemoryLimiter.java | 72 ++++++++ .../cassandra/utils/obs/OffHeapBitSet.java | 51 +++-- .../BloomFilterSerializerBench.java | 4 +- test/unit/org/apache/cassandra/Util.java | 26 ++- .../io/sstable/SSTableReaderTest.java | 148 ++++++++++++++- .../cassandra/utils/BloomFilterTest.java | 174 ++++++++++++++---- .../cassandra/utils/SerializationsTest.java | 8 +- .../utils/obs/OffHeapBitSetTest.java | 25 +-- 17 files changed, 634 insertions(+), 132 deletions(-) create mode 100644 src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java index 30e74adb4b35..68d5163e4d85 100644 --- a/src/java/org/apache/cassandra/db/compaction/Verifier.java +++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java @@ -43,7 +43,7 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageService; -import org.apache.cassandra.utils.BloomFilterSerializer; +import org.apache.cassandra.utils.BloomFilter; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.IFilter; @@ -448,7 +448,7 @@ private void deserializeBloomFilter(SSTableReader sstable) throws IOException if (Files.exists(bfPath)) { try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(bfPath))); - IFilter bf = BloomFilterSerializer.deserialize(stream, sstable.descriptor.version.hasOldBfFormat())) + IFilter bf = BloomFilter.serializer.deserialize(stream, sstable.descriptor.version.hasOldBfFormat())) { } } diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index ea40f34f265b..258b004871bf 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -740,7 +740,7 @@ public static void saveBloomFilter(Descriptor descriptor, IFilter filter) File filterFile = new File(descriptor.filenameFor(Component.FILTER)); try (DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(new FileOutputStream(filterFile))) { - BloomFilterSerializer.serialize((BloomFilter) filter, stream); + BloomFilter.serializer.serialize((BloomFilter) filter, stream); stream.flush(); } catch (IOException e) diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java index 8fe1deff9e4e..e5abcf834e48 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java @@ -25,6 +25,7 @@ import org.apache.cassandra.db.RowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.io.sstable.*; +import org.apache.cassandra.io.sstable.metadata.MetadataType; import org.apache.cassandra.io.sstable.metadata.StatsMetadata; import org.apache.cassandra.io.sstable.metadata.ValidationMetadata; import org.apache.cassandra.io.util.DiskOptimizationStrategy; @@ -43,10 +44,13 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.Set; import java.util.concurrent.TimeUnit; +import com.google.common.collect.ImmutableMap; + public abstract class SSTableReaderBuilder { private static final Logger logger = LoggerFactory.getLogger(SSTableReaderBuilder.class); @@ -89,30 +93,6 @@ public SSTableReaderBuilder(Descriptor descriptor, public abstract SSTableReader build(); - public SSTableReaderBuilder dfile(FileHandle dfile) - { - this.dfile = dfile; - return this; - } - - public SSTableReaderBuilder ifile(FileHandle ifile) - { - this.ifile = ifile; - return this; - } - - public SSTableReaderBuilder bf(IFilter bf) - { - this.bf = bf; - return this; - } - - public SSTableReaderBuilder summary(IndexSummary summary) - { - this.summary = summary; - return this; - } - /** * Load index summary, first key and last key from Summary.db file if it exists. * @@ -226,17 +206,29 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter, } } - /** - * Load bloom filter from Filter.db file. - * - * @throws IOException - */ - IFilter loadBloomFilter() throws IOException + public static IFilter loadBloomFilter(Path path, boolean oldFormat) { - try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(descriptor.filenameFor(Component.FILTER)))))) + if (Files.exists(path)) + { + IFilter filter = null; + try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(path)))) + { + filter = BloomFilter.serializer.deserialize(stream, oldFormat); + return filter; + } + catch (Throwable t) + { + JVMStabilityInspector.inspectThrowable(t); + logger.error("Failed to deserialize Bloom filter: {}", t.getMessage()); + if (filter != null) + filter.close(); + } + } + else { - return BloomFilterSerializer.deserialize(stream, descriptor.version.hasOldBfFormat()); + logger.error("Bloom filter {} not found", path); } + return null; } public static class ForWriter extends SSTableReaderBuilder @@ -252,6 +244,30 @@ public ForWriter(Descriptor descriptor, super(descriptor, metadataRef, maxDataAge, components, statsMetadata, openReason, header); } + public SSTableReaderBuilder.ForWriter dfile(FileHandle dfile) + { + this.dfile = dfile; + return this; + } + + public SSTableReaderBuilder.ForWriter ifile(FileHandle ifile) + { + this.ifile = ifile; + return this; + } + + public SSTableReaderBuilder.ForWriter bf(IFilter bf) + { + this.bf = bf; + return this; + } + + public SSTableReaderBuilder.ForWriter summary(IndexSummary summary) + { + this.summary = summary; + return this; + } + @Override public SSTableReader build() { @@ -276,6 +292,7 @@ public ForBatch(Descriptor descriptor, @Override public SSTableReader build() { + assert dfile == null && ifile == null && summary == null && bf == null; String dataFilePath = descriptor.filenameFor(Component.DATA); long fileLength = new File(dataFilePath).length(); logger.info("Opening {} ({})", descriptor, FBUtilities.prettyPrintMemory(fileLength)); @@ -346,6 +363,7 @@ public ForRead(Descriptor descriptor, @Override public SSTableReader build() { + assert dfile == null && ifile == null && summary == null && bf == null; String dataFilePath = descriptor.filenameFor(Component.DATA); long fileLength = new File(dataFilePath).length(); logger.info("Opening {} ({})", descriptor, FBUtilities.prettyPrintMemory(fileLength)); @@ -381,11 +399,10 @@ private void load(ValidationMetadata validation, DiskOptimizationStrategy optimizationStrategy, StatsMetadata statsMetadata) throws IOException { - if (metadata.params.bloomFilterFpChance == 1.0) + if (!BloomFilter.shouldUseBloomFilter(metadata.params.bloomFilterFpChance)) { // bf is disabled. load(false, !isOffline, optimizationStrategy, statsMetadata, components); - bf = FilterFactory.AlwaysPresent; } else if (!components.contains(Component.PRIMARY_INDEX)) // What happens if filter component and primary index is missing? { @@ -397,15 +414,21 @@ else if (!components.contains(Component.FILTER) || validation == null) { // bf is enabled, but filter component is missing. load(!isOffline, !isOffline, optimizationStrategy, statsMetadata, components); - if (isOffline) - bf = FilterFactory.AlwaysPresent; + } + else if (!BloomFilter.isFPChanceDiffNeglectable(metadata.params.bloomFilterFpChance, validationMetadata.bloomFilterFPChance) && BloomFilter.recreateOnFPChanceChange) + { + // bf is enabled, but fp chance changed + load(!isOffline, !isOffline, optimizationStrategy, statsMetadata, components); } else { // bf is enabled and fp chance matches the currently configured value. - load(false, !isOffline, optimizationStrategy, statsMetadata, components); - bf = loadBloomFilter(); + bf = loadBloomFilter(Paths.get(descriptor.filenameFor(Component.FILTER)), descriptor.version.hasOldBfFormat()); + load(bf == null, !isOffline, optimizationStrategy, statsMetadata, components); } + // if the filter was neither loaded nor created, or we encountered some problems, we fallback to pass-through filter + if (bf == null) + bf = FilterFactory.AlwaysPresent; } /** @@ -448,7 +471,11 @@ void load(boolean recreateBloomFilter, if (saveSummaryIfCreated) SSTableReader.saveSummary(descriptor, first, last, summary); if (recreateBloomFilter) + { SSTableReader.saveBloomFilter(descriptor, bf); + ValidationMetadata updatedValidationMetadata = new ValidationMetadata(validationMetadata.partitioner, metadata.params.bloomFilterFpChance); + descriptor.getMetadataSerializer().updateSSTableMetadata(descriptor, ImmutableMap.of(MetadataType.VALIDATION, updatedValidationMetadata)); + } } } catch (Throwable t) @@ -463,6 +490,12 @@ void load(boolean recreateBloomFilter, dfile.close(); } + if (bf != null) + { + bf.close(); + bf = null; + } + if (summary != null) { summary.close(); diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java index eeb9153826c5..806a05951118 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java @@ -539,7 +539,7 @@ void flushBf() DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos)) { // bloom filter - BloomFilterSerializer.serialize((BloomFilter) bf, stream); + BloomFilter.serializer.serialize((BloomFilter) bf, stream); stream.flush(); SyncUtil.sync(fos); } diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java index fc1ce422a40a..8c3adb811389 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java @@ -100,4 +100,11 @@ public interface IMetadataSerializer * Replace the sstable metadata file ({@code -Statistics.db}) with the given components. */ void rewriteSSTableMetadata(Descriptor descriptor, Map currentComponents) throws IOException; + + /** + * Updates the sstable metadata components (works similarly to {@link #rewriteSSTableMetadata(Descriptor, Map)} but + * only updates the provided components rather than replacing the whole metadata map). + */ + void updateSSTableMetadata(Descriptor descriptor, Map updatedComponents) throws IOException; + } diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java index 042103e26798..ee9a24aca9f4 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java @@ -279,4 +279,12 @@ public void rewriteSSTableMetadata(Descriptor descriptor, Map updatedComponents) throws IOException + { + Map currentComponents = deserialize(descriptor, EnumSet.allOf(MetadataType.class)); + currentComponents.putAll(updatedComponents); + rewriteSSTableMetadata(descriptor, currentComponents); + } + } diff --git a/src/java/org/apache/cassandra/utils/BloomFilter.java b/src/java/org/apache/cassandra/utils/BloomFilter.java index bf48d4341894..a59a7fb14a9a 100644 --- a/src/java/org/apache/cassandra/utils/BloomFilter.java +++ b/src/java/org/apache/cassandra/utils/BloomFilter.java @@ -21,12 +21,45 @@ import io.netty.util.concurrent.FastThreadLocal; import net.nicoulaj.compilecommand.annotations.Inline; +import org.apache.cassandra.config.Config; import org.apache.cassandra.utils.concurrent.Ref; import org.apache.cassandra.utils.concurrent.WrappedSharedCloseable; import org.apache.cassandra.utils.obs.IBitSet; +import org.apache.cassandra.utils.obs.MemoryLimiter; public class BloomFilter extends WrappedSharedCloseable implements IFilter { + /** + * The maximum memory to be used by all loaded bloom filters. If the limit is exceeded, pass-through filter will be + * used until some filters get unloaded. + */ + public final static String MAX_MEMORY_MB_PROP = Config.PROPERTY_PREFIX + "bf.max_memory_mb"; + + /** + * A minimal relative change of the fase-positive chance so that it is considered as a reason to recreate the bloom + * filter. If the change is smaller than this, it will be ignored. + */ + public final static String FP_CHANCE_TOLERANCE_PROP = Config.PROPERTY_PREFIX + "bf.fp_chance_tolerance"; + + /** + * If the false-positive chance has changed since the last compaction (for example by alter table statement), and + * the node is restarted - the bloom filter can get rebuilt if this property jest set to true. + */ + public final static String RECREATE_ON_FP_CHANCE_CHANGE = Config.PROPERTY_PREFIX + "bf.recreate_on_fp_chance_change"; + + private static final long maxMemory = Long.getLong(MAX_MEMORY_MB_PROP, 0) << 20; + + @VisibleForTesting + public static double fpChanceTolerance = Double.parseDouble(System.getProperty(FP_CHANCE_TOLERANCE_PROP, "0.000001")); + + @VisibleForTesting + public static boolean recreateOnFPChanceChange = Boolean.getBoolean(RECREATE_ON_FP_CHANCE_CHANGE); + + public static final MemoryLimiter memoryLimiter = new MemoryLimiter(maxMemory != 0 ? maxMemory : Long.MAX_VALUE, + "Allocating %s for Bloom filter would reach max of %s (current %s)"); + + public final static BloomFilterSerializer serializer = new BloomFilterSerializer(memoryLimiter); + private final static FastThreadLocal reusableIndexes = new FastThreadLocal() { protected long[] initialValue() @@ -54,7 +87,7 @@ private BloomFilter(BloomFilter copy) public long serializedSize() { - return BloomFilterSerializer.serializedSize(this); + return serializer.serializedSize(this); } // Murmur is faster than an SHA-based approach and provides as-good collision @@ -149,4 +182,15 @@ public void addTo(Ref.IdentityCollection identities) super.addTo(identities); bitset.addTo(identities); } + + public static boolean shouldUseBloomFilter(double fpChance) + { + return Math.abs(1 - fpChance) > BloomFilter.fpChanceTolerance; + } + + public static boolean isFPChanceDiffNeglectable(double fpChance1, double fpChance2) + { + return Math.abs(fpChance1 - fpChance2) <= fpChanceTolerance; + } + } diff --git a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java index d3c08b53cbed..c677bf26d84b 100644 --- a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java +++ b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java @@ -20,40 +20,61 @@ import java.io.DataInputStream; import java.io.IOException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.obs.IBitSet; +import org.apache.cassandra.utils.obs.MemoryLimiter; import org.apache.cassandra.utils.obs.OffHeapBitSet; +import static org.apache.cassandra.utils.FilterFactory.AlwaysPresent; + public final class BloomFilterSerializer { - private BloomFilterSerializer() + private final static Logger logger = LoggerFactory.getLogger(BloomFilterSerializer.class); + + private final MemoryLimiter memoryLimiter; + + public BloomFilterSerializer(MemoryLimiter memoryLimiter) { + this.memoryLimiter = memoryLimiter; } - public static void serialize(BloomFilter bf, DataOutputPlus out) throws IOException + public void serialize(BloomFilter bf, DataOutputPlus out) throws IOException { out.writeInt(bf.hashCount); bf.bitset.serialize(out); } @SuppressWarnings("resource") - public static BloomFilter deserialize(DataInputStream in, boolean oldBfFormat) throws IOException + public IFilter deserialize(DataInputStream in, boolean oldBfFormat) throws IOException { int hashes = in.readInt(); - IBitSet bs = OffHeapBitSet.deserialize(in, oldBfFormat); - + IBitSet bs; + try + { + bs = OffHeapBitSet.deserialize(in, oldBfFormat, memoryLimiter); + } + catch (MemoryLimiter.ReachedMemoryLimitException | OutOfMemoryError e) + { + logger.error("Failed to create Bloom filter during deserialization: ({}) - " + + "continuing but this will have severe performance implications, consider increasing FP chance or" + + "lowering number of sstables through compaction", e.getMessage()); + return AlwaysPresent; + } return new BloomFilter(hashes, bs); } /** * Calculates a serialized size of the given Bloom Filter - * @param bf Bloom filter to calculate serialized size - * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus) * + * @param bf Bloom filter to calculate serialized size * @return serialized size of the given bloom filter + * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus) */ - public static long serializedSize(BloomFilter bf) + public long serializedSize(BloomFilter bf) { int size = TypeSizes.sizeof(bf.hashCount); // hash count size += bf.bitset.serializedSize(); diff --git a/src/java/org/apache/cassandra/utils/FilterFactory.java b/src/java/org/apache/cassandra/utils/FilterFactory.java index 4cf0cbf74d19..63a54e11fe3c 100644 --- a/src/java/org/apache/cassandra/utils/FilterFactory.java +++ b/src/java/org/apache/cassandra/utils/FilterFactory.java @@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory; import org.apache.cassandra.utils.obs.IBitSet; +import org.apache.cassandra.utils.obs.MemoryLimiter; import org.apache.cassandra.utils.obs.OffHeapBitSet; public class FilterFactory @@ -35,6 +36,11 @@ public class FilterFactory * probability for the given number of elements. */ public static IFilter getFilter(long numElements, int targetBucketsPerElem) + { + return getFilter(numElements, targetBucketsPerElem, BloomFilter.memoryLimiter); + } + + public static IFilter getFilter(long numElements, int targetBucketsPerElem, MemoryLimiter memoryLimiter) { int maxBucketsPerElement = Math.max(1, BloomCalculations.maxBucketsPerElement(numElements)); int bucketsPerElement = Math.min(targetBucketsPerElem, maxBucketsPerElement); @@ -43,31 +49,46 @@ public static IFilter getFilter(long numElements, int targetBucketsPerElem) logger.warn("Cannot provide an optimal BloomFilter for {} elements ({}/{} buckets per element).", numElements, bucketsPerElement, targetBucketsPerElem); } BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement); - return createFilter(spec.K, numElements, spec.bucketsPerElement); + return createFilter(spec.K, numElements, spec.bucketsPerElement, memoryLimiter); } /** * @return The smallest BloomFilter that can provide the given false - * positive probability rate for the given number of elements. - * - * Asserts that the given probability can be satisfied using this - * filter. + * positive probability rate for the given number of elements. + *

+ * Asserts that the given probability can be satisfied using this + * filter. */ public static IFilter getFilter(long numElements, double maxFalsePosProbability) + { + return getFilter(numElements, maxFalsePosProbability, BloomFilter.memoryLimiter); + } + + public static IFilter getFilter(long numElements, double maxFalsePosProbability, MemoryLimiter memoryLimiter) { assert maxFalsePosProbability <= 1.0 : "Invalid probability"; if (maxFalsePosProbability == 1.0) - return new AlwaysPresentFilter(); + return AlwaysPresent; int bucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements); BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability); - return createFilter(spec.K, numElements, spec.bucketsPerElement); + return createFilter(spec.K, numElements, spec.bucketsPerElement, memoryLimiter); } @SuppressWarnings("resource") - private static IFilter createFilter(int hash, long numElements, int bucketsPer) + private static IFilter createFilter(int hash, long numElements, int bucketsPer, MemoryLimiter memoryLimiter) { - long numBits = (numElements * bucketsPer) + BITSET_EXCESS; - IBitSet bitset = new OffHeapBitSet(numBits); - return new BloomFilter(hash, bitset); + try + { + long numBits = (numElements * bucketsPer) + BITSET_EXCESS; + IBitSet bitset = new OffHeapBitSet(numBits, memoryLimiter); + return new BloomFilter(hash, bitset); + } + catch (MemoryLimiter.ReachedMemoryLimitException | OutOfMemoryError e) + { + logger.error("Failed to create new Bloom filter with {} elements: ({}) - " + + "continuing but this will have severe performance implications, consider increasing FP chance or" + + "lowering number of sstables through compaction", numElements, e.getMessage()); + return AlwaysPresent; + } } } diff --git a/src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java b/src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java new file mode 100644 index 000000000000..bb2eb28a341d --- /dev/null +++ b/src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.obs; + +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.cassandra.utils.FBUtilities; + +public class MemoryLimiter +{ + public final long maxMemory; + private final AtomicLong currentMemory; + private final String exceptionFormat; + + public MemoryLimiter(long maxMemory, String exceptionFormat) + { + this.maxMemory = maxMemory; + this.currentMemory = new AtomicLong(); + this.exceptionFormat = exceptionFormat; + } + + public void increment(long bytesCount) throws ReachedMemoryLimitException + { + assert bytesCount >= 0; + long bytesCountAfterAllocation = this.currentMemory.addAndGet(bytesCount); + if (bytesCountAfterAllocation >= maxMemory) + { + this.currentMemory.addAndGet(-bytesCount); + + throw new ReachedMemoryLimitException(String.format(exceptionFormat, + FBUtilities.prettyPrintMemory(bytesCount), + FBUtilities.prettyPrintMemory(maxMemory), + FBUtilities.prettyPrintMemory(bytesCountAfterAllocation - bytesCount))); + } + } + + public void decrement(long bytesCount) + { + assert bytesCount >= 0; + long result = this.currentMemory.addAndGet(-bytesCount); + assert result >= 0; + } + + public long memoryAllocated() + { + return currentMemory.get(); + } + + public static class ReachedMemoryLimitException extends Exception + { + public ReachedMemoryLimitException(String message) + { + super(message); + } + } +} diff --git a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java index 486ec388d820..fa9dcd07ef0f 100644 --- a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java +++ b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java @@ -17,9 +17,7 @@ */ package org.apache.cassandra.utils.obs; -import java.io.DataInput; import java.io.DataInputStream; -import java.io.DataOutput; import java.io.IOException; import com.google.common.annotations.VisibleForTesting; @@ -37,18 +35,25 @@ */ public class OffHeapBitSet implements IBitSet { + /** + * The maximum memory that can be used by bloom filters, in megabytes, overall. + * The default is unlimited, a limit should only be set as a last resort measure. + */ + @VisibleForTesting private final Memory bytes; + private final MemoryLimiter memoryLimiter; - public OffHeapBitSet(long numBits) + public OffHeapBitSet(long numBits, MemoryLimiter memoryLimiter) throws MemoryLimiter.ReachedMemoryLimitException { - /** returns the number of 64 bit words it would take to hold numBits */ + this.memoryLimiter = memoryLimiter; + // returns the number of 64 bit words it would take to hold numBits long wordCount = (((numBits - 1) >>> 6) + 1); if (wordCount > Integer.MAX_VALUE) throw new UnsupportedOperationException("Bloom filter size is > 16GB, reduce the bloom_filter_fp_chance"); try { long byteCount = wordCount * 8L; - bytes = Memory.allocate(byteCount); + bytes = allocate(byteCount, memoryLimiter); } catch (OutOfMemoryError e) { @@ -58,11 +63,33 @@ public OffHeapBitSet(long numBits) clear(); } - private OffHeapBitSet(Memory bytes) + private OffHeapBitSet(Memory bytes, MemoryLimiter memoryLimiter) { + this.memoryLimiter = memoryLimiter; this.bytes = bytes; } + private static Memory allocate(long byteCount, MemoryLimiter memoryLimiter) throws MemoryLimiter.ReachedMemoryLimitException + { + memoryLimiter.increment(byteCount); + try + { + return Memory.allocate(byteCount); + } + catch (OutOfMemoryError e) + { + memoryLimiter.decrement(byteCount); + throw e; + } + } + + private static void release(Memory memory, MemoryLimiter memoryLimiter) + { + long size = memory.size(); + memory.free(); + memoryLimiter.decrement(size); + } + public long capacity() { return bytes.size() * 8; @@ -145,10 +172,10 @@ public long serializedSize() } @SuppressWarnings("resource") - public static OffHeapBitSet deserialize(DataInputStream in, boolean oldBfFormat) throws IOException + public static OffHeapBitSet deserialize(DataInputStream in, boolean oldBfFormat, MemoryLimiter memoryLimiter) throws IOException, MemoryLimiter.ReachedMemoryLimitException { long byteCount = in.readInt() * 8L; - Memory memory = Memory.allocate(byteCount); + Memory memory = allocate(byteCount, memoryLimiter); if (oldBfFormat) { for (long i = 0; i < byteCount; ) @@ -168,12 +195,12 @@ public static OffHeapBitSet deserialize(DataInputStream in, boolean oldBfFormat) { FBUtilities.copy(in, new MemoryOutputStream(memory), byteCount); } - return new OffHeapBitSet(memory); + return new OffHeapBitSet(memory, memoryLimiter); } public void close() { - bytes.free(); + release(bytes, memoryLimiter); } @Override @@ -192,7 +219,7 @@ public int hashCode() { // Similar to open bitset. long h = 0; - for (long i = bytes.size(); --i >= 0;) + for (long i = bytes.size(); --i >= 0; ) { h ^= bytes.getByte(i); h = (h << 1) | (h >>> 63); // rotate left @@ -202,6 +229,6 @@ public int hashCode() public String toString() { - return "[OffHeapBitSet]"; + return String.format("[OffHeapBitSet %s]", FBUtilities.prettyPrintMemory(serializedSize())); } } diff --git a/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java b/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java index 922281145f52..4cabf0633eba 100644 --- a/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java +++ b/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java @@ -81,12 +81,12 @@ public void serializationTest() throws IOException if (oldBfFormat) SerializationsTest.serializeOldBfFormat(filter, out); else - BloomFilterSerializer.serialize(filter, out); + BloomFilter.serializer.serialize(filter, out); out.close(); filter.close(); DataInputStream in = new DataInputStream(new FileInputStream(file)); - BloomFilter filter2 = BloomFilterSerializer.deserialize(in, oldBfFormat); + IFilter filter2 = BloomFilter.serializer.deserialize(in, oldBfFormat); FileUtils.closeQuietly(in); filter2.close(); } diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java index 69c9f1ad8779..cd4e4f442f77 100644 --- a/test/unit/org/apache/cassandra/Util.java +++ b/test/unit/org/apache/cassandra/Util.java @@ -19,12 +19,12 @@ * */ -import java.io.Closeable; -import java.io.EOFException; -import java.io.File; -import java.io.IOError; +import java.io.*; +import java.lang.reflect.Field; import java.net.UnknownHostException; import java.nio.ByteBuffer; +import java.nio.file.*; +import java.nio.file.attribute.FileTime; import java.util.*; import java.util.concurrent.Callable; import java.util.concurrent.Future; @@ -95,6 +95,8 @@ public class Util private static List hostIdPool = new ArrayList<>(); + public final static TimeUnit supportedMTimeGranularity = getSupportedMTimeGranularity(); + public static IPartitioner testPartitioner() { return DatabaseDescriptor.getPartitioner(); @@ -818,4 +820,20 @@ public static void setUpgradeFromVersion(String version) VersionedValue.unsafeMakeVersionedValue(version, v + 1)); Gossiper.instance.expireUpgradeFromVersion(); } + + private static TimeUnit getSupportedMTimeGranularity() { + try + { + Path p = Files.createTempFile(Util.class.getSimpleName(), "dummy-file"); + FileTime ft = Files.getLastModifiedTime(p); + Files.deleteIfExists(p); + Field f = FileTime.class.getDeclaredField("unit"); + f.setAccessible(true); + return (TimeUnit) f.get(ft); + } + catch (IOException | NoSuchFieldException | IllegalAccessException e) + { + throw new AssertionError("Failed to read supported file modification time granularity"); + } + } } diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java index 6ba942a7be1d..80828e1b64e3 100644 --- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java @@ -22,11 +22,16 @@ import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.FileTime; +import java.time.Instant; import java.util.*; import java.util.concurrent.*; import com.google.common.collect.Sets; +import com.google.common.util.concurrent.Uninterruptibles; import org.junit.After; +import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; @@ -48,19 +53,28 @@ import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.LocalPartitioner.LocalToken; +import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.index.Index; import org.apache.cassandra.io.FSReadError; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.sstable.metadata.MetadataComponent; +import org.apache.cassandra.io.sstable.metadata.MetadataType; +import org.apache.cassandra.io.sstable.metadata.ValidationMetadata; import org.apache.cassandra.io.util.FileDataInput; import org.apache.cassandra.io.util.MmappedRegions; import org.apache.cassandra.schema.CachingParams; import org.apache.cassandra.schema.CompressionParams; import org.apache.cassandra.schema.KeyspaceParams; +import org.apache.cassandra.schema.TableMetadata; +import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.service.CacheService; +import org.apache.cassandra.utils.BloomCalculations; +import org.apache.cassandra.utils.BloomFilter; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FilterFactory; +import org.apache.cassandra.utils.IFilter; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; import static org.junit.Assert.assertEquals; @@ -77,6 +91,7 @@ public class SSTableReaderTest public static final String CF_COMPRESSED = "Compressed"; public static final String CF_INDEXED = "Indexed1"; public static final String CF_STANDARDLOWINDEXINTERVAL = "StandardLowIndexInterval"; + public static final String CF_STANDARDNOBLOOMFILTER = "StandardNoBloomFilter"; private IPartitioner partitioner; @@ -100,13 +115,16 @@ public static void defineSchema() throws Exception SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL) .minIndexInterval(8) .maxIndexInterval(256) - .caching(CachingParams.CACHE_NOTHING)); + .caching(CachingParams.CACHE_NOTHING), + SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDNOBLOOMFILTER) + .bloomFilterFpChance(1)); } @After public void Cleanup() { Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD).truncateBlocking(); Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2).truncateBlocking(); + BloomFilter.recreateOnFPChanceChange = false; } @Test @@ -834,13 +852,15 @@ public void testMoveAndOpenSSTable() throws IOException } } - - private SSTableReader getNewSSTable(ColumnFamilyStore cfs) { + return getNewSSTable(cfs, 100, 2); + } + private SSTableReader getNewSSTable(ColumnFamilyStore cfs, int numKeys, int step) + { Set before = cfs.getLiveSSTables(); - for (int j = 0; j < 100; j += 2) + for (int j = 0; j < numKeys; j += step) { new RowUpdateBuilder(cfs.metadata(), j, String.valueOf(j)) .clustering("0") @@ -915,6 +935,126 @@ public void testVerifyCompressionInfoExistencePasses() SSTableReader.verifyCompressionInfoExistenceIfApplicable(desc, components); } + @Test + public void testBloomFilterIsCreatedOnLoad() throws IOException + { + BloomFilter.recreateOnFPChanceChange = true; + + final int numKeys = 100; + final Keyspace keyspace = Keyspace.open(KEYSPACE1); + final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARDNOBLOOMFILTER); + + SSTableReader sstable = getNewSSTable(cfs, numKeys, 1); + Assert.assertTrue(sstable.getBloomFilterSerializedSize() == 0); + Assert.assertSame(FilterFactory.AlwaysPresent, sstable.getBloomFilter()); + + // should do nothing + checkSSTableOpenedWithGivenFPChance(sstable, 1, false, numKeys, false); + + // should create BF because the FP has changed + checkSSTableOpenedWithGivenFPChance(sstable, BloomCalculations.minSupportedBloomFilterFpChance(), true, numKeys, true); + checkSSTableOpenedWithGivenFPChance(sstable, 0.05, true, numKeys, true); + checkSSTableOpenedWithGivenFPChance(sstable, 0.1, true, numKeys, true); + + // should deserialize the existing BF + checkSSTableOpenedWithGivenFPChance(sstable, 0.1, true, numKeys, false); + // should create BF because the FP has changed + checkSSTableOpenedWithGivenFPChance(sstable, 1 - BloomFilter.fpChanceTolerance, true, numKeys, true); + // should install empty filter without changing file or metadata + checkSSTableOpenedWithGivenFPChance(sstable, 1, false, numKeys, false); + + // corrupted bf file should fail to deserialize and we should fall back to recreating it + Files.write(Paths.get(sstable.descriptor.filenameFor(Component.FILTER)), new byte[] { 0, 0, 0, 0}); + checkSSTableOpenedWithGivenFPChance(sstable, 1 - BloomFilter.fpChanceTolerance, true, numKeys, true); + + // missing primary index file should make BF fail to load and we should install the empty one + new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete(); + checkSSTableOpenedWithGivenFPChance(sstable, 0.05, false, numKeys, false); + } + + private void checkSSTableOpenedWithGivenFPChance(SSTableReader sstable, double fpChance, boolean bfShouldExist, int numKeys, boolean expectRecreated) throws IOException + { + Descriptor desc = sstable.descriptor; + TableMetadata metadata = sstable.metadata.get().unbuild().bloomFilterFpChance(fpChance).build(); + ValidationMetadata prevValidationMetadata = getValidationMetadata(desc); + Assert.assertNotNull(prevValidationMetadata); + File bfFile = new File(desc.filenameFor(Component.FILTER)); + + SSTableReader target = null; + try + { + FileTime bf0Time = bfFile.exists() ? Files.getLastModifiedTime(bfFile.toPath()) : FileTime.from(Instant.MIN); + + // make sure we wait enough - some JDK implementations use seconds granularity and we need to wait a bit to actually see the change + Uninterruptibles.sleepUninterruptibly(1, Util.supportedMTimeGranularity); + + target = SSTableReader.open(desc, + SSTableReader.discoverComponentsFor(desc), + TableMetadataRef.forOfflineTools(metadata), + false, + false); + IFilter bloomFilter = target.getBloomFilter(); + ValidationMetadata validationMetadata = getValidationMetadata(desc); + Assert.assertNotNull(validationMetadata); + FileTime bf1Time = bfFile.exists() ? Files.getLastModifiedTime(bfFile.toPath()) : FileTime.from(Instant.MIN); + + if (expectRecreated) + { + Assert.assertTrue(bf0Time.compareTo(bf1Time) < 0); + } + else + { + assertEquals(bf0Time, bf1Time); + } + + if (bfShouldExist) + { + Assert.assertNotEquals(FilterFactory.AlwaysPresent, bloomFilter); + Assert.assertTrue(bloomFilter.serializedSize() > 0); + Assert.assertEquals(fpChance, validationMetadata.bloomFilterFPChance, BloomFilter.fpChanceTolerance); + Assert.assertTrue(bfFile.exists()); + Assert.assertEquals(bloomFilter.serializedSize(), bfFile.length()); + } + else + { + Assert.assertEquals(FilterFactory.AlwaysPresent, sstable.getBloomFilter()); + Assert.assertTrue(sstable.getBloomFilterSerializedSize() == 0); + Assert.assertEquals(prevValidationMetadata.bloomFilterFPChance, validationMetadata.bloomFilterFPChance, BloomFilter.fpChanceTolerance); + Assert.assertEquals(bfFile.exists(), bfFile.exists()); + } + + // verify all keys are present according to the BF + Token token = new Murmur3Partitioner.LongToken(0L); + for (int i = 0; i < numKeys; i++) + { + DecoratedKey key = new BufferDecoratedKey(token, ByteBufferUtil.bytes(String.valueOf(i))); + Assert.assertTrue("Expected key to be in BF: " + i, bloomFilter.isPresent(key)); + } + } + finally + { + if (target != null) + target.selfRef().release(); + } + } + + private static ValidationMetadata getValidationMetadata(Descriptor descriptor) + { + EnumSet types = EnumSet.of(MetadataType.VALIDATION); + + Map sstableMetadata; + try + { + sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor, types); + } + catch (Throwable t) + { + throw new CorruptSSTableException(t, descriptor.filenameFor(Component.STATS)); + } + + return (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION); + } + private Descriptor setUpForTestVerfiyCompressionInfoExistence() { Keyspace keyspace = Keyspace.open(KEYSPACE1); diff --git a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java index 1c3afff2efab..f612a57c8d63 100644 --- a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java +++ b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java @@ -1,31 +1,42 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package org.apache.cassandra.utils; -import java.io.*; +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; import java.nio.ByteBuffer; +import java.text.NumberFormat; import java.util.HashSet; import java.util.Iterator; +import java.util.Locale; import java.util.Random; import java.util.Set; -import org.junit.*; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Murmur3Partitioner; @@ -36,14 +47,17 @@ import org.apache.cassandra.utils.IFilter.FilterKey; import org.apache.cassandra.utils.KeyGenerator.RandomStringGenerator; import org.apache.cassandra.utils.obs.IBitSet; +import org.apache.cassandra.utils.obs.MemoryLimiter; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; public class BloomFilterTest { public IFilter bfInvHashes; - - + public MemoryLimiter memoryLimiter; public static IFilter testSerialize(IFilter f, boolean oldBfFormat) throws IOException { @@ -55,11 +69,11 @@ public static IFilter testSerialize(IFilter f, boolean oldBfFormat) throws IOExc } else { - BloomFilterSerializer.serialize((BloomFilter) f, out); + BloomFilter.serializer.serialize((BloomFilter) f, out); } ByteArrayInputStream in = new ByteArrayInputStream(out.getData(), 0, out.getLength()); - IFilter f2 = BloomFilterSerializer.deserialize(new DataInputStream(in), oldBfFormat); + IFilter f2 = BloomFilter.serializer.deserialize(new DataInputStream(in), oldBfFormat); assert f2.isPresent(FilterTestHelper.bytes("a")); assert !f2.isPresent(FilterTestHelper.bytes("b")); @@ -76,6 +90,10 @@ static void compare(IBitSet bs, IBitSet newbs) @Before public void setup() { + // Set a high limit so that normal tests won't reach it, but we don't want Long.MAX_VALUE because + // we want to test what happens when we reach it + System.setProperty(BloomFilter.MAX_MEMORY_MB_PROP, Long.toString(128 << 10)); + memoryLimiter = new MemoryLimiter(128L << 30, "Allocating %s for bloom filter would reach max of %s (current %s)"); bfInvHashes = FilterFactory.getFilter(10000L, FilterTestHelper.MAX_FAILURE_RATE); } @@ -83,6 +101,7 @@ public void setup() public void destroy() { bfInvHashes.close(); + assertEquals(0, memoryLimiter.memoryAllocated()); } @Test(expected = UnsupportedOperationException.class) @@ -164,13 +183,13 @@ private static void testManyRandom(Iterator keys) collisions += (MAX_HASH_COUNT - hashes.size()); bf.close(); } - Assert.assertTrue("collisions=" + collisions, collisions <= 100); + assertTrue("collisions=" + collisions, collisions <= 100); } @Test(expected = UnsupportedOperationException.class) public void testOffHeapException() { - long numKeys = ((long)Integer.MAX_VALUE) * 64L + 1L; // approx 128 Billion + long numKeys = ((long) Integer.MAX_VALUE) * 64L + 1L; // approx 128 Billion FilterFactory.getFilter(numKeys, 0.01d).close(); } @@ -202,22 +221,21 @@ public void compareCachedKey() } @Test - @Ignore - public void testHugeBFSerialization() throws IOException + public void testHugeBFSerialization() throws Exception { - ByteBuffer test = ByteBuffer.wrap(new byte[] {0, 1}); + ByteBuffer test = ByteBuffer.wrap(new byte[]{ 0, 1 }); File file = FileUtils.createDeletableTempFile("bloomFilterTest-", ".dat"); BloomFilter filter = (BloomFilter) FilterFactory.getFilter(((long) Integer.MAX_VALUE / 8) + 1, 0.01d); filter.add(FilterTestHelper.wrap(test)); DataOutputStreamPlus out = new BufferedDataOutputStreamPlus(new FileOutputStream(file)); - BloomFilterSerializer.serialize(filter, out); + BloomFilter.serializer.serialize(filter, out); out.close(); filter.close(); DataInputStream in = new DataInputStream(new FileInputStream(file)); - BloomFilter filter2 = BloomFilterSerializer.deserialize(in, false); - Assert.assertTrue(filter2.isPresent(FilterTestHelper.wrap(test))); + IFilter filter2 = BloomFilter.serializer.deserialize(in, false); + assertTrue(filter2.isPresent(FilterTestHelper.wrap(test))); FileUtils.closeQuietly(in); filter2.close(); } @@ -243,4 +261,96 @@ public void testMurmur3FilterHash() Assert.assertArrayEquals(expected, actual); } } -} + + @Test + public void testMaxMemoryExceeded() + { + long allocSize = 2L * (1 << 20); + double fpChance = 0.01; + long size; + + try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter)) + { + size = filter.offHeapSize(); + } + assertNotEquals(0, size); + + memoryLimiter = new MemoryLimiter(3 * size / 2, "Allocating %s for bloom filter would reach max of %s (current %s)"); + + try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter)) + { + assertNotNull(filter); + assertTrue(filter instanceof BloomFilter); + + long memBefore = memoryLimiter.memoryAllocated(); + + try (IFilter blankFilter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter)) + { + assertNotNull(blankFilter); + assertTrue(blankFilter instanceof AlwaysPresentFilter); + + assertEquals(memBefore, memoryLimiter.memoryAllocated()); + } + } + } + + @Test + public void testMaxMemoryExceededOnDeserialize() throws IOException + { + long allocSize = 2L * (1 << 20); + double fpChance = 0.01; + long size; + + DataOutputBuffer out = new DataOutputBuffer(); + try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter)) + { + size = filter.offHeapSize(); + BloomFilter.serializer.serialize((BloomFilter) filter, out); + } + assertNotEquals(0, size); + + memoryLimiter = new MemoryLimiter(3 * size / 2, "Allocating %s for bloom filter would reach max of %s (current %s)"); + + try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter)) + { + assertNotNull(filter); + assertTrue(filter instanceof BloomFilter); + + long memBefore = memoryLimiter.memoryAllocated(); + + ByteArrayInputStream in = new ByteArrayInputStream(out.getData(), 0, out.getLength()); + try (IFilter blankFilter = new BloomFilterSerializer(memoryLimiter).deserialize(new DataInputStream(in), false)) + { + assertNotNull(blankFilter); + assertTrue(blankFilter instanceof AlwaysPresentFilter); + assertEquals(memBefore, memoryLimiter.memoryAllocated()); + } + } + } + + @Test + @Ignore // this is a test that can be used to print out the sizes of BFs + public void testBloomFilterSize() + { + int[] nks = new int[]{ + 100_000, 500_000, + 1_000_000, 5_000_000, + 10_000_000, 50_000_000, + 100_000_000, 500_000_000 }; + + //double[] fps = new double[] { 0.01, 0.05, 0.1, 0.2, 0.25 }; + double[] fps = new double[]{ 0.01, 0.1 }; + + for (int nk : nks) + { + for (double fp : fps) + { + IFilter filter = FilterFactory.getFilter(nk, fp); + System.out.println(String.format("%s keys %s FP chance => %s", + NumberFormat.getNumberInstance(Locale.US).format(nk), + NumberFormat.getNumberInstance(Locale.US).format(fp), + FBUtilities.prettyPrintMemory(filter.serializedSize()))); + } + } + } +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/utils/SerializationsTest.java b/test/unit/org/apache/cassandra/utils/SerializationsTest.java index 6597f3bb562d..ba84e5f79731 100644 --- a/test/unit/org/apache/cassandra/utils/SerializationsTest.java +++ b/test/unit/org/apache/cassandra/utils/SerializationsTest.java @@ -66,7 +66,7 @@ private static void testBloomFilterWrite1000(boolean oldBfFormat) throws IOExcep if (oldBfFormat) serializeOldBfFormat((BloomFilter) bf, out); else - BloomFilterSerializer.serialize((BloomFilter) bf, out); + BloomFilter.serializer.serialize((BloomFilter) bf, out); } } } @@ -81,7 +81,7 @@ public void testBloomFilterRead1000() throws IOException } try (DataInputStream in = getInput("4.0", "utils.BloomFilter1000.bin"); - IFilter filter = BloomFilterSerializer.deserialize(in, false)) + IFilter filter = BloomFilter.serializer.deserialize(in, false)) { boolean present; for (int i = 0 ; i < 1000 ; i++) @@ -97,7 +97,7 @@ public void testBloomFilterRead1000() throws IOException } try (DataInputStream in = getInput("3.0", "utils.BloomFilter1000.bin"); - IFilter filter = BloomFilterSerializer.deserialize(in, true)) + IFilter filter = BloomFilter.serializer.deserialize(in, true)) { boolean present; for (int i = 0 ; i < 1000 ; i++) @@ -124,7 +124,7 @@ private static void testBloomFilterTable(String file, boolean oldBfFormat) throw Murmur3Partitioner partitioner = new Murmur3Partitioner(); try (DataInputStream in = new DataInputStream(new FileInputStream(new File(file))); - IFilter filter = BloomFilterSerializer.deserialize(in, oldBfFormat)) + IFilter filter = BloomFilter.serializer.deserialize(in, oldBfFormat)) { for (int i = 1; i <= 10; i++) { diff --git a/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java b/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java index 49b4c94dd387..87dbd192f7bf 100644 --- a/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java +++ b/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java @@ -20,15 +20,15 @@ import java.io.ByteArrayInputStream; import java.io.DataInputStream; -import java.io.IOException; import java.util.List; import java.util.Random; import com.google.common.collect.Lists; -import org.apache.cassandra.io.util.DataOutputBuffer; import org.junit.Assert; import org.junit.Test; +import org.apache.cassandra.io.util.DataOutputBuffer; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -36,6 +36,7 @@ public class OffHeapBitSetTest { private static final Random random = new Random(); + private static final MemoryLimiter memoryLimiter = new MemoryLimiter(1L << 30, "Allocating %s for bloom filter would reach max of %s (current %s)"); static void compare(IBitSet bs, IBitSet newbs) { @@ -44,9 +45,9 @@ static void compare(IBitSet bs, IBitSet newbs) Assert.assertEquals(bs.get(i), newbs.get(i)); } - private void testOffHeapSerialization(boolean oldBfFormat) throws IOException + private void testOffHeapSerialization(boolean oldBfFormat) throws Exception { - try (OffHeapBitSet bs = new OffHeapBitSet(100000)) + try (OffHeapBitSet bs = new OffHeapBitSet(100000, memoryLimiter)) { for (long i = 0; i < bs.capacity(); i++) if (random.nextBoolean()) @@ -59,7 +60,7 @@ private void testOffHeapSerialization(boolean oldBfFormat) throws IOException bs.serialize(out); DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.getData())); - try (OffHeapBitSet newbs = OffHeapBitSet.deserialize(in, oldBfFormat)) + try (OffHeapBitSet newbs = OffHeapBitSet.deserialize(in, oldBfFormat, memoryLimiter)) { compare(bs, newbs); } @@ -67,17 +68,17 @@ private void testOffHeapSerialization(boolean oldBfFormat) throws IOException } @Test - public void testSerialization() throws IOException + public void testSerialization() throws Exception { testOffHeapSerialization(true); testOffHeapSerialization(false); } @Test - public void testBitSetGetClear() + public void testBitSetGetClear() throws Exception { int size = Integer.MAX_VALUE / 4000; - try (OffHeapBitSet bs = new OffHeapBitSet(size)) + try (OffHeapBitSet bs = new OffHeapBitSet(size, memoryLimiter)) { List randomBits = Lists.newArrayList(); for (int i = 0; i < 10; i++) @@ -98,16 +99,16 @@ public void testBitSetGetClear() } @Test(expected = UnsupportedOperationException.class) - public void testUnsupportedLargeSize() + public void testUnsupportedLargeSize() throws Exception { long size = 64L * Integer.MAX_VALUE + 1; // Max size 16G * 8 bits - OffHeapBitSet bs = new OffHeapBitSet(size); + OffHeapBitSet bs = new OffHeapBitSet(size, memoryLimiter); } @Test - public void testInvalidIndex() + public void testInvalidIndex() throws Exception { - OffHeapBitSet bs = new OffHeapBitSet(10); + OffHeapBitSet bs = new OffHeapBitSet(10, memoryLimiter); int invalidIdx[] = {-1, 64, 1000}; for (int i : invalidIdx) From b777dbebfbdf3006141da5825acd0a0c1b542840 Mon Sep 17 00:00:00 2001 From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com> Date: Wed, 16 Jun 2021 10:56:25 +0200 Subject: [PATCH 018/151] STAR-748: Fix Scrubber so that it can work in case of broken index (#193) Fix scrubber so that it can work with broken indexes. If the index is broken, but the data file is ok, we can continue scrubbing and the index will be rebuilt. With this patch it will work even if we cannot access the initial position in the index. Also, the corrupted file will be obsoleted before finishing the rewriter so that it will not be attempted to move starts (and thus fail at the end as move starts requires correct indexes). There are also some explanations added to how canonical set of sstables is constructed and why Co-authored-by: Branimir Lambov (cherry picked from commit c0be3ae5cdd89fae0afcbb277322a466ece4de00) --- .../cassandra/db/compaction/Scrubber.java | 50 +++-- .../apache/cassandra/db/lifecycle/View.java | 23 ++- .../cassandra/io/sstable/SSTableRewriter.java | 13 +- .../org/apache/cassandra/db/ScrubTest.java | 171 ++++++++++++++---- .../io/sstable/SSTableRewriterTest.java | 88 ++++++--- 5 files changed, 252 insertions(+), 93 deletions(-) diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java index afbfe3d27a61..5884f989e008 100644 --- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java +++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java @@ -167,12 +167,23 @@ public void scrub() try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, false, sstable.maxDataAge); Refs refs = Refs.ref(Collections.singleton(sstable))) { - nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null; - if (indexAvailable()) + try + { + nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null; + if (indexAvailable()) + { + // throw away variable so we don't have a side effect in the assert + long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); + assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex; + } + } + catch (Throwable ex) { - // throw away variable so we don't have a side effect in the assert - long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); - assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex; + throwIfFatal(ex); + nextIndexKey = null; + nextRowPositionFromIndex = dataFile.length(); + if (indexFile != null) + indexFile.seek(indexFile.length()); } StatsMetadata metadata = sstable.getSSTableMetadata(); @@ -199,18 +210,22 @@ public void scrub() // check for null key below } - updateIndexKey(); - - long dataStart = dataFile.getFilePointer(); - long dataStartFromIndex = -1; long dataSizeFromIndex = -1; - if (currentIndexKey != null) + + updateIndexKey(); + + if (indexAvailable()) { - dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining(); - dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex; + if (currentIndexKey != null) + { + dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining(); + dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex; + } } + long dataStart = dataFile.getFilePointer(); + // avoid an NPE if key is null String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey()); outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex))); @@ -295,13 +310,10 @@ public void scrub() } // finish obsoletes the old sstable + transaction.obsoleteOriginals(); finished.addAll(writer.setRepairedAt(badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt).finish()); completed = true; } - catch (IOException e) - { - throw Throwables.propagate(e); - } finally { if (transaction.isOffline()) @@ -379,8 +391,8 @@ private void updateIndexKey() nextIndexKey = !indexAvailable() ? null : ByteBufferUtil.readWithShortLength(indexFile); nextRowPositionFromIndex = !indexAvailable() - ? dataFile.length() - : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); + ? dataFile.length() + : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); } catch (Throwable th) { @@ -388,6 +400,8 @@ private void updateIndexKey() outputHandler.warn("Error reading index file", th); nextIndexKey = null; nextRowPositionFromIndex = dataFile.length(); + if (indexFile != null) + indexFile.seek(indexFile.length()); } } diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java index b26426de63ae..e2f09b7791d4 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/View.java +++ b/src/java/org/apache/cassandra/db/lifecycle/View.java @@ -136,13 +136,30 @@ public Iterable select(SSTableSet sstableSet) case NONCOMPACTING: return filter(sstables, (s) -> !compacting.contains(s)); case CANONICAL: + // When early open is not in play, the LIVE and CANONICAL sets are the same. + // However, when we do have early-open sstables, we will have some unfinished sources in the live set. + // For these sources we need to extract the originals, in their non-moved-start versions, from the + // compacting set. + // This creates a problem when the compaction completes, as then both: + // - the source is in the compacting set + // - the result is in the live set + // This currently causes the CANONICAL set to return both source and result when early-open is disabled, + // and is otherwise worked around by opening early the last sstable in the result set (which pushes it + // in the compacting set with EARLY openReason) and the !compacting.contains(sstable) check in the + // second loop below. + // Unfortunately there does not appear to be a way to avoid this workaround. Filtering the compacting + // set through having an early-open version in live does not work because sources are fully removed from + // the live set when they are completely exhausted. + + // Add the compacting versions first because they will be the canonical versions of compaction sources. Set canonicalSSTables = new HashSet<>(); for (SSTableReader sstable : compacting) if (sstable.openReason != SSTableReader.OpenReason.EARLY) canonicalSSTables.add(sstable); - // reason for checking if compacting contains the sstable is that if compacting has an EARLY version - // of a NORMAL sstable, we still have the canonical version of that sstable in sstables. - // note that the EARLY version is equal, but not == since it is a different instance of the same sstable. + // Add anything that is not compacting, removing any compaction result where we still have the + // compaction sources. + // note that the EARLY version is equal to the original, i.e. the set itself can guarantee early-open + // versions of sstables in compacting won't be added, but we also want to remove the results. for (SSTableReader sstable : sstables) if (!compacting.contains(sstable) && sstable.openReason != SSTableReader.OpenReason.EARLY) canonicalSSTables.add(sstable); diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java index a3d5ae9a2bab..92548b26aea4 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java @@ -255,10 +255,13 @@ private void moveStarts(SSTableReader newReader, DecoratedKey lowerbound) continue; } - DecoratedKey newStart = latest.firstKeyBeyond(lowerbound); - assert newStart != null; - SSTableReader replacement = latest.cloneWithNewStart(newStart, runOnClose); - transaction.update(replacement, true); + if (!transaction.isObsolete(latest)) + { + DecoratedKey newStart = latest.firstKeyBeyond(lowerbound); + assert newStart != null; + SSTableReader replacement = latest.cloneWithNewStart(newStart, runOnClose); + transaction.update(replacement, true); + } } } @@ -310,6 +313,8 @@ public void switchWriter(SSTableWriter newWriter) return; } + // Open fully completed sstables early. This is also required for the final sstable in a set (where newWriter + // is null) to permit the compilation of a canonical set of sstables (see View.select). if (preemptiveOpenInterval != Long.MAX_VALUE) { // we leave it as a tmp file, but we open it and add it to the Tracker diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java index 1b425051f859..e17d202a9835 100644 --- a/test/unit/org/apache/cassandra/db/ScrubTest.java +++ b/test/unit/org/apache/cassandra/db/ScrubTest.java @@ -28,22 +28,26 @@ import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; +import java.util.SortedSet; import java.util.UUID; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.commons.lang3.StringUtils; - +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.commons.lang3.ArrayUtils; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; +import net.openhft.chronicle.core.util.ThrowingBiConsumer; import org.apache.cassandra.OrderedJUnit4ClassRunner; import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.UpdateBuilder; @@ -59,7 +63,6 @@ import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; import org.apache.cassandra.db.lifecycle.LifecycleTransaction; import org.apache.cassandra.db.marshal.Int32Type; -import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.db.marshal.UUIDType; import org.apache.cassandra.db.partitions.Partition; import org.apache.cassandra.db.partitions.PartitionUpdate; @@ -199,7 +202,8 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep scrubber.scrub(); fail("Expected a CorruptSSTableException to be thrown"); } - catch (IOError err) { + catch (IOError err) + { assertTrue(err.getCause() instanceof CorruptSSTableException); } @@ -224,7 +228,7 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep else { assertEquals(1, scrubResult.badRows); - assertEquals(numPartitions-1, scrubResult.goodRows); + assertEquals(numPartitions - 1, scrubResult.goodRows); } assertEquals(1, cfs.getLiveSSTables().size()); @@ -232,36 +236,98 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep } @Test - public void testScrubCorruptedRowInSmallFile() throws IOException, WriteTimeoutException + public void testScrubCorruptedRowInSmallFile() throws Throwable + { + // overwrite one row with garbage + testCorruptionInSmallFile((sstable, keys) -> + overrideWithGarbage(sstable, + ByteBufferUtil.bytes(keys[0]), + ByteBufferUtil.bytes(keys[1]), + (byte) 0x7A), + false, + 4); + } + + + @Test + public void testScrubCorruptedIndex() throws Throwable + { + // overwrite a part of the index with garbage + testCorruptionInSmallFile((sstable, keys) -> + overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX), + 5, + 6, + (byte) 0x7A), + true, + 5); + } + + @Test + public void testScrubCorruptedIndexOnOpen() throws Throwable + { + // overwrite the whole index with garbage + testCorruptionInSmallFile((sstable, keys) -> + overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX), + 0, + 60, + (byte) 0x7A), + true, + 5); + } + + @Test + public void testScrubCorruptedRowCorruptedIndex() throws Throwable + { + // overwrite one row, and the index with garbage + testCorruptionInSmallFile((sstable, keys) -> + { + overrideWithGarbage(sstable, + ByteBufferUtil.bytes(keys[2]), + ByteBufferUtil.bytes(keys[3]), + (byte) 0x7A); + overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX), + 5, + 6, + (byte) 0x7A); + }, + false, + 2); // corrupt after the second partition, no way to resync + } + + public void testCorruptionInSmallFile(ThrowingBiConsumer corrupt, boolean isFullyRecoverable, int expectedPartitions) throws Throwable { // cannot test this with compression assumeTrue(!Boolean.parseBoolean(System.getProperty("cassandra.test.compression", "false"))); CompactionManager.instance.disableAutoCompaction(); ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COUNTER_CF); + cfs.clearUnsafe(); - fillCounterCF(cfs, 2); + String[] keys = fillCounterCF(cfs, 5); - assertOrderedAll(cfs, 2); + assertOrderedAll(cfs, 5); SSTableReader sstable = cfs.getLiveSSTables().iterator().next(); // overwrite one row with garbage - overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1")); + corrupt.accept(sstable, keys); // with skipCorrupted == false, the scrub is expected to fail - try (LifecycleTransaction txn = cfs.getTracker().tryModify(Collections.singletonList(sstable), OperationType.SCRUB); - Scrubber scrubber = new Scrubber(cfs, txn, false, true)) + if (!isFullyRecoverable) { - // with skipCorrupted == true, the corrupt row will be skipped - scrubber.scrub(); - fail("Expected a CorruptSSTableException to be thrown"); - } - catch (IOError err) { - assertTrue(err.getCause() instanceof CorruptSSTableException); + try (LifecycleTransaction txn = cfs.getTracker().tryModify(Arrays.asList(sstable), OperationType.SCRUB); + Scrubber scrubber = new Scrubber(cfs, txn, false, true)) + { + // with skipCorrupted == true, the corrupt row will be skipped + scrubber.scrub(); + fail("Expected a CorruptSSTableException to be thrown"); + } + catch (IOError err) + { + } } - try (LifecycleTransaction txn = cfs.getTracker().tryModify(Collections.singletonList(sstable), OperationType.SCRUB); + try (LifecycleTransaction txn = cfs.getTracker().tryModify(ImmutableList.of(sstable), OperationType.SCRUB); Scrubber scrubber = new Scrubber(cfs, txn, true, true)) { // with skipCorrupted == true, the corrupt row will be skipped @@ -269,8 +335,8 @@ public void testScrubCorruptedRowInSmallFile() throws IOException, WriteTimeoutE } assertEquals(1, cfs.getLiveSSTables().size()); - // verify that we can read all of the rows, and there is now one less row - assertOrderedAll(cfs, 1); + // verify that we can read all of the rows, and there is now the expected number of rows + assertOrderedAll(cfs, expectedPartitions); } @Test @@ -346,14 +412,14 @@ public void testScrubNoIndex() throws ExecutionException, InterruptedException, } @Test - public void testScrubOutOfOrder() + public void testScrubOutOfOrder() throws IOException { // This test assumes ByteOrderPartitioner to create out-of-order SSTable IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner(); DatabaseDescriptor.setPartitionerUnsafe(new ByteOrderedPartitioner()); // Create out-of-order SSTable - File tempDir = FileUtils.createTempFile("ScrubTest.testScrubOutOfOrder", "").getParentFile(); + File tempDir = Files.createTempDirectory("ScrubTest.testScrubOutOfOrder").toFile(); // create ks/cf directory File tempDataDir = new File(tempDir, String.join(File.separator, ksName, CF)); assertTrue(tempDataDir.mkdirs()); @@ -420,6 +486,11 @@ public void testScrubOutOfOrder() } private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2) throws IOException + { + overrideWithGarbage(sstable, key1, key2, (byte) 'z'); + } + + private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2, byte junk) throws IOException { boolean compression = Boolean.parseBoolean(System.getProperty("cassandra.test.compression", "false")); long startPosition, endPosition; @@ -429,9 +500,9 @@ private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuf CompressionMetadata compData = CompressionMetadata.create(sstable.getFilename()); CompressionMetadata.Chunk chunk1 = compData.chunkFor( - sstable.getPosition(PartitionPosition.ForKey.get(key1, sstable.getPartitioner()), SSTableReader.Operator.EQ).position); + sstable.getPosition(PartitionPosition.ForKey.get(key1, sstable.getPartitioner()), SSTableReader.Operator.EQ).position); CompressionMetadata.Chunk chunk2 = compData.chunkFor( - sstable.getPosition(PartitionPosition.ForKey.get(key2, sstable.getPartitioner()), SSTableReader.Operator.EQ).position); + sstable.getPosition(PartitionPosition.ForKey.get(key2, sstable.getPartitioner()), SSTableReader.Operator.EQ).position); startPosition = Math.min(chunk1.offset, chunk2.offset); endPosition = Math.max(chunk1.offset + chunk1.length, chunk2.offset + chunk2.length); @@ -446,18 +517,31 @@ private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuf endPosition = Math.max(row0Start, row1Start); } - overrideWithGarbage(sstable, startPosition, endPosition); + overrideWithGarbage(sstable, startPosition, endPosition, junk); } private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition) throws IOException { - try (RandomAccessFile file = new RandomAccessFile(sstable.getFilename(), "rw")) + overrideWithGarbage(sstable, startPosition, endPosition, (byte) 'z'); + } + + private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition, byte junk) throws IOException + { + overrideWithGarbage(sstable.getFilename(), startPosition, endPosition, junk); + } + + private void overrideWithGarbage(String path, long startPosition, long endPosition, byte junk) throws IOException + { + try (RandomAccessFile file = new RandomAccessFile(path, "rw")) { file.seek(startPosition); - file.writeBytes(StringUtils.repeat('z', (int) (endPosition - startPosition))); + int length = (int) (endPosition - startPosition); + byte[] buff = new byte[length]; + Arrays.fill(buff, junk); + file.write(buff, 0, length); } if (ChunkCache.instance != null) - ChunkCache.instance.invalidateFile(sstable.getFilename()); + ChunkCache.instance.invalidateFile(path); } private static void assertOrderedAll(ColumnFamilyStore cfs, int expectedSize) @@ -494,10 +578,10 @@ protected void fillCF(ColumnFamilyStore cfs, int partitionsPerSSTable) cfs.forceBlockingFlush(); } - public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long ... values) + public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long... values) { assertEquals(0, values.length % 2); - for (int i = 0; i < values.length; i +=2) + for (int i = 0; i < values.length; i += 2) { UpdateBuilder builder = UpdateBuilder.create(cfs.metadata(), String.valueOf(i)); if (composite) @@ -518,17 +602,23 @@ public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long .. cfs.forceBlockingFlush(); } - protected void fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException + protected String[] fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException { + SortedSet tokenSorted = Sets.newTreeSet(Comparator.comparing(a -> cfs.getPartitioner() + .decorateKey(ByteBufferUtil.bytes(a)))); + for (int i = 0; i < partitionsPerSSTable; i++) { PartitionUpdate update = UpdateBuilder.create(cfs.metadata(), String.valueOf(i)) .newRow("r1").add("val", 100L) .build(); + tokenSorted.add(String.valueOf(i)); new CounterMutation(new Mutation(update), ConsistencyLevel.ONE).apply(); } cfs.forceBlockingFlush(); + + return tokenSorted.toArray(ArrayUtils.EMPTY_STRING_ARRAY); } @Test @@ -624,18 +714,18 @@ public void testScrubTwice() throws IOException, ExecutionException, Interrupted } @SuppressWarnings("SameParameterValue") - private void testScrubIndex(String cfName, String colName, boolean composite, boolean ... scrubs) - throws IOException, ExecutionException, InterruptedException + private void testScrubIndex(String cfName, String colName, boolean composite, boolean... scrubs) + throws IOException, ExecutionException, InterruptedException { CompactionManager.instance.disableAutoCompaction(); ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName); int numRows = 1000; - long[] colValues = new long [numRows * 2]; // each row has two columns - for (int i = 0; i < colValues.length; i+=2) + long[] colValues = new long[numRows * 2]; // each row has two columns + for (int i = 0; i < colValues.length; i += 2) { colValues[i] = (i % 4 == 0 ? 1L : 2L); // index column - colValues[i+1] = 3L; //other column + colValues[i + 1] = 3L; //other column } fillIndexCF(cfs, composite, colValues); @@ -646,7 +736,7 @@ private void testScrubIndex(String cfName, String colName, boolean composite, bo // scrub index Set indexCfss = cfs.indexManager.getAllIndexColumnFamilyStores(); assertEquals(1, indexCfss.size()); - for(ColumnFamilyStore indexCfs : indexCfss) + for (ColumnFamilyStore indexCfs : indexCfss) { for (int i = 0; i < scrubs.length; i++) { @@ -655,11 +745,11 @@ private void testScrubIndex(String cfName, String colName, boolean composite, bo { //make sure the next scrub fails overrideWithGarbage(indexCfs.getLiveSSTables().iterator().next(), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(2L)); } - CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, false, true, false,0); + CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, false, true, false, 0); assertEquals(failure ? CompactionManager.AllSSTableOpStatus.ABORTED : CompactionManager.AllSSTableOpStatus.SUCCESSFUL, - result); + result); } } @@ -804,7 +894,8 @@ public void testNoSkipScrubCorruptedCounterRowWithTool() throws IOException, Wri ToolRunner.invokeClass(StandaloneScrubber.class, ksName, COUNTER_CF); fail("Expected a CorruptSSTableException to be thrown"); } - catch (IOError err) { + catch (IOError err) + { assertTrue(err.getCause() instanceof CorruptSSTableException); } } diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java index 1895653ccd0b..30b1d5ca3221 100644 --- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java @@ -23,14 +23,17 @@ import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; +import org.junit.Ignore; import org.junit.Test; import org.apache.cassandra.Util; import org.apache.cassandra.UpdateBuilder; import org.apache.cassandra.concurrent.NamedThreadFactory; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.DeletionTime; @@ -620,7 +623,7 @@ private void testAbortHelper(boolean earlyException, boolean offline) CompactionController controller = new CompactionController(cfs, compacting, 0); LifecycleTransaction txn = offline ? LifecycleTransaction.offline(OperationType.UNKNOWN, compacting) : cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN); - SSTableRewriter rewriter = new SSTableRewriter(txn, 100, 10000000, false, true); + SSTableRewriter rewriter = new SSTableRewriter(txn, 100, 10000000, offline, true); CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID()) ) { @@ -812,42 +815,71 @@ public void testTwoWriters() } @Test - public void testCanonicalSSTables() throws ExecutionException, InterruptedException + public void testCanonicalSSTablesWithEarlyOpen() throws ExecutionException, InterruptedException { - Keyspace keyspace = Keyspace.open(KEYSPACE); - final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF); - truncate(cfs); + testCanonicalSSTables(1); + } - cfs.addSSTable(writeFile(cfs, 100)); - Collection allSSTables = cfs.getLiveSSTables(); - assertEquals(1, allSSTables.size()); - final AtomicBoolean done = new AtomicBoolean(false); - final AtomicBoolean failed = new AtomicBoolean(false); - Runnable r = () -> { - while (!done.get()) - { - Iterable sstables = cfs.getSSTables(SSTableSet.CANONICAL); - if (Iterables.size(sstables) != 1) + @Test + public void testCanonicalSSTablesWithFinalEarlyOpen() throws ExecutionException, InterruptedException + { + testCanonicalSSTables(1000000); + } + + @Test + @Ignore // This does not currently work. See View.select. + public void testCanonicalSSTablesNoEarlyOpen() throws ExecutionException, InterruptedException + { + testCanonicalSSTables(-1); + } + + + public void testCanonicalSSTables(int preemptiveOpenInterval) throws ExecutionException, InterruptedException + { + int prevPreemptiveOpenInterval = DatabaseDescriptor.getSSTablePreemptiveOpenIntervalInMB(); + try + { + DatabaseDescriptor.setSSTablePreemptiveOpenIntervalInMB(preemptiveOpenInterval); + Keyspace keyspace = Keyspace.open(KEYSPACE); + final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF); + truncate(cfs); + + cfs.addSSTable(writeFile(cfs, 2000)); + Collection allSSTables = cfs.getLiveSSTables(); + assertEquals(1, allSSTables.size()); + final AtomicBoolean done = new AtomicBoolean(false); + final AtomicBoolean gotZero = new AtomicBoolean(false); + final AtomicInteger maxValue = new AtomicInteger(0); + Runnable r = () -> { + while (!done.get()) { - failed.set(true); - return; + Iterable sstables = cfs.getSSTables(SSTableSet.CANONICAL); + int sstablesCount = Iterables.size(sstables); + if (sstablesCount == 0) + gotZero.set(true); + else + maxValue.updateAndGet(prev -> Math.max(prev, sstablesCount)); } + }; + Thread t = NamedThreadFactory.createThread(r); + try + { + t.start(); + cfs.forceMajorCompaction(); } - }; - Thread t = NamedThreadFactory.createThread(r); - try - { - t.start(); - cfs.forceMajorCompaction(); + finally + { + done.set(true); + t.join(20); + } + // Note: the checks below can falsely succeed. Flaky failures should be treated as genuine problems. + assertFalse("No sstables", gotZero.get()); + assertEquals("Too many sstables", 1, maxValue.get()); } finally { - done.set(true); - t.join(20); + DatabaseDescriptor.setSSTablePreemptiveOpenIntervalInMB(prevPreemptiveOpenInterval); } - assertFalse(failed.get()); - - } /** From 6e7c1287a80e2484f8c4b09dd9775ed39edd1dde Mon Sep 17 00:00:00 2001 From: dan jatnieks Date: Thu, 17 Jun 2021 03:03:36 -0700 Subject: [PATCH 019/151] =?UTF-8?q?STAR-583=20Fix=20infinite=20loop=20when?= =?UTF-8?q?=20replaying=20a=20truncated=20commit=20log=20file=E2=80=A6=20(?= =?UTF-8?q?#194)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit STAR-583 Fix infinite loop when replaying a truncated commit log file and truncation is tolerated Co-authored-by: Massimiliano Tomassi (cherry picked from commit 2bf6dd4e31c48a3567cb3cd4eb515da8005d8e10) --- .../cassandra/db/commitlog/CommitLog.java | 6 +-- .../db/commitlog/CommitLogSegmentReader.java | 9 +++- .../cassandra/db/commitlog/CommitLogTest.java | 43 +++++++++++++++++-- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java index a32b8a1030e7..3fdfaf44c127 100644 --- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java +++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java @@ -200,10 +200,10 @@ public int recoverFiles(File... clogs) throws IOException return replayer.blockForWrites(); } - public void recoverPath(String path) throws IOException + public void recoverPath(String path, boolean tolerateTruncation) throws IOException { CommitLogReplayer replayer = CommitLogReplayer.construct(this, getLocalHostId()); - replayer.replayPath(new File(path), false); + replayer.replayPath(new File(path), tolerateTruncation); replayer.blockForWrites(); } @@ -217,7 +217,7 @@ private static UUID getLocalHostId() */ public void recover(String path) throws IOException { - recoverPath(path); + recoverPath(path, false); } /** diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java index e23a915ba355..de4f135583ac 100644 --- a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java +++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java @@ -87,9 +87,9 @@ protected SyncSegment computeNext() { while (true) { + final int currentStart = end; try { - final int currentStart = end; end = readSyncMarker(descriptor, currentStart, reader); if (end == -1) { @@ -133,6 +133,13 @@ protected SyncSegment computeNext() throw new RuntimeException(ioe); } } + + // if we've not been able to read the sync marker, or the file is truncated, + // then return end of data, otherwise continue the loop + if (currentStart == end) + { + return endOfData(); + } } } } diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java index da3b83ee6ce5..86a47febc74b 100644 --- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java +++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java @@ -74,8 +74,10 @@ import org.junit.After; import static org.apache.cassandra.db.commitlog.CommitLogSegment.ENTRY_OVERHEAD_SIZE; +import static org.apache.cassandra.db.commitlog.CommitLogSegment.SYNC_MARKER_SIZE; import static org.apache.cassandra.utils.ByteBufferUtil.bytes; +import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -262,6 +264,28 @@ public void testRecoveryWithShortSize() throws Exception }, CommitLogReplayException.class); } + @Test + public void testRecoveryWithTruncatedFileAndTruncationToleration() throws Exception + { + CommitLogDescriptor desc = new CommitLogDescriptor(CommitLogDescriptor.current_version, + CommitLogSegment.getNextId(), + DatabaseDescriptor.getCommitLogCompression(), + DatabaseDescriptor.getEncryptionContext()); + + byte[] randomData = new byte[100]; + (new java.util.Random()).nextBytes(randomData); + + // Simulates a truncated log segment section by writing a segment section marker with a section end offset + // that is greater than the log file size. + // + // This is achieved by using a data length greater than the actual data contents, which will be used when + // writing the segment marker. + int dataLength = randomData.length * 2; + + // Recovery should succeed when truncation toleration is specified + testRecovery(desc, randomData, dataLength, true); + } + @Test public void testRecoveryWithShortMutationSize() throws Exception { @@ -595,20 +619,31 @@ protected Void testRecovery(byte[] logData, int version) throws Exception return null; } - protected Void testRecovery(CommitLogDescriptor desc, byte[] logData) throws Exception + protected Void testRecovery(CommitLogDescriptor desc, byte[] logData, int dataLength, boolean tolerateTruncation) throws Exception { File logFile = tmpFile(desc.version); CommitLogDescriptor fromFile = CommitLogDescriptor.fromFileName(logFile.getName()); // Change id to match file. desc = new CommitLogDescriptor(desc.version, fromFile.id, desc.compression, desc.getEncryptionContext()); + ByteBuffer buf = ByteBuffer.allocate(1024); CommitLogDescriptor.writeHeader(buf, desc, getAdditionalHeaders(desc.getEncryptionContext())); + + // Write a section marker using the given data length + CommitLogSegment.writeSyncMarker(fromFile.id, buf, buf.position(), buf.position(), buf.position() + SYNC_MARKER_SIZE + dataLength); + + // Update buffer position for sync marker + buf.position(buf.position() + SYNC_MARKER_SIZE); + + // Add data to byte buffer + buf.put(logData); + try (OutputStream lout = new FileOutputStream(logFile)) { lout.write(buf.array(), 0, buf.position()); - lout.write(logData); + //statics make it annoying to test things correctly - CommitLog.instance.recover(logFile.getPath()); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure*/ + CommitLog.instance.recoverPath(logFile.getPath(), tolerateTruncation); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure } return null; } @@ -636,7 +671,7 @@ public void testRecoveryWithBadCompressor() throws Exception { CommitLogDescriptor desc = new CommitLogDescriptor(4, new ParameterizedClass("UnknownCompressor", null), EncryptionContextGenerator.createDisabledContext()); runExpecting(() -> { - testRecovery(desc, new byte[0]); + testRecovery(desc, new byte[0], 0, false); return null; }, CommitLogReplayException.class); } From 5d5368c719ed39b47d49ca3ef1ef0810e80a7e15 Mon Sep 17 00:00:00 2001 From: Stefania Alborghetti Date: Tue, 20 Feb 2018 11:41:22 +0800 Subject: [PATCH 020/151] STAR-593 Harden txn log files against exceptions Harden txn log files against exceptions when adding records and improve log messages Port of riptano/apollo@83b93bc434dc2f07371e848a9b24854403bf740e (cherry picked from commit 8175eb82b5f7c31ee77dfe13ac57eb40a647c775) --- .../cassandra/db/lifecycle/LogReplicaSet.java | 21 ++++++++++++++++--- .../db/lifecycle/LogTransaction.java | 4 +++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java b/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java index 0295357e8f0f..6a07392217b7 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java @@ -222,13 +222,28 @@ void printContentsWithAnyErrors(StringBuilder str) */ void append(LogRecord record) { - Throwable err = Throwables.perform(null, replicas().stream().map(r -> () -> r.append(record))); + Throwable err = null; + int failed = 0; + for (LogReplica replica : replicas()) + { + try + { + replica.append(record); + } + catch (Throwable t) + { + logger.warn("Failed to add record to a replica: {}", t.getMessage()); + err = Throwables.merge(err, t); + failed++; + } + } + if (err != null) { - if (!record.isFinal() || err.getSuppressed().length == replicas().size() -1) + if (!record.isFinal() || failed == replicas().size()) Throwables.maybeFail(err); - logger.error("Failed to add record '{}' to some replicas '{}'", record, this); + logger.error("Failed to add record '{}' to some replicas '{}'", record, this, err); } } diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java index 85df4d64e04f..fd916864a879 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java @@ -548,7 +548,9 @@ static boolean removeUnfinishedLeftovers(Map.Entry> entry) } else { - logger.error("Unexpected disk state: failed to read transaction log {}", txn.toString(true)); + logger.error("Unexpected disk state: failed to read transaction log {}, " + + "check logs before last shutdown for any errors, and ensure txn log files were not edited manually.", + txn.toString(true)); return false; } } From 07666e00d0c9c76bae051a66c766592dc30f8d9b Mon Sep 17 00:00:00 2001 From: Ruslan Fomkin Date: Wed, 16 Jun 2021 15:19:53 +0200 Subject: [PATCH 021/151] STAR-593 test patched LogReplicaSet.append The ported bug fix patch changes impelmentation of LogReplicaSet.append in the error case, however no tests exist. This commit adds tests to cover the error path. It also changes the version of JUnit to use assertThrow. (cherry picked from commit ca66e38f764963d9bf4c1ea8018cb8e4cdad0cef) --- build.xml | 2 +- relocate-dependencies.pom | 2 +- .../db/lifecycle/LogReplicationSetTest.java | 78 +++++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java diff --git a/build.xml b/build.xml index 95b237ee16c3..884d9cd60e51 100644 --- a/build.xml +++ b/build.xml @@ -523,7 +523,7 @@ - + diff --git a/relocate-dependencies.pom b/relocate-dependencies.pom index 07728dd405ae..d7c9150dda7d 100644 --- a/relocate-dependencies.pom +++ b/relocate-dependencies.pom @@ -36,7 +36,7 @@ UTF-8 UTF-8 - 4.12 + 4.13 ${java.version} ${java.version} 4.0.0-SNAPSHOT diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java new file mode 100644 index 000000000000..6baccd6af1d1 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java @@ -0,0 +1,78 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.lifecycle; + +import java.io.File; +import java.util.ArrayList; + +import org.junit.Assert; +import org.junit.Test; + +import org.apache.cassandra.io.util.FileUtils; +import org.mockito.Mockito; + +public class LogReplicationSetTest +{ + @Test + public void shouldThrowIfAppendFailedToAllReplicas() throws Throwable + { + int nrReplicas = 2; + LogReplicaSet replicas = new LogReplicaSet(); + ArrayList spyFiles = getSpyFiles("testAppendFailedToAll", nrReplicas); + + replicas.addReplicas(spyFiles); + spyFiles.forEach(f -> Mockito.when(f.exists()).thenThrow(new RuntimeException())); + + Assert.assertThrows(RuntimeException.class, + () -> + replicas.append(LogRecord.makeAbort(System.currentTimeMillis()))); + } + + @Test + public void shouldNotThrowIfAppendFailedToSomeReplicas() throws Throwable + { + int nrReplicas = 2; + LogReplicaSet replicas = new LogReplicaSet(); + ArrayList spyFiles = getSpyFiles("testAppendFailedToSome", nrReplicas); + + replicas.addReplicas(spyFiles); + Mockito.when(spyFiles.get(0).exists()).thenThrow(new RuntimeException()); + } + + private ArrayList getSpyFiles(String testName, int nrReplicas) + { + ArrayList files = new ArrayList<>(nrReplicas); + for (int i = 0; i < nrReplicas; i++) + { + files.add(Mockito.spy(createTempFile(testName, i))); + } + return files; + } + + private static File createTempFile(String testName, int id) + { + String prefix = String.format("%s_%d", testName, id); + File dir = new File(FileUtils.getTempDir(), prefix); + + FileUtils.createDirectory(dir); + File file = FileUtils.createTempFile(prefix, "tmp", dir); + + file.deleteOnExit(); + dir.deleteOnExit(); + return file; + } +} From 06d7704df6182355dce4b53f1f390f74804f77b9 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 14:52:05 +0100 Subject: [PATCH 022/151] CORE-92: Remove component number argument from getComparatorSize That parameter was misleading, those methods do not use it at all Also refactored a bit DynamicCompositeTypeTest (cherry picked from commit b0dd03ebc8371f67a7098b46c5de60443e4db5fe) (cherry picked from commit 0779d149067647f6e57edf83bdd192caeba11254) --- .../db/marshal/AbstractCompositeType.java | 13 +++--- .../cassandra/db/marshal/CompositeType.java | 2 +- .../db/marshal/DynamicCompositeType.java | 3 +- .../db/marshal/DynamicCompositeTypeTest.java | 41 ++++++++----------- 4 files changed, 26 insertions(+), 33 deletions(-) diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java index 24d283457e99..86ac00f2fe73 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java @@ -62,8 +62,8 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right while (!accessorL.isEmptyFromOffset(left, offsetL) && !accessorR.isEmptyFromOffset(right, offsetL)) { AbstractType comparator = getComparator(i, left, accessorL, right, accessorR, offsetL, offsetR); - offsetL += getComparatorSize(i, left, accessorL, offsetL); - offsetR += getComparatorSize(i, right, accessorR, offsetR); + offsetL += getComparatorSize(left, accessorL, offsetL); + offsetR += getComparatorSize(right, accessorR, offsetR); VL value1 = accessorL.sliceWithShortLength(left, offsetL); offsetL += accessorL.sizeWithShortLength(value1); @@ -106,10 +106,9 @@ public ByteBuffer[] split(ByteBuffer bb) boolean isStatic = readIsStatic(bb, ByteBufferAccessor.instance); int offset = startingOffset(isStatic); - int i = 0; while (!ByteBufferAccessor.instance.isEmptyFromOffset(bb, offset)) { - offset += getComparatorSize(i++, bb, ByteBufferAccessor.instance, offset); + offset += getComparatorSize(bb, ByteBufferAccessor.instance, offset); ByteBuffer value = ByteBufferAccessor.instance.sliceWithShortLength(bb, offset); offset += ByteBufferAccessor.instance.sizeWithShortLength(value); l.add(value); @@ -188,7 +187,7 @@ public String getString(V input, ValueAccessor accessor) sb.append(":"); AbstractType comparator = getAndAppendComparator(i, input, accessor, sb, offset); - offset += getComparatorSize(i, input, accessor, offset); + offset += getComparatorSize(input, accessor, offset); V value = accessor.sliceWithShortLength(input, offset); offset += accessor.sizeWithShortLength(value); @@ -285,7 +284,7 @@ public void validate(V input, ValueAccessor accessor) while (!accessor.isEmptyFromOffset(input, offset)) { AbstractType comparator = validateComparator(i, input, accessor, offset); - offset += getComparatorSize(i, input, accessor, offset); + offset += getComparatorSize(input, accessor, offset); if (accessor.sizeFromOffset(input, offset) < 2) throw new MarshalException("Not enough bytes to read value size of component " + i); @@ -317,7 +316,7 @@ public TypeSerializer getSerializer() return BytesSerializer.instance; } - abstract protected int getComparatorSize(int i, V value, ValueAccessor accessor, int offset); + abstract protected int getComparatorSize(V value, ValueAccessor accessor, int offset); /** * @return the comparator for the given component. static CompositeType will consult * @param i DynamicCompositeType will read the type information from @param bb diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java index bf5e914a9d9e..d8e0ac7b79e2 100644 --- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java @@ -177,7 +177,7 @@ protected AbstractType validateComparator(int i, V value, ValueAccessor int getComparatorSize(int i, V value, ValueAccessor accessor, int offset) + protected int getComparatorSize(V value, ValueAccessor accessor, int offset) { return 0; } diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java index 5df36009956e..e0377fd5396b 100644 --- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java @@ -94,7 +94,7 @@ protected int startingOffset(boolean isStatic) return 0; } - protected int getComparatorSize(int i, V value, ValueAccessor accessor, int offset) + protected int getComparatorSize(V value, ValueAccessor accessor, int offset) { int header = accessor.getShort(value, offset); if ((header & 0x8000) == 0) @@ -114,7 +114,6 @@ private AbstractType getComparator(V value, ValueAccessor accessor, in int header = accessor.getShort(value, offset); if ((header & 0x8000) == 0) { - String name = accessor.toString(accessor.slice(value, offset + 2, header)); return TypeParser.parse(name); } diff --git a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java index 9f8eec3c21c7..1de4f20f3c95 100644 --- a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java @@ -20,11 +20,13 @@ import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; -import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.UUID; +import java.util.stream.Stream; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; import org.junit.BeforeClass; import org.junit.Test; import static org.junit.Assert.fail; @@ -45,25 +47,18 @@ public class DynamicCompositeTypeTest { private static final String KEYSPACE1 = "DynamicCompositeType"; private static final String CF_STANDARDDYNCOMPOSITE = "StandardDynamicComposite"; - private static Map> aliases = new HashMap<>(); - private static final DynamicCompositeType comparator; - static - { - aliases.put((byte)'b', BytesType.instance); - aliases.put((byte)'B', ReversedType.getInstance(BytesType.instance)); - aliases.put((byte)'t', TimeUUIDType.instance); - aliases.put((byte)'T', ReversedType.getInstance(TimeUUIDType.instance)); - comparator = DynamicCompositeType.getInstance(aliases); - } + public final static Map> aliases = ImmutableMap.>builder() + .put((byte) 'b', BytesType.instance) + .put((byte) 'B', ReversedType.getInstance(BytesType.instance)) + .put((byte) 't', TimeUUIDType.instance) + .put((byte) 'T', ReversedType.getInstance(TimeUUIDType.instance)) + .build(); - private static final int UUID_COUNT = 3; - private static final UUID[] uuids = new UUID[UUID_COUNT]; - static - { - for (int i = 0; i < UUID_COUNT; ++i) - uuids[i] = UUIDGen.getTimeUUID(); - } + public static final DynamicCompositeType comparator = DynamicCompositeType.getInstance(aliases); + + public static final int UUID_COUNT = 3; + public static final UUID[] uuids = Stream.generate(UUIDGen::getTimeUUID).limit(UUID_COUNT).toArray(UUID[]::new); @BeforeClass public static void defineSchema() throws ConfigurationException @@ -323,13 +318,13 @@ private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean return createDynamicCompositeKey(s, uuid, i, lastIsOne, false); } - private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne, - final boolean reversed) + @VisibleForTesting + public static ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne, boolean reversed) { String intType = (reversed ? "ReversedType(IntegerType)" : "IntegerType"); - ByteBuffer bytes = ByteBufferUtil.bytes(s); + ByteBuffer bytes = s != null ? ByteBufferUtil.bytes(s) : null; int totalSize = 0; - if (s != null) + if (bytes != null) { totalSize += 2 + 2 + bytes.remaining() + 1; if (uuid != null) @@ -344,7 +339,7 @@ private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean ByteBuffer bb = ByteBuffer.allocate(totalSize); - if (s != null) + if (bytes != null) { bb.putShort((short)(0x8000 | (reversed ? 'B' : 'b'))); bb.putShort((short) bytes.remaining()); From 9b9775bcc46bc243728c874db6d84a0ff9764739 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 14:59:56 +0100 Subject: [PATCH 023/151] CORE-92: Remove misleading method readCollectionSize from CollectionSerializer That method is misleading because it accepts a byte buffer as an input source but unlike any other byte buffer reading method it does not shift the buffer position (cherry picked from commit 1ffd9284de0925e753c5a6ac45c4348699cf5f01) (cherry picked from commit 2be1e6051d094cfc6ec307de5f6853ff70655a64) --- src/java/org/apache/cassandra/cql3/CQL3Type.java | 2 +- src/java/org/apache/cassandra/db/marshal/ListType.java | 2 +- src/java/org/apache/cassandra/db/marshal/MapType.java | 2 +- .../apache/cassandra/serializers/CollectionSerializer.java | 5 ----- src/java/org/apache/cassandra/serializers/MapSerializer.java | 4 ++-- src/java/org/apache/cassandra/serializers/SetSerializer.java | 4 ++-- 6 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/java/org/apache/cassandra/cql3/CQL3Type.java b/src/java/org/apache/cassandra/cql3/CQL3Type.java index 5059104446e5..421b44684e6d 100644 --- a/src/java/org/apache/cassandra/cql3/CQL3Type.java +++ b/src/java/org/apache/cassandra/cql3/CQL3Type.java @@ -196,7 +196,7 @@ public String toCQLLiteral(ByteBuffer buffer, ProtocolVersion version) StringBuilder target = new StringBuilder(); buffer = buffer.duplicate(); - int size = CollectionSerializer.readCollectionSize(buffer, version); + int size = CollectionSerializer.readCollectionSize(buffer, ByteBufferAccessor.instance, version); buffer.position(buffer.position() + CollectionSerializer.sizeOfCollectionSize(size, version)); switch (type.kind) diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java index cc6393751bcc..cee3cd2c4c77 100644 --- a/src/java/org/apache/cassandra/db/marshal/ListType.java +++ b/src/java/org/apache/cassandra/db/marshal/ListType.java @@ -245,7 +245,7 @@ public static String setOrListToJsonString(ByteBuffer buffer, AbstractType eleme { ByteBuffer value = buffer.duplicate(); StringBuilder sb = new StringBuilder("["); - int size = CollectionSerializer.readCollectionSize(value, protocolVersion); + int size = CollectionSerializer.readCollectionSize(value, ByteBufferAccessor.instance, protocolVersion); int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); for (int i = 0; i < size; i++) { diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java index 9473e2913618..991ae08048a6 100644 --- a/src/java/org/apache/cassandra/db/marshal/MapType.java +++ b/src/java/org/apache/cassandra/db/marshal/MapType.java @@ -286,7 +286,7 @@ public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion) { ByteBuffer value = buffer.duplicate(); StringBuilder sb = new StringBuilder("{"); - int size = CollectionSerializer.readCollectionSize(value, protocolVersion); + int size = CollectionSerializer.readCollectionSize(value, ByteBufferAccessor.instance, protocolVersion); int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); for (int i = 0; i < size; i++) { diff --git a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java index eb2991b8d78c..204261d46fd7 100644 --- a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java +++ b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java @@ -91,11 +91,6 @@ protected static void writeCollectionSize(ByteBuffer output, int elements, Proto output.putInt(elements); } - public static int readCollectionSize(ByteBuffer input, ProtocolVersion version) - { - return readCollectionSize(input, ByteBufferAccessor.instance, version); - } - public static int readCollectionSize(V value, ValueAccessor accessor, ProtocolVersion version) { return accessor.toInt(value); diff --git a/src/java/org/apache/cassandra/serializers/MapSerializer.java b/src/java/org/apache/cassandra/serializers/MapSerializer.java index 9eae598003ba..867308404336 100644 --- a/src/java/org/apache/cassandra/serializers/MapSerializer.java +++ b/src/java/org/apache/cassandra/serializers/MapSerializer.java @@ -149,7 +149,7 @@ public ByteBuffer getSerializedValue(ByteBuffer collection, ByteBuffer key, Abst try { ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); int offset = sizeOfCollectionSize(n, ProtocolVersion.V3); for (int i = 0; i < n; i++) { @@ -185,7 +185,7 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection, try { ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); int startPos = input.position(); int count = 0; diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java index 0b7a2a5fa2ec..aae78d861ec7 100644 --- a/src/java/org/apache/cassandra/serializers/SetSerializer.java +++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java @@ -157,7 +157,7 @@ public ByteBuffer getSerializedValue(ByteBuffer input, ByteBuffer key, AbstractT { try { - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); int offset = sizeOfCollectionSize(n, ProtocolVersion.V3); for (int i = 0; i < n; i++) @@ -193,7 +193,7 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection, try { ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); int startPos = input.position(); int count = 0; From 8b066116dc7591888859b436b2dd5b7621c05d71 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 13 Nov 2020 12:05:02 +0100 Subject: [PATCH 024/151] CORE-93: Add ByteComparable, ByteSource and related stuff (cherry picked from commit c76194ed765811b14c8e85bb577a9945d41cc1bd) (cherry picked from commit 3cdb4c0148db34587708f306e67bf9277054bd07) --- .../cassandra/utils/ByteComparable.java | 166 +++++ .../apache/cassandra/utils/ByteSource.java | 699 ++++++++++++++++++ 2 files changed, 865 insertions(+) create mode 100644 src/java/org/apache/cassandra/utils/ByteComparable.java create mode 100644 src/java/org/apache/cassandra/utils/ByteSource.java diff --git a/src/java/org/apache/cassandra/utils/ByteComparable.java b/src/java/org/apache/cassandra/utils/ByteComparable.java new file mode 100644 index 000000000000..05e53e682460 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/ByteComparable.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import java.nio.ByteBuffer; + +import static org.apache.cassandra.utils.ByteSource.END_OF_STREAM; + +/** + * Interface indicating a value can be represented/identified by a comparable {@link ByteSource}. + */ +public interface ByteComparable +{ + /** + * Returns a source that generates the byte-comparable representation of the value byte by byte. + */ + ByteSource asComparableBytes(Version version); + + default ByteSource.Peekable asPeekableBytes(Version version) + { + return ByteSource.peekable(asComparableBytes(version)); + } + + enum Version + { + LEGACY, + OSS41, // CASSANDRA 4.1 encoding, used in trie-based indices + } + + ByteComparable EMPTY = (Version version) -> ByteSource.EMPTY; + + /** + * Construct a human-readable string from the byte-comparable representation. Used for debugging. + */ + default String byteComparableAsString(Version version) + { + StringBuilder builder = new StringBuilder(); + ByteSource stream = asComparableBytes(version); + if (stream == null) + return "null"; + for (int b = stream.next(); b != END_OF_STREAM; b = stream.next()) + builder.append(Integer.toHexString((b >> 4) & 0xF)).append(Integer.toHexString(b & 0xF)); + return builder.toString(); + } + + // Simple factories used for testing + + static ByteComparable of(String s) + { + return v -> ByteSource.of(s, v); + } + + static ByteComparable of(long value) + { + return v -> ByteSource.of(value); + } + + static ByteComparable of(int value) + { + return v -> ByteSource.of(value); + } + + static ByteComparable fixedLength(ByteBuffer bytes) + { + return v -> ByteSource.fixedLength(bytes); + } + + static ByteComparable fixedLength(byte[] bytes) + { + return v -> ByteSource.fixedLength(bytes); + } + + /** + * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This returns the shortest prefix of currMin that is greater than prevMax. + */ + static ByteComparable separatorPrefix(ByteComparable prevMax, ByteComparable currMin) + { + return version -> ByteSource.separatorPrefix(prevMax.asComparableBytes(version), currMin.asComparableBytes(version)); + } + + /** + * Returns a separator for two byte comparable, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This is a stream of length 1 longer than the common prefix of the two streams, with last byte one higher than the + * prevMax stream. + */ + static ByteComparable separatorGt(ByteComparable prevMax, ByteComparable currMin) + { + return version -> ByteSource.separatorGt(prevMax.asComparableBytes(version), currMin.asComparableBytes(version)); + } + + static ByteComparable cut(ByteComparable src, int cutoff) + { + return version -> ByteSource.cut(src.asComparableBytes(version), cutoff); + } + + /** + * Return the length of a byte comparable, not including the terminator byte. + */ + static int length(ByteComparable src, Version version) + { + int l = 0; + ByteSource s = src.asComparableBytes(version); + while (s.next() != END_OF_STREAM) + ++l; + return l; + } + + /** + * Compare two byte-comparable values by their byte-comparable representation. Used for tests. + * + * @return the result of the lexicographic unsigned byte comparison of the byte-comparable representations of the + * two arguments + */ + static int compare(ByteComparable bytes1, ByteComparable bytes2, Version version) + { + ByteSource s1 = bytes1.asComparableBytes(version); + ByteSource s2 = bytes2.asComparableBytes(version); + + if (s1 == null || s2 == null) + return Boolean.compare(s1 != null, s2 != null); + + while (true) + { + int b1 = s1.next(); + int b2 = s2.next(); + int cmp = Integer.compare(b1, b2); + if (cmp != 0) + return cmp; + if (b1 == ByteSource.END_OF_STREAM) + return 0; + } + } + + /** + * Returns the length of the minimum prefix that differentiates the two given byte-comparable representations. + */ + static int diffPoint(ByteComparable bytes1, ByteComparable bytes2, Version version) + { + ByteSource s1 = bytes1.asComparableBytes(version); + ByteSource s2 = bytes2.asComparableBytes(version); + int pos = 1; + int b; + while ((b = s1.next()) == s2.next() && b != END_OF_STREAM) + ++pos; + return pos; + } +} diff --git a/src/java/org/apache/cassandra/utils/ByteSource.java b/src/java/org/apache/cassandra/utils/ByteSource.java new file mode 100644 index 000000000000..6326861b28f4 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/ByteSource.java @@ -0,0 +1,699 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + +import org.apache.cassandra.utils.ByteComparable.Version; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * A stream of byte, used for byte-order-comparable representations of data. + */ +public interface ByteSource +{ + /** Get the next byte, unsigned. Must be between 0 and 255, or END_OF_STREAM if there are no more bytes. */ + int next(); + + /** Value returned if at the end of the stream. */ + int END_OF_STREAM = -1; + + ByteSource EMPTY = () -> END_OF_STREAM; + + /** + * Escape value. Used, among other things, to mark the end of subcomponents (so that shorter compares before anything longer). + * Actual zeros in input need to be escaped if this is in use (see BufferReinterpreter). + */ + int ESCAPE = 0x00; + + // Zeros are encoded as a sequence of ESCAPE, 0 or more of ESCAPED_0_CONT, ESCAPED_0_DONE so zeroed spaces only grow by 1 byte + int ESCAPED_0_CONT = 0xFE; + int ESCAPED_0_DONE = 0xFF; + + // All separators must be within these bounds + int MIN_SEPARATOR = 0x10; + int MAX_SEPARATOR = 0xEF; + + // Next component marker. + int NEXT_COMPONENT = 0x40; + int NEXT_COMPONENT_NULL = 0x3F; + int NEXT_COMPONENT_NULL_REVERSED = 0x41; + // Default terminator byte in sequences. Smaller than NEXT_COMPONENT_NULL, but larger than LT_NEXT_COMPONENT to + // ensure lexicographic compares go in the correct direction + int TERMINATOR = 0x38; + // These are special endings, for exclusive/inclusive bounds (i.e. smaller than anything with more components, bigger than anything with more components) + int LT_NEXT_COMPONENT = 0x20; + int GT_NEXT_COMPONENT = 0x60; + + /** + * Reinterprets a byte buffer as a byte-comparable source that has 0s escaped and finishes in an escape. + * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences. + * (See ByteSource.BufferReinterpreter/Multi for explanation.) + */ + static ByteSource of(ByteBuffer buf, Version version) + { + return new BufferReinterpreter(buf, version); + } + + /** + * Reinterprets a byte array as a byte-comparable source that has 0s escaped and finishes in an escape. + * This provides a prefix-free byte-comparable version of the content to use in sequences. + * (See ByteSource.BufferReinterpreter/Multi for explanation.) + */ + static ByteSource of(byte[] buf, Version version) + { + return new ReinterpreterArray(buf, version); + } + + /** + * Combines a chain of sources, turning their weak-prefix-free byte-comparable representation into the combination's + * prefix-free byte-comparable representation, with the included terminator character. + * For correctness, the terminator must be within MIN-MAX_SEPARATOR and different from NEXT_COMPONENT+/-1. + * Typically TERMINATOR, or LT/GT_NEXT_COMPONENT if used for partially specified bounds. + */ + static ByteSource withTerminator(int terminator, ByteSource... srcs) + { + return new Multi(srcs, terminator); + } + + static ByteSource of(String s, Version version) + { + return new ReinterpreterArray(s.getBytes(StandardCharsets.UTF_8), version); + } + + static ByteSource of(long value) + { + return new Number(value ^ (1L<<63), 8); + } + + static ByteSource of(int value) + { + return new Number(value ^ (1L<<31), 4); + } + + /** + * Produce a source for a signed fixed-length number, also translating empty to null. + * The first byte has its sign bit inverted, and the rest are passed unchanged. + * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and + * ensures the representation is prefix-free. + */ + static ByteSource optionalSignedFixedLengthNumber(ByteBuffer b) + { + return b.hasRemaining() ? signedFixedLengthNumber(b) : null; + } + + /** + * Produce a source for a signed fixed-length number. + * The first byte has its sign bit inverted, and the rest are passed unchanged. + * Presumes that the length of the buffer is always constant for the type. + */ + static ByteSource signedFixedLengthNumber(ByteBuffer b) + { + return new SignedFixedLengthNumber(b); + } + + /** + * Produce a source for a signed fixed-length floating-point number, also translating empty to null. + * If sign bit is on, returns negated bytes. If not, add the sign bit value. + * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.) + * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and + * ensures the representation is prefix-free. + */ + static ByteSource optionalSignedFixedLengthFloat(ByteBuffer b) + { + return b.hasRemaining() ? signedFixedLengthFloat(b) : null; + } + + /** + * Produce a source for a signed fixed-length floating-point number. + * If sign bit is on, returns negated bytes. If not, add the sign bit value. + * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.) + * Presumes that the length of the buffer is always constant for the type. + */ + static ByteSource signedFixedLengthFloat(ByteBuffer b) + { + return new SignedFixedLengthFloat(b); + } + + /** + * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This returns the shortest prefix of currMin that is greater than prevMax. + */ + public static ByteSource separatorPrefix(ByteSource prevMax, ByteSource currMin) + { + return new Separator(prevMax, currMin, true); + } + + /** + * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This is a source of length 1 longer than the common prefix of the two sources, with last byte one higher than the + * prevMax source. + */ + public static ByteSource separatorGt(ByteSource prevMax, ByteSource currMin) + { + return new Separator(prevMax, currMin, false); + } + + public static ByteSource oneByte(int i) + { + assert i >= 0 && i <= 0xFF; + return new ByteSource() + { + boolean given = false; + public int next() + { + if (given) + return END_OF_STREAM; + given = true; + return i; + } + }; + } + + public static ByteSource cut(ByteSource src, int cutoff) + { + return new ByteSource() + { + int pos = 0; + + @Override + public int next() + { + return pos++ < cutoff ? src.next() : END_OF_STREAM; + } + }; + } + + /** + * Wrap a ByteSource in a length-fixing facade. + * + * If the length of {@code src} is less than {@code cutoff}, then pad it on the right with {@code padding} until + * the overall length equals {@code cutoff}. If the length of {@code src} is greater than {@code cutoff}, then + * truncate {@code src} to that size. Effectively a noop if {@code src} happens to have length {@code cutoff}. + * + * @param src the input source to wrap + * @param cutoff the size of the source returned + * @param padding a padding byte (an int subject to a 0xFF mask) + * @return + */ + public static ByteSource cutOrRightPad(ByteSource src, int cutoff, int padding) + { + return new ByteSource() + { + int pos = 0; + + @Override + public int next() + { + if (pos++ >= cutoff) + { + return END_OF_STREAM; + } + int next = src.next(); + return next == END_OF_STREAM ? padding : next; + } + }; + } + + + static ByteSource MAX = new ByteSource() + { + public int next() + { + return 0xFF; + } + + public String toString() + { + return "MAX"; + } + }; + + /** + * Returns a maximal ByteSource, i.e. something that compares greater to any other byte source. + * This is an infinite sequence of 0xFF. + * + * Note that since the sequence is infinite, trying to calculate this item's length, copying it, trying + * to store it in a trie, or comparing it to another max will result in an infinite loop. + */ + public static ByteSource max() + { + return MAX; + } + + /** + * Variable-length encoding. Escapes 0s as ESCAPE + zero or more ESCAPED_0_CONT + ESCAPED_0_DONE. + * Finishes with an escape value (to which Multi will add non-zero component separator) + * E.g. A00B translates to 4100FEFF4200 + * A0B0 4100FF4200FE (+00 for {@link Version#LEGACY}) + * A0 4100FE (+00 for {@link Version#LEGACY}) + * + * If in a single byte source, the bytes could be simply passed unchanged, but this would not allow us to + * combine components. This translation preserves order, and since the encoding for 0 is higher than the separator + * also makes sure shorter components are treated as smaller. + * + * The encoding is not prefix-free, since e.g. the encoding of "A" (4100) is a prefix of the encoding of "A0" + * (4100FE), but the byte following the prefix is guaranteed to be FE or FF, which makes the encoding weakly + * prefix-free. Additionally, any such prefix sequence will compare smaller than the value to which it is a prefix, + * because any permitted separator byte will be smaller than the byte following the prefix. + */ + static abstract class AbstractReinterpreter implements ByteSource + { + final Version version; + int bufpos; + boolean escaped; + + AbstractReinterpreter(int position, Version version) + { + this.bufpos = position; + this.version = version; + } + + public final int next() + { + if (bufpos >= limit()) + { + if (bufpos > limit()) + return END_OF_STREAM; + + ++bufpos; + if (escaped) + { + escaped = false; + if (version == Version.LEGACY) + --bufpos; // place an ESCAPE at the end of sequence ending in ESCAPE + return ESCAPED_0_CONT; + } + return ESCAPE; + } + + int index = bufpos++; + int b = get(index) & 0xFF; + if (!escaped) + { + if (b == ESCAPE) + escaped = true; + return b; + } + else + { + if (b == ESCAPE) + return ESCAPED_0_CONT; + --bufpos; + escaped = false; + return ESCAPED_0_DONE; + } + } + + protected abstract byte get(int index); + + protected abstract int limit(); + } + + static class BufferReinterpreter extends AbstractReinterpreter + { + final ByteBuffer buf; + + private BufferReinterpreter(ByteBuffer buf, Version version) + { + super(buf.position(), version); + this.buf = buf; + } + + protected int limit() + { + return buf.limit(); + } + + protected byte get(int index) + { + return buf.get(index); + } + } + + static class ReinterpreterArray extends AbstractReinterpreter + { + final byte[] buf; + + private ReinterpreterArray(byte[] buf, Version version) + { + super(0, version); + this.buf = buf; + } + + @Override + protected byte get(int index) + { + return buf[index]; + } + + @Override + protected int limit() + { + return buf.length; + } + } + + /** + * Fixed length signed number encoding. Inverts first bit (so that neg < pos), then just posts all bytes from the + * buffer. Assumes buffer is of correct length. + */ + static class SignedFixedLengthNumber implements ByteSource + { + ByteBuffer buf; + int bufpos; + + public SignedFixedLengthNumber(ByteBuffer buf) + { + this.buf = buf; + bufpos = buf.position(); + } + + public int next() + { + if (bufpos >= buf.limit()) + return END_OF_STREAM; + int v = buf.get(bufpos) & 0xFF; + if (bufpos == buf.position()) + v ^= 0x80; + ++bufpos; + return v; + } + } + + static class Number implements ByteSource + { + final long value; + int pos; + + public Number(long value, int length) + { + this.value = value; + this.pos = length; + } + + public int next() + { + if (pos == 0) + return END_OF_STREAM; + return (int) ((value >> (--pos * 8)) & 0xFF); + } + } + + /** + * Fixed length signed floating point number encoding. First bit is sign. If positive, add sign bit value to make + * greater than all negatives. If not, invert all content to make negatives with bigger magnitude smaller. + */ + static class SignedFixedLengthFloat implements ByteSource + { + final ByteBuffer buf; + int bufpos; + boolean invert; + + public SignedFixedLengthFloat(ByteBuffer buf) + { + this.buf = buf; + this.bufpos = buf.position(); + } + + public int next() + { + if (bufpos >= buf.limit()) + return END_OF_STREAM; + int v = buf.get(bufpos) & 0xFF; + if (bufpos == buf.position()) + { + invert = v >= 0x80; + v |= 0x80; + } + if (invert) + v = v ^ 0xFF; + ++bufpos; + return v; + } + } + + /** + * Combination of multiple byte sources. Adds NEXT_COMPONENT before sources, or NEXT_COMPONENT_NULL if next is null. + */ + static class Multi implements ByteSource + { + final ByteSource[] srcs; + int srcnum = -1; + int terminator; + + Multi(ByteSource[] srcs, int terminator) + { + this.srcs = srcs; + this.terminator = terminator; + } + + public int next() + { + if (srcnum == srcs.length) + return END_OF_STREAM; + + int b = END_OF_STREAM; + if (srcnum >= 0 && srcs[srcnum] != null) + b = srcs[srcnum].next(); + if (b > END_OF_STREAM) + return b; + + ++srcnum; + if (srcnum == srcs.length) + return terminator; + if (srcs[srcnum] == null) + return NEXT_COMPONENT_NULL; + return NEXT_COMPONENT; + } + } + + /** + * Construct the shortest common prefix of prevMax and currMin that separates those two byte streams. + * If {@code useCurr == true} the last byte of the returned stream comes from {@code currMin} and is the first + * byte which is greater than byte on the corresponding position of {@code prevMax}. + * Otherwise, the last byte of the returned stream comes from {@code prevMax} and is incremented by one, still + * guaranteeing that it is <= than the byte on the corresponding position of {@code currMin}. + */ + static class Separator implements ByteSource + { + final ByteSource prev; + final ByteSource curr; + boolean done = false; + final boolean useCurr; + + Separator(ByteSource prevMax, ByteSource currMin, boolean useCurr) + { + this.prev = prevMax; + this.curr = currMin; + this.useCurr = useCurr; + } + + public int next() + { + if (done) + return END_OF_STREAM; + int p = prev.next(); + int c = curr.next(); + assert p <= c : prev + " not less than " + curr; + if (p == c) + return c; + done = true; + return useCurr ? c : p + 1; + } + } + + static ByteSource optionalFixedLength(ByteBuffer b) + { + return b.hasRemaining() ? fixedLength(b) : null; + } + + /** + * A byte source of the given bytes without any encoding. + * The resulting source is only guaranteed to give correct comparison results and be prefix-free if the + * underlying type has a fixed length. + * In tests, this method is also used to generate non-escaped test cases. + */ + public static ByteSource fixedLength(ByteBuffer b) + { + return new ByteSource() + { + int pos = b.position() - 1; + + @Override + public int next() + { + return ++pos < b.limit() ? b.get(pos) & 0xFF : -1; + } + }; + } + + /** + * A byte source of the given bytes without any encoding. + * If used in a sequence, the resulting source is only guaranteed to give correct comparison results if the + * underlying type has a fixed length. + * In tests, this method is also used to generate non-escaped test cases. + */ + public static ByteSource fixedLength(byte[] b) + { + return fixedLength(b, 0, b.length); + } + + public static ByteSource fixedLength(byte[] b, int offset, int length) + { + checkArgument(offset >= 0 && offset <= b.length); + checkArgument(length >= 0 && offset + length <= b.length); + + return new ByteSource() + { + int pos = offset - 1; + + @Override + public int next() + { + return ++pos < offset + length ? b[pos] & 0xFF : END_OF_STREAM; + } + }; + } + + public static ByteSource fourBit(ByteSource s) + { + return new ByteSource() + { + int pos = 0; + int v = 0; + + @Override + public int next() + { + if ((pos++ & 1) == 0) + { + v = s.next(); + if (v == END_OF_STREAM) + return END_OF_STREAM; + return (v >> 4) & 0xF; + } + else + return v & 0xF; + } + }; + } + + /** + * Splits each byte into portions of bitCount bits. + * @param s source + * @param bitCount number of bits to issue at a time, 1-4 make sense + */ + public static ByteSource splitBytes(ByteSource s, int bitCount) + { + return new ByteSource() + { + int pos = 8; + int v = 0; + int mask = (1 << bitCount) - 1; + + @Override + public int next() + { + if ((pos += bitCount) >= 8) + { + pos = 0; + v = s.next(); + if (v == END_OF_STREAM) + return END_OF_STREAM; + } + v <<= bitCount; + return (v >> 8) & mask; + } + }; + } + + /** + * Returns the key that is immediately after src in the topology. + * @param src + * @return src with added 00 byte at the end + */ + public static ByteSource nextKey(ByteSource src) + { + return new ByteSource() + { + boolean done = false; + + @Override + public int next() + { + if (done) + return END_OF_STREAM; + int n = src.next(); + if (n != END_OF_STREAM) + return n; + + done = true; + return 0; + } + }; + } + + public class Peekable implements ByteSource + { + static final int NONE = Integer.MIN_VALUE; + + final ByteSource wrapped; + int peeked = NONE; + + public Peekable(ByteSource wrapped) + { + this.wrapped = wrapped; + } + + public int next() + { + if (peeked != NONE) + { + int val = peeked; + peeked = NONE; + return val; + } + else + return wrapped.next(); + } + + public int peek() + { + if (peeked == NONE) + peeked = wrapped.next(); + return peeked; + } + } + + public static Peekable peekable(ByteSource p) + { + // When given a null source, we're better off not wrapping it and just returning null. This way existing + // code that doesn't know about ByteSource.Peekable, but handles correctly null ByteSources won't be thrown + // off by a non-null instance that semantically should have been null. + if (p == null) + return null; + return (p instanceof Peekable) + ? (Peekable) p + : new Peekable(p); + } +} From 6b5704a982eeff7b5dde6ddfddc5b14dc184ead7 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 16:17:54 +0100 Subject: [PATCH 025/151] CORE-93: Add isValueLengthFixed method to AbstractType (cherry picked from commit d4f1de75d0e65fd52323c70fa1f8e85fe4ff8e16) (cherry picked from commit 3ea8c7a1f34775639b169d117c5459f10feb295d) --- .../cassandra/db/marshal/AbstractType.java | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java index 19cf849dba06..4c886f31ef53 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java @@ -54,6 +54,8 @@ @Unmetered public abstract class AbstractType implements Comparator, AssignmentTestable { + private final static int VARIABLE_LENGTH = -1; + public final Comparator reverseComparator; public enum ComparisonType @@ -419,11 +421,28 @@ public List> getComponents() } /** - * The length of values for this type if all values are of fixed length, -1 otherwise. + * The length of values for this type if all values are of fixed length, -1 otherwise. This has an impact on + * serialization. + * + *

  • see {@link #writeValue}
  • + *
  • see {@link #read}
  • + *
  • see {@link #writtenLength}
  • + *
  • see {@link #skipValue}
  • + * */ public int valueLengthIfFixed() { - return -1; + return VARIABLE_LENGTH; + } + + /** + * Checks if all values are of fixed length. + * + * @return {@code true} if all values are of fixed length, {@code false} otherwise. + */ + public final boolean isValueLengthFixed() + { + return valueLengthIfFixed() != VARIABLE_LENGTH; } // This assumes that no empty values are passed From cfd23c81f94d253fbffe880fd8c643fc6cdd9dcd Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 16:26:06 +0100 Subject: [PATCH 026/151] CORE-93: Add implementation of asComparableBytes to types (cherry picked from commit 783f1c1f1f5fb6739d61f099c19f571543701d19) (cherry picked from commit 851bd6ac2abbd8c11353616b5027a8cd75c8687a) --- .../cassandra/db/marshal/AbstractType.java | 34 ++++++ .../cassandra/db/marshal/BooleanType.java | 13 +++ .../apache/cassandra/db/marshal/ByteType.java | 11 ++ .../cassandra/db/marshal/CompositeType.java | 59 ++++++++++ .../apache/cassandra/db/marshal/DateType.java | 9 ++ .../cassandra/db/marshal/DecimalType.java | 109 ++++++++++++++++++ .../cassandra/db/marshal/DoubleType.java | 8 ++ .../db/marshal/DynamicCompositeType.java | 108 +++++++++++++++++ .../cassandra/db/marshal/EmptyType.java | 8 ++ .../cassandra/db/marshal/FloatType.java | 8 ++ .../cassandra/db/marshal/Int32Type.java | 8 ++ .../cassandra/db/marshal/IntegerType.java | 73 ++++++++++++ .../cassandra/db/marshal/LexicalUUIDType.java | 28 +++++ .../apache/cassandra/db/marshal/ListType.java | 33 +++++- .../apache/cassandra/db/marshal/LongType.java | 8 ++ .../apache/cassandra/db/marshal/MapType.java | 33 +++++- .../db/marshal/PartitionerDefinedOrder.java | 19 +++ .../cassandra/db/marshal/ReversedType.java | 56 +++++++++ .../apache/cassandra/db/marshal/SetType.java | 8 ++ .../cassandra/db/marshal/ShortType.java | 10 ++ .../cassandra/db/marshal/SimpleDateType.java | 13 ++- .../apache/cassandra/db/marshal/TimeType.java | 13 ++- .../cassandra/db/marshal/TimeUUIDType.java | 19 ++- .../cassandra/db/marshal/TimestampType.java | 9 +- .../cassandra/db/marshal/TupleType.java | 17 ++- .../apache/cassandra/db/marshal/UUIDType.java | 24 ++++ .../cassandra/utils/ByteBufferUtil.java | 20 ++++ 27 files changed, 751 insertions(+), 7 deletions(-) diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java index 4c886f31ef53..edd4de79e9d8 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java @@ -39,8 +39,11 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.github.jamm.Unmetered; +import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.BYTE_ORDER; import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.CUSTOM; /** @@ -587,6 +590,37 @@ public AssignmentTestable.TestResult testAssignment(AbstractType receiverType return AssignmentTestable.TestResult.NOT_ASSIGNABLE; } + /** + * Produce a byte-comparable representation of the given value, i.e. a sequence of bytes that compares the same way + * using lexicographical unsigned byte comparison as the original value using the type's comparator. + * + * We use a slightly stronger requirement to be able to use the types in tuples. Precisely, for any pair x, y of + * non-equal valid values of this type and any bytes b1, b2 between 0x10 and 0xEF, + * (+ stands for concatenation) + * compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2) + * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and: + * asByteComparable(x)+b1 is not a prefix of asByteComparable(y) (weakly prefix free) + * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the + * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if + * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat + * more efficient encoding of arbitrary-length byte-comparable blobs. + * + * Depending on the type, this method can be called for null or empty input, in which case the output is allowed to + * be null (the clustering/tuple encoding will accept and handle it). + */ + public ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version) + { + if (comparisonType == BYTE_ORDER) + { + // When a type is byte-ordered on its own, we only need to escape it, so that we can include it in + // multi-component types and make the encoding weakly-prefix-free. + return ByteSource.of(byteBuffer, version); + } + else + // default is only good for byte-comparables + throw new UnsupportedOperationException(getClass().getSimpleName() + " does not implement asComparableBytes"); + } + /** * This must be overriden by subclasses if necessary so that for any * AbstractType, this == TypeParser.parse(toString()). diff --git a/src/java/org/apache/cassandra/db/marshal/BooleanType.java b/src/java/org/apache/cassandra/db/marshal/BooleanType.java index 4ef5f95b0bfc..fff72203f657 100644 --- a/src/java/org/apache/cassandra/db/marshal/BooleanType.java +++ b/src/java/org/apache/cassandra/db/marshal/BooleanType.java @@ -26,6 +26,8 @@ import org.apache.cassandra.serializers.BooleanSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -54,6 +56,17 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return v1 - v2; } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + if (!buf.hasRemaining()) + return null; + byte b = buf.get(buf.position()); + if (b != 0) + b = 1; + return ByteSource.oneByte(b); + } + public ByteBuffer fromString(String source) throws MarshalException { diff --git a/src/java/org/apache/cassandra/db/marshal/ByteType.java b/src/java/org/apache/cassandra/db/marshal/ByteType.java index f94f4bb01cc5..e57b479ccb15 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteType.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteType.java @@ -27,6 +27,9 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; public class ByteType extends NumberType { @@ -42,6 +45,14 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return accessorL.getByte(left, 0) - accessorR.getByte(right, 0); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, Version version) + { + return version == Version.LEGACY + ? ByteSource.signedFixedLengthNumber(buf) + : ByteSource.optionalSignedFixedLengthNumber(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java index d8e0ac7b79e2..dc4fdcc7112a 100644 --- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java @@ -31,6 +31,8 @@ import org.apache.cassandra.exceptions.SyntaxException; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.transform; @@ -165,6 +167,39 @@ protected AbstractType getAndAppendComparator(int i, V value, ValueAccess return types.get(i); } + @Override + public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version) + { + if (byteBuffer == null || byteBuffer.remaining() == 0) + return null; + + ByteSource[] srcs = new ByteSource[types.size() * 2 + 1]; + ByteBuffer bb = byteBuffer.duplicate(); + + // statics go first + boolean isStatic = readStatic(bb); + srcs[0] = isStatic ? null : ByteSource.EMPTY; + + int i = 0; + byte lastEoc = 0; + while (bb.remaining() > 0) + { + // Only the end-of-component byte of the last component of this composite can be non-zero, so the + // component before can't have a non-zero end-of-component byte. + assert lastEoc == 0 : lastEoc; + + srcs[i * 2 + 1] = types.get(i).asComparableBytes(ByteBufferUtil.readBytesWithShortLength(bb), version); + lastEoc = bb.get(); + srcs[i * 2 + 2] = ByteSource.oneByte(lastEoc & 0xFF ^ 0x80); // end-of-component also takes part in comparison as signed byte + ++i; + } + if (i * 2 + 1 < srcs.length) + srcs = Arrays.copyOfRange(srcs, 0, i * 2 + 1); + + return ByteSource.withTerminator(version == Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR, + srcs); + } + protected ParsedComparator parseComparator(int i, String part) { return new StaticParsedComparator(types.get(i), part); @@ -389,4 +424,28 @@ public static V build(ValueAccessor accessor, boolean isStatic, V... valu out.flip(); return accessor.valueOf(out); } + + public static ByteBuffer build(boolean isStatic, ByteBuffer[] buffers, byte lastEoc) + { + int totalLength = isStatic ? 2 : 0; + for (ByteBuffer bb : buffers) + totalLength += 2 + bb.remaining() + 1; + + ByteBuffer out = ByteBuffer.allocate(totalLength); + + if (isStatic) + out.putShort((short)STATIC_MARKER); + + for (int i = 0; i < buffers.length; ++i) + { + ByteBuffer bb = buffers[i]; + ByteBufferUtil.writeShortLength(out, bb.remaining()); + int toCopy = bb.remaining(); + ByteBufferUtil.arrayCopy(bb, bb.position(), out, out.position(), toCopy); + out.position(out.position() + toCopy); + out.put(i != buffers.length - 1 ? (byte) 0 : lastEoc); + } + out.flip(); + return out; + } } diff --git a/src/java/org/apache/cassandra/db/marshal/DateType.java b/src/java/org/apache/cassandra/db/marshal/DateType.java index 473cedf40795..4e6aa5a27704 100644 --- a/src/java/org/apache/cassandra/db/marshal/DateType.java +++ b/src/java/org/apache/cassandra/db/marshal/DateType.java @@ -31,6 +31,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; /** * This is the old version of TimestampType, but has been replaced as it wasn't comparing pre-epoch timestamps @@ -50,6 +52,13 @@ public boolean isEmptyValueMeaningless() return true; } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient. + return ByteSource.optionalFixedLength(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/DecimalType.java b/src/java/org/apache/cassandra/db/marshal/DecimalType.java index 5740fdcc0fcb..d6d47d8e0f59 100644 --- a/src/java/org/apache/cassandra/db/marshal/DecimalType.java +++ b/src/java/org/apache/cassandra/db/marshal/DecimalType.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.util.Objects; +import com.google.common.primitives.Ints; + import org.apache.cassandra.cql3.CQL3Type; import org.apache.cassandra.cql3.Constants; import org.apache.cassandra.cql3.Term; @@ -32,6 +34,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class DecimalType extends NumberType { @@ -41,6 +45,12 @@ public class DecimalType extends NumberType private static final int MAX_SCALE = 1000; private static final MathContext MAX_PRECISION = new MathContext(10000); + // Constants or escaping values needed to encode/decode variable-length floating point numbers (decimals) in our + // custom byte-ordered encoding scheme. + private static final int POSITIVE_DECIMAL_HEADER_MASK = 0x80; + private static final int NEGATIVE_DECIMAL_HEADER_MASK = 0x00; + private static final int DECIMAL_EXPONENT_LENGTH_HEADER_MASK = 0x40; + DecimalType() {super(ComparisonType.CUSTOM);} // singleton public boolean isEmptyValueMeaningless() @@ -59,6 +69,105 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareComposed(left, accessorL, right, accessorR, this); } + /** + * Constructs a byte-comparable representation. + * This is rather difficult and involves reconstructing the decimal. + * + * To compare, we need a normalized value, i.e. one with a sign, exponent and (0,1) mantissa. To avoid + * loss of precision, both exponent and mantissa need to be base-100. We can't get this directly off the serialized + * bytes, as they have base-10 scale and base-256 unscaled part. + * + * We store: + * - sign bit inverted * 0x80 + 0x40 + signed exponent length, where exponent is negated if value is negative + * - zero or more exponent bytes (as given by length) + * - 0x80 + first pair of decimal digits, negative is value is negative, rounded to -inf + * - zero or more 0x80 + pair of decimal digits, always positive + * - trailing 0x00 + * Zero is special-cased as 0x80. + * + * Because the trailing 00 cannot be produced from a pair of decimal digits (positive or not), no value can be + * a prefix of another. + * + * Encoding examples: + * 1.1 as c1 = 0x80 (positive number) + 0x40 + (positive exponent) 0x01 (exp length 1) + * 01 = exponent 1 (100^1) + * 81 = 0x80 + 01 (0.01) + * 8a = 0x80 + 10 (....10) 0.0110e2 + * 00 + * -1 as 3f = 0x00 (negative number) + 0x40 - (negative exponent) 0x01 (exp length 1) + * ff = exponent -1. negative number, thus 100^1 + * 7f = 0x80 - 01 (-0.01) -0.01e2 + * 00 + * -99.9 as 3f = 0x00 (negative number) + 0x40 - (negative exponent) 0x01 (exp length 1) + * ff = exponent -1. negative number, thus 100^1 + * 1c = 0x80 - 100 (-1.00) + * 8a = 0x80 + 10 (+....10) -0.999e2 + * 00 + * + */ + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + BigDecimal value = compose(buf); + if (value == null) + return null; + if (value.compareTo(BigDecimal.ZERO) == 0) // Note: 0.equals(0.0) returns false! + return ByteSource.oneByte(POSITIVE_DECIMAL_HEADER_MASK); + long scale = (((long) value.scale()) - value.precision()) & ~1; + boolean negative = value.signum() < 0; + final int negmul = negative ? -1 : 1; + // This should always fit into an int + final long exponent = (-scale * negmul) / 2; + // We should never have scale > Integer.MAX_VALUE, as we're always subtracting the non-negative precision of + // the encoded BigDecimal, and furthermore we're rounding to negative infinity. + if (scale > Integer.MAX_VALUE || scale < Integer.MIN_VALUE) + { + // We are practically out of range here, but let's handle that anyway + int mv = Long.signum(scale) * Integer.MAX_VALUE; + value = value.scaleByPowerOfTen(mv); + scale -= mv; + } + final BigDecimal mantissa = value.scaleByPowerOfTen(Ints.checkedCast(scale)).stripTrailingZeros(); + assert mantissa.abs().compareTo(BigDecimal.ONE) < 0; + + return new ByteSource() + { + int posInExp = 0; + BigDecimal current = mantissa; + + @Override + public int next() + { + if (posInExp < 5) + { + if (posInExp == 0) + { + int absexp = (int) (exponent < 0 ? -exponent : exponent); + while (posInExp < 5 && absexp >> (32 - ++posInExp * 8) == 0) {} + int explen = DECIMAL_EXPONENT_LENGTH_HEADER_MASK + (exponent < 0 ? -1 : 1) * (5 - posInExp); + return explen + (negative ? NEGATIVE_DECIMAL_HEADER_MASK : POSITIVE_DECIMAL_HEADER_MASK); + } + else + return (int) ((exponent >> (32 - posInExp++ * 8))) & 0xFF; + } + if (current == null) + return END_OF_STREAM; + if (current.compareTo(BigDecimal.ZERO) == 0) + { + current = null; + return 0x00; + } + else + { + BigDecimal v = current.scaleByPowerOfTen(2); + BigDecimal floor = v.setScale(0, BigDecimal.ROUND_FLOOR); + current = v.subtract(floor); + return floor.byteValueExact() + 0x80; + } + } + }; + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/DoubleType.java b/src/java/org/apache/cassandra/db/marshal/DoubleType.java index 570d420a75bb..d68bc4ca90a4 100644 --- a/src/java/org/apache/cassandra/db/marshal/DoubleType.java +++ b/src/java/org/apache/cassandra/db/marshal/DoubleType.java @@ -27,6 +27,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class DoubleType extends NumberType { @@ -50,6 +52,12 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareComposed(left, accessorL, right, accessorR, this); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthFloat(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java index e0377fd5396b..a293f11b2ce1 100644 --- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java @@ -19,9 +19,13 @@ import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Maps; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,6 +38,8 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; import static com.google.common.collect.Iterables.any; @@ -60,6 +66,9 @@ public class DynamicCompositeType extends AbstractCompositeType { private static final Logger logger = LoggerFactory.getLogger(DynamicCompositeType.class); + private static final ByteSource[] EMPTY_BYTE_SOURCE_ARRAY = new ByteSource[0]; + private static final String REVERSED_TYPE = ReversedType.class.getSimpleName(); + private final Map> aliases; // interning instances @@ -196,6 +205,105 @@ protected AbstractType getAndAppendComparator(int i, V value, ValueAccess } } + @Override + public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version) + { + List srcs = new ArrayList<>(); + ByteBuffer bb = byteBuffer.duplicate(); + + // statics go first + boolean isStatic = readIsStatic(bb, ByteBufferAccessor.instance); + srcs.add(isStatic ? null : ByteSource.EMPTY); + bb.position(bb.position() + startingOffset(isStatic)); + + byte lastEoc = 0; + while (bb.remaining() > 0) + { + // Only the end-of-component byte of the last component of this composite can be non-zero, so the + // component before can't have a non-zero end-of-component byte. + assert lastEoc == 0 : lastEoc; + + AbstractType comp = getComparator(bb, ByteBufferAccessor.instance, 0); + bb.position(bb.position() + getComparatorSize(bb, ByteBufferAccessor.instance, 0)); + // The comparable bytes for the component need to ensure comparisons consistent with + // AbstractCompositeType.compareCustom(ByteBuffer, ByteBuffer) and + // DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer): + if (version == Version.LEGACY || !(comp instanceof ReversedType)) + { + // ...most often that means just adding the short name of the type, followed by the full name of the type. + srcs.add(ByteSource.of(comp.getClass().getSimpleName(), version)); + srcs.add(ByteSource.of(comp.getClass().getName(), version)); + } + else + { + // ...however some times the component uses a complex type (currently the only supported complex type + // is ReversedType - we can't have elements that are of MapType, CompositeType, TupleType, etc.)... + ReversedType reversedComp = (ReversedType) comp; + // ...in this case, we need to add the short name of ReversedType before the short name of the base + // type, to ensure consistency with DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer). + srcs.add(ByteSource.of(REVERSED_TYPE, version)); + srcs.add(ByteSource.of(reversedComp.baseType.getClass().getSimpleName(), version)); + srcs.add(ByteSource.of(reversedComp.baseType.getClass().getName(), version)); + } + // Only then the payload of the component gets encoded. + srcs.add(comp.asComparableBytes(ByteBufferUtil.readBytesWithShortLength(bb), version)); + // The end-of-component byte also takes part in the comparison, and therefore needs to be encoded. + lastEoc = bb.get(); + srcs.add(ByteSource.oneByte(version == Version.LEGACY ? lastEoc : lastEoc & 0xFF ^ 0x80)); + } + + return ByteSource.withTerminator(version == Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR, + srcs.toArray(EMPTY_BYTE_SOURCE_ARRAY)); + } + + public static ByteBuffer build(List types, List values) + { + return build(types, values, (byte) 0); + } + + @VisibleForTesting + public static ByteBuffer build(List types, List values, byte lastEoc) + { + assert types.size() == values.size(); + + int numComponents = types.size(); + // Compute the total number of bytes that we'll need to store the types and their payloads. + int totalLength = 0; + for (int i = 0; i < numComponents; ++i) + { + int typeNameLength = types.get(i).getBytes(StandardCharsets.UTF_8).length; + // The type data will be stored by means of the type's fully qualified name, not by aliasing, so: + // 1. The type data header should be the fully qualified name length in bytes. + // 2. The length should be small enough so that it fits in 15 bits (2 bytes with the first bit zero). + assert typeNameLength <= 0x7FFF; + int valueLength = values.get(i).remaining(); + // The value length should also expect its first bit to be 0, as the length should be stored as a signed + // 2-byte value (short). + assert valueLength <= 0x7FFF; + totalLength += 2 + typeNameLength + 2 + valueLength + 1; + } + + ByteBuffer result = ByteBuffer.allocate(totalLength); + for (int i = 0; i < numComponents; ++i) + { + // Write the type data (2-byte length header + the fully qualified type name in UTF-8). + byte[] typeNameBytes = types.get(i).getBytes(StandardCharsets.UTF_8); + ByteBufferUtil.writeShortLength(result, typeNameBytes.length); + result.put(ByteBuffer.wrap(typeNameBytes)); + + // Write the type payload data (2-byte length header + the payload). + ByteBuffer value = values.get(i); + int bytesToCopy = value.remaining(); + ByteBufferUtil.writeShortLength(result, bytesToCopy); + ByteBufferUtil.arrayCopy(value, value.position(), result, result.position(), bytesToCopy); + result.position(result.position() + bytesToCopy); + + // Write the end-of-component byte. + result.put(i != numComponents - 1 ? (byte) 0 : lastEoc); + } + return result; + } + protected ParsedComparator parseComparator(int i, String part) { return new DynamicParsedComparator(part); diff --git a/src/java/org/apache/cassandra/db/marshal/EmptyType.java b/src/java/org/apache/cassandra/db/marshal/EmptyType.java index 357b6e85ad15..80f8950e7c59 100644 --- a/src/java/org/apache/cassandra/db/marshal/EmptyType.java +++ b/src/java/org/apache/cassandra/db/marshal/EmptyType.java @@ -33,6 +33,8 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.NoSpamLogger; /** @@ -68,6 +70,12 @@ private static NonEmptyWriteBehavior parseNonEmptyWriteBehavior() private EmptyType() {super(ComparisonType.CUSTOM);} // singleton + @Override + public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version) + { + return null; + } + public int compareCustom(VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) { return 0; diff --git a/src/java/org/apache/cassandra/db/marshal/FloatType.java b/src/java/org/apache/cassandra/db/marshal/FloatType.java index 35abee0f98ed..8618325f1e45 100644 --- a/src/java/org/apache/cassandra/db/marshal/FloatType.java +++ b/src/java/org/apache/cassandra/db/marshal/FloatType.java @@ -27,6 +27,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class FloatType extends NumberType @@ -51,6 +53,12 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareComposed(left, accessorL, right, accessorR, this); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthFloat(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/Int32Type.java b/src/java/org/apache/cassandra/db/marshal/Int32Type.java index 98f4c83cf64c..7c644633270c 100644 --- a/src/java/org/apache/cassandra/db/marshal/Int32Type.java +++ b/src/java/org/apache/cassandra/db/marshal/Int32Type.java @@ -28,6 +28,8 @@ import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class Int32Type extends NumberType { @@ -55,6 +57,12 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return ValueAccessor.compare(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthNumber(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/IntegerType.java b/src/java/org/apache/cassandra/db/marshal/IntegerType.java index 4c913d50afee..fed7e672c268 100644 --- a/src/java/org/apache/cassandra/db/marshal/IntegerType.java +++ b/src/java/org/apache/cassandra/db/marshal/IntegerType.java @@ -30,11 +30,19 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public final class IntegerType extends NumberType { public static final IntegerType instance = new IntegerType(); + // Constants or escaping values needed to encode/decode variable-length integers in our custom byte-ordered + // encoding scheme. + private static final int POSITIVE_VARINT_HEADER = 0x80; + private static final int NEGATIVE_VARINT_LENGTH_HEADER = 0x00; + private static final int POSITIVE_VARINT_LENGTH_HEADER = 0xFF; + private static int findMostSignificantByte(V value, ValueAccessor accessor) { int len = accessor.size(value) - 1; @@ -131,6 +139,71 @@ public static int compareIntegers(VL lhs, ValueAccessor accessorL, return 0; } + /** + * Constructs a byte-comparable representation of the number. + * We represent it as + * + * where a length_byte is: + * - 0x80 + (length - 1) for positive numbers (so that longer length sorts bigger) + * - 0x7F - (length - 1) for negative numbers (so that longer length sorts smaller) + * we don't need to sign-invert the first significant byte as the order there is already determined by the length + * byte. + * + * The representations are prefix-free, because representations of different length always have length bytes that + * differ. + * + * Examples: + * 0 as 8000 + * 1 as 8001 + * 127 as 807F + * 255 as 80FF + * 2^32-1 as 837FFFFFFF + * 2^32 as 8380000000 + * 2^33 as 840100000000 + */ + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + int p = buf.position(); + final int limit = buf.limit(); + if (p == limit) + return null; + + // skip padding + final byte signbyte = buf.get(p); + if (signbyte == (byte) POSITIVE_VARINT_LENGTH_HEADER || signbyte == (byte) NEGATIVE_VARINT_LENGTH_HEADER) + while (p + 1 < limit && buf.get(++p) == signbyte) {} + final int startpos = p; + + return new ByteSource() + { + int pos = startpos; + int sizeToReport = limit - startpos; + boolean sizeReported = false; + + public int next() + { + if (!sizeReported) + { + int v = sizeToReport; + if (v >= 128) + v = 128; + else + sizeReported = true; + + sizeToReport -= v; + return signbyte >= 0 + ? POSITIVE_VARINT_HEADER + (v - 1) + : POSITIVE_VARINT_HEADER - v; + } + if (pos == limit) + return END_OF_STREAM; + + return buf.get(pos++) & 0xFF; + } + }; + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java index 6dd41616f04d..c0d099dee174 100644 --- a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java +++ b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java @@ -26,6 +26,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.UUIDSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class LexicalUUIDType extends AbstractType { @@ -48,6 +50,32 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return accessorL.toUUID(left).compareTo(accessorR.toUUID(right)); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + if (buf == null || buf.remaining() == 0) + return null; + + // fixed-length (hence prefix-free) representation, but + // we have to sign-flip the highest bytes of the two longs + final int bufstart = buf.position(); + return new ByteSource() + { + int bufpos = 0; + + public int next() + { + if (bufpos + bufstart >= buf.limit()) + return END_OF_STREAM; + int v = buf.get(bufpos + bufstart) & 0xFF; + if (bufpos == 0 || bufpos == 8) + v ^= 0x80; + ++bufpos; + return v; + } + }; + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java index cee3cd2c4c77..ada7bc198d4c 100644 --- a/src/java/org/apache/cassandra/db/marshal/ListType.java +++ b/src/java/org/apache/cassandra/db/marshal/ListType.java @@ -18,7 +18,11 @@ package org.apache.cassandra.db.marshal; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import org.apache.cassandra.cql3.Json; @@ -32,6 +36,8 @@ import org.apache.cassandra.serializers.ListSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; public class ListType extends CollectionType> { @@ -195,6 +201,31 @@ static int compareListOrSet(AbstractType elementsComparator, VL left return sizeL == sizeR ? 0 : (sizeL < sizeR ? -1 : 1); } + @Override + public ByteSource asComparableBytes(ByteBuffer b, Version version) + { + return asComparableBytesListOrSet(getElementsType(), b, version); + } + + static ByteSource asComparableBytesListOrSet(AbstractType elementsComparator, ByteBuffer b, Version version) + { + if (!b.hasRemaining()) + return null; + + b = b.duplicate(); + int offset = 0; + int size = CollectionSerializer.readCollectionSize(b, ByteBufferAccessor.instance, ProtocolVersion.V3); + offset += CollectionSerializer.sizeOfCollectionSize(size, ProtocolVersion.V3); + ByteSource[] srcs = new ByteSource[size]; + for (int i = 0; i < size; ++i) + { + ByteBuffer v = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, ProtocolVersion.V3); + offset += CollectionSerializer.sizeOfValue(v, ByteBufferAccessor.instance, ProtocolVersion.V3); + srcs[i] = elementsComparator.asComparableBytes(v, version); + } + return ByteSource.withTerminator(version == Version.LEGACY ? 0x00 : ByteSource.TERMINATOR, srcs); + } + @Override public String toString(boolean ignoreFreezing) { diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java index ad539f70de70..e8db323731af 100644 --- a/src/java/org/apache/cassandra/db/marshal/LongType.java +++ b/src/java/org/apache/cassandra/db/marshal/LongType.java @@ -28,6 +28,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class LongType extends NumberType { @@ -57,6 +59,12 @@ public static int compareLongs(VL left, ValueAccessor accessorL, VR return ValueAccessor.compare(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthNumber(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java index 991ae08048a6..a6e59ad2fcc9 100644 --- a/src/java/org/apache/cassandra/db/marshal/MapType.java +++ b/src/java/org/apache/cassandra/db/marshal/MapType.java @@ -28,9 +28,11 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.SyntaxException; import org.apache.cassandra.serializers.CollectionSerializer; -import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.MapSerializer; +import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.Pair; public class MapType extends CollectionType> @@ -218,6 +220,35 @@ public static int compareMaps(AbstractType keysComparator, AbstractT return sizeL == sizeR ? 0 : (sizeL < sizeR ? -1 : 1); } + @Override + public ByteSource asComparableBytes(ByteBuffer b, Version version) + { + return asComparableBytesMap(getKeysType(), getValuesType(), b, version); + } + + static ByteSource asComparableBytesMap(AbstractType keysComparator, AbstractType valuesComparator, ByteBuffer b, Version version) + { + if (!b.hasRemaining()) + return null; + + b = b.duplicate(); + ProtocolVersion protocolVersion = ProtocolVersion.V3; + int offset = 0; + int size = CollectionSerializer.readCollectionSize(b, ByteBufferAccessor.instance, protocolVersion); + offset += CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); + ByteSource[] srcs = new ByteSource[size * 2]; + for (int i = 0; i < size; ++i) + { + ByteBuffer k = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, protocolVersion); + offset += CollectionSerializer.sizeOfValue(k, ByteBufferAccessor.instance, protocolVersion); + srcs[i * 2 + 0] = keysComparator.asComparableBytes(k, version); + ByteBuffer v = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, protocolVersion); + offset += CollectionSerializer.sizeOfValue(v, ByteBufferAccessor.instance, protocolVersion); + srcs[i * 2 + 1] = valuesComparator.asComparableBytes(v, version); + } + return ByteSource.withTerminator(version == Version.LEGACY ? 0x00 : ByteSource.TERMINATOR, srcs); + } + @Override public MapSerializer getSerializer() { diff --git a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java index 89241b416bb4..d72969267ab1 100644 --- a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java +++ b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java @@ -27,6 +27,10 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.FBUtilities; /** for sorting columns representing row keys in the row ordering as determined by a partitioner. @@ -93,6 +97,21 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return PartitionPosition.ForKey.get(accessorL.toBuffer(left), partitioner).compareTo(PartitionPosition.ForKey.get(accessorR.toBuffer(right), partitioner)); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, Version version) + { + if (version != Version.LEGACY) + { + // For ByteComparable.Version.OSS41 and above we encode an empty key with a null byte source. This + // way we avoid the need to special-handle a sentinel value when we decode the byte source for such a key + // (e.g. for ByteComparable.Version.Legacy we use the minimum key bound of the partitioner's minimum token as + // a sentinel value, and that results in the need to go twice through the byte source that is being + // decoded). + return buf.hasRemaining() ? partitioner.decorateKey(buf).asComparableBytes(version) : null; + } + return PartitionPosition.ForKey.get(buf, partitioner).asComparableBytes(version); + } + @Override public void validate(ByteBuffer bytes) throws MarshalException { diff --git a/src/java/org/apache/cassandra/db/marshal/ReversedType.java b/src/java/org/apache/cassandra/db/marshal/ReversedType.java index 8a4b58dca297..4b753f528184 100644 --- a/src/java/org/apache/cassandra/db/marshal/ReversedType.java +++ b/src/java/org/apache/cassandra/db/marshal/ReversedType.java @@ -28,6 +28,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class ReversedType extends AbstractType { @@ -63,6 +65,26 @@ public boolean isEmptyValueMeaningless() return baseType.isEmptyValueMeaningless(); } + @Override + public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version) + { + ByteSource src = baseType.asComparableBytes(b, version); + if (src == null) // Note: this will only compare correctly if used within a sequence + return null; + // Invert all bytes. + // The comparison requirements for the original type ensure that this encoding will compare correctly with + // respect to the reversed comparator function (and, specifically, prefixes of escaped byte-ordered types will + // compare as larger). Additionally, the weak prefix-freedom requirement ensures this encoding will also be + // weakly prefix-free. + return () -> + { + int v = src.next(); + if (v == ByteSource.END_OF_STREAM) + return v; + return v ^ 0xFF; + }; + } + public int compareCustom(VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) { return baseType.compare(right, accessorR, left, accessorL); @@ -162,4 +184,38 @@ public String toString() { return getClass().getName() + "(" + baseType + ")"; } + + private static final class ReversedPeekableByteSource extends ByteSource.Peekable + { + private final ByteSource.Peekable original; + + static ByteSource.Peekable of(ByteSource.Peekable original) + { + return original != null ? new ReversedPeekableByteSource(original) : null; + } + + private ReversedPeekableByteSource(ByteSource.Peekable original) + { + super(null); + this.original = original; + } + + @Override + public int next() + { + int v = original.next(); + if (v != END_OF_STREAM) + return v ^ 0xFF; + return END_OF_STREAM; + } + + @Override + public int peek() + { + int v = original.peek(); + if (v != END_OF_STREAM) + return v ^ 0xFF; + return END_OF_STREAM; + } + } } diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java index e5bdadab25f8..cab4d55a2357 100644 --- a/src/java/org/apache/cassandra/db/marshal/SetType.java +++ b/src/java/org/apache/cassandra/db/marshal/SetType.java @@ -30,6 +30,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.SetSerializer; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class SetType extends CollectionType> { @@ -157,6 +159,12 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return ListType.compareListOrSet(elements, left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version) + { + return ListType.asComparableBytesListOrSet(getElementsType(), b, version); + } + public SetSerializer getSerializer() { return serializer; diff --git a/src/java/org/apache/cassandra/db/marshal/ShortType.java b/src/java/org/apache/cassandra/db/marshal/ShortType.java index 03dcf5d31446..83a3e054a23e 100644 --- a/src/java/org/apache/cassandra/db/marshal/ShortType.java +++ b/src/java/org/apache/cassandra/db/marshal/ShortType.java @@ -28,6 +28,8 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public class ShortType extends NumberType { @@ -46,6 +48,14 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return ValueAccessor.compare(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + return version == ByteComparable.Version.LEGACY + ? ByteSource.signedFixedLengthNumber(buf) + : ByteSource.optionalSignedFixedLengthNumber(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java index f883ccdc1c54..0f0546af7baa 100644 --- a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java +++ b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java @@ -23,12 +23,14 @@ import org.apache.cassandra.cql3.Constants; import org.apache.cassandra.cql3.Duration; import org.apache.cassandra.cql3.Term; -import org.apache.cassandra.cql3.statements.RequestValidations; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.SimpleDateSerializer; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; @@ -38,6 +40,15 @@ public class SimpleDateType extends TemporalType SimpleDateType() {super(ComparisonType.BYTE_ORDER);} // singleton + @Override + public ByteSource asComparableBytes(ByteBuffer buf, Version version) + { + // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient. + return version == Version.LEGACY + ? ByteSource.fixedLength(buf) + : ByteSource.optionalFixedLength(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { return ByteBufferUtil.bytes(SimpleDateSerializer.dateStringToDays(source)); diff --git a/src/java/org/apache/cassandra/db/marshal/TimeType.java b/src/java/org/apache/cassandra/db/marshal/TimeType.java index be20ba7a526e..58a2bdb69fa5 100644 --- a/src/java/org/apache/cassandra/db/marshal/TimeType.java +++ b/src/java/org/apache/cassandra/db/marshal/TimeType.java @@ -19,7 +19,6 @@ import java.nio.ByteBuffer; import java.time.LocalTime; -import java.time.ZoneId; import java.time.ZoneOffset; import org.apache.cassandra.cql3.Constants; @@ -29,6 +28,9 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; /** * Nanosecond resolution time values @@ -43,6 +45,15 @@ public ByteBuffer fromString(String source) throws MarshalException return decompose(TimeSerializer.timeStringToLong(source)); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, Version version) + { + // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient. + return version == Version.LEGACY + ? ByteSource.fixedLength(buf) + : ByteSource.optionalFixedLength(buf); + } + @Override public boolean isValueCompatibleWithInternal(AbstractType otherType) { diff --git a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java index 6cf137596ccf..64bee6c430d6 100644 --- a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java +++ b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java @@ -21,10 +21,11 @@ import java.util.UUID; import org.apache.cassandra.cql3.CQL3Type; -import org.apache.cassandra.cql3.ColumnSpecification; import org.apache.cassandra.cql3.Constants; import org.apache.cassandra.cql3.Term; import org.apache.cassandra.serializers.TypeSerializer; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.UUIDGen; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.TimeUUIDSerializer; @@ -74,6 +75,22 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return Long.compare(lsb1, lsb2); } + @Override + public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version) + { + if (!b.hasRemaining()) + return null; + + int s = b.position(); + long msb = b.getLong(s); + assert ((msb >>> 12) & 0xf) == 1; + ByteBuffer swizzled = ByteBuffer.allocate(16); + swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(msb)); + swizzled.putLong(8, b.getLong(s + 8) ^ 0x8080808080808080L); + + return ByteSource.fixedLength(swizzled); + } + // takes as input 8 signed bytes in native machine order // returns the first byte unchanged, and the following 7 bytes converted to an unsigned representation // which is the same as a 2's complement long in native format diff --git a/src/java/org/apache/cassandra/db/marshal/TimestampType.java b/src/java/org/apache/cassandra/db/marshal/TimestampType.java index 0dac6b0394d2..310eafc50354 100644 --- a/src/java/org/apache/cassandra/db/marshal/TimestampType.java +++ b/src/java/org/apache/cassandra/db/marshal/TimestampType.java @@ -23,7 +23,6 @@ import org.apache.cassandra.cql3.Constants; import org.apache.cassandra.cql3.Duration; import org.apache.cassandra.cql3.Term; -import org.apache.cassandra.cql3.statements.RequestValidations; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,6 +32,8 @@ import org.apache.cassandra.serializers.TimestampSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; @@ -61,6 +62,12 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return LongType.compareLongs(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthNumber(buf); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java index 83fbb25d548a..59f9786d33f6 100644 --- a/src/java/org/apache/cassandra/db/marshal/TupleType.java +++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java @@ -35,6 +35,8 @@ import org.apache.cassandra.serializers.*; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.transform; @@ -194,12 +196,25 @@ private boolean allRemainingComponentsAreNull(T v, ValueAccessor accessor { int size = accessor.getInt(v, offset); offset += TypeSizes.INT_SIZE; - if (size >= 0) + if (size > 0) return false; } return true; } + @Override + public ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version) + { + ByteBuffer[] bufs = split(byteBuffer); // this may be shorter than types.size -- other srcs remain null in that case + ByteSource[] srcs = new ByteSource[types.size()]; + for (int i = 0; i < bufs.length; ++i) + srcs[i] = types.get(i).asComparableBytes(bufs[i], version); + // We always have a fixed number of sources, with the trailing ones possibly being nulls. + // This can only result in a prefix if the last type in the tuple allows prefixes. Since that type is required + // to be weakly prefix-free, so is the tuple. + return ByteSource.withTerminator(ByteSource.END_OF_STREAM, srcs); + } + /** * Split a tuple value into its component values. */ diff --git a/src/java/org/apache/cassandra/db/marshal/UUIDType.java b/src/java/org/apache/cassandra/db/marshal/UUIDType.java index 55ce59dae798..1ff728402ae9 100644 --- a/src/java/org/apache/cassandra/db/marshal/UUIDType.java +++ b/src/java/org/apache/cassandra/db/marshal/UUIDType.java @@ -30,6 +30,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.UUIDSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.UUIDGen; /** @@ -99,6 +101,28 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return UnsignedLongs.compare(accessorL.getLong(left, 8), accessorR.getLong(right, 8)); } + @Override + public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version v) + { + if (!b.hasRemaining()) + return null; + + int s = b.position(); + long msb = b.getLong(s); + long version = ((msb >>> 12) & 0xf); + ByteBuffer swizzled = ByteBuffer.allocate(16); + + if (version == 1) + swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(msb)); + else + swizzled.putLong(0, (version << 60) | ((msb >>> 4) & 0x0FFFFFFFFFFFF000L) | (msb & 0xFFFL)); + + swizzled.putLong(8, b.getLong(s + 8)); + + // fixed-length thus prefix-free + return ByteSource.fixedLength(swizzled); + } + @Override public boolean isValueCompatibleWithInternal(AbstractType otherType) { diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java index d0ab6b233653..26d9437f9f5a 100644 --- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java +++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java @@ -296,6 +296,26 @@ public static void copyBytes(ByteBuffer src, int srcPos, ByteBuffer dst, int dst FastByteOperations.copy(src, srcPos, dst, dstPos, length); } + /** + * Transfer bytes from one ByteBuffer to another. + * This function acts as System.arrayCopy() but for ByteBuffers. + * + * @param src the source ByteBuffer + * @param srcPos starting position in the source ByteBuffer + * @param dst the destination ByteBuffer + * @param dstPos starting position in the destination ByteBuffer + * @param length the number of bytes to copy + */ + public static void arrayCopy(ByteBuffer src, int srcPos, ByteBuffer dst, int dstPos, int length) + { + FastByteOperations.copy(src, srcPos, dst, dstPos, length); + } + + public static void arrayCopy(ByteBuffer src, int srcPos, byte[] dst, int dstPos, int length) + { + FastByteOperations.copy(src, srcPos, dst, dstPos, length); + } + public static int put(ByteBuffer src, ByteBuffer trg) { int length = Math.min(src.remaining(), trg.remaining()); From 7640c0494c9a8568c9468b87dfea2e30327b2a65 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 16:42:46 +0100 Subject: [PATCH 027/151] CORE-93: Add implementation of asComparableBytes to tokens (cherry picked from commit 73c8be5ca94a5a02aa88f68b0ec03d753b9b89b0) (cherry picked from commit fa557fd092ec459bd2eee28c8d56dac49bd478ce) --- .../cassandra/dht/ByteOrderedPartitioner.java | 8 ++++++++ .../apache/cassandra/dht/LocalPartitioner.java | 8 ++++++++ .../apache/cassandra/dht/Murmur3Partitioner.java | 8 ++++++++ .../dht/OrderPreservingPartitioner.java | 8 ++++++++ .../apache/cassandra/dht/RandomPartitioner.java | 8 ++++++++ src/java/org/apache/cassandra/dht/Token.java | 16 ++++++++++++++++ .../apache/cassandra/dht/KeyCollisionTest.java | 9 +++++++++ 7 files changed, 65 insertions(+) diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java index a6314dcccc8e..13e2d9c2f44c 100644 --- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java +++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java @@ -26,6 +26,8 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Hex; import org.apache.cassandra.utils.ObjectSizes; @@ -102,6 +104,12 @@ public boolean equals(Object obj) return Arrays.equals(token, other.token); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return ByteSource.of(token, version); + } + @Override public IPartitioner getPartitioner() { diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java index 168601ca3ef8..fe9f12de432d 100644 --- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java +++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java @@ -27,6 +27,8 @@ import org.apache.cassandra.db.CachedHashDecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.memory.HeapAllocator; @@ -174,6 +176,12 @@ public boolean equals(Object obj) return token.equals(other.token); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return comparator.asComparableBytes(token, version); + } + @Override public IPartitioner getPartitioner() { diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java index 2856f131f1ab..94ebb46cbdd7 100644 --- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java +++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java @@ -33,6 +33,8 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.MurmurHash; import org.apache.cassandra.utils.ObjectSizes; @@ -176,6 +178,12 @@ public int compareTo(Token o) return Long.compare(token, ((LongToken) o).token); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return ByteSource.of(token); + } + @Override public IPartitioner getPartitioner() { diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java index 16c5db17a448..d248e0c5ee87 100644 --- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java +++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java @@ -33,6 +33,8 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Pair; @@ -194,6 +196,12 @@ public long getHeapSize() { return EMPTY_SIZE + ObjectSizes.sizeOf(token); } + + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return ByteSource.of((String) token, version); + } } public StringToken getToken(ByteBuffer key) diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java index 241b7850fdf7..eb7eed8f15ad 100644 --- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java +++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java @@ -34,6 +34,8 @@ import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.GuidGenerator; import org.apache.cassandra.utils.ObjectSizes; @@ -244,6 +246,12 @@ public BigIntegerToken(String token) this(new BigInteger(token)); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return IntegerType.instance.asComparableBytes(ByteBuffer.wrap(token.toByteArray()), version); + } + @Override public IPartitioner getPartitioner() { diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java index d8e82f82c510..bf89e1979ec8 100644 --- a/src/java/org/apache/cassandra/dht/Token.java +++ b/src/java/org/apache/cassandra/dht/Token.java @@ -26,6 +26,8 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; public abstract class Token implements RingPosition, Serializable { @@ -99,6 +101,20 @@ public long serializedSize(Token object, int version) abstract public long getHeapSize(); abstract public Object getTokenValue(); + /** + * Produce a weakly prefix-free byte-comparable representation of the token, i.e. such a sequence of bytes that any + * pair x, y of valid tokens of this type and any bytes b1, b2 between 0x10 and 0xEF, + * (+ stands for concatenation) + * compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2) + * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and: + * asByteComparable(x)+b1 is not a prefix of asByteComparable(y) (weakly prefix free) + * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the + * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if + * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat + * more efficient encoding of arbitrary-length byte-comparable blobs. + */ + abstract public ByteSource asComparableBytes(ByteComparable.Version version); + /** * Returns a measure for the token space covered between this token and next. * Used by the token allocation algorithm (see CASSANDRA-7032). diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java index 5b5365da099b..2881ab96c9ce 100644 --- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java +++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java @@ -27,6 +27,7 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.marshal.IntegerType; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; @@ -36,6 +37,8 @@ import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.FBUtilities; /** @@ -124,5 +127,11 @@ public long getHeapSize() { return 0; } + + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return IntegerType.instance.asComparableBytes(IntegerType.instance.decompose(token), version); + } } } From 7b67b6c0079408d55a205f89a654be4d847af9bc Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 16:45:31 +0100 Subject: [PATCH 028/151] CORE-93: Add implementation of asComparableBytes to partition position (cherry picked from commit d29ec5b1198738e1a966b560acdc170dfef609bb) (cherry picked from commit 59a4b3b46a7c9eb4709c4a2c38198ef386a31173) --- .../org/apache/cassandra/db/DecoratedKey.java | 19 ++++++++++++++++++- .../cassandra/db/PartitionPosition.java | 18 +++++++++++++++++- src/java/org/apache/cassandra/dht/Token.java | 7 +++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java index 92d641460e10..70ca5d1a6b65 100644 --- a/src/java/org/apache/cassandra/db/DecoratedKey.java +++ b/src/java/org/apache/cassandra/db/DecoratedKey.java @@ -24,8 +24,9 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.dht.Token.KeyBound; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.MurmurHash; +import org.apache.cassandra.utils.ByteSource; import org.apache.cassandra.utils.IFilter.FilterKey; +import org.apache.cassandra.utils.MurmurHash; /** * Represents a decorated key, handy for certain operations @@ -97,6 +98,22 @@ public static int compareTo(IPartitioner partitioner, ByteBuffer key, PartitionP return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.getKey()) : cmp; } + @Override + public ByteSource asComparableBytes(Version version) + { + // Note: In the legacy version one encoding could be a prefix of another as the escaping is only weakly + // prefix-free (see ByteSourceTest.testDecoratedKeyPrefixes()). + // The OSS41 version avoids this by adding a terminator. + return ByteSource.withTerminator(version == Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR, + token.asComparableBytes(version), + keyComparableBytes(version)); + } + + protected ByteSource keyComparableBytes(Version version) + { + return ByteSource.of(getKey(), version); + } + public IPartitioner getPartitioner() { return getToken().getPartitioner(); diff --git a/src/java/org/apache/cassandra/db/PartitionPosition.java b/src/java/org/apache/cassandra/db/PartitionPosition.java index 3b45c6c0e2eb..578b109a835c 100644 --- a/src/java/org/apache/cassandra/db/PartitionPosition.java +++ b/src/java/org/apache/cassandra/db/PartitionPosition.java @@ -24,8 +24,10 @@ import org.apache.cassandra.dht.*; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; -public interface PartitionPosition extends RingPosition +public interface PartitionPosition extends RingPosition, ByteComparable { public static enum Kind { @@ -54,6 +56,20 @@ public static PartitionPosition get(ByteBuffer key, IPartitioner p) public Kind kind(); public boolean isMinimum(); + /** + * Produce a prefix-free byte-comparable representation of the key, i.e. such a sequence of bytes that any pair x, y + * of valid positions (with the same key column types and partitioner), + * x.compareTo(y) == compareLexicographicallyUnsigned(x.asComparableBytes(), y.asComparableBytes()) + * and + * x.asComparableBytes() is not a prefix of y.asComparableBytes() + * + * We use a two-component tuple for decorated keys, and a one-component tuple for key bounds, where the terminator + * byte is chosen to yield the correct comparison result. No decorated key can be a prefix of another (per the tuple + * encoding), and no key bound can be a prefix of one because it uses a terminator byte that is different from the + * tuple separator. + */ + public abstract ByteSource asComparableBytes(Version version); + public static class RowPositionSerializer implements IPartitionerDependentSerializer { /* diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java index bf89e1979ec8..5dd3904a2e35 100644 --- a/src/java/org/apache/cassandra/dht/Token.java +++ b/src/java/org/apache/cassandra/dht/Token.java @@ -206,6 +206,13 @@ public int compareTo(PartitionPosition pos) return ((pos instanceof KeyBound) && !((KeyBound)pos).isMinimumBound) ? 0 : 1; } + @Override + public ByteSource asComparableBytes(Version version) + { + int terminator = isMinimumBound ? ByteSource.LT_NEXT_COMPONENT : ByteSource.GT_NEXT_COMPONENT; + return ByteSource.withTerminator(terminator, token.asComparableBytes(version)); + } + public IPartitioner getPartitioner() { return getToken().getPartitioner(); From 1540142bd2414fb830fb3d2dafabebb488326ac7 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 16:54:20 +0100 Subject: [PATCH 029/151] CORE-93: Add implementation of asComparableBytes to clustering (cherry picked from commit 4187defa75496f64e0e743b0091431cd261fa87c) (cherry picked from commit 19b701e06145976fbd254095ff5962bafcaf9014) --- .../cassandra/db/ClusteringComparator.java | 85 +++++++++++++++++++ .../apache/cassandra/db/ClusteringPrefix.java | 57 +++++++++++-- 2 files changed, 133 insertions(+), 9 deletions(-) diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java index fdc450813ff2..a23aa36017ae 100644 --- a/src/java/org/apache/cassandra/db/ClusteringComparator.java +++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java @@ -31,6 +31,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.io.sstable.IndexInfo; +import org.apache.cassandra.utils.ByteComparable; +import org.apache.cassandra.utils.ByteSource; /** * A comparator of clustering prefixes (or more generally of {@link Clusterable}}. @@ -232,6 +234,89 @@ public void validate(ClusteringPrefix clustering) } } + /** + * Produce a prefix-free byte-comparable representation of the given value, i.e. such a sequence of bytes that any + * pair x, y of valid values of this type + * compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x), asByteComparable(y)) + * and + * asByteComparable(x) is not a prefix of asByteComparable(y) + */ + public ByteComparable asByteComparable(ClusteringPrefix clustering) + { + return new ByteComparableClustering(clustering); + } + + /** + * A prefix-free byte-comparable representation for a clustering or prefix. + * + * Adds a NEXT_COMPONENT byte before each component (allowing inclusive/exclusive bounds over incomplete prefixes + * of that length) and finishes with a suitable byte for the clustering kind. Also deals with null entries. + * + * Since all types' encodings are weakly prefix-free, this is guaranteed to be prefix-free as long as the + * bound/ClusteringPrefix terminators are different from the separator byte. It is okay for the terminator for + * Clustering to be the same as the separator, as all Clusterings must be completely specified. + * + * See also {@link AbstractType#asComparableBytes}. + * + * Some examples: + * "A", 0005, Clustering -> 40 4100 40 0005 40 + * "B", 0006, InclusiveEnd -> 40 4200 40 0006 60 + * "A", ExclusiveStart -> 40 4100 60 + * "", null, Clustering -> 40 00 3F 40 + * "", 0000, Clustering -> 40 00 40 0000 40 + * BOTTOM -> 20 + */ + private class ByteComparableClustering implements ByteComparable + { + private final ClusteringPrefix src; + + ByteComparableClustering(ClusteringPrefix src) + { + this.src = src; + } + + @Override + public ByteSource asComparableBytes(Version version) + { + return new ByteSource() + { + private ByteSource current = null; + private int srcnum = -1; + + @Override + public int next() + { + if (current != null) + { + int b = current.next(); + if (b > END_OF_STREAM) + return b; + current = null; + } + + int sz = src.size(); + if (srcnum == sz) + return END_OF_STREAM; + + ++srcnum; + if (srcnum == sz) + return src.kind().asByteComparableValue(version); + + current = subtype(srcnum).asComparableBytes(src.accessor().toBuffer(src.get(srcnum)), version); + if (current == null) + return subtype(srcnum).isReversed() ? NEXT_COMPONENT_NULL_REVERSED : NEXT_COMPONENT_NULL; + + return NEXT_COMPONENT; + } + }; + } + + public String toString() + { + return src.clusteringString(subtypes()); + } + } + /** * A comparator for rows. * diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java index a1291c889f1d..3cf814c6cfc7 100644 --- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java +++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; +import java.util.function.ToIntFunction; import org.apache.cassandra.cache.IMeasurableMemory; import org.apache.cassandra.config.*; @@ -34,6 +35,8 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ByteArrayUtil; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.ByteComparable.Version; +import org.apache.cassandra.utils.ByteSource; /** * A clustering prefix is the unit of what a {@link ClusteringComparator} can compare. @@ -62,14 +65,19 @@ public enum Kind { // WARNING: the ordering of that enum matters because we use ordinal() in the serialization - EXCL_END_BOUND (0, -1), - INCL_START_BOUND (0, -1), - EXCL_END_INCL_START_BOUNDARY(0, -1), - STATIC_CLUSTERING (1, -1), - CLUSTERING (2, 0), - INCL_END_EXCL_START_BOUNDARY(3, 1), - INCL_END_BOUND (3, 1), - EXCL_START_BOUND (3, 1); + EXCL_END_BOUND (0, -1, v -> ByteSource.LT_NEXT_COMPONENT), + INCL_START_BOUND (0, -1, v -> ByteSource.LT_NEXT_COMPONENT), + EXCL_END_INCL_START_BOUNDARY(0, -1, v -> ByteSource.LT_NEXT_COMPONENT), + STATIC_CLUSTERING (1, -1, v -> v == Version.LEGACY + ? ByteSource.LT_NEXT_COMPONENT + 1 + : ByteSource.TERMINATOR - 1), + CLUSTERING (2, 0, v -> v == Version.LEGACY + ? ByteSource.NEXT_COMPONENT + : ByteSource.TERMINATOR), + INCL_END_EXCL_START_BOUNDARY(3, 1, v -> ByteSource.GT_NEXT_COMPONENT), + INCL_END_BOUND (3, 1, v -> ByteSource.GT_NEXT_COMPONENT), + EXCL_START_BOUND (3, 1, v -> ByteSource.GT_NEXT_COMPONENT); + private final int comparison; @@ -79,10 +87,13 @@ public enum Kind */ public final int comparedToClustering; - Kind(int comparison, int comparedToClustering) + public final ToIntFunction asByteComparable; + + Kind(int comparison, int comparedToClustering, ToIntFunction asByteComparable) { this.comparison = comparison; this.comparedToClustering = comparedToClustering; + this.asByteComparable = asByteComparable; } /** @@ -197,6 +208,16 @@ public Kind openBoundOfBoundary(boolean reversed) ? (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND) : (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND); } + + /* + * Returns a terminator value for this clustering type that is suitable for byte comparison. + * Inclusive starts / exclusive ends need a lower value than ByteSource.NEXT_COMPONENT and the clustering byte, + * exclusive starts / inclusive ends -- a higher. + */ + public int asByteComparableValue(Version version) + { + return asByteComparable.applyAsInt(version); + } } default boolean isBottom() @@ -308,6 +329,24 @@ default ByteBuffer serializeAsPartitionKey() values[i] = accessor().toBuffer(get(i)); return CompositeType.build(ByteBufferAccessor.instance, values); } + + /** + * Produce a human-readable representation of the clustering given the list of types. + * Easier to access than metadata for debugging. + */ + public default String clusteringString(List> types) + { + StringBuilder sb = new StringBuilder(); + sb.append(kind()).append('('); + for (int i = 0; i < size(); i++) + { + if (i > 0) + sb.append(", "); + sb.append(types.get(i).getString(get(i), accessor())); + } + return sb.append(')').toString(); + } + /** * The values of this prefix as an array. *

    From 49c3f29ad4e6b402c77903e62136cb6345c51daa Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 16:55:26 +0100 Subject: [PATCH 030/151] CORE-93: Add some tests (cherry picked from commit 9f6983dd580f0ab346aa9df13d69b38c9ecccec8) (cherry picked from commit 819733f4304e16a69ffe85bc2f99c48b9244b628) --- test/unit/org/apache/cassandra/Util.java | 195 +++- .../cassandra/utils/ByteSourceTest.java | 1026 +++++++++++++++++ .../73-819733f430 CORE-93: Add some tests | 19 + 3 files changed, 1239 insertions(+), 1 deletion(-) create mode 100644 test/unit/org/apache/cassandra/utils/ByteSourceTest.java create mode 100644 update-history/STAR-801/73-819733f430 CORE-93: Add some tests diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java index cd4e4f442f77..c26b339dc5b7 100644 --- a/test/unit/org/apache/cassandra/Util.java +++ b/test/unit/org/apache/cassandra/Util.java @@ -21,6 +21,7 @@ import java.io.*; import java.lang.reflect.Field; +import java.math.BigInteger; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.nio.file.*; @@ -86,7 +87,6 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; public class Util @@ -784,6 +784,199 @@ public static void assertOnDiskState(ColumnFamilyStore cfs, int expectedSSTableC assertEquals(expectedSSTableCount, fileCount); } + public static ByteBuffer generateMurmurCollision(ByteBuffer original, byte... bytesToAdd) + { + // Round size up to 16, and add another 16 bytes + ByteBuffer collision = ByteBuffer.allocate((original.remaining() + bytesToAdd.length + 31) & -16); + collision.put(original); // we can use this as a copy of original with 0s appended at the end + + original.flip(); + + long c1 = 0x87c37b91114253d5L; + long c2 = 0x4cf5ad432745937fL; + + long h1 = 0; + long h2 = 0; + + // Get hash of original + int index = 0; + final int length = original.limit(); + while (index <= length - 16) + { + long k1 = Long.reverseBytes(collision.getLong(index + 0)); + long k2 = Long.reverseBytes(collision.getLong(index + 8)); + + // 16 bytes + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + h1 ^= k1; + h1 = rotl64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + h2 ^= k2; + h2 = rotl64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + + index += 16; + } + + long oh1 = h1; + long oh2 = h2; + + // Process final unfilled chunk, but only adjust the original hash value + if (index < length) + { + long k1 = Long.reverseBytes(collision.getLong(index + 0)); + long k2 = Long.reverseBytes(collision.getLong(index + 8)); + + // 16 bytes + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + oh1 ^= k1; + + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + oh2 ^= k2; + } + + // These are the hashes the original would provide, before final mixing + oh1 ^= original.capacity(); + oh2 ^= original.capacity(); + + // Fill in the remaining bytes before the last 16 and get their hash + collision.put(bytesToAdd); + while ((collision.position() & 0x0f) != 0) + collision.put((byte) 0); + + while (index < collision.position()) + { + long k1 = Long.reverseBytes(collision.getLong(index + 0)); + long k2 = Long.reverseBytes(collision.getLong(index + 8)); + + // 16 bytes + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + h1 ^= k1; + h1 = rotl64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + h2 ^= k2; + h2 = rotl64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + + index += 16; + } + + // Working backwards, we must get this hash pair + long th1 = h1; + long th2 = h2; + + // adjust ohx with length + h1 = oh1 ^ collision.capacity(); + h2 = oh2 ^ collision.capacity(); + + // Get modulo-long inverses of the multipliers used in the computation + long i5i = inverse(5L); + long c1i = inverse(c1); + long c2i = inverse(c2); + + // revert one step + h2 -= 0x38495ab5; + h2 *= i5i; + h2 -= h1; + h2 = rotl64(h2, 33); + + h1 -= 0x52dce729; + h1 *= i5i; + h1 -= th2; // use h2 before it's adjusted with k2 + h1 = rotl64(h1, 37); + + // extract the required modifiers and applies the inverse of their transformation + long k1 = h1 ^ th1; + k1 = c2i * k1; + k1 = rotl64(k1, 33); + k1 = c1i * k1; + + long k2 = h2 ^ th2; + k2 = c1i * k2; + k2 = rotl64(k2, 31); + k2 = c2i * k2; + + collision.putLong(Long.reverseBytes(k1)); + collision.putLong(Long.reverseBytes(k2)); + collision.flip(); + + return collision; + } + + // Assumes a and b are positive + private static BigInteger[] xgcd(BigInteger a, BigInteger b) { + BigInteger x = a, y = b; + BigInteger[] qrem; + BigInteger[] result = new BigInteger[3]; + BigInteger x0 = BigInteger.ONE, x1 = BigInteger.ZERO; + BigInteger y0 = BigInteger.ZERO, y1 = BigInteger.ONE; + while (true) + { + qrem = x.divideAndRemainder(y); + x = qrem[1]; + x0 = x0.subtract(y0.multiply(qrem[0])); + x1 = x1.subtract(y1.multiply(qrem[0])); + if (x.equals(BigInteger.ZERO)) + { + result[0] = y; + result[1] = y0; + result[2] = y1; + return result; + } + + qrem = y.divideAndRemainder(x); + y = qrem[1]; + y0 = y0.subtract(x0.multiply(qrem[0])); + y1 = y1.subtract(x1.multiply(qrem[0])); + if (y.equals(BigInteger.ZERO)) + { + result[0] = x; + result[1] = x0; + result[2] = x1; + return result; + } + } + } + + /** + * Find a mupltiplicative inverse for the given multiplier for long, i.e. + * such that x * inverse(x) = 1 where * is long multiplication. + * In other words, such an integer that x * inverse(x) == 1 (mod 2^64). + */ + public static long inverse(long multiplier) + { + final BigInteger modulus = BigInteger.ONE.shiftLeft(64); + // Add the modulus to the multiplier to avoid problems with negatives (a + m == a (mod m)) + BigInteger[] gcds = xgcd(BigInteger.valueOf(multiplier).add(modulus), modulus); + // xgcd gives g, a and b, such that ax + bm = g + // ie, ax = g (mod m). Return a + assert gcds[0].equals(BigInteger.ONE) : "Even number " + multiplier + " has no long inverse"; + return gcds[1].longValueExact(); + } + + public static long rotl64(long v, int n) + { + return ((v << n) | (v >>> (64 - n))); + } + /** * Disable bloom filter on all sstables of given table */ diff --git a/test/unit/org/apache/cassandra/utils/ByteSourceTest.java b/test/unit/org/apache/cassandra/utils/ByteSourceTest.java new file mode 100644 index 000000000000..fd1188f27858 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/ByteSourceTest.java @@ -0,0 +1,1026 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.UUID; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.Util; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.AsciiType; +import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.db.marshal.ByteType; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.CompositeType; +import org.apache.cassandra.db.marshal.DateType; +import org.apache.cassandra.db.marshal.DecimalType; +import org.apache.cassandra.db.marshal.DoubleType; +import org.apache.cassandra.db.marshal.DynamicCompositeType; +import org.apache.cassandra.db.marshal.DynamicCompositeTypeTest; +import org.apache.cassandra.db.marshal.EmptyType; +import org.apache.cassandra.db.marshal.FloatType; +import org.apache.cassandra.db.marshal.InetAddressType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.LexicalUUIDType; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.MapType; +import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; +import org.apache.cassandra.db.marshal.ReversedType; +import org.apache.cassandra.db.marshal.SetType; +import org.apache.cassandra.db.marshal.ShortType; +import org.apache.cassandra.db.marshal.SimpleDateType; +import org.apache.cassandra.db.marshal.TimeType; +import org.apache.cassandra.db.marshal.TimeUUIDType; +import org.apache.cassandra.db.marshal.TimestampType; +import org.apache.cassandra.db.marshal.TupleType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.db.marshal.UUIDType; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.utils.ByteComparable.Version; + +import static org.junit.Assert.assertEquals; + +public class ByteSourceTest +{ + private final static Logger logger = LoggerFactory.getLogger(ByteSourceTest.class); + + @Rule + public final ExpectedException expectedException = ExpectedException.none(); + + String[] testStrings = new String[] { "", "\0", "\0\0", "\001", "A\0\0B", "A\0B\0", "0", "0\0", "00", "1", "\377" }; + Integer[] testInts = new Integer[] { null, Integer.MIN_VALUE, Integer.MIN_VALUE + 1, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, Integer.MAX_VALUE - 1, Integer.MAX_VALUE }; + Byte[] testBytes = new Byte[] { -128, -127, -1, 0, 1, 127 }; + Short[] testShorts = new Short[] { Short.MIN_VALUE, Short.MIN_VALUE + 1, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, Short.MAX_VALUE - 1, Short.MAX_VALUE }; + Long[] testLongs = new Long[] { null, Long.MIN_VALUE, Long.MIN_VALUE + 1, Integer.MIN_VALUE - 1L, -256L, -255L, -128L, -127L, -1L, 0L, 1L, 127L, 128L, 255L, 256L, Integer.MAX_VALUE + 1L, Long.MAX_VALUE - 1, Long.MAX_VALUE }; + Double[] testDoubles = new Double[] { null, Double.NEGATIVE_INFINITY, -Double.MAX_VALUE, -1e+200, -1e3, -1e0, -1e-3, -1e-200, -Double.MIN_VALUE, -0.0, 0.0, Double.MIN_VALUE, 1e-200, 1e-3, 1e0, 1e3, 1e+200, Double.MAX_VALUE, Double.POSITIVE_INFINITY, Double.NaN }; + Float[] testFloats = new Float[] { null, Float.NEGATIVE_INFINITY, -Float.MAX_VALUE, -1e+30f, -1e3f, -1e0f, -1e-3f, -1e-30f, -Float.MIN_VALUE, -0.0f, 0.0f, Float.MIN_VALUE, 1e-30f, 1e-3f, 1e0f, 1e3f, 1e+30f, Float.MAX_VALUE, Float.POSITIVE_INFINITY, Float.NaN }; + Boolean[] testBools = new Boolean[] { null, false, true }; + UUID[] testUUIDs = new UUID[] { null, UUIDGen.getTimeUUID(), UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID(), + UUIDGen.getTimeUUID(123, 234), UUIDGen.getTimeUUID(123, 234), UUIDGen.getTimeUUID(123), + UUID.fromString("6ba7b811-9dad-11d1-80b4-00c04fd430c8"), + UUID.fromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8"), + UUID.fromString("e902893a-9d22-3c7e-a7b8-d6e313b71d9f"), + UUID.fromString("74738ff5-5367-5958-9aee-98fffdcd1876"), + UUID.fromString("52df1bb0-6a2f-11e6-b6e4-a6dea7a01b67"), + UUID.fromString("52df1bb0-6a2f-11e6-362d-aff2143498ea"), + UUID.fromString("52df1bb0-6a2f-11e6-b62d-aff2143498ea")}; + // Instant.MIN/MAX fail Date.from. + Date[] testDates = new Date[] { null, + Date.from(Instant.ofEpochSecond(Integer.MIN_VALUE)), + Date.from(Instant.ofEpochSecond(Short.MIN_VALUE)), + Date.from(Instant.ofEpochMilli(-2000)), + Date.from(Instant.EPOCH), + Date.from(Instant.ofEpochMilli(2000)), + Date.from(Instant.ofEpochSecond(Integer.MAX_VALUE)), + Date.from(Instant.now()) }; + BigInteger[] testBigInts; + + { + Set bigs = new TreeSet<>(); + for (Long l : testLongs) + if (l != null) + bigs.add(BigInteger.valueOf(l)); + for (int i = 0; i < 11; ++i) + { + bigs.add(BigInteger.valueOf(i)); + bigs.add(BigInteger.valueOf(-i)); + + bigs.add(BigInteger.valueOf((1L << 4 * i) - 1)); + bigs.add(BigInteger.valueOf((1L << 4 * i))); + bigs.add(BigInteger.valueOf(-(1L << 4 * i) - 1)); + bigs.add(BigInteger.valueOf(-(1L << 4 * i))); + String p = exp10(i); + bigs.add(new BigInteger(p)); + bigs.add(new BigInteger("-" + p)); + p = exp10(1 << i); + bigs.add(new BigInteger(p)); + bigs.add(new BigInteger("-" + p)); + + BigInteger base = BigInteger.ONE.shiftLeft(512 * i); + bigs.add(base); + bigs.add(base.add(BigInteger.ONE)); + bigs.add(base.subtract(BigInteger.ONE)); + base = base.negate(); + bigs.add(base); + bigs.add(base.add(BigInteger.ONE)); + bigs.add(base.subtract(BigInteger.ONE)); + } + testBigInts = bigs.toArray(new BigInteger[0]); + } + BigDecimal[] testBigDecimals; + { + String vals = "0, 1, 1.1, 21, 98.9, 99, 99.9, 100, 100.1, 101, 331, 0.4, 0.07, 0.0700, 0.005, " + + "6e4, 7e200, 6e-300, 8.1e2000, 8.1e-2000, 9e2000, " + + "123456789012.34567890e-1000, 123456.78901234, 1234.56789012e2, " + + "1.0000, 0.01e2, 100e-2, 00, 0.000, 0E-18, 0E+18"; + List decs = new ArrayList<>(); + for (String s : vals.split(", ")) + { + decs.add(new BigDecimal(s)); + decs.add(new BigDecimal("-" + s)); + } + testBigDecimals = decs.toArray(new BigDecimal[0]); + } + + static String exp10(int pow) + { + StringBuilder builder = new StringBuilder(); + builder.append('1'); + for (int i=0; i ByteSource.of(x), Integer::compare, testInts); + } + + @Test + public void randomTestInts() + { + Random rand = new Random(); + for (int i=0; i<10000; ++i) + { + int i1 = rand.nextInt(); + int i2 = rand.nextInt(); + assertComparesSame(Int32Type.instance, i1, i2); + } + + } + + @Test + public void testLongs() + { + testType(LongType.instance, testLongs); + testDirect(x -> ByteSource.of(x), Long::compare, testLongs); + } + + @Test + public void testShorts() + { + testType(ShortType.instance, testShorts); + } + + @Test + public void testBytes() + { + testType(ByteType.instance, testBytes); + } + + @Test + public void testDoubles() + { + testType(DoubleType.instance, testDoubles); + } + + @Test + public void testFloats() + { + testType(FloatType.instance, testFloats); + } + + @Test + public void testBigInts() + { + testType(IntegerType.instance, testBigInts); + } + + @Test + public void testBigDecimals() + { + testType(DecimalType.instance, testBigDecimals); + } + + @Test + public void testBigDecimalInCombination() + { + BigDecimal b1 = new BigDecimal("123456.78901201"); + BigDecimal b2 = new BigDecimal("123456.789012"); + Boolean b = false; + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = b1.negate(); + b2 = b2.negate(); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = new BigDecimal("-123456.78901289"); + b2 = new BigDecimal("-123456.789012"); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = new BigDecimal("1"); + b2 = new BigDecimal("1.1"); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = b1.negate(); + b2 = b2.negate(); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + } + + @Test + public void testUUIDs() + { + testType(UUIDType.instance, testUUIDs); + } + + @Test + public void testTimeUUIDs() + { + testType(TimeUUIDType.instance, Arrays.stream(testUUIDs).filter(x -> x == null || x.version() == 1).toArray()); + } + + @Test + public void testLexicalUUIDs() + { + testType(LexicalUUIDType.instance, testUUIDs); + } + + @Test + public void testSimpleDate() + { + testType(SimpleDateType.instance, Arrays.stream(testInts).filter(x -> x != null).toArray()); + } + + @Test + public void testTimeType() + { + testType(TimeType.instance, Arrays.stream(testLongs).filter(x -> x != null && x >= 0 && x <= 24L * 60 * 60 * 1000 * 1000 * 1000).toArray()); + } + + @SuppressWarnings("deprecation") + @Test + public void testDateType() + { + testType(DateType.instance, testDates); + } + + @Test + public void testTimestampType() + { + testType(TimestampType.instance, testDates); + } + + @Test + public void testBytesType() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testType(BytesType.instance, values.toArray()); + } + + @Test + public void testInetAddressType() throws UnknownHostException + { + InetAddress[] testInets = new InetAddress[] { null, + InetAddress.getLocalHost(), + InetAddress.getLoopbackAddress(), + InetAddress.getByName("192.168.0.1"), + InetAddress.getByName("fe80::428d:5cff:fe53:1dc9"), + InetAddress.getByName("2001:610:3:200a:192:87:36:2"), + InetAddress.getByName("10.0.0.1"), + InetAddress.getByName("0a00:0001::"), + InetAddress.getByName("::10.0.0.1") }; + testType(InetAddressType.instance, testInets); + } + + @Test + public void testEmptyType() + { + testType(EmptyType.instance, new Void[] { null }); + } + + @Test + public void testPatitionerDefinedOrder() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testBuffers(new PartitionerDefinedOrder(Murmur3Partitioner.instance), values); + testBuffers(new PartitionerDefinedOrder(RandomPartitioner.instance), values); + testBuffers(new PartitionerDefinedOrder(ByteOrderedPartitioner.instance), values); + } + + @Test + public void testPatitionerOrder() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testDecoratedKeys(Murmur3Partitioner.instance, values); + testDecoratedKeys(RandomPartitioner.instance, values); + testDecoratedKeys(ByteOrderedPartitioner.instance, values); + } + + @Test + public void testLocalPatitionerOrder() + { + for (int i = 0; i < testValues.length; ++i) + { + final AbstractType testType = testTypes[i]; + testDecoratedKeys(new LocalPartitioner(testType), Lists.transform(Arrays.asList(testValues[i]), + v -> testType.decompose(v))); + } + } + + ClusteringPrefix.Kind[] kinds = new ClusteringPrefix.Kind[] { + ClusteringPrefix.Kind.INCL_START_BOUND, + ClusteringPrefix.Kind.CLUSTERING, + ClusteringPrefix.Kind.EXCL_START_BOUND, + }; + + interface PairTester + { + void test(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4); + } + + void testCombinationSampling(Random rand, PairTester tester) + { + for (int i=0;i c = ByteBufferAccessor.instance.factory().bound(k1, b); + ClusteringPrefix e = ByteBufferAccessor.instance.factory().bound(k2, d); + final ByteComparable bsc = comp.asByteComparable(c); + final ByteComparable bse = comp.asByteComparable(e); + int expected = Integer.signum(comp.compare(c, e)); + assertEquals(String.format("Failed comparing %s and %s, %s vs %s version %s", + safeStr(c.clusteringString(comp.subtypes())), + safeStr(e.clusteringString(comp.subtypes())), bsc, bse, v), + expected, Integer.signum(ByteComparable.compare(bsc, bse, v))); + maybeCheck41Properties(expected, bsc, bse, v); + maybeAssertNotPrefix(bsc, bse, v); + + ClusteringComparator compR = new ClusteringComparator(ReversedType.getInstance(t1), ReversedType.getInstance(t2)); + final ByteComparable bsrc = compR.asByteComparable(c); + final ByteComparable bsre = compR.asByteComparable(e); + int expectedR = Integer.signum(compR.compare(c, e)); + assertEquals(String.format("Failed comparing reversed %s and %s, %s vs %s version %s", + safeStr(c.clusteringString(comp.subtypes())), + safeStr(e.clusteringString(comp.subtypes())), bsrc, bsre, v), + expectedR, Integer.signum(ByteComparable.compare(bsrc, bsre, v))); + maybeCheck41Properties(expectedR, bsrc, bsre, v); + maybeAssertNotPrefix(bsrc, bsre, v); + } + } + + @Test + public void testTupleType() + { + Random rand = ThreadLocalRandom.current(); + testCombinationSampling(rand, this::assertTupleComparesSame); + } + + @Test + public void testTupleTypeNonFull() + { + TupleType tt = new TupleType(ImmutableList.of(AsciiType.instance, Int32Type.instance)); + List tests = ImmutableList.of + ( + TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, ""), + decomposeAndRandomPad(Int32Type.instance, 0)}), + TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, ""), + decomposeAndRandomPad(Int32Type.instance, null)}), + TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, "")}), + TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[0]) + ); + testBuffers(tt, tests); + } + + void assertTupleComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4) + { + TupleType tt = new TupleType(ImmutableList.of(t1, t2)); + ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {t1.decompose(o1), t2.decompose(o2)}); + ByteBuffer b2 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {t1.decompose(o3), t2.decompose(o4)}); + assertComparesSame(tt, b1, b2); + } + + @Test + public void testCompositeType() + { + Random rand = new Random(0); + testCombinationSampling(rand, this::assertCompositeComparesSame); + } + + @Test + public void testCompositeTypeNonFull() + { + CompositeType tt = CompositeType.getInstance(AsciiType.instance, Int32Type.instance); + List tests = ImmutableList.of + ( + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)), + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)), + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, "")), + CompositeType.build(ByteBufferAccessor.instance), + CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(AsciiType.instance, "")), + CompositeType.build(ByteBufferAccessor.instance,true) + ); + for (ByteBuffer b : tests) + tt.validate(b); + testBuffers(tt, tests); + } + + void assertCompositeComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4) + { + CompositeType tt = CompositeType.getInstance(t1, t2); + ByteBuffer b1 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o1), decomposeAndRandomPad(t2, o2)); + ByteBuffer b2 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o3), decomposeAndRandomPad(t2, o4)); + assertComparesSame(tt, b1, b2); + } + + @Test + public void testDynamicComposite() + { + DynamicCompositeType tt = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases); + UUID[] uuids = DynamicCompositeTypeTest.uuids; + List tests = ImmutableList.of + ( + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", null, -1, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 24, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 42, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[0], -1, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[1], 42, false, true) + ); + for (ByteBuffer b : tests) + tt.validate(b); + testBuffers(tt, tests); + } + + @Test + public void testListTypeString() + { + testCollection(ListType.getInstance(AsciiType.instance, true), testStrings, () -> new ArrayList<>(), new Random()); + } + + @Test + public void testListTypeLong() + { + testCollection(ListType.getInstance(LongType.instance, true), testLongs, () -> new ArrayList<>(), new Random()); + } + + @Test + public void testSetTypeString() + { + testCollection(SetType.getInstance(AsciiType.instance, true), testStrings, () -> new HashSet<>(), new Random()); + } + + @Test + public void testSetTypeLong() + { + testCollection(SetType.getInstance(LongType.instance, true), testLongs, () -> new HashSet<>(), new Random()); + } + + > void testCollection(CollectionType tt, T[] values, Supplier gen, Random rand) + { + int cnt = 0; + List tests = new ArrayList<>(); + tests.add(gen.get()); + for (int c = 1; c <= 3; ++c) + for (int j = 0; j < 5; ++j) + { + CT l = gen.get(); + for (int i = 0; i < c; ++i) + l.add(values[cnt++ % values.length]); + + tests.add(l); + } + testType(tt, tests.toArray()); + } + + @Test + public void testMapTypeStringLong() + { + testMap(MapType.getInstance(AsciiType.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random()); + } + + @Test + public void testMapTypeStringLongTree() + { + testMap(MapType.getInstance(AsciiType.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random()); + } + + @Test + public void testDecoratedKeyPrefixesVOSS41() + { + // This should pass with the OSS 4.1 encoding + testDecoratedKeyPrefixes(Version.OSS41); + } + + @Test + public void testDecoratedKeyPrefixesVLegacy() + { + // ... and fail with the legacy encoding + try + { + testDecoratedKeyPrefixes(Version.LEGACY); + } + catch (AssertionError e) + { + // Correct path, test failing. + return; + } + Assert.fail("Test expected to fail."); + } + + @Test + public void testFixedLengthWithOffset() + { + byte[] bytes = new byte[]{ 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + + ByteSource source = ByteSource.fixedLength(bytes, 0, 1); + assertEquals(1, source.next()); + assertEquals(ByteSource.END_OF_STREAM, source.next()); + + source = ByteSource.fixedLength(bytes, 4, 5); + assertEquals(5, source.next()); + assertEquals(6, source.next()); + assertEquals(7, source.next()); + assertEquals(8, source.next()); + assertEquals(9, source.next()); + assertEquals(ByteSource.END_OF_STREAM, source.next()); + + ByteSource.fixedLength(bytes, 9, 0); + assertEquals(ByteSource.END_OF_STREAM, source.next()); + } + + @Test + public void testFixedLengthNegativeLength() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, 0, -1); + } + + @Test + public void testFixedLengthNegativeOffset() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, -1, 1); + } + + @Test + public void testFixedLengthOutOfBounds() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, 0, 4); + } + + @Test + public void testFixedOffsetOutOfBounds() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, 4, 1); + } + + public void testDecoratedKeyPrefixes(Version version) + { + testDecoratedKeyPrefixes("012345678BCDE\0", "", version); + testDecoratedKeyPrefixes("012345678ABCDE\0", "ABC", version); + testDecoratedKeyPrefixes("0123456789ABCDE\0", "\0AB", version); + testDecoratedKeyPrefixes("0123456789ABCDEF\0", "\0", version); + + testDecoratedKeyPrefixes("0123456789ABCDEF0", "ABC", version); + testDecoratedKeyPrefixes("0123456789ABCDEF", "", version); + testDecoratedKeyPrefixes("0123456789ABCDE", "", version); + testDecoratedKeyPrefixes("0123456789ABCD", "\0AB", version); + testDecoratedKeyPrefixes("0123456789ABC", "\0", version); + + } + + public void testDecoratedKeyPrefixes(String key, String append, Version version) + { + logger.info("Testing {} + {}", safeStr(key), safeStr(append)); + IPartitioner partitioner = Murmur3Partitioner.instance; + ByteBuffer original = ByteBufferUtil.bytes(key); + ByteBuffer collision = Util.generateMurmurCollision(original, append.getBytes(StandardCharsets.UTF_8)); + + long[] hash = new long[2]; + MurmurHash.hash3_x64_128(original, 0, original.limit(), 0, hash); + logger.info(String.format("Original hash %016x,%016x", hash[0], hash[1])); + MurmurHash.hash3_x64_128(collision, 0, collision.limit(), 0, hash); + logger.info(String.format("Collision hash %016x,%016x", hash[0], hash[1])); + + DecoratedKey kk1 = partitioner.decorateKey(original); + DecoratedKey kk2 = partitioner.decorateKey(collision); + logger.info("{}\n{}\n{}\n{}", kk1, kk2, kk1.byteComparableAsString(version), kk2.byteComparableAsString(version)); + + final ByteSource s1 = kk1.asComparableBytes(version); + final ByteSource s2 = kk2.asComparableBytes(version); + logger.info("{}\n{}", s1, s2); + + // Check that the representations compare correctly + Assert.assertEquals(Long.signum(kk1.compareTo(kk2)), ByteComparable.compare(kk1, kk2, version)); + // s1 must not be a prefix of s2 + assertNotPrefix(s1, s2); + } + + private void assertNotPrefix(ByteSource s1, ByteSource s2) + { + int c1, c2; + do + { + c1 = s1.next(); + c2 = s2.next(); + } + while (c1 == c2 && c1 != ByteSource.END_OF_STREAM); + + // Equal is ok + if (c1 == c2) + return; + + Assert.assertNotEquals("ByteComparable is a prefix of other", ByteSource.END_OF_STREAM, c1); + Assert.assertNotEquals("ByteComparable is a prefix of other", ByteSource.END_OF_STREAM, c2); + } + + private int compare(ByteSource s1, ByteSource s2) + { + int c1, c2; + do + { + c1 = s1.next(); + c2 = s2.next(); + } + while (c1 == c2 && c1 != ByteSource.END_OF_STREAM); + + return Integer.compare(c1, c2); + } + + private void maybeAssertNotPrefix(ByteComparable s1, ByteComparable s2, Version version) + { + if (version == Version.OSS41) + assertNotPrefix(s1.asComparableBytes(version), s2.asComparableBytes(version)); + } + + private void maybeCheck41Properties(int expectedComparison, ByteComparable s1, ByteComparable s2, Version version) + { + if (version != Version.OSS41) + return; + + if (s1 == null || s2 == null || 0 == expectedComparison) + return; + int b1 = ThreadLocalRandom.current().nextInt(ByteSource.MIN_SEPARATOR, ByteSource.MAX_SEPARATOR + 1); + int b2 = ThreadLocalRandom.current().nextInt(ByteSource.MIN_SEPARATOR, ByteSource.MAX_SEPARATOR + 1); + assertEquals(String.format("Comparison failed for %s(%s + %02x) and %s(%s + %02x)", s1, s1.byteComparableAsString(version), b1, s2, s2.byteComparableAsString(version), b2), + expectedComparison, Integer.signum(compare(ByteSource.withTerminator(b1, s1.asComparableBytes(version)), ByteSource.withTerminator(b2, s2.asComparableBytes(version))))); + assertNotPrefix(ByteSource.withTerminator(b1, s1.asComparableBytes(version)), ByteSource.withTerminator(b2, s2.asComparableBytes(version))); + } + + > void testMap(MapType tt, K[] keys, V[] values, Supplier gen, Random rand) + { + List tests = new ArrayList<>(); + tests.add(gen.get()); + for (int c = 1; c <= 3; ++c) + for (int j = 0; j < 5; ++j) + { + M l = gen.get(); + for (int i = 0; i < c; ++i) + l.put(keys[rand.nextInt(keys.length)], values[rand.nextInt(values.length)]); + + tests.add(l); + } + testType(tt, tests.toArray()); + } + + /* + * Convert type to a comparable. + */ + private ByteComparable typeToComparable(AbstractType type, ByteBuffer value) + { + return new ByteComparable() + { + @Override + public ByteSource asComparableBytes(Version v) + { + return type.asComparableBytes(value, v); + } + + @Override + public String toString() + { + return type.getString(value); + } + }; + } + + public void testType(AbstractType type, Object[] values) + { + for (Object i : values) { + ByteBuffer b = decomposeAndRandomPad(type, i); + logger.info("Value {} ({}) bytes {} ByteSource {}", + safeStr(i), + safeStr(type.getSerializer().toCQLLiteral(b)), + safeStr(ByteBufferUtil.bytesToHex(b)), + typeToComparable(type, b).byteComparableAsString(Version.OSS41)); + } + for (Object i : values) + for (Object j : values) + assertComparesSame(type, i, j); + if (!type.isReversed()) + testType(ReversedType.getInstance(type), values); + } + + public void testBuffers(AbstractType type, List values) + { + try + { + for (Object i : values) { + ByteBuffer b = decomposeAndRandomPad(type, i); + logger.info("Value {} bytes {} ByteSource {}", + safeStr(type.getSerializer().toCQLLiteral(b)), + safeStr(ByteBufferUtil.bytesToHex(b)), + typeToComparable(type, b).byteComparableAsString(Version.OSS41)); + } + } + catch (UnsupportedOperationException e) + { + // Continue without listing values. + } + + for (ByteBuffer i : values) + for (ByteBuffer j : values) + assertComparesSameBuffers(type, i, j); + } + + void assertComparesSameBuffers(AbstractType type, ByteBuffer b1, ByteBuffer b2) + { + int expected = Integer.signum(type.compare(b1, b2)); + final ByteComparable bs1 = typeToComparable(type, b1); + final ByteComparable bs2 = typeToComparable(type, b2); + + for (Version version : Version.values()) + { + int actual = Integer.signum(ByteComparable.compare(bs1, bs2, version)); + assertEquals(String.format("Failed comparing %s(%s) and %s(%s)", ByteBufferUtil.bytesToHex(b1), bs1.byteComparableAsString(version), ByteBufferUtil.bytesToHex(b2), bs2.byteComparableAsString(version)), + expected, + actual); + maybeCheck41Properties(expected, bs1, bs2, version); + } + } + + public void testDecoratedKeys(IPartitioner type, List values) + { + for (ByteBuffer i : values) + for (ByteBuffer j : values) + assertComparesSameDecoratedKeys(type, i, j); + } + + void assertComparesSameDecoratedKeys(IPartitioner type, ByteBuffer b1, ByteBuffer b2) + { + DecoratedKey k1 = type.decorateKey(b1); + DecoratedKey k2 = type.decorateKey(b2); + int expected = Integer.signum(k1.compareTo(k2)); + + for (Version version : Version.values()) + { + int actual = Integer.signum(ByteComparable.compare(k1, k2, version)); + assertEquals(String.format("Failed comparing %s[%s](%s) and %s[%s](%s)\npartitioner %s version %s", + ByteBufferUtil.bytesToHex(b1), + k1, + k1.byteComparableAsString(version), + ByteBufferUtil.bytesToHex(b2), + k2, + k2.byteComparableAsString(version), + type, + version), + expected, + actual); + maybeAssertNotPrefix(k1, k2, version); + } + } + + private Object safeStr(Object i) + { + if (i == null) + return null; + String s = i.toString(); + if (s.length() > 100) + s = s.substring(0, 100) + "..."; + return s.replaceAll("\0", "<0>"); + } + + public void testDirect(Function convertor, BiFunction comparator, T[] values) + { + for (T i : values) { + if (i == null) + continue; + + logger.info("Value {} ByteSource {}\n", + safeStr(i), + convertor.apply(i)); + } + for (T i : values) + if (i != null) + for (T j : values) + if (j != null) + assertComparesSame(convertor, comparator, i, j); + } + + void assertComparesSame(Function convertor, BiFunction comparator, T v1, T v2) + { + ByteComparable b1 = v -> convertor.apply(v1); + ByteComparable b2 = v -> convertor.apply(v2); + int expected = Integer.signum(comparator.apply(v1, v2)); + int actual = Integer.signum(ByteComparable.compare(b1, b2, null)); // version ignored above + assertEquals(String.format("Failed comparing %s and %s", v1, v2), expected, actual); + } + + void assertComparesSame(AbstractType type, Object v1, Object v2) + { + ByteBuffer b1 = decomposeAndRandomPad(type, v1); + ByteBuffer b2 = decomposeAndRandomPad(type, v2); + int expected = Integer.signum(type.compare(b1, b2)); + final ByteComparable bc1 = typeToComparable(type, b1); + final ByteComparable bc2 = typeToComparable(type, b2); + + for (Version version : Version.values()) + { + int actual = Integer.signum(ByteComparable.compare(bc1, bc2, version)); + if (expected != actual) + { + if (type.isReversed()) + { + // This can happen for reverse of nulls and prefixes. Check that it's ok within multi-component + ClusteringComparator cc = new ClusteringComparator(type); + ByteComparable c1 = cc.asByteComparable(Clustering.make(b1)); + ByteComparable c2 = cc.asByteComparable(Clustering.make(b2)); + int actualcc = Integer.signum(ByteComparable.compare(c1, c2, version)); + if (actualcc == expected) + return; + assertEquals(String.format("Failed comparing reversed %s(%s, %s) and %s(%s, %s) direct (%d) and as clustering", safeStr(v1), ByteBufferUtil.bytesToHex(b1), c1, safeStr(v2), ByteBufferUtil.bytesToHex(b2), c2, actual), expected, actualcc); + } + else + assertEquals(String.format("Failed comparing %s(%s) and %s(%s)", safeStr(v1), ByteBufferUtil.bytesToHex(b1), safeStr(v2), ByteBufferUtil.bytesToHex(b2)), expected, actual); + } + maybeCheck41Properties(expected, bc1, bc2, version); + } + } + + ByteBuffer decomposeAndRandomPad(AbstractType type, Object v) + { + ByteBuffer b = type.decompose(v); + Random rand = new Random(0); + int padBefore = rand.nextInt(16); + int padAfter = rand.nextInt(16); + int paddedCapacity = b.remaining() + padBefore + padAfter; + ByteBuffer padded = allocateBuffer(paddedCapacity); + rand.ints(padBefore).forEach(x -> padded.put((byte) x)); + padded.put(b); + rand.ints(padAfter).forEach(x -> padded.put((byte) x)); + padded.clear().limit(padded.capacity() - padAfter).position(padBefore); + return padded; + } + + protected ByteBuffer allocateBuffer(int paddedCapacity) + { + return ByteBuffer.allocate(paddedCapacity); + } +} diff --git a/update-history/STAR-801/73-819733f430 CORE-93: Add some tests b/update-history/STAR-801/73-819733f430 CORE-93: Add some tests new file mode 100644 index 000000000000..8fbd81023924 --- /dev/null +++ b/update-history/STAR-801/73-819733f430 CORE-93: Add some tests @@ -0,0 +1,19 @@ +--- a/test/unit/org/apache/cassandra/Util.java ++++ b/test/unit/org/apache/cassandra/Util.java +@@ -19,16 +19,9 @@ + * + */ + +-<<<<<<< + import java.io.*; + import java.lang.reflect.Field; +-======= +-import java.io.Closeable; +-import java.io.EOFException; +-import java.io.File; +-import java.io.IOError; + import java.math.BigInteger; +->>>>>>> + import java.net.UnknownHostException; + import java.nio.ByteBuffer; + import java.nio.file.*; From 7bb0ad043e80acfa4494a08c7404e3c44d2c1bf4 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Tue, 10 Nov 2020 13:54:32 +0100 Subject: [PATCH 031/151] STAR-15: Allow for other implementations of SSTableFormat [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator :...skipping... [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator Also make it possible to obtain PartitionIndexIterator directly from :...skipping... [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator Also make it possible to obtain PartitionIndexIterator directly from the SSTableReader.Factory :...skipping... [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator Also make it possible to obtain PartitionIndexIterator directly from the SSTableReader.Factory :...skipping... [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator Also make it possible to obtain PartitionIndexIterator directly from the SSTableReader.Factory Added methods to reset the iterator position and set the position to exact value :...skipping... [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator Also make it possible to obtain PartitionIndexIterator directly from the SSTableReader.Factory Added methods to reset the iterator position and set the position to exact value [402ce9f31313d3c963a845d04bd5e7ee93598119] STAR-15: Refactor explicit usages of BigTableRowIndexEntry.Serializer Use generic PartitionIndexIterator or KeyIterator instead :...skipping... [288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface [ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat This includes introduction of PartitionIndexIterator. Pull down serializer creation to the locations where it is needed. In other locations, we use PartitionIndexIterator abstract [0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big [bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class [044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format [9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big [ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring 1. Generic RowIndexEntry was extracted 2. AbstractSSTableIterator is now typed by row index entry 3. In SSTableReader some methods were squashed and some were pulled down 4. Test were adjusted to just work (for now) [a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big [fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big [91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier [0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during instantiation of some SSTables passed to the constructor, the index files were left unclosed In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the enclosing block - it does not seem to change the semantics In SSTableExport opening the SSTable was moved to the upper level as sstable instance was needed to create PartitionIndexIterator [82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator Also make it possible to obtain PartitionIndexIterator directly from the SSTableReader.Factory Added methods to reset the iterator position and set the position to exact value [402ce9f31313d3c963a845d04bd5e7ee93598119] STAR-15: Refactor explicit usages of BigTableRowIndexEntry.Serializer Use generic PartitionIndexIterator or KeyIterator instead [ad4535715f117f349c99d98403b3f7c23454d9fd] Apply review comments (cherry picked from commit 4520f47729752b78b5a83e71e29cb6ff45ceb599) (cherry picked from commit 3aba2f97e3a64b125a840a1b244e1bd0fd81cd64) --- .../cassandra/db/ClusteringComparator.java | 2 +- .../cassandra/db/ColumnFamilyStore.java | 2 +- .../apache/cassandra/db/SSTableImporter.java | 15 +- .../db/SinglePartitionReadCommand.java | 1 + .../db/compaction/CompactionController.java | 8 +- .../cassandra/db/compaction/Scrubber.java | 122 +++----- .../cassandra/db/compaction/Verifier.java | 46 ++- .../writers/MajorLeveledCompactionWriter.java | 4 +- .../writers/MaxSSTableSizeWriter.java | 4 +- .../SplittingSizeTieredCompactionWriter.java | 4 +- .../rows/UnfilteredRowIteratorSerializer.java | 1 + .../UnfilteredRowIteratorWithLowerBound.java | 7 +- .../index/sasi/SASIIndexBuilder.java | 37 ++- .../cassandra/io/sstable/KeyIterator.java | 156 +++++----- .../io/sstable/ReducingKeyIterator.java | 14 +- .../apache/cassandra/io/sstable/SSTable.java | 29 +- .../io/sstable/SSTableIdentityIterator.java | 1 + .../cassandra/io/sstable/SSTableRewriter.java | 15 +- .../io/sstable/SimpleSSTableMultiWriter.java | 4 +- .../format}/AbstractSSTableIterator.java | 270 ++---------------- .../format/PartitionIndexIterator.java | 77 +++++ .../io/sstable/format/RowIndexEntry.java | 48 ++++ .../io/sstable/format/SSTableFormat.java | 10 +- .../io/sstable/format/SSTableReader.java | 126 +++----- .../sstable/format/SSTableReaderBuilder.java | 65 ++--- .../sstable/format/SSTableReadsListener.java | 2 +- .../io/sstable/format/SSTableWriter.java | 6 +- .../format/big/AbstractBigTableIterator.java | 87 ++++++ .../io/sstable/format/big/BigFormat.java | 35 ++- .../big/BigTablePartitionIndexIterator.java | 173 +++++++++++ .../io/sstable/format/big/BigTableReader.java | 60 +++- .../format/big/BigTableRowIndexEntry.java} | 84 +++--- .../sstable/format/big/BigTableScanner.java | 26 +- .../io/sstable/format/big/BigTableWriter.java | 46 ++- .../sstable/format/big}/ColumnIndex.java | 13 +- .../sstable/{ => format/big}/IndexInfo.java | 6 +- .../io/sstable/format/big/IndexState.java | 228 +++++++++++++++ .../sstable/format/big}/SSTableIterator.java | 12 +- .../format/big}/SSTableReversedIterator.java | 12 +- .../cassandra/service/CacheService.java | 25 +- .../apache/cassandra/tools/SSTableExport.java | 6 +- .../apache/cassandra/utils/StatusLogger.java | 4 +- .../distributed/test/FailingRepairTest.java | 7 + ...yInspectorCorruptSSTableExceptionTest.java | 8 +- .../format/ForwardingSSTableReader.java | 33 +-- .../cassandra/cache/AutoSavingCacheTest.java | 3 +- .../cql3/QueryWithIndexedSSTableTest.java | 4 +- .../TombstonesWithIndexedSSTableTest.java | 5 +- .../org/apache/cassandra/db/KeyCacheTest.java | 17 +- .../org/apache/cassandra/db/KeyspaceTest.java | 3 +- .../streaming/CassandraOutgoingFileTest.java | 11 +- .../big/BigTableRowIndexEntryTest.java} | 52 ++-- .../SSTableReverseIteratorTest.java | 6 +- ...for other implementations of SSTableFormat | 212 ++++++++++++++ 54 files changed, 1408 insertions(+), 846 deletions(-) rename src/java/org/apache/cassandra/{db/columniterator => io/sstable/format}/AbstractSSTableIterator.java (54%) create mode 100644 src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java create mode 100644 src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java rename src/java/org/apache/cassandra/{db/RowIndexEntry.java => io/sstable/format/big/BigTableRowIndexEntry.java} (91%) rename src/java/org/apache/cassandra/{db => io/sstable/format/big}/ColumnIndex.java (95%) rename src/java/org/apache/cassandra/io/sstable/{ => format/big}/IndexInfo.java (97%) create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java rename src/java/org/apache/cassandra/{db/columniterator => io/sstable/format/big}/SSTableIterator.java (96%) rename src/java/org/apache/cassandra/{db/columniterator => io/sstable/format/big}/SSTableReversedIterator.java (97%) rename test/unit/org/apache/cassandra/{db/RowIndexEntryTest.java => io/sstable/format/big/BigTableRowIndexEntryTest.java} (93%) rename test/unit/org/apache/cassandra/{db => io/sstable/format}/columniterator/SSTableReverseIteratorTest.java (93%) create mode 100644 update-history/STAR-801/72-3aba2f97e3 STAR-15: Allow for other implementations of SSTableFormat diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java index a23aa36017ae..e5b63aa6a27a 100644 --- a/src/java/org/apache/cassandra/db/ClusteringComparator.java +++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java @@ -30,7 +30,7 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.serializers.MarshalException; -import org.apache.cassandra.io.sstable.IndexInfo; +import org.apache.cassandra.io.sstable.format.big.IndexInfo; import org.apache.cassandra.utils.ByteComparable; import org.apache.cassandra.utils.ByteSource; diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index 6b49855ec2fe..f80a04366238 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -1770,7 +1770,7 @@ public List getSSTablesForKey(String key, boolean hexFormat) for (SSTableReader sstr : select(View.select(SSTableSet.LIVE, dk)).sstables) { // check if the key actually exists in this sstable, without updating cache and stats - if (sstr.getPosition(dk, SSTableReader.Operator.EQ, false) != null) + if (sstr.checkEntryExists(dk, SSTableReader.Operator.EQ, false)) files.add(sstr.getFilename()); } return files; diff --git a/src/java/org/apache/cassandra/db/SSTableImporter.java b/src/java/org/apache/cassandra/db/SSTableImporter.java index 989ff12297a7..5bcbd4c528cd 100644 --- a/src/java/org/apache/cassandra/db/SSTableImporter.java +++ b/src/java/org/apache/cassandra/db/SSTableImporter.java @@ -181,8 +181,15 @@ synchronized List importNewSSTables(Options options) cfs.getTracker().addSSTables(newSSTables); for (SSTableReader reader : newSSTables) { - if (options.invalidateCaches && cfs.isRowCacheEnabled()) - invalidateCachesForSSTable(reader.descriptor); + try + { + if (options.invalidateCaches && cfs.isRowCacheEnabled()) + invalidateCachesForSSTable(reader); + } + catch (IOException ex) + { + throw new RuntimeException(ex); + } } } @@ -311,9 +318,9 @@ private void removeCopiedSSTables(Set movedSSTables) * Iterates over all keys in the sstable index and invalidates the row cache */ @VisibleForTesting - void invalidateCachesForSSTable(Descriptor desc) + void invalidateCachesForSSTable(SSTableReader reader) throws IOException { - try (KeyIterator iter = new KeyIterator(desc, cfs.metadata())) + try (KeyIterator iter = KeyIterator.forSSTable(reader)) { while (iter.hasNext()) { diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java index 7dba4d88380e..df52130e8a25 100644 --- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java +++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java @@ -36,6 +36,7 @@ import org.apache.cassandra.db.transform.RTBoundValidator; import org.apache.cassandra.db.transform.Transformation; import org.apache.cassandra.exceptions.RequestExecutionException; +import org.apache.cassandra.io.sstable.format.RowIndexEntry; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.SSTableReadsListener; import org.apache.cassandra.io.util.DataInputPlus; diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java index e1b0f3258359..6078dabac2e6 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java @@ -257,7 +257,7 @@ public LongPredicate getPurgeEvaluator(DecoratedKey key) { // if we don't have bloom filter(bf_fp_chance=1.0 or filter file is missing), // we check index file instead. - if (sstable.getBloomFilter() instanceof AlwaysPresentFilter && sstable.getPosition(key, SSTableReader.Operator.EQ, false) != null + if (sstable.getBloomFilter() instanceof AlwaysPresentFilter && sstable.checkEntryExists(key, SSTableReader.Operator.EQ, false) || sstable.getBloomFilter().isPresent(key)) { minTimestampSeen = Math.min(minTimestampSeen, sstable.getMinTimestamp()); @@ -321,11 +321,7 @@ private UnfilteredRowIterator getShadowIterator(SSTableReader reader, DecoratedK reader.getMaxTimestamp() <= minTimestamp || tombstoneOnly && !reader.mayHaveTombstones()) return null; - RowIndexEntry position = reader.getPosition(key, SSTableReader.Operator.EQ); - if (position == null) - return null; - FileDataInput dfile = openDataFiles.computeIfAbsent(reader, this::openDataFile); - return reader.simpleIterator(dfile, key, position, tombstoneOnly); + return reader.simpleIterator(() -> openDataFiles.computeIfAbsent(reader, this::openDataFile), key, tombstoneOnly); } /** diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java index 5884f989e008..b0d601937a47 100644 --- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java +++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java @@ -25,9 +25,9 @@ import java.util.concurrent.locks.ReentrantReadWriteLock; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Throwables; import com.google.common.collect.ImmutableSet; +import org.apache.cassandra.io.sstable.format.PartitionIndexIterator; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.db.*; import org.apache.cassandra.db.lifecycle.LifecycleTransaction; @@ -60,19 +60,13 @@ public class Scrubber implements Closeable private final ReadWriteLock fileAccessLock; private final RandomAccessReader dataFile; - private final RandomAccessReader indexFile; + private final PartitionIndexIterator indexIterator; private final ScrubInfo scrubInfo; - private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer; private int goodRows; private int badRows; private int emptyRows; - private ByteBuffer currentIndexKey; - private ByteBuffer nextIndexKey; - long currentRowPositionFromIndex; - long nextRowPositionFromIndex; - private NegativeLocalDeletionInfoMetrics negativeLocalDeletionInfoMetrics = new NegativeLocalDeletionInfoMetrics(); private final OutputHandler outputHandler; @@ -111,9 +105,6 @@ public Scrubber(ColumnFamilyStore cfs, this.outputHandler = outputHandler; this.skipCorrupted = skipCorrupted; this.reinsertOverflowedTTLRows = reinsertOverflowedTTLRows; - this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(cfs.metadata(), - sstable.descriptor.version, - sstable.header); List toScrub = Collections.singletonList(sstable); @@ -141,19 +132,29 @@ public Scrubber(ColumnFamilyStore cfs, ? sstable.openDataReader() : sstable.openDataReader(CompactionManager.instance.getRateLimiter()); - this.indexFile = hasIndexFile - ? RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX))) - : null; + this.indexIterator = hasIndexFile + ? openIndexIterator() + : null; this.scrubInfo = new ScrubInfo(dataFile, sstable, fileAccessLock.readLock()); - this.currentRowPositionFromIndex = 0; - this.nextRowPositionFromIndex = 0; - if (reinsertOverflowedTTLRows) outputHandler.output("Starting scrub with reinsert overflowed TTL option"); } + private PartitionIndexIterator openIndexIterator() + { + try + { + return sstable.allKeysIterator(); + } + catch (IOException e) + { + outputHandler.warn("Index is unreadable."); + } + return null; + } + private UnfilteredRowIterator withValidation(UnfilteredRowIterator iter, String filename) { return checkData ? UnfilteredRowIterators.withValidation(iter, filename) : iter; @@ -167,24 +168,7 @@ public void scrub() try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, false, sstable.maxDataAge); Refs refs = Refs.ref(Collections.singleton(sstable))) { - try - { - nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null; - if (indexAvailable()) - { - // throw away variable so we don't have a side effect in the assert - long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); - assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex; - } - } - catch (Throwable ex) - { - throwIfFatal(ex); - nextIndexKey = null; - nextRowPositionFromIndex = dataFile.length(); - if (indexFile != null) - indexFile.seek(indexFile.length()); - } + assert !indexAvailable() || indexIterator.dataPosition() == 0 : indexIterator.dataPosition(); StatsMetadata metadata = sstable.getSSTableMetadata(); writer.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, metadata.repairedAt, metadata.pendingRepair, metadata.isTransient, sstable, transaction)); @@ -210,22 +194,18 @@ public void scrub() // check for null key below } + long dataStart = dataFile.getFilePointer(); + long dataStartFromIndex = -1; long dataSizeFromIndex = -1; - - updateIndexKey(); - - if (indexAvailable()) + ByteBuffer currentIndexKey = indexIterator != null ? indexIterator.key() : null; + if (currentIndexKey != null) { - if (currentIndexKey != null) - { - dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining(); - dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex; - } + dataStartFromIndex = indexIterator.dataPosition() + TypeSizes.SHORT_SIZE + currentIndexKey.remaining(); + if (advanceIndexNoThrow()) + dataSizeFromIndex = indexIterator.dataPosition() - dataStartFromIndex; } - long dataStart = dataFile.getFilePointer(); - // avoid an NPE if key is null String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey()); outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex))); @@ -244,10 +224,10 @@ public void scrub() "_too big_", ByteBufferUtil.bytesToHex(currentIndexKey)))); } - if (indexFile != null && dataSizeFromIndex > dataFile.length()) + if (indexIterator != null && dataSizeFromIndex > dataFile.length()) throw new IOError(new IOException("Impossible row size (greater than file length): " + dataSizeFromIndex)); - if (indexFile != null && dataStart != dataStartFromIndex) + if (indexIterator != null && dataStart != dataStartFromIndex) outputHandler.warn(String.format("Data file row position %d differs from index file row position %d", dataStart, dataStartFromIndex)); if (tryAppend(prevKey, key, writer)) @@ -382,51 +362,41 @@ private UnfilteredRowIterator getIterator(DecoratedKey key) negativeLocalDeletionInfoMetrics) : rowMergingIterator; } - private void updateIndexKey() + private boolean advanceIndexNoThrow() { - currentIndexKey = nextIndexKey; - currentRowPositionFromIndex = nextRowPositionFromIndex; try { - nextIndexKey = !indexAvailable() ? null : ByteBufferUtil.readWithShortLength(indexFile); - - nextRowPositionFromIndex = !indexAvailable() - ? dataFile.length() - : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); + return indexAvailable() && indexIterator.advance(); } catch (Throwable th) { JVMStabilityInspector.inspectThrowable(th); outputHandler.warn("Error reading index file", th); - nextIndexKey = null; - nextRowPositionFromIndex = dataFile.length(); - if (indexFile != null) - indexFile.seek(indexFile.length()); + indexIterator.close(); + return false; } } private boolean indexAvailable() { - return indexFile != null && !indexFile.isEOF(); + return indexIterator != null && !indexIterator.isExhausted(); } private void seekToNextRow() { - while(nextRowPositionFromIndex < dataFile.length()) - { - try - { - dataFile.seek(nextRowPositionFromIndex); - return; - } - catch (Throwable th) - { - throwIfFatal(th); - outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th); - badRows++; - } + long nextRowPositionFromIndex = indexIterator.isExhausted() + ? dataFile.length() + : indexIterator.dataPosition(); - updateIndexKey(); + try + { + dataFile.seek(nextRowPositionFromIndex); + } + catch (Throwable th) + { + throwIfFatal(th); + outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th); + badRows++; } } @@ -469,7 +439,7 @@ public void close() try { FileUtils.closeQuietly(dataFile); - FileUtils.closeQuietly(indexFile); + FileUtils.closeQuietly(indexIterator); } finally { diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java index 68d5163e4d85..fb4a17f810f3 100644 --- a/src/java/org/apache/cassandra/db/compaction/Verifier.java +++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java @@ -32,6 +32,7 @@ import org.apache.cassandra.io.sstable.IndexSummary; import org.apache.cassandra.io.sstable.KeyIterator; import org.apache.cassandra.io.sstable.SSTableIdentityIterator; +import org.apache.cassandra.io.sstable.format.PartitionIndexIterator; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.metadata.MetadataComponent; import org.apache.cassandra.io.sstable.metadata.MetadataType; @@ -76,9 +77,7 @@ public class Verifier implements Closeable private final ReadWriteLock fileAccessLock; private final RandomAccessReader dataFile; - private final RandomAccessReader indexFile; private final VerifyInfo verifyInfo; - private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer; private final Options options; private final boolean isOffline; /** @@ -103,7 +102,6 @@ public Verifier(ColumnFamilyStore cfs, SSTableReader sstable, OutputHandler outp this.cfs = cfs; this.sstable = sstable; this.outputHandler = outputHandler; - this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(cfs.metadata(), sstable.descriptor.version, sstable.header); this.controller = new VerifyController(cfs); @@ -111,7 +109,6 @@ public Verifier(ColumnFamilyStore cfs, SSTableReader sstable, OutputHandler outp this.dataFile = isOffline ? sstable.openDataReader() : sstable.openDataReader(CompactionManager.instance.getRateLimiter()); - this.indexFile = RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX))); this.verifyInfo = new VerifyInfo(dataFile, sstable, fileAccessLock.readLock()); this.options = options; this.isOffline = isOffline; @@ -185,7 +182,7 @@ public void verify() if (options.checkOwnsTokens && !isOffline && !(cfs.getPartitioner() instanceof LocalPartitioner)) { outputHandler.debug("Checking that all tokens are owned by the current node"); - try (KeyIterator iter = new KeyIterator(sstable.descriptor, sstable.metadata())) + try (KeyIterator iter = KeyIterator.forSSTable(sstable)) { List> ownedRanges = Range.normalize(tokenLookup.apply(cfs.metadata.keyspace)); if (ownedRanges.isEmpty()) @@ -239,14 +236,10 @@ public void verify() outputHandler.output("Extended Verify requested, proceeding to inspect values"); - try + try(PartitionIndexIterator indexIterator = sstable.allKeysIterator()) { - ByteBuffer nextIndexKey = ByteBufferUtil.readWithShortLength(indexFile); - { - long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); - if (firstRowPositionFromIndex != 0) - markAndThrow(new RuntimeException("firstRowPositionFromIndex != 0: "+firstRowPositionFromIndex)); - } + if (indexIterator.dataPosition() != 0) + markAndThrow(new RuntimeException("First row position from index != 0: " + indexIterator.dataPosition())); List> ownedRanges = isOffline ? Collections.emptyList() : Range.normalize(tokenLookup.apply(cfs.metadata().keyspace)); RangeOwnHelper rangeOwnHelper = new RangeOwnHelper(ownedRanges); @@ -285,14 +278,18 @@ public void verify() } } - ByteBuffer currentIndexKey = nextIndexKey; + ByteBuffer currentIndexKey = indexIterator.key(); long nextRowPositionFromIndex = 0; try { - nextIndexKey = indexFile.isEOF() ? null : ByteBufferUtil.readWithShortLength(indexFile); - nextRowPositionFromIndex = indexFile.isEOF() - ? dataFile.length() - : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile); + if (indexIterator.advance()) + { + nextRowPositionFromIndex = indexIterator.dataPosition(); + } + else + { + nextRowPositionFromIndex = dataFile.length(); + } } catch (Throwable th) { @@ -309,8 +306,6 @@ public void verify() String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey()); outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSize))); - assert currentIndexKey != null || indexFile.isEOF(); - try { if (key == null || dataSize > dataFile.length()) @@ -413,15 +408,9 @@ public boolean check(DecoratedKey key) private void deserializeIndex(SSTableReader sstable) throws IOException { - try (RandomAccessReader primaryIndex = RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)))) - { - long indexSize = primaryIndex.length(); - - while ((primaryIndex.getFilePointer()) != indexSize) - { - ByteBuffer key = ByteBufferUtil.readWithShortLength(primaryIndex); - RowIndexEntry.Serializer.skip(primaryIndex, sstable.descriptor.version); - } + try (PartitionIndexIterator it = sstable.allKeysIterator()) { + //noinspection StatementWithEmptyBody + while (it.advance()); // no-op, just check if index is readable } } @@ -460,7 +449,6 @@ public void close() try { FileUtils.closeQuietly(dataFile); - FileUtils.closeQuietly(indexFile); } finally { diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java index 1c5360020b49..93043913f39b 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java @@ -21,7 +21,7 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Directories; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.compaction.LeveledManifest; @@ -70,7 +70,7 @@ public MajorLeveledCompactionWriter(ColumnFamilyStore cfs, @SuppressWarnings("resource") public boolean realAppend(UnfilteredRowIterator partition) { - RowIndexEntry rie = sstableWriter.append(partition); + BigTableRowIndexEntry rie = sstableWriter.append(partition); partitionsWritten++; long totalWrittenInCurrentWriter = sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten(); if (totalWrittenInCurrentWriter > maxSSTableSize) diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java index 915f96bfb431..af21e51ed4f3 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java @@ -21,7 +21,7 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Directories; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.compaction.OperationType; import org.apache.cassandra.db.rows.UnfilteredRowIterator; @@ -81,7 +81,7 @@ private static long getTotalWriteSize(Iterable nonExpiredSSTables protected boolean realAppend(UnfilteredRowIterator partition) { - RowIndexEntry rie = sstableWriter.append(partition); + BigTableRowIndexEntry rie = sstableWriter.append(partition); if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > maxSSTableSize) { switchCompactionLocation(sstableDirectory); diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java index d29061ca8630..f2d6fe91674a 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java @@ -25,7 +25,7 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Directories; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.db.lifecycle.LifecycleTransaction; @@ -86,7 +86,7 @@ public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories di @Override public boolean realAppend(UnfilteredRowIterator partition) { - RowIndexEntry rie = sstableWriter.append(partition); + BigTableRowIndexEntry rie = sstableWriter.append(partition); if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > currentBytesToWrite && currentRatioIndex < ratios.length - 1) // if we underestimate how many keys we have, the last sstable might get more than we expect { currentRatioIndex++; diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java index 938a3eed114e..9b93c89f8454 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java @@ -26,6 +26,7 @@ import org.apache.cassandra.db.*; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.io.sstable.format.big.ColumnIndex; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.schema.TableMetadata; diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java index b6f425458dec..d0ba98f7075f 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java @@ -26,12 +26,13 @@ import java.util.List; import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.db.*; import org.apache.cassandra.db.filter.ClusteringIndexFilter; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.transform.RTBoundValidator; -import org.apache.cassandra.io.sstable.IndexInfo; +import org.apache.cassandra.io.sstable.format.big.IndexInfo; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.format.SSTableReadsListener; import org.apache.cassandra.io.sstable.metadata.StatsMetadata; @@ -189,11 +190,11 @@ private ClusteringBound getPartitionIndexLowerBound() if (!canUseMetadataLowerBound()) maybeInit(); - RowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false); + BigTableRowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false); if (rowIndexEntry == null || !rowIndexEntry.indexOnHeap()) return null; - try (RowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null)) + try (BigTableRowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null)) { IndexInfo column = onHeapRetriever.columnsIndex(filter.isReversed() ? rowIndexEntry.columnsIndexCount() - 1 : 0); ClusteringPrefix lowerBoundPrefix = filter.isReversed() ? column.lastName : column.firstName; diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java index bb42dc2d178a..c2b0aa19c9d3 100644 --- a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java +++ b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java @@ -24,10 +24,10 @@ import java.io.IOException; import java.util.*; +import org.apache.cassandra.io.sstable.format.RowIndexEntry; import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.RowIndexEntry; import org.apache.cassandra.db.compaction.CompactionInfo; import org.apache.cassandra.db.compaction.CompactionInterruptedException; import org.apache.cassandra.db.compaction.OperationType; @@ -78,7 +78,7 @@ public void build() PerSSTableIndexWriter indexWriter = SASIIndex.newWriter(keyValidator, sstable.descriptor, indexes, OperationType.COMPACTION); long previousKeyPosition = 0; - try (KeyIterator keys = new KeyIterator(sstable.descriptor, cfs.metadata())) + try (KeyIterator keys = KeyIterator.forSSTable(sstable)) { while (keys.hasNext()) { @@ -90,25 +90,18 @@ public void build() indexWriter.startPartition(key, keyPosition); - try - { - RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ); - dataFile.seek(indexEntry.position); - ByteBufferUtil.readWithShortLength(dataFile); // key - - try (SSTableIdentityIterator partition = SSTableIdentityIterator.create(sstable, dataFile, key)) - { - // if the row has statics attached, it has to be indexed separately - if (cfs.metadata().hasStaticColumns()) - indexWriter.nextUnfilteredCluster(partition.staticRow()); - - while (partition.hasNext()) - indexWriter.nextUnfilteredCluster(partition.next()); - } - } - catch (IOException ex) + RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ); + dataFile.seek(indexEntry.position); + ByteBufferUtil.readWithShortLength(dataFile); // key + + try (SSTableIdentityIterator partition = SSTableIdentityIterator.create(sstable, dataFile, key)) { - throw new FSReadError(ex, sstable.getFilename()); + // if the row has statics attached, it has to be indexed separately + if (cfs.metadata().hasStaticColumns()) + indexWriter.nextUnfilteredCluster(partition.staticRow()); + + while (partition.hasNext()) + indexWriter.nextUnfilteredCluster(partition.next()); } bytesProcessed += keyPosition - previousKeyPosition; @@ -117,6 +110,10 @@ public void build() completeSSTable(indexWriter, sstable, indexes.values()); } + catch (IOException ex) + { + throw new FSReadError(ex, sstable.getFilename()); + } } } } diff --git a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java index 1a5792c4fad2..c31af70d4c0d 100644 --- a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java +++ b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java @@ -17,105 +17,70 @@ */ package org.apache.cassandra.io.sstable; -import java.io.File; import java.io.IOException; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.RowIndexEntry; import org.apache.cassandra.dht.IPartitioner; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.RandomAccessReader; +import org.apache.cassandra.io.sstable.format.PartitionIndexIterator; +import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.AbstractIterator; -import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.CloseableIterator; public class KeyIterator extends AbstractIterator implements CloseableIterator { - private final static class In - { - private final File path; - private volatile RandomAccessReader in; - - public In(File path) - { - this.path = path; - } - - private void maybeInit() - { - if (in != null) - return; - - synchronized (this) - { - if (in == null) - { - in = RandomAccessReader.open(path); - } - } - } - - public DataInputPlus get() - { - maybeInit(); - return in; - } - - public boolean isEOF() - { - maybeInit(); - return in.isEOF(); - } - - public void close() - { - if (in != null) - in.close(); - } + private final IPartitioner partitioner; + private final PartitionIndexIterator it; + private final ReadWriteLock fileAccessLock; + private final long indexLength; - public long getFilePointer() - { - maybeInit(); - return in.getFilePointer(); - } + private long keyPosition = -1; - public long length() - { - maybeInit(); - return in.length(); - } + public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner, ReadWriteLock fileAccessLock) + { + this.it = it; + this.partitioner = partitioner; + this.fileAccessLock = fileAccessLock; + this.indexLength = it.indexLength(); } - private final Descriptor desc; - private final In in; - private final IPartitioner partitioner; - private final ReadWriteLock fileAccessLock; + public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner) + { + this(it, partitioner, null); + } - private long keyPosition; + public static KeyIterator forSSTable(SSTableReader ssTableReader) throws IOException + { + return new KeyIterator(ssTableReader.allKeysIterator(), ssTableReader.getPartitioner(), new ReentrantReadWriteLock()); + } - public KeyIterator(Descriptor desc, TableMetadata metadata) + public static KeyIterator create(SSTableReader.Factory factory, Descriptor descriptor, TableMetadata metadata) { - this.desc = desc; - in = new In(new File(desc.filenameFor(Component.PRIMARY_INDEX))); - partitioner = metadata.partitioner; - fileAccessLock = new ReentrantReadWriteLock(); + return new KeyIterator(factory.indexIterator(descriptor, metadata), metadata.partitioner, new ReentrantReadWriteLock()); } protected DecoratedKey computeNext() { - fileAccessLock.readLock().lock(); + if (fileAccessLock != null) + fileAccessLock.readLock().lock(); try { - if (in.isEOF()) - return endOfData(); - - keyPosition = in.getFilePointer(); - DecoratedKey key = partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in.get())); - RowIndexEntry.Serializer.skip(in.get(), desc.version); // skip remainder of the entry - return key; + if (keyPosition < 0) + { + keyPosition = 0; + return it.isExhausted() + ? endOfData() + : partitioner.decorateKey(it.key()); + } + else + { + keyPosition = it.indexPosition(); + return it.advance() + ? partitioner.decorateKey(it.key()) + : endOfData(); + } } catch (IOException e) { @@ -123,45 +88,68 @@ protected DecoratedKey computeNext() } finally { - fileAccessLock.readLock().unlock(); + if (fileAccessLock != null) + fileAccessLock.readLock().unlock(); } } public void close() { - fileAccessLock.writeLock().lock(); + if (fileAccessLock != null) + fileAccessLock.writeLock().lock(); try { - in.close(); + it.close(); } finally { - fileAccessLock.writeLock().unlock(); + if (fileAccessLock != null) + fileAccessLock.writeLock().unlock(); } } public long getBytesRead() { - fileAccessLock.readLock().lock(); + if (fileAccessLock != null) + fileAccessLock.readLock().lock(); try { - return in.getFilePointer(); + return it.indexPosition(); } finally { - fileAccessLock.readLock().unlock(); + if (fileAccessLock != null) + fileAccessLock.readLock().unlock(); } } public long getTotalBytes() { - // length is final in the referenced object. - // no need to acquire the lock - return in.length(); + return indexLength; } public long getKeyPosition() { return keyPosition; } + + public void reset() + { + if (fileAccessLock != null) + fileAccessLock.readLock().lock(); + try + { + it.reset(); + keyPosition = -1; + } + catch (IOException ex) + { + throw new RuntimeException(ex); + } + finally + { + if (fileAccessLock != null) + fileAccessLock.readLock().unlock(); + } + } } diff --git a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java index 826b91d65257..9a231c932355 100644 --- a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java +++ b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.io.sstable; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; @@ -26,6 +27,7 @@ import org.apache.cassandra.utils.CloseableIterator; import org.apache.cassandra.utils.IMergeIterator; import org.apache.cassandra.utils.MergeIterator; +import org.apache.cassandra.utils.Throwables; /** * Caller must acquire and release references to the sstables used here. @@ -38,8 +40,16 @@ public class ReducingKeyIterator implements CloseableIterator public ReducingKeyIterator(Collection sstables) { iters = new ArrayList<>(sstables.size()); - for (SSTableReader sstable : sstables) - iters.add(new KeyIterator(sstable.descriptor, sstable.metadata())); + try + { + for (SSTableReader sstable : sstables) + iters.add(KeyIterator.forSSTable(sstable)); + } + catch (IOException | RuntimeException ex) + { + iters.forEach(KeyIterator::close); + throw Throwables.cleaned(ex); + } } private void maybeInit() diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java index 0471be3238cf..ba4b323becaf 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTable.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java @@ -33,7 +33,7 @@ import org.apache.cassandra.db.BufferDecoratedKey; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.PartitionIndexIterator; import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Token; @@ -79,6 +79,8 @@ public abstract class SSTable protected final DiskOptimizationStrategy optimizationStrategy; protected final TableMetadataRef metadata; + private static final int SAMPLES_CAP = 10000; + private static final int BYTES_CAP = 10000000; protected SSTable(Descriptor descriptor, Set components, TableMetadataRef metadata, DiskOptimizationStrategy optimizationStrategy) { @@ -265,21 +267,24 @@ public static Set discoverComponentsFor(Descriptor desc) } /** @return An estimate of the number of keys contained in the given index file. */ - public static long estimateRowsFromIndex(RandomAccessReader ifile, Descriptor descriptor) throws IOException + public static long estimateRowsFromIndex(PartitionIndexIterator iterator) throws IOException { // collect sizes for the first 10000 keys, or first 10 megabytes of data - final int SAMPLES_CAP = 10000, BYTES_CAP = (int)Math.min(10000000, ifile.length()); - int keys = 0; - while (ifile.getFilePointer() < BYTES_CAP && keys < SAMPLES_CAP) + try + { + int keys = 0; + while (!iterator.isExhausted() && iterator.indexPosition() < BYTES_CAP && keys < SAMPLES_CAP) + { + iterator.advance(); + keys++; + } + assert keys > 0 && iterator.indexPosition() > 0 && iterator.indexLength() > 0 : "Unexpected empty index file"; + return iterator.indexLength() / (iterator.indexPosition() / keys); + } + finally { - ByteBufferUtil.skipShortLength(ifile); - RowIndexEntry.Serializer.skip(ifile, descriptor.version); - keys++; + iterator.reset(); } - assert keys > 0 && ifile.getFilePointer() > 0 && ifile.length() > 0 : "Unexpected empty index file: " + ifile; - long estimatedRows = ifile.length() / (ifile.getFilePointer() / keys); - ifile.seek(0); - return estimatedRows; } public long bytesOnDisk() diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java index 76e12c891ada..cf19083c9f55 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java @@ -19,6 +19,7 @@ import java.io.*; +import org.apache.cassandra.io.sstable.format.RowIndexEntry; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.db.*; import org.apache.cassandra.db.rows.*; diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java index 92548b26aea4..1b6336dfbef2 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java @@ -27,7 +27,8 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.db.lifecycle.ILifecycleTransaction; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.io.sstable.format.SSTableReader; @@ -69,7 +70,7 @@ public class SSTableRewriter extends Transactional.AbstractTransactional impleme private final boolean eagerWriterMetaRelease; // true if the writer metadata should be released when switch is called private SSTableWriter writer; - private Map cachedKeys = new HashMap<>(); + private Map cachedKeys = new HashMap<>(); // for testing (TODO: remove when have byteman setup) private boolean throwEarly, throwLate; @@ -117,12 +118,12 @@ public SSTableWriter currentWriter() return writer; } - public RowIndexEntry append(UnfilteredRowIterator partition) + public BigTableRowIndexEntry append(UnfilteredRowIterator partition) { // we do this before appending to ensure we can resetAndTruncate() safely if the append fails DecoratedKey key = partition.partitionKey(); maybeReopenEarly(key); - RowIndexEntry index = writer.append(partition); + BigTableRowIndexEntry index = writer.append(partition); if (DatabaseDescriptor.shouldMigrateKeycacheOnCompaction()) { if (!transaction.isOffline() && index != null) @@ -141,7 +142,7 @@ public RowIndexEntry append(UnfilteredRowIterator partition) } // attempts to append the row, if fails resets the writer position - public RowIndexEntry tryAppend(UnfilteredRowIterator partition) + public BigTableRowIndexEntry tryAppend(UnfilteredRowIterator partition) { writer.mark(); try @@ -163,7 +164,7 @@ private void maybeReopenEarly(DecoratedKey key) { for (SSTableReader reader : transaction.originals()) { - RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE); + RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE); NativeLibrary.trySkipCache(reader.getFilename(), 0, index == null ? 0 : index.position); } } @@ -223,7 +224,7 @@ private void moveStarts(SSTableReader newReader, DecoratedKey lowerbound) if (!cachedKeys.isEmpty()) { invalidateKeys = new ArrayList<>(cachedKeys.size()); - for (Map.Entry cacheKey : cachedKeys.entrySet()) + for (Map.Entry cacheKey : cachedKeys.entrySet()) { invalidateKeys.add(cacheKey.getKey()); newReader.cacheKey(cacheKey.getKey(), cacheKey.getValue()); diff --git a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java index a84f07e94971..d38d03292b9a 100644 --- a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java @@ -21,7 +21,7 @@ import java.util.Collections; import java.util.UUID; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; import org.apache.cassandra.db.rows.UnfilteredRowIterator; @@ -45,7 +45,7 @@ protected SimpleSSTableMultiWriter(SSTableWriter writer, LifecycleNewTracker lif public boolean append(UnfilteredRowIterator partition) { - RowIndexEntry indexEntry = writer.append(partition); + BigTableRowIndexEntry indexEntry = writer.append(partition); return indexEntry != null; } diff --git a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java similarity index 54% rename from src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java rename to src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java index fee45c232fe2..2a1e67572426 100644 --- a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java +++ b/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java @@ -15,10 +15,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.cassandra.db.columniterator; +package org.apache.cassandra.io.sstable.format; +import java.io.Closeable; import java.io.IOException; -import java.util.Comparator; import java.util.Iterator; import java.util.NoSuchElementException; @@ -26,15 +26,12 @@ import org.apache.cassandra.db.*; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.rows.*; -import org.apache.cassandra.io.sstable.IndexInfo; -import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.io.sstable.CorruptSSTableException; import org.apache.cassandra.io.util.FileDataInput; -import org.apache.cassandra.io.util.DataPosition; import org.apache.cassandra.io.util.FileHandle; import org.apache.cassandra.utils.ByteBufferUtil; -public abstract class AbstractSSTableIterator implements UnfilteredRowIterator +public abstract class AbstractSSTableIterator> implements UnfilteredRowIterator { protected final SSTableReader sstable; // We could use sstable.metadata(), but that can change during execution so it's good hygiene to grab an immutable instance @@ -59,7 +56,7 @@ public abstract class AbstractSSTableIterator implements UnfilteredRowIterator protected AbstractSSTableIterator(SSTableReader sstable, FileDataInput file, DecoratedKey key, - RowIndexEntry indexEntry, + E indexEntry, Slices slices, ColumnFilter columnFilter, FileHandle ifile) @@ -176,9 +173,9 @@ private static Row readStaticRow(SSTableReader sstable, } } - protected abstract Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile); + protected abstract Reader createReaderInternal(E indexEntry, FileDataInput file, boolean shouldCloseFile); - private Reader createReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile) + private Reader createReader(E indexEntry, FileDataInput file, boolean shouldCloseFile) { return slices.isEmpty() ? new NoRowsReader(file, shouldCloseFile) : createReaderInternal(indexEntry, file, shouldCloseFile); @@ -255,7 +252,7 @@ private void slice(Slice slice) e.addSuppressed(suppressed); } sstable.markSuspect(); - throw new CorruptSSTableException(e, reader.file.getPath()); + throw new CorruptSSTableException(e, reader.toString()); } } @@ -286,53 +283,19 @@ public void close() catch (IOException e) { sstable.markSuspect(); - throw new CorruptSSTableException(e, reader.file.getPath()); + throw new CorruptSSTableException(e, reader.toString()); } } - protected abstract class Reader implements Iterator + protected abstract class Reader implements Iterator, Closeable { - private final boolean shouldCloseFile; public FileDataInput file; - - protected UnfilteredDeserializer deserializer; - - // Records the currently open range tombstone (if any) - protected DeletionTime openMarker = null; + protected final boolean shouldCloseFile; protected Reader(FileDataInput file, boolean shouldCloseFile) { this.file = file; this.shouldCloseFile = shouldCloseFile; - - if (file != null) - createDeserializer(); - } - - private void createDeserializer() - { - assert file != null && deserializer == null; - deserializer = UnfilteredDeserializer.create(metadata, file, sstable.header, helper); - } - - protected void seekToPosition(long position) throws IOException - { - // This may be the first time we're actually looking into the file - if (file == null) - { - file = sstable.getFileDataInput(position); - createDeserializer(); - } - else - { - file.seek(position); - } - } - - protected void updateOpenMarker(RangeTombstoneMarker marker) - { - // Note that we always read index blocks in forward order so this method is always called in forward order - openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null; } public boolean hasNext() @@ -352,7 +315,7 @@ public boolean hasNext() e.addSuppressed(suppressed); } sstable.markSuspect(); - throw new CorruptSSTableException(e, reader.file.getPath()); + throw new CorruptSSTableException(e, toString()); } } @@ -373,7 +336,7 @@ public Unfiltered next() e.addSuppressed(suppressed); } sstable.markSuspect(); - throw new CorruptSSTableException(e, reader.file.getPath()); + throw new CorruptSSTableException(e, toString()); } } @@ -381,26 +344,34 @@ public Unfiltered next() public abstract void setForSlice(Slice slice) throws IOException; protected abstract boolean hasNextInternal() throws IOException; + protected abstract Unfiltered nextInternal() throws IOException; + @Override public void close() throws IOException { if (shouldCloseFile && file != null) file.close(); } + + @Override + public String toString() + { + return file != null ? file.getPath() : "null"; + } } // Reader for when we have Slices.NONE but need to read static row or partition level deletion - private class NoRowsReader extends AbstractSSTableIterator.Reader + private class NoRowsReader extends Reader { - private NoRowsReader(FileDataInput file, boolean shouldCloseFile) + public NoRowsReader(FileDataInput file, boolean shouldCloseFile) { super(file, shouldCloseFile); } public void setForSlice(Slice slice) throws IOException { - return; + // no-op } protected boolean hasNextInternal() throws IOException @@ -413,199 +384,4 @@ protected Unfiltered nextInternal() throws IOException throw new NoSuchElementException(); } } - - // Used by indexed readers to store where they are of the index. - public static class IndexState implements AutoCloseable - { - private final Reader reader; - private final ClusteringComparator comparator; - - private final RowIndexEntry indexEntry; - private final RowIndexEntry.IndexInfoRetriever indexInfoRetriever; - private final boolean reversed; - - private int currentIndexIdx; - - // Marks the beginning of the block corresponding to currentIndexIdx. - private DataPosition mark; - - public IndexState(Reader reader, ClusteringComparator comparator, RowIndexEntry indexEntry, boolean reversed, FileHandle indexFile) - { - this.reader = reader; - this.comparator = comparator; - this.indexEntry = indexEntry; - this.indexInfoRetriever = indexEntry.openWithIndex(indexFile); - this.reversed = reversed; - this.currentIndexIdx = reversed ? indexEntry.columnsIndexCount() : -1; - } - - public boolean isDone() - { - return reversed ? currentIndexIdx < 0 : currentIndexIdx >= indexEntry.columnsIndexCount(); - } - - // Sets the reader to the beginning of blockIdx. - public void setToBlock(int blockIdx) throws IOException - { - if (blockIdx >= 0 && blockIdx < indexEntry.columnsIndexCount()) - { - reader.seekToPosition(columnOffset(blockIdx)); - mark = reader.file.mark(); - reader.deserializer.clearState(); - } - - currentIndexIdx = blockIdx; - reader.openMarker = blockIdx > 0 ? index(blockIdx - 1).endOpenMarker : null; - } - - private long columnOffset(int i) throws IOException - { - return indexEntry.position + index(i).offset; - } - - public int blocksCount() - { - return indexEntry.columnsIndexCount(); - } - - // Update the block idx based on the current reader position if we're past the current block. - // This only makes sense for forward iteration (for reverse ones, when we reach the end of a block we - // should seek to the previous one, not update the index state and continue). - public void updateBlock() throws IOException - { - assert !reversed; - - // If we get here with currentBlockIdx < 0, it means setToBlock() has never been called, so it means - // we're about to read from the beginning of the partition, but haven't "prepared" the IndexState yet. - // Do so by setting us on the first block. - if (currentIndexIdx < 0) - { - setToBlock(0); - return; - } - - while (currentIndexIdx + 1 < indexEntry.columnsIndexCount() && isPastCurrentBlock()) - { - reader.openMarker = currentIndex().endOpenMarker; - ++currentIndexIdx; - - // We have to set the mark, and we have to set it at the beginning of the block. So if we're not at the beginning of the block, this forces us to a weird seek dance. - // This can only happen when reading old file however. - long startOfBlock = columnOffset(currentIndexIdx); - long currentFilePointer = reader.file.getFilePointer(); - if (startOfBlock == currentFilePointer) - { - mark = reader.file.mark(); - } - else - { - reader.seekToPosition(startOfBlock); - mark = reader.file.mark(); - reader.seekToPosition(currentFilePointer); - } - } - } - - // Check if we've crossed an index boundary (based on the mark on the beginning of the index block). - public boolean isPastCurrentBlock() throws IOException - { - assert reader.deserializer != null; - return reader.file.bytesPastMark(mark) >= currentIndex().width; - } - - public int currentBlockIdx() - { - return currentIndexIdx; - } - - public IndexInfo currentIndex() throws IOException - { - return index(currentIndexIdx); - } - - public IndexInfo index(int i) throws IOException - { - return indexInfoRetriever.columnsIndex(i); - } - - // Finds the index of the first block containing the provided bound, starting at the provided index. - // Will be -1 if the bound is before any block, and blocksCount() if it is after every block. - public int findBlockIndex(ClusteringBound bound, int fromIdx) throws IOException - { - if (bound.isBottom()) - return -1; - if (bound.isTop()) - return blocksCount(); - - return indexFor(bound, fromIdx); - } - - public int indexFor(ClusteringPrefix name, int lastIndex) throws IOException - { - IndexInfo target = new IndexInfo(name, name, 0, 0, null); - /* - Take the example from the unit test, and say your index looks like this: - [0..5][10..15][20..25] - and you look for the slice [13..17]. - - When doing forward slice, we are doing a binary search comparing 13 (the start of the query) - to the lastName part of the index slot. You'll end up with the "first" slot, going from left to right, - that may contain the start. - - When doing a reverse slice, we do the same thing, only using as a start column the end of the query, - i.e. 17 in this example, compared to the firstName part of the index slots. bsearch will give us the - first slot where firstName > start ([20..25] here), so we subtract an extra one to get the slot just before. - */ - int startIdx = 0; - int endIdx = indexEntry.columnsIndexCount() - 1; - - if (reversed) - { - if (lastIndex < endIdx) - { - endIdx = lastIndex; - } - } - else - { - if (lastIndex > 0) - { - startIdx = lastIndex; - } - } - - int index = binarySearch(target, comparator.indexComparator(reversed), startIdx, endIdx); - return (index < 0 ? -index - (reversed ? 2 : 1) : index); - } - - private int binarySearch(IndexInfo key, Comparator c, int low, int high) throws IOException - { - while (low <= high) - { - int mid = (low + high) >>> 1; - IndexInfo midVal = index(mid); - int cmp = c.compare(midVal, key); - - if (cmp < 0) - low = mid + 1; - else if (cmp > 0) - high = mid - 1; - else - return mid; - } - return -(low + 1); - } - - @Override - public String toString() - { - return String.format("IndexState(indexSize=%d, currentBlock=%d, reversed=%b)", indexEntry.columnsIndexCount(), currentIndexIdx, reversed); - } - - @Override - public void close() throws IOException - { - indexInfoRetriever.close(); - } - } } diff --git a/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java b/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java new file mode 100644 index 000000000000..616db43aec2f --- /dev/null +++ b/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.io.sstable.format; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; + +/** + * Iterator over the partitions of an sstable. + *

    + * The index iterator starts with a key/position ready. advance() should be used to move to the next key; iteration + * completes when key() == null. + */ +public interface PartitionIndexIterator extends Closeable +{ + /** + * Current key + */ + public ByteBuffer key(); + + /** + * Position in the data file where the associated content resides + */ + public long dataPosition(); + + /** + * Moves the iterator forward. Returns false if we reach EOF and there nothing more to read + */ + public boolean advance() throws IOException; + + /** + * Closes the iterator quietly + */ + public void close(); + + /** + * Returns true if we reach EOF + */ + boolean isExhausted(); + + /** + * Returns the current position in index file (which along with {@link #indexLength()} + * can be used to track iteration progress + */ + long indexPosition(); + + /** + * Sets the current position in index file + */ + void indexPosition(long position) throws IOException; + + /** + * Returns length of the index file + */ + long indexLength(); + + /** + * Resets the iterator to the initial position + */ + void reset() throws IOException; +} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java new file mode 100644 index 000000000000..c00a37ae39bf --- /dev/null +++ b/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.sstable.format; + +import org.apache.cassandra.cache.IMeasurableMemory; +import org.apache.cassandra.db.DeletionTime; + +public abstract class RowIndexEntry implements IMeasurableMemory +{ + public final long position; + + public RowIndexEntry(long position) + { + this.position = position; + } + + /** + * @return true if this index entry contains the row-level tombstone and column summary. Otherwise, + * caller should fetch these from the row header. + */ + public boolean isIndexed() + { + return columnsIndexCount() > 1; + } + + public abstract DeletionTime deletionTime(); + + public int columnsIndexCount() + { + return 0; + } +} diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java index 14f660258fbe..2ecef6025ac3 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java @@ -19,9 +19,6 @@ import com.google.common.base.CharMatcher; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.db.RowIndexEntry; -import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.io.sstable.format.big.BigFormat; /** @@ -31,6 +28,7 @@ public interface SSTableFormat { static boolean enableSSTableDevelopmentTestMode = Boolean.getBoolean("cassandra.test.sstableformatdevelopment"); + Type getType(); Version getLatestVersion(); Version getVersion(String version); @@ -38,9 +36,7 @@ public interface SSTableFormat SSTableWriter.Factory getWriterFactory(); SSTableReader.Factory getReaderFactory(); - RowIndexEntry.IndexSerializer getIndexSerializer(TableMetadata metadata, Version version, SerializationHeader header); - - public static enum Type + public enum Type { //The original sstable format BIG("big", BigFormat.instance); @@ -53,7 +49,7 @@ public static Type current() return BIG; } - private Type(String name, SSTableFormat info) + Type(String name, SSTableFormat info) { //Since format comes right after generation //we disallow formats with numeric names diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index 258b004871bf..7aa38482169b 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -24,6 +24,7 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Supplier; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Iterables; @@ -58,6 +59,7 @@ import org.apache.cassandra.io.FSWriteError; import org.apache.cassandra.io.compress.CompressionMetadata; import org.apache.cassandra.io.sstable.*; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.io.sstable.metadata.*; import org.apache.cassandra.io.util.*; import org.apache.cassandra.metrics.RestorableMeter; @@ -65,6 +67,7 @@ import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.CacheService; @@ -205,9 +208,7 @@ public enum OpenReason protected final IFilter bf; public final IndexSummary indexSummary; - protected final RowIndexEntry.IndexSerializer rowIndexEntrySerializer; - - protected InstrumentingCache keyCache; + protected InstrumentingCache keyCache; protected final BloomFilterTracker bloomFilterTracker = new BloomFilterTracker(); @@ -659,7 +660,6 @@ protected SSTableReader(final Descriptor desc, this.bf = bf; this.maxDataAge = maxDataAge; this.openReason = openReason; - this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata.get(), desc.version, header); tidy = new InstanceTidier(descriptor, metadata.id); selfRef = new Ref<>(this, tidy); } @@ -681,6 +681,8 @@ public static long getTotalUncompressedBytes(Iterable sstables) return sum; } + public abstract PartitionIndexIterator allKeysIterator() throws IOException; + public boolean equals(Object that) { return that instanceof SSTableReader && ((SSTableReader) that).descriptor.equals(this.descriptor); @@ -701,7 +703,7 @@ public void setupOnline() // under normal operation we can do this at any time, but SSTR is also used outside C* proper, // e.g. by BulkLoader, which does not initialize the cache. As a kludge, we set up the cache // here when we know we're being wired into the rest of the server infrastructure. - InstrumentingCache maybeKeyCache = CacheService.instance.keyCache; + InstrumentingCache maybeKeyCache = CacheService.instance.keyCache; if (maybeKeyCache.getCapacity() > 0) keyCache = maybeKeyCache; @@ -995,25 +997,13 @@ else if (samplingLevel < indexSummary.getSamplingLevel()) private IndexSummary buildSummaryAtLevel(int newSamplingLevel) throws IOException { // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary. - RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX))); - try + try (KeyIterator iterator = KeyIterator.forSSTable(this); + IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata().params.minIndexInterval, newSamplingLevel)) { - long indexSize = primaryIndex.length(); - try (IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata().params.minIndexInterval, newSamplingLevel)) - { - long indexPosition; - while ((indexPosition = primaryIndex.getFilePointer()) != indexSize) - { - summaryBuilder.maybeAddEntry(decorateKey(ByteBufferUtil.readWithShortLength(primaryIndex)), indexPosition); - RowIndexEntry.Serializer.skip(primaryIndex, descriptor.version); - } + while (iterator.hasNext()) + summaryBuilder.maybeAddEntry(iterator.next(), iterator.getKeyPosition()); - return summaryBuilder.build(getPartitioner()); - } - } - finally - { - FileUtils.closeQuietly(primaryIndex); + return summaryBuilder.build(getPartitioner()); } } @@ -1314,7 +1304,7 @@ public KeyCacheKey getCacheKey(DecoratedKey key) return new KeyCacheKey(metadata(), descriptor, key.getKey()); } - public void cacheKey(DecoratedKey key, RowIndexEntry info) + public void cacheKey(DecoratedKey key, BigTableRowIndexEntry info) { CachingParams caching = metadata().params.caching; @@ -1326,20 +1316,20 @@ public void cacheKey(DecoratedKey key, RowIndexEntry info) keyCache.put(cacheKey, info); } - public RowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats) + public BigTableRowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats) { if (isKeyCacheEnabled()) return getCachedPosition(new KeyCacheKey(metadata(), descriptor, key.getKey()), updateStats); return null; } - protected RowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats) + protected BigTableRowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats) { if (isKeyCacheEnabled()) { if (updateStats) { - RowIndexEntry cachedEntry = keyCache.get(unifiedKey); + BigTableRowIndexEntry cachedEntry = keyCache.get(unifiedKey); keyCacheRequest.incrementAndGet(); if (cachedEntry != null) { @@ -1367,28 +1357,16 @@ public boolean isKeyCacheEnabled() * allow key selection by token bounds but only if op != * EQ * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins. */ - public final RowIndexEntry getPosition(PartitionPosition key, Operator op) + public final RowIndexEntry getPosition(PartitionPosition key, Operator op) { - return getPosition(key, op, SSTableReadsListener.NOOP_LISTENER); + return getPosition(key, op, true, false, SSTableReadsListener.NOOP_LISTENER); } - /** - * Retrieves the position while updating the key cache and the stats. - * @param key The key to apply as the rhs to the given Operator. A 'fake' key is allowed to - * allow key selection by token bounds but only if op != * EQ - * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins. - * @param listener the {@code SSTableReaderListener} that must handle the notifications. - */ - public final RowIndexEntry getPosition(PartitionPosition key, Operator op, SSTableReadsListener listener) + public final boolean checkEntryExists(PartitionPosition key, + Operator op, + boolean updateCacheAndStats) { - return getPosition(key, op, true, false, listener); - } - - public final RowIndexEntry getPosition(PartitionPosition key, - Operator op, - boolean updateCacheAndStats) - { - return getPosition(key, op, updateCacheAndStats, false, SSTableReadsListener.NOOP_LISTENER); + return getPosition(key, op, updateCacheAndStats, false, SSTableReadsListener.NOOP_LISTENER) != null; } /** @@ -1399,11 +1377,11 @@ public final RowIndexEntry getPosition(PartitionPosition key, * @param listener a listener used to handle internal events * @return The index entry corresponding to the key, or null if the key is not present */ - protected abstract RowIndexEntry getPosition(PartitionPosition key, - Operator op, - boolean updateCacheAndStats, - boolean permitMatchPastLast, - SSTableReadsListener listener); + protected abstract RowIndexEntry getPosition(PartitionPosition key, + Operator op, + boolean updateCacheAndStats, + boolean permitMatchPastLast, + SSTableReadsListener listener); public abstract UnfilteredRowIterator iterator(DecoratedKey key, Slices slices, @@ -1411,9 +1389,7 @@ public abstract UnfilteredRowIterator iterator(DecoratedKey key, boolean reversed, SSTableReadsListener listener); - public abstract UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed); - - public abstract UnfilteredRowIterator simpleIterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, boolean tombstoneOnly); + public abstract UnfilteredRowIterator simpleIterator(Supplier dfile, DecoratedKey key, boolean tombstoneOnly); /** * Finds and returns the first key beyond a given token in this SSTable or null if no such key exists. @@ -1428,24 +1404,22 @@ public DecoratedKey firstKeyBeyond(PartitionPosition token) if (ifile == null) return null; - String path = null; - try (FileDataInput in = ifile.createReader(sampledPosition)) + try (PartitionIndexIterator iterator = allKeysIterator()) { - path = in.getPath(); - while (!in.isEOF()) + iterator.indexPosition(sampledPosition); + KeyIterator keyIterator = new KeyIterator(iterator, getPartitioner()); + + while (keyIterator.hasNext()) { - ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in); - DecoratedKey indexDecoratedKey = decorateKey(indexKey); + DecoratedKey indexDecoratedKey = keyIterator.next(); if (indexDecoratedKey.compareTo(token) > 0) return indexDecoratedKey; - - RowIndexEntry.Serializer.skip(in, descriptor.version); } } catch (IOException e) { markSuspect(); - throw new CorruptSSTableException(e, path); + throw new CorruptSSTableException(e, ifile.path()); } return null; @@ -1618,25 +1592,7 @@ public boolean isRepaired() return sstableMetadata.repairedAt != ActiveRepairService.UNREPAIRED_SSTABLE; } - public DecoratedKey keyAt(long indexPosition) throws IOException - { - DecoratedKey key; - try (FileDataInput in = ifile.createReader(indexPosition)) - { - if (in.isEOF()) - return null; - - key = decorateKey(ByteBufferUtil.readWithShortLength(in)); - - // hint read path about key location if caching is enabled - // this saves index summary lookup and index file iteration which whould be pretty costly - // especially in presence of promoted column indexes - if (isKeyCacheEnabled()) - cacheKey(key, rowIndexEntrySerializer.deserialize(in)); - } - - return key; - } + public abstract DecoratedKey keyAt(long indexPosition) throws IOException; public boolean isPendingRepair() { @@ -1725,7 +1681,7 @@ public long getRecentBloomFilterTrueNegativeCount() return bloomFilterTracker.getRecentTrueNegativeCount(); } - public InstrumentingCache getKeyCache() + public InstrumentingCache getKeyCache() { return keyCache; } @@ -1954,7 +1910,7 @@ public Ref ref() return selfRef.ref(); } - void setup(boolean trackHotness) + protected void setup(boolean trackHotness) { tidy.setup(this, trackHotness); this.readMeter = tidy.global.readMeter; @@ -2200,9 +2156,11 @@ public static void resetTidying() GlobalTidy.lookup.clear(); } - public static abstract class Factory + public interface Factory { - public abstract SSTableReader open(SSTableReaderBuilder builder); + SSTableReader open(SSTableReaderBuilder builder); + + PartitionIndexIterator indexIterator(Descriptor descriptor, TableMetadata metadata); } public static class PartitionPositionBounds diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java index e5abcf834e48..24edf70d1c24 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java @@ -22,7 +22,6 @@ import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.RowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.io.sstable.*; import org.apache.cassandra.io.sstable.metadata.MetadataType; @@ -42,7 +41,6 @@ import java.io.DataInputStream; import java.io.File; import java.io.IOException; -import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -93,6 +91,20 @@ public SSTableReaderBuilder(Descriptor descriptor, public abstract SSTableReader build(); + public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor) + { + return new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX)) + .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap) + .withChunkCache(ChunkCache.instance); + } + + public static FileHandle.Builder defaultDataHandleBuilder(Descriptor descriptor) + { + return new FileHandle.Builder(descriptor.filenameFor(Component.DATA)) + .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap) + .withChunkCache(ChunkCache.instance); + } + /** * Load index summary, first key and last key from Summary.db file if it exists. * @@ -151,47 +163,39 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter, if (!components.contains(Component.PRIMARY_INDEX)) return; + if (!recreateBloomFilter && summaryLoaded) + return; + if (logger.isDebugEnabled()) logger.debug("Attempting to build summary for {}", descriptor); - - // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary. - try (RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)))) - { - long indexSize = primaryIndex.length(); + try (PartitionIndexIterator indexIterator = readerFactory.indexIterator(descriptor, metadata)) { long histogramCount = statsMetadata.estimatedPartitionSize.count(); long estimatedKeys = histogramCount > 0 && !statsMetadata.estimatedPartitionSize.isOverflowed() ? histogramCount - : SSTable.estimateRowsFromIndex(primaryIndex, descriptor); // statistics is supposed to be optional - + : SSTable.estimateRowsFromIndex(indexIterator); // statistics is supposed to be optional if (recreateBloomFilter) bf = FilterFactory.getFilter(estimatedKeys, metadata.params.bloomFilterFpChance); - try (IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL)) + // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary. + try (KeyIterator keyIterator = new KeyIterator(indexIterator, metadata.partitioner); + IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL)) { - long indexPosition; - - while ((indexPosition = primaryIndex.getFilePointer()) != indexSize) + while (keyIterator.hasNext()) { - ByteBuffer key = ByteBufferUtil.readWithShortLength(primaryIndex); - RowIndexEntry.Serializer.skip(primaryIndex, descriptor.version); - DecoratedKey decoratedKey = metadata.partitioner.decorateKey(key); + DecoratedKey decoratedKey = keyIterator.next(); if (!summaryLoaded) { if (first == null) first = decoratedKey; last = decoratedKey; + + summaryBuilder.maybeAddEntry(decoratedKey, keyIterator.getKeyPosition()); } if (recreateBloomFilter) bf.add(decoratedKey); - - // if summary was already read from disk we don't want to re-populate it using primary index - if (!summaryLoaded) - { - summaryBuilder.maybeAddEntry(decoratedKey, indexPosition); - } } if (!summaryLoaded) @@ -300,12 +304,8 @@ public SSTableReader build() initSummary(dataFilePath, components, statsMetadata); boolean compression = components.contains(Component.COMPRESSION_INFO); - try (FileHandle.Builder ibuilder = new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX)) - .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap) - .withChunkCache(ChunkCache.instance); - FileHandle.Builder dbuilder = new FileHandle.Builder(descriptor.filenameFor(Component.DATA)).compressed(compression) - .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap) - .withChunkCache(ChunkCache.instance)) + try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor); + FileHandle.Builder dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression)) { long indexFileLength = new File(descriptor.filenameFor(Component.PRIMARY_INDEX)).length(); DiskOptimizationStrategy optimizationStrategy = DatabaseDescriptor.getDiskOptimizationStrategy(); @@ -443,12 +443,9 @@ void load(boolean recreateBloomFilter, StatsMetadata statsMetadata, Set components) throws IOException { - try(FileHandle.Builder ibuilder = new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX)) - .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap) - .withChunkCache(ChunkCache.instance); - FileHandle.Builder dbuilder = new FileHandle.Builder(descriptor.filenameFor(Component.DATA)).compressed(components.contains(Component.COMPRESSION_INFO)) - .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap) - .withChunkCache(ChunkCache.instance)) + boolean compression = components.contains(Component.COMPRESSION_INFO); + try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor); + FileHandle.Builder dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression)) { loadSummary(); boolean buildSummary = summary == null || recreateBloomFilter; diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java index 6d384bfb7233..0b34fa4b8d32 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java @@ -17,7 +17,7 @@ */ package org.apache.cassandra.io.sstable.format; -import org.apache.cassandra.db.RowIndexEntry; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; /** * Listener for receiving notifications associated with reading SSTables. diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java index 43c50c56141a..52667bfe78ed 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java @@ -25,7 +25,6 @@ import com.google.common.collect.Sets; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.db.RowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.compaction.OperationType; import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; @@ -35,6 +34,7 @@ import org.apache.cassandra.io.sstable.Component; import org.apache.cassandra.io.sstable.Descriptor; import org.apache.cassandra.io.sstable.SSTable; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry; import org.apache.cassandra.io.sstable.metadata.MetadataCollector; import org.apache.cassandra.io.sstable.metadata.MetadataComponent; import org.apache.cassandra.io.sstable.metadata.MetadataType; @@ -59,7 +59,6 @@ public abstract class SSTableWriter extends SSTable implements Transactional protected long maxDataAge = -1; protected final long keyCount; protected final MetadataCollector metadataCollector; - protected final RowIndexEntry.IndexSerializer rowIndexEntrySerializer; protected final SerializationHeader header; protected final TransactionalProxy txnProxy = txnProxy(); protected final Collection observers; @@ -91,7 +90,6 @@ protected SSTableWriter(Descriptor descriptor, this.isTransient = isTransient; this.metadataCollector = metadataCollector; this.header = header; - this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata.get(), descriptor.version, header); this.observers = observers == null ? Collections.emptySet() : observers; } @@ -209,7 +207,7 @@ private static Collection observers(Descriptor descriptor, * * @throws FSWriteError if a write to the dataFile fails */ - public abstract RowIndexEntry append(UnfilteredRowIterator iterator); + public abstract BigTableRowIndexEntry append(UnfilteredRowIterator iterator); public abstract long getFilePointer(); diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java new file mode 100644 index 000000000000..4ff6a727c12b --- /dev/null +++ b/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.io.sstable.format.big; + +import java.io.IOException; + +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.Slices; +import org.apache.cassandra.db.UnfilteredDeserializer; +import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.rows.RangeTombstoneMarker; +import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.FileDataInput; +import org.apache.cassandra.io.util.FileHandle; + +public abstract class AbstractBigTableIterator extends AbstractSSTableIterator +{ + protected AbstractBigTableIterator(SSTableReader sstable, + FileDataInput file, + DecoratedKey key, + BigTableRowIndexEntry indexEntry, + Slices slices, + ColumnFilter columnFilter, + FileHandle ifile) + { + super(sstable, file, key, indexEntry, slices, columnFilter, ifile); + } + + protected abstract class RowReader extends Reader { + protected UnfilteredDeserializer deserializer; + + // Records the currently open range tombstone (if any) + protected DeletionTime openMarker; + + protected RowReader(FileDataInput file, boolean shouldCloseFile) + { + super(file, shouldCloseFile); + + if (file != null) + createDeserializer(); + } + + private void createDeserializer() + { + assert file != null && deserializer == null; + deserializer = UnfilteredDeserializer.create(metadata, file, sstable.header, helper); + } + + public void seekToPosition(long position) throws IOException + { + // This may be the first time we're actually looking into the file + if (file == null) + { + file = sstable.getFileDataInput(position); + createDeserializer(); + } + else + { + file.seek(position); + } + } + + protected void updateOpenMarker(RangeTombstoneMarker marker) + { + // Note that we always read index blocks in forward order so this method is always called in forward order + openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null; + } + } +} diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java index ff0d7916672c..9c8b161386a9 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java @@ -17,13 +17,15 @@ */ package org.apache.cassandra.io.sstable.format.big; +import java.io.IOException; import java.util.Collection; import java.util.UUID; import org.apache.cassandra.io.sstable.SSTable; +import org.apache.cassandra.io.sstable.metadata.MetadataType; +import org.apache.cassandra.io.util.FileHandle; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.schema.TableMetadataRef; -import org.apache.cassandra.db.RowIndexEntry; import org.apache.cassandra.db.SerializationHeader; import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; import org.apache.cassandra.io.sstable.Descriptor; @@ -46,6 +48,12 @@ private BigFormat() } + @Override + public Type getType() + { + return Type.BIG; + } + @Override public Version getLatestVersion() { @@ -70,12 +78,6 @@ public SSTableReader.Factory getReaderFactory() return readerFactory; } - @Override - public RowIndexEntry.IndexSerializer getIndexSerializer(TableMetadata metadata, Version version, SerializationHeader header) - { - return new RowIndexEntry.Serializer(version, header); - } - static class WriterFactory extends SSTableWriter.Factory { @Override @@ -95,13 +97,30 @@ public SSTableWriter open(Descriptor descriptor, } } - static class ReaderFactory extends SSTableReader.Factory + static class ReaderFactory implements SSTableReader.Factory { @Override public SSTableReader open(SSTableReaderBuilder builder) { return new BigTableReader(builder); } + + @Override + public PartitionIndexIterator indexIterator(Descriptor descriptor, TableMetadata metadata) + { + try (FileHandle iFile = SSTableReaderBuilder.defaultIndexHandleBuilder(descriptor).complete()) { + SerializationHeader.Component headerComponent = (SerializationHeader.Component) + descriptor.getMetadataSerializer() + .deserialize(descriptor, MetadataType.HEADER); + SerializationHeader header = headerComponent.toHeader(metadata); + BigTableRowIndexEntry.Serializer serializer = new BigTableRowIndexEntry.Serializer(descriptor.version, header); + return BigTablePartitionIndexIterator.create(iFile, serializer); + } + catch (IOException ex) + { + throw new RuntimeException(ex); + } + } } // versions are denoted as [major][minor]. Minor versions must be forward-compatible: diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java new file mode 100644 index 000000000000..846f00809cae --- /dev/null +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.io.sstable.format.big; + +import java.io.IOException; +import java.nio.ByteBuffer; +import javax.annotation.concurrent.NotThreadSafe; + +import org.apache.cassandra.io.sstable.format.PartitionIndexIterator; +import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry.IndexSerializer; +import org.apache.cassandra.io.util.FileHandle; +import org.apache.cassandra.io.util.FileUtils; +import org.apache.cassandra.io.util.RandomAccessReader; +import org.apache.cassandra.utils.ByteBufferUtil; + +@NotThreadSafe +public class BigTablePartitionIndexIterator implements PartitionIndexIterator +{ + private final FileHandle indexFile; + private final RandomAccessReader reader; + private final IndexSerializer rowIndexEntrySerializer; + private final long initialPosition; + + private ByteBuffer key; + private long dataPosition; + + private BigTablePartitionIndexIterator(FileHandle indexFile, + RandomAccessReader reader, + IndexSerializer rowIndexEntrySerializer) + { + this.indexFile = indexFile; + this.reader = reader; + this.rowIndexEntrySerializer = rowIndexEntrySerializer; + this.initialPosition = reader.getFilePointer(); + } + + public static BigTablePartitionIndexIterator create(RandomAccessReader reader, IndexSerializer serializer) + throws IOException + { + BigTablePartitionIndexIterator iterator = new BigTablePartitionIndexIterator(null, reader, serializer); + try + { + iterator.advance(); + return iterator; + } + catch (IOException | RuntimeException ex) + { + iterator.close(); + throw ex; + } + } + + @SuppressWarnings({ "resource" }) + public static BigTablePartitionIndexIterator create(FileHandle indexFile, IndexSerializer serializer) + throws IOException + { + FileHandle iFile = null; + RandomAccessReader reader = null; + BigTablePartitionIndexIterator iterator = null; + try + { + iFile = indexFile.sharedCopy(); + reader = iFile.createReader(); + iterator = new BigTablePartitionIndexIterator(iFile, reader, serializer); + iterator.advance(); + return iterator; + } + catch (IOException | RuntimeException ex) + { + if (iterator != null) + { + iterator.close(); + } + else + { + FileUtils.closeQuietly(reader); + FileUtils.closeQuietly(iFile); + } + throw ex; + } + } + + @Override + public void close() + { + key = null; + dataPosition = -1; + FileUtils.closeQuietly(reader); + FileUtils.closeQuietly(indexFile); + } + + @Override + public boolean advance() throws IOException + { + if (!reader.isEOF()) + { + key = ByteBufferUtil.readWithShortLength(reader); + dataPosition = rowIndexEntrySerializer.deserializePositionAndSkip(reader); + return true; + } + else + { + dataPosition = -1; + key = null; + return false; + } + } + + @Override + public boolean isExhausted() + { + return key == null && dataPosition < 0; + } + + @Override + public ByteBuffer key() + { + return key; + } + + @Override + public long dataPosition() + { + return dataPosition; + } + + @Override + public long indexPosition() + { + return reader.getFilePointer(); + } + + @Override + public void indexPosition(long position) throws IOException + { + if (position > indexLength()) + throw new IndexOutOfBoundsException("The requested position exceeds the index length"); + reader.seek(position); + key = null; + dataPosition = 0; + advance(); + } + + @Override + public long indexLength() + { + return reader.length(); + } + + @Override + public void reset() throws IOException + { + reader.seek(initialPosition); + key = null; + dataPosition = 0; + advance(); + } +} diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java index f60c9dfee473..708488873098 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java @@ -20,14 +20,14 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; +import java.util.function.Supplier; +import org.apache.cassandra.io.sstable.format.PartitionIndexIterator; import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.db.*; -import org.apache.cassandra.db.columniterator.SSTableIterator; -import org.apache.cassandra.db.columniterator.SSTableReversedIterator; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.rows.Rows; import org.apache.cassandra.db.rows.UnfilteredRowIterator; @@ -52,9 +52,18 @@ public class BigTableReader extends SSTableReader { private static final Logger logger = LoggerFactory.getLogger(BigTableReader.class); + protected final BigTableRowIndexEntry.IndexSerializer rowIndexEntrySerializer; + BigTableReader(SSTableReaderBuilder builder) { super(builder); + this.rowIndexEntrySerializer = new BigTableRowIndexEntry.Serializer(descriptor.version, header); + } + + @Override + public PartitionIndexIterator allKeysIterator() throws IOException + { + return BigTablePartitionIndexIterator.create(getIndexFile(), rowIndexEntrySerializer); } public UnfilteredRowIterator iterator(DecoratedKey key, @@ -63,12 +72,12 @@ public UnfilteredRowIterator iterator(DecoratedKey key, boolean reversed, SSTableReadsListener listener) { - RowIndexEntry rie = getPosition(key, SSTableReader.Operator.EQ, listener); + BigTableRowIndexEntry rie = getPosition(key, SSTableReader.Operator.EQ, true, false, listener); return iterator(null, key, rie, slices, selectedColumns, reversed); } @SuppressWarnings("resource") - public UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed) + public UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, BigTableRowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed) { if (indexEntry == null) return UnfilteredRowIterators.noRowsIterator(metadata(), key, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE, reversed); @@ -121,9 +130,12 @@ public ISSTableScanner getScanner(Collection> ranges) @SuppressWarnings("resource") // caller to close @Override - public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, RowIndexEntry position, boolean tombstoneOnly) + public UnfilteredRowIterator simpleIterator(Supplier dfile, DecoratedKey key, boolean tombstoneOnly) { - return SSTableIdentityIterator.create(this, dfile, position, key, tombstoneOnly); + BigTableRowIndexEntry position = getPosition(key, SSTableReader.Operator.EQ, true, false, SSTableReadsListener.NOOP_LISTENER); + if (position == null) + return null; + return SSTableIdentityIterator.create(this, dfile.get(), position, key, tombstoneOnly); } /** @@ -133,11 +145,11 @@ public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey ke * @param updateCacheAndStats true if updating stats and cache * @return The index entry corresponding to the key, or null if the key is not present */ - protected RowIndexEntry getPosition(PartitionPosition key, - Operator op, - boolean updateCacheAndStats, - boolean permitMatchPastLast, - SSTableReadsListener listener) + protected BigTableRowIndexEntry getPosition(PartitionPosition key, + Operator op, + boolean updateCacheAndStats, + boolean permitMatchPastLast, + SSTableReadsListener listener) { if (op == Operator.EQ) { @@ -155,7 +167,7 @@ protected RowIndexEntry getPosition(PartitionPosition key, if ((op == Operator.EQ || op == Operator.GE) && (key instanceof DecoratedKey)) { DecoratedKey decoratedKey = (DecoratedKey) key; - RowIndexEntry cachedPosition = getCachedPosition(decoratedKey, updateCacheAndStats); + BigTableRowIndexEntry cachedPosition = getCachedPosition(decoratedKey, updateCacheAndStats); if (cachedPosition != null) { listener.onSSTableSelected(this, cachedPosition, SelectionReason.KEY_CACHE_HIT); @@ -244,7 +256,7 @@ protected RowIndexEntry getPosition(PartitionPosition key, if (opSatisfied) { // read data position from index entry - RowIndexEntry indexEntry = rowIndexEntrySerializer.deserialize(in); + BigTableRowIndexEntry indexEntry = rowIndexEntrySerializer.deserialize(in); if (exactMatch && updateCacheAndStats) { assert key instanceof DecoratedKey; // key can be == to the index key only if it's a true row key @@ -271,7 +283,7 @@ protected RowIndexEntry getPosition(PartitionPosition key, return indexEntry; } - RowIndexEntry.Serializer.skip(in, descriptor.version); + BigTableRowIndexEntry.Serializer.skip(in, descriptor.version); } } catch (IOException e) @@ -288,4 +300,24 @@ protected RowIndexEntry getPosition(PartitionPosition key, } + @Override + public DecoratedKey keyAt(long indexPosition) throws IOException + { + DecoratedKey key; + try (FileDataInput in = ifile.createReader(indexPosition)) + { + if (in.isEOF()) + return null; + + key = decorateKey(ByteBufferUtil.readWithShortLength(in)); + + // hint read path about key location if caching is enabled + // this saves index summary lookup and index file iteration which whould be pretty costly + // especially in presence of promoted column indexes + if (isKeyCacheEnabled()) + cacheKey(key, rowIndexEntrySerializer.deserialize(in)); + } + + return key; + } } diff --git a/src/java/org/apache/cassandra/db/RowIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java similarity index 91% rename from src/java/org/apache/cassandra/db/RowIndexEntry.java rename to src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java index 215768bc8d3f..dc8e91ec93e3 100644 --- a/src/java/org/apache/cassandra/db/RowIndexEntry.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.cassandra.db; +package org.apache.cassandra.io.sstable.format.big; import java.io.IOException; import java.nio.ByteBuffer; @@ -24,8 +24,11 @@ import com.codahale.metrics.Histogram; import org.apache.cassandra.cache.IMeasurableMemory; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.DeletionTime; +import org.apache.cassandra.db.SerializationHeader; +import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.ISerializer; -import org.apache.cassandra.io.sstable.IndexInfo; +import org.apache.cassandra.io.sstable.format.RowIndexEntry; import org.apache.cassandra.io.sstable.format.Version; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputBuffer; @@ -103,7 +106,7 @@ * This results in these classes: *

    *