From aa77e23f17afcdbfc778ca04082afc49d685e1ae Mon Sep 17 00:00:00 2001 From: Aleksey Yeshchenko Date: Mon, 30 Mar 2026 17:18:18 +0100 Subject: [PATCH] CEP-45: Topology Change Support for Mutation Tracking patch by Aleksey Yeschenko; reviewed by Blake Eggleston for CASSANDRA-20386 --- .../statements/ModificationStatement.java | 11 +- .../apache/cassandra/db/CounterMutation.java | 1 + .../apache/cassandra/db/SystemKeyspace.java | 8 +- .../db/streaming/CassandraStreamManager.java | 43 +- .../db/virtual/MutationTrackingTables.java | 50 +- src/java/org/apache/cassandra/dht/Range.java | 2 +- src/java/org/apache/cassandra/net/Verb.java | 32 +- .../apache/cassandra/repair/SyncTasks.java | 8 +- .../replication/ActivationRequest.java | 8 +- .../replication/BroadcastLogOffsets.java | 42 +- .../replication/CoordinatedTransfer.java | 9 +- .../cassandra/replication/CoordinatorLog.java | 166 +-- .../replication/CoordinatorLogId.java | 17 +- .../cassandra/replication/ForwardedWrite.java | 21 +- .../cassandra/replication/Log2OffsetsMap.java | 37 + .../replication/MutationTrackingService.java | 677 +++++---- .../apache/cassandra/replication/Offsets.java | 47 +- .../cassandra/replication/Participants.java | 41 +- .../ReconciledKeyspaceOffsets.java | 263 ---- .../replication/ReconciledLogSnapshot.java | 206 --- .../replication/SealingCoordinator.java | 1294 +++++++++++++++++ .../apache/cassandra/replication/Shard.java | 238 ++- .../replication/ShardIntervalBTree.java | 337 +++++ .../cassandra/replication/ShardMetadata.java | 107 ++ .../replication/ShardMetadataRequest.java | 145 ++ .../replication/ShardMetadataResponse.java | 62 + .../replication/ShortMutationId.java | 5 - .../replication/TrackedImportTransfer.java | 8 +- .../replication/TrackedImportTransfers.java | 2 +- .../replication/TrackedRepairTransfer.java | 8 +- .../replication/TrackedWriteRequest.java | 49 +- ...Failed.java => TransferFailedRequest.java} | 14 +- .../replication/TransferFailedResponse.java | 59 + .../replication/TransferTrackingService.java | 9 +- .../replication/UnknownShardException.java | 10 +- .../cassandra/service/StorageProxy.java | 26 +- .../cassandra/service/paxos/PaxosCommit.java | 16 +- .../service/paxos/PaxosPrepareRefresh.java | 4 + .../paxos/PrepareRefreshForwardHandler.java | 4 + .../cassandra/streaming/LogReceiveTask.java | 52 - .../cassandra/streaming/LogStreamHeader.java | 124 -- .../streaming/LogStreamManifest.java | 157 -- .../cassandra/streaming/LogStreamTask.java | 82 -- .../cassandra/streaming/LogTransferTask.java | 113 -- .../cassandra/streaming/StreamPlan.java | 27 +- .../cassandra/streaming/StreamSession.java | 223 +-- .../streaming/TableStreamManager.java | 4 +- .../async/StreamingMultiplexedChannel.java | 13 +- .../IncomingMutationLogStreamMessage.java | 119 -- .../messages/MutationLogReceivedMessage.java | 61 - .../messages/MutationLogStreamMessage.java | 37 - .../OutgoingMutationLogStreamMessage.java | 141 -- .../messages/PrepareSynAckMessage.java | 27 - .../streaming/messages/PrepareSynMessage.java | 42 - .../streaming/messages/StreamMessage.java | 4 +- .../org/apache/cassandra/tcm/Startup.java | 8 +- .../tcm/sequences/BootstrapAndJoin.java | 15 + .../tcm/sequences/BootstrapAndReplace.java | 8 + .../tcm/sequences/SingleNodeSequences.java | 9 + .../tcm/sequences/UnbootstrapAndLeave.java | 14 +- .../tcm/sequences/UnbootstrapStreams.java | 16 +- .../tcm/transformations/Assassinate.java | 5 + .../test/TrackedBootstrapTest.java | 198 +++ .../test/TrackedDecommissionTest.java | 195 +++ .../test/TrackedReplacementTest.java | 228 +++ .../TrackedHostReplacementTest.java | 376 ----- .../streaming/CassandraStreamManagerTest.java | 5 +- ...StreamConcurrentComponentMutationTest.java | 2 +- .../MutationTrackingShardsTableTest.java | 6 +- .../ActivationRequestSerializationTest.java | 4 +- .../CoordinatorLogOffsetsTest.java | 25 +- .../replication/CoordinatorLogTest.java | 95 +- .../replication/ShardIntervalBTreeTest.java | 598 ++++++++ .../cassandra/replication/ShardTest.java | 10 +- ...SSTableStreamingCorrectFilesCountTest.java | 3 +- .../StreamSessionOwnedRangesTest.java | 4 +- 76 files changed, 4261 insertions(+), 2875 deletions(-) delete mode 100644 src/java/org/apache/cassandra/replication/ReconciledKeyspaceOffsets.java delete mode 100644 src/java/org/apache/cassandra/replication/ReconciledLogSnapshot.java create mode 100644 src/java/org/apache/cassandra/replication/SealingCoordinator.java create mode 100644 src/java/org/apache/cassandra/replication/ShardIntervalBTree.java create mode 100644 src/java/org/apache/cassandra/replication/ShardMetadata.java create mode 100644 src/java/org/apache/cassandra/replication/ShardMetadataRequest.java create mode 100644 src/java/org/apache/cassandra/replication/ShardMetadataResponse.java rename src/java/org/apache/cassandra/replication/{TransferFailed.java => TransferFailedRequest.java} (74%) create mode 100644 src/java/org/apache/cassandra/replication/TransferFailedResponse.java delete mode 100644 src/java/org/apache/cassandra/streaming/LogReceiveTask.java delete mode 100644 src/java/org/apache/cassandra/streaming/LogStreamHeader.java delete mode 100644 src/java/org/apache/cassandra/streaming/LogStreamManifest.java delete mode 100644 src/java/org/apache/cassandra/streaming/LogStreamTask.java delete mode 100644 src/java/org/apache/cassandra/streaming/LogTransferTask.java delete mode 100644 src/java/org/apache/cassandra/streaming/messages/IncomingMutationLogStreamMessage.java delete mode 100644 src/java/org/apache/cassandra/streaming/messages/MutationLogReceivedMessage.java delete mode 100644 src/java/org/apache/cassandra/streaming/messages/MutationLogStreamMessage.java delete mode 100644 src/java/org/apache/cassandra/streaming/messages/OutgoingMutationLogStreamMessage.java create mode 100644 test/distributed/org/apache/cassandra/distributed/test/TrackedBootstrapTest.java create mode 100644 test/distributed/org/apache/cassandra/distributed/test/TrackedDecommissionTest.java create mode 100644 test/distributed/org/apache/cassandra/distributed/test/TrackedReplacementTest.java delete mode 100644 test/distributed/org/apache/cassandra/distributed/test/hostreplacement/TrackedHostReplacementTest.java create mode 100644 test/unit/org/apache/cassandra/replication/ShardIntervalBTreeTest.java diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java index 39e45d5d5dcf..02ec0e6901f9 100644 --- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java @@ -861,8 +861,15 @@ public ResultMessage executeInternalWithoutCondition(QueryState queryState, Quer String keyspaceName = mutation.getKeyspaceName(); Token token = mutation.key().getToken(); MutationId id = MutationTrackingService.instance().nextMutationId(keyspaceName, token); - mutation = mutation.withMutationId(id); - mutation.apply(); + try + { + mutation = mutation.withMutationId(id); + mutation.apply(); + } + finally + { + MutationTrackingService.instance().completeLocalWrite(id); + } } for (IMutation mutation : routed.untrackedMutations) diff --git a/src/java/org/apache/cassandra/db/CounterMutation.java b/src/java/org/apache/cassandra/db/CounterMutation.java index ee67a4b7e8b0..41ebbf0b8d3a 100644 --- a/src/java/org/apache/cassandra/db/CounterMutation.java +++ b/src/java/org/apache/cassandra/db/CounterMutation.java @@ -200,6 +200,7 @@ public Mutation applyCounterMutation(MutationId mutationId) throws WriteTimeoutE } } + @Override public void apply() { applyCounterMutation(); diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java index 65d8c4ac4e0d..c81ce4082a55 100644 --- a/src/java/org/apache/cassandra/db/SystemKeyspace.java +++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java @@ -564,8 +564,11 @@ private SystemKeyspace() + "keyspace_name text," + "range_start text," + "range_end text," + + "since_epoch bigint," + "participants frozen>," - + "PRIMARY KEY ((keyspace_name, range_start, range_end)))") + + "sealed_participants frozen>," + + "state text," + + "PRIMARY KEY ((keyspace_name, range_start, range_end, since_epoch)))") .build(); private static final TableMetadata CoordinatorLogs = @@ -575,12 +578,13 @@ private SystemKeyspace() + "keyspace_name text," + "range_start text," + "range_end text," + + "since_epoch bigint," + "host_id int," + "host_log_id int," + "participants frozen>," + "witnessed_offsets map>>," + "persisted_offsets map>>," - + "PRIMARY KEY ((keyspace_name, range_start, range_end), host_id, host_log_id))") + + "PRIMARY KEY ((keyspace_name, range_start, range_end, since_epoch), host_id, host_log_id))") .build(); @Deprecated(since = "4.0") diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java index fee507549242..d3cee3d296b4 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java @@ -21,10 +21,8 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; -import java.util.Map; import java.util.Set; -import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.base.Predicates; import com.google.common.collect.ImmutableList; @@ -45,9 +43,6 @@ import org.apache.cassandra.io.sstable.metadata.StatsMetadata; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.replication.ImmutableCoordinatorLogOffsets; -import org.apache.cassandra.replication.Offsets; -import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.streaming.IncomingStream; import org.apache.cassandra.streaming.OutgoingStream; @@ -92,34 +87,8 @@ public StreamReceiver createStreamReceiver(StreamSession session, List getSSTablePredicateForKeyspaceRanges(ReconciledKeyspaceOffsets reconciledKeyspaceOffsets) - { - if (reconciledKeyspaceOffsets == null) - return sstable -> true; - - return sstable -> { - if (sstable.isRepaired()) - return false; - - ImmutableCoordinatorLogOffsets sstableOffsets = sstable.getSSTableMetadata().coordinatorLogOffsets; - - // if it's not repaired and there are no offsets, it was probably written before the table was using - // mutation tracking and therefore should be considered unreconciled - if (sstableOffsets.isEmpty()) - return true; - - for (Map.Entry entry : sstableOffsets.entries()) - { - if (!reconciledKeyspaceOffsets.isFullyReconciled(entry.getKey(), entry.getValue())) - return true; - } - - return false; - }; - } - @Override - public Collection createOutgoingStreams(StreamSession session, RangesAtEndpoint replicas, TimeUUID pendingRepair, PreviewKind previewKind, ReconciledKeyspaceOffsets reconciledKeyspaceOffsets) + public Collection createOutgoingStreams(StreamSession session, RangesAtEndpoint replicas, TimeUUID pendingRepair, PreviewKind previewKind) { Refs refs = new Refs<>(); try @@ -131,15 +100,7 @@ public Collection createOutgoingStreams(StreamSession session, R Set sstables = Sets.newHashSet(); SSTableIntervalTree intervalTree = buildSSTableIntervalTree(ImmutableList.copyOf(view.select(SSTableSet.CANONICAL))); Predicate predicate; - // reconciledKeyspaceOffsets are only included when mutation logs are streamed, since we include logs - // for all unreconciled mutations, and SSTables for all reconciled mutations - if (reconciledKeyspaceOffsets != null) - { - Preconditions.checkArgument(previewKind == PreviewKind.NONE); - Preconditions.checkArgument(pendingRepair == ActiveRepairService.NO_PENDING_REPAIR); - predicate = getSSTablePredicateForKeyspaceRanges(reconciledKeyspaceOffsets); - } - else if (previewKind.isPreview()) + if (previewKind.isPreview()) { predicate = previewKind.predicate(); } diff --git a/src/java/org/apache/cassandra/db/virtual/MutationTrackingTables.java b/src/java/org/apache/cassandra/db/virtual/MutationTrackingTables.java index 406693d23e81..3c61039b3c4f 100644 --- a/src/java/org/apache/cassandra/db/virtual/MutationTrackingTables.java +++ b/src/java/org/apache/cassandra/db/virtual/MutationTrackingTables.java @@ -15,7 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.cassandra.db.virtual; import java.util.Collection; @@ -29,6 +28,7 @@ import org.apache.cassandra.db.marshal.BooleanType; import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.SetType; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.dht.LocalPartitioner; import org.apache.cassandra.journal.ActiveSegment; @@ -66,7 +66,7 @@ public static final class MutationJournalTable extends AbstractVirtualTable private static final String FSYNCED_TO = "fsynced_to"; private static final String NEEDS_REPLAY = "needs_replay"; private static final String FILE_PATH = "file_path"; - + MutationJournalTable(String keyspace) { super(TableMetadata.builder(keyspace, MUTATION_JOURNAL) @@ -83,12 +83,12 @@ public static final class MutationJournalTable extends AbstractVirtualTable .addRegularColumn(FILE_PATH, UTF8Type.instance) .build()); } - + @Override public DataSet data() { SimpleDataSet result = new SimpleDataSet(metadata()); - + for (Segment segment : MutationJournal.instance().getAllSegments()) { result.row(segment.id()) @@ -100,15 +100,19 @@ public DataSet data() .column(NEEDS_REPLAY, segment.metadata().needsReplay()) .column(FILE_PATH, segment.filePath()); } - + return result; } } + // TODO (expected): fix the types + // TODO (expected): fix perf WTFs + // TODO (expected): split out shards and coordinator logs tables public static class MutationTrackingShardsTable extends AbstractVirtualTable { private static final String KEYSPACE = "keyspace"; private static final String LOG_ID = "log_id"; + private static final String SINCE_EPOCH = "since_epoch"; private static final String RANGE_START = "range_start"; private static final String RANGE_END = "range_end"; private static final String LOCAL_NODE_ID = "local_node_id"; @@ -116,23 +120,28 @@ public static class MutationTrackingShardsTable extends AbstractVirtualTable private static final String WITNESSED_OFFSETS = "witnessed_offsets"; private static final String RECONCILED_OFFSETS = "reconciled_offsets"; private static final String PERSISTED_OFFSETS = "persisted_offsets"; - - MutationTrackingShardsTable(String keyspace) { + + private static final SetType FROZEN_INT_SET = SetType.getInstance(Int32Type.instance, false); + + MutationTrackingShardsTable(String keyspace) + { super(TableMetadata.builder(keyspace, MUTATION_TRACKING_SHARDS) .comment("mutation tracking shards and their offset information") .kind(TableMetadata.Kind.VIRTUAL).partitioner(new LocalPartitioner(UTF8Type.instance)) .addPartitionKeyColumn(KEYSPACE, UTF8Type.instance) .addClusteringColumn(LOG_ID, UTF8Type.instance) + .addClusteringColumn(SINCE_EPOCH, LongType.instance) .addClusteringColumn(RANGE_START, UTF8Type.instance) .addClusteringColumn(RANGE_END, UTF8Type.instance) .addRegularColumn(LOCAL_NODE_ID, Int32Type.instance) - .addRegularColumn(PARTICIPANTS, UTF8Type.instance) + .addRegularColumn(PARTICIPANTS, FROZEN_INT_SET) + // TODO (expected): change offsets columns to structured types .addRegularColumn(WITNESSED_OFFSETS, UTF8Type.instance) .addRegularColumn(RECONCILED_OFFSETS, UTF8Type.instance) .addRegularColumn(PERSISTED_OFFSETS, UTF8Type.instance) .build()); } - + private void addShardRows(Shard shard, SimpleDataSet result) { Shard.DebugInfo shardDebugInfo = shard.getDebugInfo(); @@ -142,44 +151,35 @@ private void addShardRows(Shard shard, SimpleDataSet result) CoordinatorLog.DebugInfo logDebugInfo = entry.getValue(); result.row(shardDebugInfo.keyspace, logId.toString(), + shardDebugInfo.sinceEpoch, shardDebugInfo.range.left.toString(), shardDebugInfo.range.right.toString()) .column(LOCAL_NODE_ID, shardDebugInfo.localNodeId) - .column(PARTICIPANTS, shardDebugInfo.participants.toString()) + .column(PARTICIPANTS, shardDebugInfo.participants.asSet()) .column(WITNESSED_OFFSETS, logDebugInfo.witnessedOffsets) .column(RECONCILED_OFFSETS, logDebugInfo.reconciledOffsets) .column(PERSISTED_OFFSETS, logDebugInfo.persistedOffsets); } } - + @Override public DataSet data() { SimpleDataSet result = new SimpleDataSet(metadata()); - for (Shard shard : MutationTrackingService.instance().getShards()) - { addShardRows(shard, result); - } - return result; } - + @Override public DataSet data(DecoratedKey key) { String keyspaceName = UTF8Type.instance.compose(key.getKey()); SimpleDataSet result = new SimpleDataSet(metadata()); - + for (Shard shard : MutationTrackingService.instance().getShards()) - { - Shard.DebugInfo debugInfo = shard.getDebugInfo(); - if (!debugInfo.keyspace.equals(keyspaceName)) - continue; - - addShardRows(shard, result); - } - + if (shard.keyspace.equals(keyspaceName)) + addShardRows(shard, result); return result; } } diff --git a/src/java/org/apache/cassandra/dht/Range.java b/src/java/org/apache/cassandra/dht/Range.java index a95249d426b3..286edcbef61a 100644 --- a/src/java/org/apache/cassandra/dht/Range.java +++ b/src/java/org/apache/cassandra/dht/Range.java @@ -876,7 +876,7 @@ public static int compareRightToken(Token a, Token b) if (a.isMinimum()) return 1; if (b.isMinimum()) - return 0; + return -1; return a.compareTo(b); } } diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java index 6667d582682a..472cd4f15e55 100644 --- a/src/java/org/apache/cassandra/net/Verb.java +++ b/src/java/org/apache/cassandra/net/Verb.java @@ -83,7 +83,16 @@ import org.apache.cassandra.replication.ForwardedWrite; import org.apache.cassandra.replication.PullMutationsRequest; import org.apache.cassandra.replication.PushMutationRequest; -import org.apache.cassandra.replication.TransferFailed; +import org.apache.cassandra.replication.SealingCoordinator.CompleteSealing; +import org.apache.cassandra.replication.SealingCoordinator.Drain; +import org.apache.cassandra.replication.SealingCoordinator.FetchShards; +import org.apache.cassandra.replication.SealingCoordinator.InitSealing; +import org.apache.cassandra.replication.SealingCoordinator.ReconcileCapture; +import org.apache.cassandra.replication.SealingCoordinator.ReconcilePoll; +import org.apache.cassandra.replication.ShardMetadataRequest; +import org.apache.cassandra.replication.ShardMetadataResponse; +import org.apache.cassandra.replication.TransferFailedRequest; +import org.apache.cassandra.replication.TransferFailedResponse; import org.apache.cassandra.replication.TransferTrackingService; import org.apache.cassandra.schema.SchemaMutationsSerializer; import org.apache.cassandra.schema.SchemaPullVerbHandler; @@ -361,7 +370,7 @@ public enum Verb MT_PUSH_MUTATION_REQ (902, P3, writeTimeout, MUTATION, () -> mtEmbedded(PushMutationRequest.serializer), () -> PushMutationRequest.verbHandler, MUTATION_RSP ), MT_READ_RECONCILE_ACK (903, P2, readTimeout, REQUEST_RESPONSE, () -> mtEmbedded(ReadReconcileAck.serializer), () -> ReadReconcileAck.verbHandler ), MT_FORWARD_WRITE_REQ (904, P3, writeTimeout, MUTATION, () -> mtEmbedded(ForwardedWrite.serializer), () -> ForwardedWrite.verbHandler ), - MT_BROADCAST_LOG_OFFSETS (905, P1, rpcTimeout, MISC, () -> mtEmbedded(BroadcastLogOffsets.serializer), () -> BroadcastLogOffsets.verbHandler ), + MT_BROADCAST_LOG_OFFSETS (905, P1, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(BroadcastLogOffsets.serializer), () -> BroadcastLogOffsets.verbHandler ), MT_PARTITION_READ_RSP (906, P2, readTimeout, REQUEST_RESPONSE, () -> TrackedDataResponse.embedded, RESPONSE_HANDLER ), MT_PARTITION_READ_REQ (907, P3, readTimeout, READ, () -> TrackedRead.DataRequest.embedded, () -> TrackedRead.verbHandler, MT_PARTITION_READ_RSP ), @@ -372,11 +381,26 @@ public enum Verb MT_TRANSFER_ACTIVATE_RSP (912, P1, repairTimeout, REQUEST_RESPONSE, () -> mtEmbedded(ActivationResponse.serializer), RESPONSE_HANDLER ), MT_TRANSFER_ACTIVATE_REQ (913, P1, repairTimeout, ANTI_ENTROPY, () -> mtEmbedded(ActivationRequest.serializer), () -> ActivationRequest.verbHandler, MT_TRANSFER_ACTIVATE_RSP), - MT_TRANSFER_FAILED_RSP (914, P1, repairTimeout, REQUEST_RESPONSE, () -> mtEmbedded(NoPayload.unversionedSerializer), RESPONSE_HANDLER ), - MT_TRANSFER_FAILED_REQ (915, P1, repairTimeout, ANTI_ENTROPY, () -> mtEmbedded(TransferFailed.serializer), () -> TransferTrackingService.verbHandler, MT_TRANSFER_FAILED_RSP ), + MT_TRANSFER_FAILED_RSP (914, P1, repairTimeout, REQUEST_RESPONSE, () -> mtEmbedded(TransferFailedResponse.serializer), RESPONSE_HANDLER ), + MT_TRANSFER_FAILED_REQ (915, P1, repairTimeout, ANTI_ENTROPY, () -> mtEmbedded(TransferFailedRequest.serializer), () -> TransferTrackingService.verbHandler, MT_TRANSFER_FAILED_RSP ), MT_SYNC_RSP (916, P1, repairWithBackoffTimeout, REQUEST_RESPONSE, () -> mtEmbedded(MutationTrackingSyncResponse.serializer), RESPONSE_HANDLER ), MT_SYNC_REQ (917, P1, repairWithBackoffTimeout, ANTI_ENTROPY, () -> mtEmbedded(MutationTrackingSyncRequest.serializer), () -> RepairMessageVerbHandler.instance(), MT_SYNC_RSP ), + MT_SHARD_METADATA_RSP (918, P2, writeTimeout, REQUEST_RESPONSE, () -> mtEmbedded(ShardMetadataResponse.serializer), RESPONSE_HANDLER ), + MT_SHARD_METADATA_REQ (919, P2, writeTimeout, MISC, () -> mtEmbedded(ShardMetadataRequest.serializer), () -> ShardMetadataRequest.verbHandler, MT_SHARD_METADATA_RSP ), + MT_FETCH_SHARDS_RSP (920, P2, rpcTimeout, REQUEST_RESPONSE, () -> mtEmbedded(FetchShards.responseSerializer), RESPONSE_HANDLER ), + MT_FETCH_SHARDS_REQ (921, P2, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(FetchShards.requestSerializer), () -> FetchShards.verbHandler, MT_FETCH_SHARDS_RSP ), + MT_INIT_SEALING_RSP (922, P2, rpcTimeout, REQUEST_RESPONSE, () -> mtEmbedded(InitSealing.responseSerializer), RESPONSE_HANDLER ), + MT_INIT_SEALING_REQ (923, P2, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(InitSealing.requestSerializer), () -> InitSealing.verbHandler, MT_INIT_SEALING_RSP ), + MT_DRAIN_RSP (924, P2, rpcTimeout, REQUEST_RESPONSE, () -> mtEmbedded(Drain.responseSerializer), RESPONSE_HANDLER ), + MT_DRAIN_REQ (925, P2, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(Drain.requestSerializer), () -> Drain.verbHandler, MT_DRAIN_RSP ), + MT_RECONCILE_CAPTURE_RSP (926, P2, rpcTimeout, REQUEST_RESPONSE, () -> mtEmbedded(ReconcileCapture.responseSerializer), RESPONSE_HANDLER ), + MT_RECONCILE_CAPTURE_REQ (927, P2, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(ReconcileCapture.requestSerializer), () -> ReconcileCapture.verbHandler, MT_RECONCILE_CAPTURE_RSP), + MT_RECONCILE_POLL_RSP (928, P2, rpcTimeout, REQUEST_RESPONSE, () -> mtEmbedded(ReconcilePoll.responseSerializer), RESPONSE_HANDLER ), + MT_RECONCILE_POLL_REQ (929, P2, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(ReconcilePoll.requestSerializer), () -> ReconcilePoll.verbHandler, MT_RECONCILE_POLL_RSP ), + MT_COMPLETE_SEALING_RSP (930, P2, rpcTimeout, REQUEST_RESPONSE, () -> mtEmbedded(CompleteSealing.responseSerializer), RESPONSE_HANDLER ), + MT_COMPLETE_SEALING_REQ (931, P2, rpcTimeout, ANTI_ENTROPY, () -> mtEmbedded(CompleteSealing.requestSerializer), () -> CompleteSealing.verbHandler, MT_COMPLETE_SEALING_RSP ), + // accord ACCORD_SIMPLE_RSP (119, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(EnumSerializer.simpleReply), AccordService::responseHandlerOrNoop ), ACCORD_PRE_ACCEPT_RSP (120, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(PreacceptSerializers.reply), AccordService::responseHandlerOrNoop ), diff --git a/src/java/org/apache/cassandra/repair/SyncTasks.java b/src/java/org/apache/cassandra/repair/SyncTasks.java index 83aa7e25a5ba..e69a10d8cc5d 100644 --- a/src/java/org/apache/cassandra/repair/SyncTasks.java +++ b/src/java/org/apache/cassandra/repair/SyncTasks.java @@ -43,13 +43,15 @@ public static class ShardedSyncTask public final String keyspace; public final Participants participants; public final SyncTask task; + public final long sinceEpoch; public final Range range; - private ShardedSyncTask(String keyspace, Participants participants, SyncTask task, Range range) + private ShardedSyncTask(String keyspace, Participants participants, SyncTask task, long sinceEpoch, Range range) { this.keyspace = keyspace; this.participants = participants; this.task = task; + this.sinceEpoch = sinceEpoch; this.range = range; } } @@ -57,7 +59,7 @@ private ShardedSyncTask(String keyspace, Participants participants, SyncTask tas static SyncTasks untracked(Collection tasks) { SyncTasks syncTasks = new SyncTasks(); - tasks.forEach(t -> syncTasks.shardedTasks.add(new ShardedSyncTask(null, null, t, null))); + tasks.forEach(t -> syncTasks.shardedTasks.add(new ShardedSyncTask(null, null, t, 0L, null))); return syncTasks; } @@ -76,7 +78,7 @@ public void add(Shard shard, SyncTask task) // Narrow the ultimate scope of activation to the ranges in the sync tasks rather than the entire shard. Set> ranges = new HashSet<>(task.rangesToSync); Range span = span(ranges); - shardedTasks.add(new ShardedSyncTask(shard.keyspace, shard.participants, task, span)); + shardedTasks.add(new ShardedSyncTask(shard.keyspace, shard.participants, task, shard.sinceEpoch, span)); } public static Range span(Set> ranges) diff --git a/src/java/org/apache/cassandra/replication/ActivationRequest.java b/src/java/org/apache/cassandra/replication/ActivationRequest.java index e6faac3fdcd2..6eef773207c9 100644 --- a/src/java/org/apache/cassandra/replication/ActivationRequest.java +++ b/src/java/org/apache/cassandra/replication/ActivationRequest.java @@ -61,6 +61,7 @@ public class ActivationRequest public final ShortMutationId transferId; public final NodeId coordinatorId; public final String keyspace; + public final long sinceEpoch; public final Range range; @Nullable @@ -99,6 +100,7 @@ public ActivationRequest(StreamOperation operation, ShortMutationId transferId, NodeId coordinatorId, Range range, + long sinceEpoch, String keyspace, TimeUUID planId) { @@ -113,6 +115,7 @@ public ActivationRequest(StreamOperation operation, this.coordinatorId = coordinatorId; this.phase = phase; this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; this.range = range; this.planId = planId; } @@ -151,6 +154,7 @@ public void serialize(ActivationRequest request, DataOutputPlus out, Version ver NodeId.messagingSerializer.serialize(request.coordinatorId, out, version.messagingVersion()); out.writeByte(request.phase.id); out.writeUTF(request.keyspace); + out.writeLong(request.sinceEpoch); Range.serializer.serialize(request.range, out, null); TimeUUID.Serializer.nullable.serialize(request.planId, out); } @@ -167,10 +171,11 @@ public ActivationRequest deserialize(DataInputPlus in, Version version) throws I NodeId coordinatorId = NodeId.messagingSerializer.deserialize(in, version.messagingVersion()); Phase phase = Phase.from(in.readByte()); String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); Range range = Range.serializer.deserialize(in, null); TimeUUID planId = TimeUUID.Serializer.nullable.deserialize(in); - return new ActivationRequest(operation, Pair.create(sender, receiver), phase, id, coordinatorId, range, keyspace, planId); + return new ActivationRequest(operation, Pair.create(sender, receiver), phase, id, coordinatorId, range, sinceEpoch, keyspace, planId); } @Override @@ -187,6 +192,7 @@ public long serializedSize(ActivationRequest request, Version version) size += NodeId.messagingSerializer.serializedSize(request.coordinatorId, version.messagingVersion()); size += TypeSizes.BYTE_SIZE; // Enum ordinal size += TypeSizes.sizeof(request.keyspace); + size += TypeSizes.sizeof(request.sinceEpoch); size += Range.serializer.serializedSize(request.range, null); size += TimeUUID.Serializer.nullable.serializedSize(request.planId); diff --git a/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java b/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java index f02888d5536c..8c0447d647ab 100644 --- a/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java +++ b/src/java/org/apache/cassandra/replication/BroadcastLogOffsets.java @@ -21,8 +21,6 @@ import java.util.List; import org.apache.cassandra.db.TypeSizes; -import org.apache.cassandra.dht.AbstractBounds; -import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.util.DataInputPlus; @@ -32,15 +30,20 @@ public class BroadcastLogOffsets { - private final String keyspace; - private final Range range; + private final ShardMetadata shardMetadata; private final List replicatedOffsets; private final boolean durable; - public BroadcastLogOffsets(String keyspace, Range range, List offsets, boolean durable) + public BroadcastLogOffsets( + String keyspace, long sinceEpoch, Range range, Participants participants, + List offsets, boolean durable) { - this.keyspace = keyspace; - this.range = range; + this(new ShardMetadata(keyspace, sinceEpoch, range, participants), offsets, durable); + } + + public BroadcastLogOffsets(ShardMetadata shardMetadata, List offsets, boolean durable) + { + this.shardMetadata = shardMetadata; this.replicatedOffsets = offsets; this.durable = durable; } @@ -62,17 +65,19 @@ public String toString() isFirst = false; } sb.append(']'); - return "ShardReplicatedOffsets{" + keyspace + ", " + range + ", " + sb + ", " + durable + '}'; + return "ShardReplicatedOffsets{" + shardMetadata + ", " + sb + ", " + durable + '}'; } public static final IVerbHandler verbHandler = message -> { MutationTrackingService.ensureEnabled(); BroadcastLogOffsets replicatedOffsets = message.payload; - MutationTrackingService.instance().updateReplicatedOffsets(replicatedOffsets.keyspace, - replicatedOffsets.range, - replicatedOffsets.replicatedOffsets, - replicatedOffsets.durable, - message.from()); + MutationTrackingService.instance().updateReplicatedOffsets(replicatedOffsets.shardMetadata.keyspace, + replicatedOffsets.shardMetadata.sinceEpoch, + replicatedOffsets.shardMetadata.range, + replicatedOffsets.shardMetadata.participants, + replicatedOffsets.replicatedOffsets, + replicatedOffsets.durable, + message.from()); }; public static final VersionedSerializer serializer = new VersionedSerializer<>() @@ -80,8 +85,7 @@ public String toString() @Override public void serialize(BroadcastLogOffsets status, DataOutputPlus out, Version version) throws IOException { - out.writeUTF(status.keyspace); - AbstractBounds.tokenSerializer.serialize(status.range, out, version.messagingVersion()); + ShardMetadata.serializer.serialize(status.shardMetadata, out, version); CollectionSerializers.serializeList(status.replicatedOffsets, out, Offsets.serializer); out.writeBoolean(status.durable); } @@ -89,19 +93,17 @@ public void serialize(BroadcastLogOffsets status, DataOutputPlus out, Version ve @Override public BroadcastLogOffsets deserialize(DataInputPlus in, Version version) throws IOException { - String keyspace = in.readUTF(); - Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + ShardMetadata shardMetadata = ShardMetadata.serializer.deserialize(in, version); List replicatedOffsets = CollectionSerializers.deserializeList(in, Offsets.serializer); boolean durable = in.readBoolean(); - return new BroadcastLogOffsets(keyspace, range, replicatedOffsets, durable); + return new BroadcastLogOffsets(shardMetadata, replicatedOffsets, durable); } @Override public long serializedSize(BroadcastLogOffsets replicatedOffsets, Version version) { long size = 0; - size += TypeSizes.sizeof(replicatedOffsets.keyspace); - size += AbstractBounds.tokenSerializer.serializedSize(replicatedOffsets.range, version.messagingVersion()); + size += ShardMetadata.serializer.serializedSize(replicatedOffsets.shardMetadata, version); size += CollectionSerializers.serializedListSize(replicatedOffsets.replicatedOffsets, Offsets.serializer); size += TypeSizes.sizeof(replicatedOffsets.durable); return size; diff --git a/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java b/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java index 84630eb75619..92b1e7eb650d 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java +++ b/src/java/org/apache/cassandra/replication/CoordinatedTransfer.java @@ -94,6 +94,7 @@ public abstract class CoordinatedTransfer private static final Logger logger = LoggerFactory.getLogger(CoordinatedTransfer.class); protected final String keyspace; + protected final long sinceEpoch; protected final Range range; String logPrefix() @@ -104,19 +105,21 @@ String logPrefix() private final ShortMutationId id; final ConcurrentMap, SingleTransferResult> streamResults; - public CoordinatedTransfer(ShortMutationId id, String keyspace, Range range) + public CoordinatedTransfer(ShortMutationId id, String keyspace, long sinceEpoch, Range range) { this.id = id; this.streamResults = new ConcurrentHashMap<>(); this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; this.range = range; } - public CoordinatedTransfer(ShortMutationId id, Participants participants, String keyspace, Range range) + public CoordinatedTransfer(ShortMutationId id, Participants participants, String keyspace, long sinceEpoch, Range range) { this.id = id; this.streamResults = new ConcurrentHashMap<>(participants.size()); this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; this.range = range; } @@ -285,7 +288,7 @@ public void onFailure(InetAddressAndPort from, RequestFailure failure) logger.debug("{} Notifying {} of transfer failure for plan {}", logPrefix(), to, result.planId()); notifyFailure.responses.incrementAndGet(); - Message msg = Message.out(Verb.MT_TRANSFER_FAILED_REQ, new TransferFailed(result.planId())); + Message msg = Message.out(Verb.MT_TRANSFER_FAILED_REQ, new TransferFailedRequest(result.planId())); MessagingService.instance().sendWithCallback(msg, to, notifyFailure); } diff --git a/src/java/org/apache/cassandra/replication/CoordinatorLog.java b/src/java/org/apache/cassandra/replication/CoordinatorLog.java index 317a3e6eea59..aeb0d57089f5 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatorLog.java +++ b/src/java/org/apache/cassandra/replication/CoordinatorLog.java @@ -63,6 +63,7 @@ public abstract class CoordinatorLog private static final Logger logger = LoggerFactory.getLogger(CoordinatorLog.class); protected final int localNodeId; + protected final long sinceEpoch; protected final String keyspace; protected final Range range; protected final CoordinatorLogId logId; @@ -80,6 +81,7 @@ public abstract class CoordinatorLog abstract void receivedWriteResponse(ShortMutationId mutationId, int fromNodeId); CoordinatorLog(String keyspace, + long sinceEpoch, Range range, int localNodeId, CoordinatorLogId logId, @@ -89,6 +91,7 @@ public abstract class CoordinatorLog UnreconciledMutations unreconciledMutations) { this.localNodeId = localNodeId; + this.sinceEpoch = sinceEpoch; this.keyspace = keyspace; this.range = range; this.logId = logId; @@ -100,90 +103,23 @@ public abstract class CoordinatorLog this.reconciledPersistedOffsets = persistedOffsets.intersection(); } - CoordinatorLog(String keyspace, Range range, int localNodeId, CoordinatorLogId logId, Participants participants) + CoordinatorLog(String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId logId, Participants participants) { - this(keyspace, range, localNodeId, logId, participants, forParticipants(logId, participants), forParticipants(logId, participants), new UnreconciledMutations()); + this(keyspace, sinceEpoch, range, localNodeId, logId, participants, forParticipants(logId, participants), forParticipants(logId, participants), new UnreconciledMutations()); } - static CoordinatorLog create(String keyspace, Range range, int localNodeId, CoordinatorLogId id, Participants participants) + static CoordinatorLog create(String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId id, Participants participants) { - return id.hostId == localNodeId ? new CoordinatorLogPrimary(keyspace, range, localNodeId, id, participants) - : new CoordinatorLogReplica(keyspace, range, localNodeId, id, participants); + return id.hostId == localNodeId ? new CoordinatorLogPrimary(keyspace, sinceEpoch, range, localNodeId, id, participants) + : new CoordinatorLogReplica(keyspace, sinceEpoch, range, localNodeId, id, participants); } static CoordinatorLog recreate( - String keyspace, Range range, int localNodeId, CoordinatorLogId id, Participants participants, + String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId id, Participants participants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations) { - return id.hostId == localNodeId ? new CoordinatorLogPrimary(keyspace, range, localNodeId, id, participants, witnessedOffsets, persistedOffsets, unreconciledMutations) - : new CoordinatorLogReplica(keyspace, range, localNodeId, id, participants, witnessedOffsets, persistedOffsets, unreconciledMutations); - } - - abstract CoordinatorLog withUpdatedParticipants(Participants newParticipants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations); - - CoordinatorLog withParticipants(Participants newParticipants) - { - if (participants.equals(newParticipants)) - return this; - - lock.readLock().lock(); - try - { - Node2OffsetsMap newWitnessedOffsets = new Node2OffsetsMap(); - Node2OffsetsMap newPersistedOffsets = new Node2OffsetsMap(); - Offsets passivelyReconciled = null; - for (int newIndex = 0; newIndex < newParticipants.size(); newIndex++) - { - int participantId = newParticipants.get(newIndex); - - Offsets.Mutable offsets; - if (participants.contains(participantId)) - { - offsets = witnessedOffsets.get(participantId); - } - else - { - offsets = new Offsets.Mutable(logId); - - // the new node doesn't actually have these reconciled offsets yet, but they will receive them - // as part of the topology change. We preemptively mark them as reconciled here to prevent so - // we don't stream journal entries that the new node will receive in sstables and to prevent - // retroactively un-reconciling previously reconciled offsets for the other replicas. - offsets.addAll(reconciledOffsets); - } - Offsets.Mutable persisted = participants.contains(participantId) - ? persistedOffsets.get(participantId) - : new Offsets.Mutable(logId); - passivelyReconciled = passivelyReconciled != null - ? Offsets.Immutable.intersection(passivelyReconciled, offsets) - : offsets; - newWitnessedOffsets.add(participantId, offsets); - newPersistedOffsets.add(participantId, persisted); - } - - UnreconciledMutations newUnreconciledMutations; - passivelyReconciled = Offsets.Immutable.difference(passivelyReconciled, reconciledOffsets); - if (!passivelyReconciled.isEmpty()) - { - logger.debug("Toplogy change implicitly reconciled offsets: {}", passivelyReconciled); - newUnreconciledMutations = unreconciledMutations.copy(); - passivelyReconciled.forEach(id -> newUnreconciledMutations.remove(id.offset)); - } - else - { - newUnreconciledMutations = unreconciledMutations; - } - - if (logger.isTraceEnabled()) - logger.trace("Updating coordinator log {} participants: {} -> {}. Passively reconciled: {}", - logId, participants, newParticipants, passivelyReconciled); - - return withUpdatedParticipants(newParticipants, newWitnessedOffsets, newPersistedOffsets, newUnreconciledMutations); - } - finally - { - lock.readLock().unlock(); - } + return id.hostId == localNodeId ? new CoordinatorLogPrimary(keyspace, sinceEpoch, range, localNodeId, id, participants, witnessedOffsets, persistedOffsets, unreconciledMutations) + : new CoordinatorLogReplica(keyspace, sinceEpoch, range, localNodeId, id, participants, witnessedOffsets, persistedOffsets, unreconciledMutations); } void updateReplicatedOffsets(Offsets offsets, boolean persisted, int onNodeId) @@ -240,20 +176,6 @@ private void updatePersistedReplicatedOffsets(Offsets offsets, int onNodeId) logger.debug("done applying PRO, now {}", reconciledPersistedOffsets); } - public void recordFullyReconciledOffsets(Offsets.Immutable reconciled) - { - lock.writeLock().lock(); - try { - for (int i = 0; i < participants.size(); ++i) - { - int participant = participants.get(i); - updateWitnessedReplicatedOffsets(reconciled, participant); - } - } finally { - lock.writeLock().unlock(); - } - } - @Nullable Offsets.Immutable collectReplicatedOffsets(boolean persisted) { @@ -442,7 +364,7 @@ void finishWriting(Mutation mutation) } /* - - On local replicas after they've completed activation (onHostId == me) + * On local replicas after they've completed activation (onHostId == me) */ void finishActivation(Bounds bounds, ActivationRequest activation) { @@ -704,23 +626,15 @@ static class CoordinatorLogPrimary extends CoordinatorLog private final AtomicLong sequenceId = new AtomicLong(-1); CoordinatorLogPrimary( - String keyspace, Range range, int localNodeId, CoordinatorLogId logId, Participants participants, + String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId logId, Participants participants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations) { - super(keyspace, range, localNodeId, logId, participants, witnessedOffsets, persistedOffsets, unreconciledMutations); - } - - CoordinatorLogPrimary(String keyspace, Range range, int localNodeId, CoordinatorLogId logId, Participants participants) - { - super(keyspace, range, localNodeId, logId, participants); + super(keyspace, sinceEpoch, range, localNodeId, logId, participants, witnessedOffsets, persistedOffsets, unreconciledMutations); } - @Override - CoordinatorLog withUpdatedParticipants(Participants newParticipants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations) + CoordinatorLogPrimary(String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId logId, Participants participants) { - CoordinatorLogPrimary next = new CoordinatorLogPrimary(keyspace, range, localNodeId, logId, newParticipants, witnessedOffsets, persistedOffsets, unreconciledMutations); - next.sequenceId.set(sequenceId.get()); - return next; + super(keyspace, sinceEpoch, range, localNodeId, logId, participants); } @Override @@ -786,21 +700,15 @@ private long nextSequenceId() static class CoordinatorLogReplica extends CoordinatorLog { CoordinatorLogReplica( - String keyspace, Range range, int localNodeId, CoordinatorLogId logId, Participants participants, + String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId logId, Participants participants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations) { - super(keyspace, range, localNodeId, logId, participants, witnessedOffsets, persistedOffsets, unreconciledMutations); + super(keyspace, sinceEpoch, range, localNodeId, logId, participants, witnessedOffsets, persistedOffsets, unreconciledMutations); } - CoordinatorLogReplica(String keyspace, Range range, int localNodeId, CoordinatorLogId logId, Participants participants) - { - super(keyspace, range, localNodeId, logId, participants); - } - - @Override - CoordinatorLog withUpdatedParticipants(Participants newParticipants, Node2OffsetsMap witnessedOffsets, Node2OffsetsMap persistedOffsets, UnreconciledMutations unreconciledMutations) + CoordinatorLogReplica(String keyspace, long sinceEpoch, Range range, int localNodeId, CoordinatorLogId logId, Participants participants) { - return new CoordinatorLogReplica(keyspace, range, localNodeId, logId, newParticipants, witnessedOffsets, persistedOffsets, unreconciledMutations); + super(keyspace, sinceEpoch, range, localNodeId, logId, participants); } @Override @@ -815,10 +723,18 @@ void receivedWriteResponse(ShortMutationId mutationId, int fromNodeId) */ private static final String INSERT_QUERY = - format("INSERT INTO %s.%s (keyspace_name, range_start, range_end, host_id, host_log_id, participants, witnessed_offsets, persisted_offsets) " - + "VALUES (?, ?, ?, ?, ?, ?, ?, ?)", + format("INSERT INTO %s.%s (keyspace_name, since_epoch, range_start, range_end, host_id, host_log_id, participants, witnessed_offsets, persisted_offsets) " + + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.COORDINATOR_LOGS); + private static void persistToSystemTable( + String keyspace, long sinceEpoch, Range range, CoordinatorLogId logId, Participants participants, + Map> witnessedOffsets, Map> persistedOffsets) + { + executeInternal(INSERT_QUERY, keyspace, sinceEpoch, range.left.toString(), range.right.toString(), logId.hostId, + logId.hostLogId, participants.asSet(), witnessedOffsets, persistedOffsets); + } + void persistToSystemTable() { Map> witnessed = new Int2ObjectHashMap<>(); @@ -834,8 +750,7 @@ void persistToSystemTable() { lock.readLock().unlock(); } - executeInternal(INSERT_QUERY, keyspace, range.left.toString(), range.right.toString(), logId.hostId, - logId.hostLogId, participants.asSet(), witnessed, persisted); + persistToSystemTable(keyspace, sinceEpoch, range, logId, participants, witnessed, persisted); } void updateLogsInSystemTable() @@ -859,8 +774,7 @@ void updateLogsInSystemTable() lock.readLock().unlock(); } - executeInternal(INSERT_QUERY, keyspace, range.left.toString(), range.right.toString(), logId.hostId, - logId.hostLogId, participants.asSet(), witnessed, persisted); + persistToSystemTable(keyspace, sinceEpoch, range, logId, participants, witnessed, persisted); lock.writeLock().lock(); try @@ -875,13 +789,13 @@ void updateLogsInSystemTable() } private static final String SELECT_QUERY = - format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND range_start = ? AND range_end = ?", + format("SELECT * FROM %s.%s WHERE keyspace_name = ? AND range_start = ? AND range_end = ? AND since_epoch = ?", SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.COORDINATOR_LOGS); - static List loadFromSystemTable(String keyspace, Range range, int localNodeId) + static List loadFromSystemTable(String keyspace, long sinceEpoch, Range range, int localNodeId) { ArrayList logs = new ArrayList<>(); - for (UntypedResultSet.Row row : executeInternal(SELECT_QUERY, keyspace, range.left.toString(), range.right.toString())) + for (UntypedResultSet.Row row : executeInternal(SELECT_QUERY, keyspace, range.left.toString(), range.right.toString(), sinceEpoch)) { int nodeId = row.getInt("host_id"); int hostLogId = row.getInt("host_log_id"); @@ -895,19 +809,23 @@ static List loadFromSystemTable(String keyspace, Range ra Node2OffsetsMap persisted = fromPrimitiveMap(logId, persistedOffsets); UnreconciledMutations unreconciled = UnreconciledMutations.loadFromJournal(witnessed, localNodeId); CoordinatorLog log = - CoordinatorLog.recreate(keyspace, range, localNodeId, logId, new Participants(participants), witnessed, persisted, unreconciled); + CoordinatorLog.recreate(keyspace, sinceEpoch, range, localNodeId, logId, new Participants(participants), witnessed, persisted, unreconciled); logs.add(log); } return logs; } private static final String DELETE_QUERY = - format("DELETE FROM %s.%s WHERE keyspace_name = ? AND range_start = ? AND range_end = ? AND host_id = ? AND host_log_id = ?", + format("DELETE FROM %s.%s WHERE keyspace_name = ? AND range_start = ? AND range_end = ? AND since_epoch = ? AND host_id = ? AND host_log_id = ?", SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.COORDINATOR_LOGS); + /** + * Currently exists for tests only. Not sure if it'll ever be used outside of tests. + */ + @VisibleForTesting void deleteFromSystemTable() { - executeInternal(DELETE_QUERY, keyspace, range.left.toString(), range.right.toString(), logId.hostId, logId.hostLogId); + executeInternal(DELETE_QUERY, keyspace, range.left.toString(), range.right.toString(), sinceEpoch, logId.hostId, logId.hostLogId); } @VisibleForTesting diff --git a/src/java/org/apache/cassandra/replication/CoordinatorLogId.java b/src/java/org/apache/cassandra/replication/CoordinatorLogId.java index 1049ac77a0d2..dc07594a2ff0 100644 --- a/src/java/org/apache/cassandra/replication/CoordinatorLogId.java +++ b/src/java/org/apache/cassandra/replication/CoordinatorLogId.java @@ -125,7 +125,7 @@ public int hashCode() public static final Comparator comparator = (l, r) -> Long.compareUnsigned(l.asLong(), r.asLong()); - public static final class Serializer implements UnversionedSerializer + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() { @Override public void serialize(CoordinatorLogId logId, DataOutputPlus out) throws IOException @@ -134,12 +134,6 @@ public void serialize(CoordinatorLogId logId, DataOutputPlus out) throws IOExcep out.writeInt(logId.hostLogId); } - public void serialize(long logId, DataOutputPlus out) throws IOException - { - out.writeInt(hostId(logId)); - out.writeInt(hostLogId(logId)); - } - @Override public CoordinatorLogId deserialize(DataInputPlus in) throws IOException { @@ -153,12 +147,5 @@ public long serializedSize(CoordinatorLogId logId) { return TypeSizes.sizeof(logId.hostId) + TypeSizes.sizeof(logId.hostLogId); } - - public long serializedSize(long logId) - { - return TypeSizes.sizeof(logId); - } - } - - public static final Serializer serializer = new Serializer(); + }; } diff --git a/src/java/org/apache/cassandra/replication/ForwardedWrite.java b/src/java/org/apache/cassandra/replication/ForwardedWrite.java index 74c17efe3699..7d2280b3bda8 100644 --- a/src/java/org/apache/cassandra/replication/ForwardedWrite.java +++ b/src/java/org/apache/cassandra/replication/ForwardedWrite.java @@ -246,7 +246,15 @@ private static void applyLocallyAndForwardToReplicas(Mutation mutation, Set implements Iterable @@ -271,4 +276,36 @@ public Log2OffsetsMap.Immutable build() } } } + + public static final AsymmetricUnversionedSerializer, Immutable> serializer = new AsymmetricUnversionedSerializer<>() + { + @Override + public void serialize(Log2OffsetsMap map, DataOutputPlus out) throws IOException + { + Long2ObjectHashMap offsetMap = map.asMap(); + out.writeUnsignedVInt32(offsetMap.size()); + for (Offsets offsets : offsetMap.values()) + offsets.serialize(out); + } + + @Override + public Immutable deserialize(DataInputPlus in) throws IOException + { + int count = in.readUnsignedVInt32(); + Immutable.Builder builder = new Immutable.Builder(); + for (int i = 0; i < count; i++) + builder.add(Offsets.deserialize(in)); + return builder.build(); + } + + @Override + public long serializedSize(Log2OffsetsMap map) + { + Long2ObjectHashMap offsetMap = map.asMap(); + long size = TypeSizes.sizeofUnsignedVInt(offsetMap.size()); + for (Offsets offsets : offsetMap.values()) + size += offsets.serializedSize(); + return size; + } + }; } diff --git a/src/java/org/apache/cassandra/replication/MutationTrackingService.java b/src/java/org/apache/cassandra/replication/MutationTrackingService.java index 5d1103e69a50..1de6b3696596 100644 --- a/src/java/org/apache/cassandra/replication/MutationTrackingService.java +++ b/src/java/org/apache/cassandra/replication/MutationTrackingService.java @@ -19,13 +19,11 @@ import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -61,12 +59,10 @@ import org.apache.cassandra.dht.AbstractBounds; import org.apache.cassandra.dht.Bounds; import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Splitter; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.RequestFailure; import org.apache.cassandra.gms.FailureDetector; import org.apache.cassandra.io.sstable.format.SSTableReader; -import org.apache.cassandra.locator.EndpointsForRange; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.metrics.MutationTrackingMetrics; import org.apache.cassandra.net.Message; @@ -75,7 +71,6 @@ import org.apache.cassandra.repair.SyncTask; import org.apache.cassandra.repair.SyncTasks; import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.schema.ReplicationParams; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.TableId; @@ -86,10 +81,10 @@ import org.apache.cassandra.tcm.Epoch; import org.apache.cassandra.tcm.listeners.ChangeListener; import org.apache.cassandra.tcm.membership.NodeId; -import org.apache.cassandra.tcm.ownership.ReplicaGroups; import org.apache.cassandra.tcm.ownership.VersionedEndpoints; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.MBeanWrapper; +import org.apache.cassandra.utils.concurrent.AsyncPromise; import static com.google.common.base.Preconditions.checkNotNull; import static java.lang.String.format; @@ -97,8 +92,6 @@ import static org.apache.cassandra.concurrent.ExecutorFactory.SimulatorSemantics.NORMAL; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; -// TODO (expected): persistence (handle restarts) -// TODO (expected): handle topology changes public class MutationTrackingService implements MutationTrackingServiceMBean { public static final String MBEAN_NAME = "org.apache.cassandra.db:type=MutationTrackingService"; @@ -161,21 +154,11 @@ public static void shutdown() throws InterruptedException instance().shutdownBlocking(); } - /** - * Split ranges into this many shards. - *

- * REVIEW: Reset back to 1 because for transfers, replicas need to know each others' shards, since transfers are - * sliced to fit within shards. Can we achieve sharding via split range ownership, instead of it being local-only? - *

- * TODO (expected): ability to rebalance / change this constant - */ - private static final int SHARD_MULTIPLIER = 1; - private static final Logger logger = LoggerFactory.getLogger(MutationTrackingService.class); private final TrackedLocalReads localReads = new TrackedLocalReads(); private ConcurrentHashMap keyspaceShards = new ConcurrentHashMap<>(); - private ConcurrentHashMap log2ShardMap = new ConcurrentHashMap<>(); + private final ConcurrentHashMap log2ShardMap = new ConcurrentHashMap<>(); private final ChangeListener tcmListener; // prevents a race between topology changes (shard recreation) and coordinator log creation. @@ -192,6 +175,7 @@ public static void shutdown() throws InterruptedException // a better tradeoff for node replacement, but it seems likely that handling token movements will be simpler // if we use a copy on write pattern for topology changes. // TODO (expected): consider StampedLock or other approaches to avoid theoretical topology change starvation + // TODO (expected): review all instances of taking this lock, minimise the scope of what's done within private final ReentrantReadWriteLock shardLock = new ReentrantReadWriteLock(); private final ReplicatedOffsetsBroadcaster offsetsBroadcaster = new ReplicatedOffsetsBroadcaster(); @@ -285,28 +269,6 @@ public void pauseOffsetBroadcast(boolean pause) offsetsBroadcaster.pauseOffsetBroadcast(pause); } - /** - * Creates a ShardReconciledOffsets containing reconciled offsets and ranges for multiple keyspaces. - */ - public ReconciledLogSnapshot snapshotReconciledLogs() - { - ReconciledLogSnapshot.Builder builder = ReconciledLogSnapshot.builder(); - - shardLock.readLock().lock(); - try - { - keyspaceShards.forEach((keyspace, ksShards) -> { - ksShards.collectShardReconciledOffsetsToBuilder(builder); - }); - } - finally - { - shardLock.readLock().unlock(); - } - - return builder.build(); - } - public void registerMetadataListener() { ClusterMetadataService.instance().log().addListener(tcmListener); @@ -346,35 +308,6 @@ public MutationId nextMutationId(String keyspace, Token token) } } - // Requires that ranges is aligned to a single shard - public MutationId nextMutationId(String keyspace, Collection> ranges) - { - shardLock.readLock().lock(); - try - { - KeyspaceShards shards = getOrCreateShards(keyspace); - Shard shard = null; - for (Range range : ranges) - { - Shard curShard = shards.lookUp(range); - if (curShard == null) - throw new UnknownShardException(range, shards.groups); - if (shard == null) - shard = curShard; - else if (shard != curShard) - throw new IllegalStateException(String.format("Cannot generate a mutation ID for ranges (%s) that span across more than one shard (%s, %s)", ranges, shard, curShard)); - } - Preconditions.checkNotNull(shard); - MutationId id = shard.nextId(); - logger.trace("Created new mutation id {}", id); - return id; - } - finally - { - shardLock.readLock().unlock(); - } - } - public void sentWriteRequest(Mutation mutation, IntHashSet toHostIds) { Preconditions.checkArgument(!mutation.id().isNone()); @@ -437,12 +370,14 @@ public void retryFailedTransfer(CoordinatedTransfer transfer, InetAddressAndPort activeReconciler.schedule(transfer.id(), onHost, ActiveLogReconciler.Priority.REGULAR); } - public void updateReplicatedOffsets(String keyspace, Range range, List offsets, boolean durable, InetAddressAndPort onHost) + public void updateReplicatedOffsets(String keyspace, long sinceEpoch, Range range, Participants participants, + List offsets, boolean durable, InetAddressAndPort onHost) { shardLock.readLock().lock(); try { - getOrCreateShards(keyspace).updateReplicatedOffsets(range, offsets, durable, onHost); + Shard shard = getOrCreateShard(keyspace, sinceEpoch, range, participants); + shard.updateReplicatedOffsets(offsets, durable, onHost); } finally { @@ -463,16 +398,15 @@ public void updateReplicatedOffsets(String keyspace, Range range, List { - KeyspaceShards ksShards = getOrCreateShards(keyspace); - if (ksShards != null) - ksShards.recordFullyReconciledOffsets(keyspaceOffsets); - }); + return shard.startWriting(mutation); } finally { @@ -480,13 +414,14 @@ public void recordFullyReconciledOffsets(ReconciledLogSnapshot reconciledSnapsho } } - public boolean startWriting(Mutation mutation) + public void finishWriting(Mutation mutation) { + Preconditions.checkArgument(!mutation.id().isNone()); shardLock.readLock().lock(); try { - Preconditions.checkArgument(!mutation.id().isNone()); - return getOrCreateShards(mutation.getKeyspaceName()).startWriting(mutation); + getShard(mutation.id().asLogId()).finishWriting(mutation); + incomingMutations.invokeListeners(mutation.id()); } finally { @@ -494,14 +429,40 @@ public boolean startWriting(Mutation mutation) } } - public void finishWriting(Mutation mutation) + /** + * Must be called exactly once per {@code nextId()} invocation. + */ + public void completeLocalWrite(MutationId id) + { + Preconditions.checkArgument(!id.isNone()); + Shard shard = getShardNullable(id.asLogId()); + if (null == shard) + throw new IllegalStateException(format("Shard for log %s was not found in log2ShardMap", id.asLogId())); + shard.completeLocalWrite(); + } + + /** + * Check the log-to-shard index first; if the log ID is locally unknown, query all peers for the shard metadata, + * then find or create the matching shard locally under the read lock. + *

+ * The returned shard is safe to write to even if a topology change interleaves between this + * call and the subsequent write, because shard recreation carries existing shards forward rather + * than discarding them (see {@link KeyspaceShards#withNewShards}). + */ + @Nonnull + private Shard getOrCreateShardForMutation(Mutation mutation) { + CoordinatorLogId logId = mutation.id().asLogId(); + + Shard shard = getShardNullable(logId); + if (null != shard) + return shard; + + ShardMetadata metadata = queryPeersForShardMetadata(logId, mutation.getKeyspaceName()); shardLock.readLock().lock(); try { - Preconditions.checkArgument(!mutation.id().isNone()); - getOrCreateShards(mutation.getKeyspaceName()).finishWriting(mutation); - incomingMutations.invokeListeners(mutation.id()); + return getOrCreateShard(metadata.keyspace, metadata.sinceEpoch, metadata.range, metadata.participants); } finally { @@ -509,6 +470,33 @@ public void finishWriting(Mutation mutation) } } + /** + * When we see a new coordinator log id for the first time, we may not know what shard to place it into. + * Queries all peers for the shard metadata, finds or creates the matching shard locally, and returns it. + * Throws if resolution fails entirely. + */ + private ShardMetadata queryPeersForShardMetadata(CoordinatorLogId logId, String keyspace) + { + // Collect peers: current replicas for the keyspace + the log's originating host + Set peers = new HashSet<>(); + KeyspaceShards shards = keyspaceShards.get(keyspace); + if (shards != null) + shards.forEachShard(s -> peers.addAll(s.remoteReplicas())); + + InetAddressAndPort hostOfOrigin = ClusterMetadata.current().directory.endpoint(new NodeId(logId.hostId)); + if (hostOfOrigin != null) + peers.add(hostOfOrigin); + + peers.remove(FBUtilities.getBroadcastAddressAndPort()); + + AsyncPromise promise = ShardMetadataRequest.queryPeers(logId, peers); + promise.awaitUninterruptibly(DatabaseDescriptor.getWriteRpcTimeout(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS); + ShardMetadata metadata = promise.getNow(); + if (metadata == null) + throw new RuntimeException(String.format("Could not resolve shard metadata for log %s in keyspace %s", logId, keyspace)); + return metadata; + } + /** * Register to be notified to an incoming mutation. * @return true if this is the first active listener added for this id @@ -609,7 +597,7 @@ else if (request.operation == StreamOperation.IMPORT) { if (committed) { - keyspaceShards.get(keyspace).lookUp(request.range).finishActivation(bounds, request); + keyspaceShards.get(keyspace).lookUpForActivation(request.range, request.sinceEpoch).finishActivation(bounds, request); incomingMutations.invokeListeners(request.transferId); } } @@ -682,6 +670,7 @@ public long getUnreconciledMutationCount() return count[0]; } + // TODO (expected): what? this is not the way; this should grab an immutable snapshot instead public Iterable getShards() { List shards = new ArrayList<>(); @@ -689,7 +678,7 @@ public Iterable getShards() try { keyspaceShards.forEach((keyspace, ksShards) -> { - ksShards.forEachShard(shards::add); + ksShards.forEachShard((shard, into) -> into.add(shard), shards); }); } finally @@ -793,13 +782,14 @@ public void requestMissingMutations(Offsets offsets, InetAddressAndPort forHost, } @Nullable - private Shard getShardNullable(CoordinatorLogId logId) + Shard getShardNullable(CoordinatorLogId logId) { return log2ShardMap.get(logId); } @Nonnull - private Shard getShard(CoordinatorLogId logId) + @VisibleForTesting + Shard getShard(CoordinatorLogId logId) { return Preconditions.checkNotNull(log2ShardMap.get(logId)); } @@ -821,6 +811,44 @@ private KeyspaceShards getOrCreateShards(String keyspace) return keyspaceShards.computeIfAbsent(keyspace, ignore -> KeyspaceShards.make(ksm, csm, this::nextLogId, this::onNewLog)); } + /** + * Find an existing shard matching the response's (epoch, range), or create a new one from the response metadata. + * TODO (expected): validate if I need to persist the newly created shard, if it was just created here + * TODO (expected): validate if this should be called with a shard lock everywhere it's called + * TODO (expected): log2Shard map updates even on failed CAS. Need to clean up there(?); Register callbacks? + */ + @Nonnull + private Shard getOrCreateShard( + String keyspace, long sinceEpoch, Range range, Participants participants) + { + int localNodeId = ClusterMetadata.current().myNodeId().id(); + + if (!participants.contains(localNodeId)) + throw new IllegalStateException("A shard looked up that this node doesn't participate in"); + + // unlikely, but possible to race here, hence the CAS loop + while (true) + { + KeyspaceShards current = getOrCreateShards(keyspace); + Shard shard = current.get(range, sinceEpoch); + if (shard != null) + return shard; + shard = new Shard(localNodeId, keyspace, sinceEpoch, range, participants, this::nextLogId, this::onNewLog); + KeyspaceShards updated = current.withNewShard(shard); + if (keyspaceShards.replace(keyspace, current, updated)) + return shard; + } + } + + @Nonnull + private Shard getShard(String keyspace, long sinceEpoch, Range range) + { + KeyspaceShards shards = keyspaceShards.get(keyspace); + Shard shard = shards != null ? shards.get(range, sinceEpoch) : null; + if (shard != null) return shard; + throw new IllegalStateException(format("Shard for keyspace %s, epoch %d, range %s cannot be found", keyspace, sinceEpoch, range)); + } + private long nextLogId() { NodeId nodeId = ClusterMetadata.current().myNodeId(); @@ -832,13 +860,13 @@ private long nextLogId() * Allocate and persist the next host log id. * We only do this on startup and when rotating logs. */ - private int nextHostLogId() + private synchronized int nextHostLogId() { int nextHostLogId = ++prevHostLogId; persistHostLogIdToSystemTable(nextHostLogId); return nextHostLogId; } - private int prevHostLogId; + private volatile int prevHostLogId; public boolean isDurablyReconciled(ShortMutationId id) { @@ -900,17 +928,15 @@ private void onNewClusterMetadata(@Nullable ClusterMetadata prev, ClusterMetadat shardLock.readLock().unlock(); } - shardLock.writeLock().lock(); - ConcurrentHashMap originalLog2ShardMap = log2ShardMap; ConcurrentHashMap originalKeyspaceShards = keyspaceShards; + shardLock.writeLock().lock(); try { + if (!shardUpdateNeeded(keyspaceShards, prev, next)) return; - // recalculating the shards will repopulate this via the existing callbacks - log2ShardMap = new ConcurrentHashMap<>(); - keyspaceShards = applyUpdatedMetadata(keyspaceShards, prev, next, this::nextLogId, this::onNewLog); + keyspaceShards = applyUpdatedMetadata(keyspaceShards, prev, next, this::nextLogId, this::onNewLog, this::onDroppedLog); if (!config.background_reconciliation_enabled) { @@ -922,7 +948,6 @@ private void onNewClusterMetadata(@Nullable ClusterMetadata prev, ClusterMetadat } catch (Throwable t) { - log2ShardMap = originalLog2ShardMap; keyspaceShards = originalKeyspaceShards; throw t; } @@ -955,7 +980,7 @@ private static boolean shardUpdateNeeded(Map current, @N return false; } - private static ConcurrentHashMap applyUpdatedMetadata(Map keyspaceShardsMap, @Nullable ClusterMetadata prev, ClusterMetadata next, LongSupplier logIdProvider, BiConsumer onNewLog) + private static ConcurrentHashMap applyUpdatedMetadata(Map keyspaceShardsMap, @Nullable ClusterMetadata prev, ClusterMetadata next, LongSupplier logIdProvider, BiConsumer onNewLog, BiConsumer onDroppedLog) { Preconditions.checkNotNull(next); @@ -982,13 +1007,15 @@ private static ConcurrentHashMap applyUpdatedMetadata(Ma updated.put(keyspace, current); break; case DROP: - // Don't carry forward the state for the dropped keyspace + // clean up the log2ShardMap when a keyspace is dropped + if (current != null) + current.forEachShard(s -> s.forEachLog(onDroppedLog)); break; case REPLICA_GROUP: // if there's an existing keyspace shards instance, update it, otherwise fall through to CREATE if (current != null) { - KeyspaceShards ksShards = current.withUpdatedMetadata(next.schema.getKeyspaceMetadata(keyspace), next, logIdProvider, onNewLog); + KeyspaceShards ksShards = current.withNewShards(next.schema.getKeyspaceMetadata(keyspace), next, logIdProvider, onNewLog); updated.put(keyspace, ksShards); break; } @@ -1013,18 +1040,14 @@ private static ConcurrentHashMap applyUpdatedMetadata(Ma return updated; } - // TODO (expected): when topology and state truncation is implemented, implement cleanup of this map as well private void onNewLog(Shard shard, CoordinatorLog log) { - shardLock.readLock().lock(); - try - { - log2ShardMap.put(log.logId, shard); - } - finally - { - shardLock.readLock().unlock(); - } + log2ShardMap.put(log.logId, shard); + } + + private void onDroppedLog(Shard shard, CoordinatorLog log) + { + log2ShardMap.remove(log.logId, shard); } private void truncateMutationJournal() @@ -1046,41 +1069,10 @@ private void collectDurablyReconciledOffsets(Log2OffsetsMap.Mutable into) public SyncTasks alignToShardBoundaries(Keyspace keyspace, List tasks) { - Preconditions.checkArgument(keyspace.getMetadata().replicationStrategy.replicationType.isTracked(), "Keyspace " + keyspace.getName() + " is not tracked"); - + Preconditions.checkArgument(keyspace.getMetadata().replicationStrategy.replicationType.isTracked(), + "Keyspace " + keyspace.getName() + " is not tracked"); KeyspaceShards shards = keyspaceShards.get(keyspace.getName()); - Map> tasksByShard = new HashMap<>(); - - // Shard ranges do not wrap, so unwrap the task ranges before we start comparing them. - for (SyncTask task : unwrapped(tasks)) - { - Set intersectingShards = new HashSet<>(); - shards.forEachIntersectingShard(task.rangesToSync, intersectingShards::add); - for (Shard shard : intersectingShards) - { - // Ensure that we don't expand outside the ranges of the original sync tasks. - Set> intersectingSyncRanges = new HashSet<>(); - for (Range syncRange : task.rangesToSync) - intersectingSyncRanges.addAll(syncRange.intersectionWith(shard.range)); - - if (!intersectingSyncRanges.isEmpty()) - tasksByShard.computeIfAbsent(shard, key -> new ArrayList<>()).add(task.withRanges(intersectingSyncRanges)); - } - } - - SyncTasks into = new SyncTasks(); - - for (Map.Entry> entry : tasksByShard.entrySet()) - { - Shard shard = entry.getKey(); - Collection syncTasks = entry.getValue(); - - // Assign a new transfer ID to each sync task and add to the tasks container - for (SyncTask task : syncTasks) - into.add(shard, task.withTransferId(shard.nextId())); - } - - return into; + return shards.alignToShardBoundaries(tasks); } private static List unwrapped(Collection tasks) @@ -1170,26 +1162,44 @@ static UpdateDecision decisionForTopologyChange(String keyspace, ClusterMetadata } private final String keyspace; - private final Map, Shard> shards; - private final ReplicaGroups groups; + private final ShardIntervalBTree shards; - private transient final Map, Shard> ppShards; + private KeyspaceShards(String keyspace, ShardIntervalBTree shards) + { + this.keyspace = keyspace; + this.shards = shards; + } - private static class ParticipantForRange + private static class ParticipantsForRange { final Participants participants; final VersionedEndpoints.ForRange forRange; - public ParticipantForRange(Participants participants, VersionedEndpoints.ForRange forRange) + public ParticipantsForRange(Participants participants, VersionedEndpoints.ForRange forRange) { this.participants = participants; this.forRange = forRange; } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (!(o instanceof ParticipantsForRange)) return false; + ParticipantsForRange that = (ParticipantsForRange) o; + return participants.equals(that.participants) && forRange.equals(that.forRange); + } + + @Override + public int hashCode() + { + return 31 * participants.hashCode() + forRange.hashCode(); + } } - private static Map, ParticipantForRange> calculateParticipantsForRange(KeyspaceMetadata keyspace, ClusterMetadata cluster) + private static Map, ParticipantsForRange> calculateParticipantsForRange(KeyspaceMetadata keyspace, ClusterMetadata cluster) { - Map, ParticipantForRange> result = new HashMap<>(); + Map, ParticipantsForRange> result = new HashMap<>(); cluster.placements.get(keyspace.params.replication).writes.forEach((fullTokenRange, forRange) -> { if (!forRange.endpoints().contains(FBUtilities.getBroadcastAddressAndPort())) return; @@ -1199,218 +1209,188 @@ private static Map, ParticipantForRange> calculateParticipantsForRa participantList.add(cluster.directory.peerId(endpoint).id()); Participants participants = new Participants(participantList); - result.put(fullTokenRange, new ParticipantForRange(participants, forRange)); + result.put(fullTokenRange, new ParticipantsForRange(participants, forRange)); }); return result; } - private static Set> splitRange(Range range) - { - Optional splitter = range.left.getPartitioner().splitter(); - return splitter.isPresent() && SHARD_MULTIPLIER > 1 - ? splitter.get().split(range, SHARD_MULTIPLIER) - : Collections.singleton(range); - } - static KeyspaceShards make(KeyspaceMetadata keyspace, ClusterMetadata cluster, LongSupplier logIdProvider, BiConsumer onNewLog) { Preconditions.checkArgument(keyspace.params.replicationType.isTracked() || cluster.mutationTrackingMigrationState.isMigrating(keyspace.name)); - Map, Shard> shards = new HashMap<>(); - Map, VersionedEndpoints.ForRange> groups = new HashMap<>(); + List shards = new ArrayList<>(); - calculateParticipantsForRange(keyspace, cluster).forEach((fullTokenRange, participantForRange) -> { - Participants participants = participantForRange.participants; - VersionedEndpoints.ForRange forRange = participantForRange.forRange; - - Set> ranges = splitRange(fullTokenRange); - - for (Range tokenRange : ranges) - { - shards.put(tokenRange, new Shard(cluster.myNodeId().id(), keyspace.name, tokenRange, participants, logIdProvider, onNewLog)); - groups.put(tokenRange, forRange.map(original -> original.withRange(tokenRange))); - } + calculateParticipantsForRange(keyspace, cluster).forEach((tokenRange, participantsForRange) -> { + Participants participants = participantsForRange.participants; + VersionedEndpoints.ForRange forRange = participantsForRange.forRange; + shards.add(new Shard(cluster.myNodeId().id(), keyspace.name, forRange.lastModified().getEpoch(), tokenRange, participants, logIdProvider, onNewLog)); }); - KeyspaceShards keyspaceShards = new KeyspaceShards(keyspace.name, shards, new ReplicaGroups(groups)); + shards.sort(Shard.COMPARATOR); + KeyspaceShards keyspaceShards = new KeyspaceShards(keyspace.name, ShardIntervalBTree.fromSorted(shards)); keyspaceShards.persistToSystemTables(); return keyspaceShards; } - KeyspaceShards(String keyspace, Map, Shard> shards, ReplicaGroups groups) + KeyspaceShards withNewShards(KeyspaceMetadata keyspace, ClusterMetadata cluster, LongSupplier logIdProvider, BiConsumer onNewLog) { - this.keyspace = keyspace; - this.shards = shards; - this.groups = groups; - - HashMap, Shard> ppShards = new HashMap<>(); - shards.forEach((range, shard) -> ppShards.put(Range.makeRowRange(range), shard)); - this.ppShards = ppShards; - } - - KeyspaceShards withUpdatedMetadata(KeyspaceMetadata keyspace, ClusterMetadata cluster, LongSupplier logIdProvider, BiConsumer onNewLog) - { - Map, Shard> currentShards = new HashMap<>(shards); - Map, Shard> newShards = new HashMap<>(); - Map, VersionedEndpoints.ForRange> newGroups = new HashMap<>(); - - calculateParticipantsForRange(keyspace, cluster).forEach((fullTokenRange, participantForRange) -> { - Participants participants = participantForRange.participants; - VersionedEndpoints.ForRange forRange = participantForRange.forRange; - - Set> ranges = splitRange(fullTokenRange); + // carry forward all current shards - allow SealingCoordinator to explicitly seal the obsoleted ones + List newShards = new ArrayList<>(); + shards.forEach(newShards::add); - for (Range tokenRange : ranges) - { - Shard currentShard = currentShards.remove(tokenRange); - if (currentShard != null) - { - newShards.put(tokenRange, currentShard.withParticipants(participants)); - newGroups.put(tokenRange, forRange.map(original -> original.withRange(tokenRange))); - } - else - { - newShards.put(tokenRange, new Shard(cluster.myNodeId().id(), keyspace.name, tokenRange, participants, logIdProvider, onNewLog)); - newGroups.put(tokenRange, forRange.map(original -> original.withRange(tokenRange))); - } - } - }); - - newShards.values().forEach(Shard::reportAllLogsToCallback); - - return new KeyspaceShards(keyspace.name, newShards, new ReplicaGroups(newGroups)); - } - - MutationId nextMutationId(Token token) - { - return lookUp(token).nextId(); - } + // add all the new shards for the new topology/epoch + for (Map.Entry, ParticipantsForRange> entry : calculateParticipantsForRange(keyspace, cluster).entrySet()) + { + Range tokenRange = entry.getKey(); + ParticipantsForRange participantsForRange = entry.getValue(); + Participants participants = participantsForRange.participants; + VersionedEndpoints.ForRange forRange = participantsForRange.forRange; + + long rangeEpoch = forRange.lastModified().getEpoch(); + Shard existing = shards.get(tokenRange, rangeEpoch); + if (existing == null) + newShards.add(new Shard(cluster.myNodeId().id(), keyspace.name, rangeEpoch, tokenRange, participants, logIdProvider, onNewLog)); + } - void updateReplicatedOffsets(Range range, List offsets, boolean durable, InetAddressAndPort onHost) - { - Shard shard = shards.get(range); - if (shard == null) - return; - shard.updateReplicatedOffsets(offsets, durable, onHost); + newShards.sort(Shard.COMPARATOR); + newShards.forEach(Shard::reportAllLogsToCallback); // TODO (expected): audit + KeyspaceShards keyspaceShards = new KeyspaceShards(keyspace.name, ShardIntervalBTree.fromSorted(newShards)); + keyspaceShards.persistToSystemTables(); + return keyspaceShards; } - boolean startWriting(Mutation mutation) + KeyspaceShards withNewShard(Shard shard) { - return lookUp(mutation).startWriting(mutation); + return new KeyspaceShards(keyspace, shards.with(shard)); } - void finishWriting(Mutation mutation) + MutationId nextMutationId(Token token) { - lookUp(mutation).finishWriting(mutation); + Shard shard = shards.latestShardCovering(token); + if (null == shard) + throw new UnknownShardException(token, keyspace); + return shard.nextMutationId(); } MutationSummary createSummaryForKey(DecoratedKey key, TableId tableId, boolean includePending) { MutationSummary.Builder builder = new MutationSummary.Builder(tableId); - lookUp(key.getToken()).addSummaryForKey(key.getToken(), includePending, builder); + shards.forEachCovering(key.getToken(), shard -> shard.addSummaryForKey(key.getToken(), includePending, builder)); return builder.build(); } MutationSummary createSummaryForRange(AbstractBounds range, TableId tableId, boolean includePending) { MutationSummary.Builder builder = new MutationSummary.Builder(tableId); - forEachIntersectingShard(range, shard -> shard.addSummaryForRange(range, includePending, builder)); + shards.forEachIntersecting(range, shard -> shard.addSummaryForRange(range, includePending, builder)); return builder.build(); } - private void forEachIntersectingShard(AbstractBounds bounds, Consumer consumer) + // TODO (expected): I think this should be grabbing the shardLock? (AY) + SyncTasks alignToShardBoundaries(List tasks) { - ppShards.forEach((range, shard) -> { - // TODO (expected): partial workaround - is there a better way to do this? - // SELECT * statements create Bounds[min,min], (PartitionKeyRestrictions.java:L174) not Range(min,min], - // which Ranges generally won't intersect with (Range.java:L148), so contains is used here to make it work - if (bounds.contains(range.right) || range.intersects(bounds)) - consumer.accept(shard); - }); - } + Map> tasksByShard = new HashMap<>(); - private void forEachIntersectingShard(Collection> ranges, Consumer consumer) - { - shards.forEach((range0, shard) -> { - if (shard.range.intersects(ranges)) - consumer.accept(shard); - }); - } + // Shard ranges do not wrap, so unwrap the task ranges before we start comparing them. + for (SyncTask task : unwrapped(tasks)) + { + Set intersectingShards = new HashSet<>(); + shards.forEachIntersecting(task.rangesToSync, intersectingShards::add); + for (Shard shard : intersectingShards) + { + // Ensure that we don't expand outside the ranges of the original sync tasks. + Set> intersectingSyncRanges = new HashSet<>(); + for (Range syncRange : task.rangesToSync) + intersectingSyncRanges.addAll(syncRange.intersectionWith(shard.range)); - void collectShardReconciledOffsetsToBuilder(ReconciledLogSnapshot.Builder builder) - { - ReconciledKeyspaceOffsets.Builder keyspaceBuilder = builder.getKeyspaceBuilder(keyspace); - ppShards.values().forEach(shard -> shard.collectShardReconciledOffsetsToBuilder(keyspaceBuilder)); - } + if (!intersectingSyncRanges.isEmpty()) + tasksByShard.computeIfAbsent(shard, key -> new ArrayList<>()).add(task.withRanges(intersectingSyncRanges)); + } + } - void recordFullyReconciledOffsets(ReconciledKeyspaceOffsets keyspaceOffsets) - { - keyspaceOffsets.forEach((logId, entry) -> { - // Find the shard that should contain this log based on the range - Shard shard = shards.get(entry.range); - if (shard != null) - shard.recordFullyReconciledOffsets(logId, entry.offsets); - }); + SyncTasks into = new SyncTasks(); + + for (Map.Entry> entry : tasksByShard.entrySet()) + { + Shard shard = entry.getKey(); + Collection syncTasks = entry.getValue(); + + // Assign a new transfer ID to each sync task and add to the tasks container + for (SyncTask task : syncTasks) + into.add(shard, task.withTransferId(shard.nextTransferId())); + } + + return into; } void collectDurablyReconciledOffsets(Log2OffsetsMap.Mutable into) { - forEachShard(shard -> shard.collectDurablyReconciledOffsets(into)); + shards.forEach(Shard::collectDurablyReconciledOffsets, into); } + /** + * Invoke {@code consumer} for every Shard in the tree (exactly once for each shard). + */ void forEachShard(Consumer consumer) { - for (Shard shard : shards.values()) - consumer.accept(shard); + shards.forEach(consumer); } - Shard lookUp(Mutation mutation) + /** + * Invoke {@code consumer} for every Shard in the tree (exactly once for each shard). + * Allows one pass-through arg to avoid allocating some capturing lambdas. + */ +

void forEachShard(BiConsumer consumer, P param) { - return lookUp(mutation.key()); + shards.forEach(consumer, param); } - Shard lookUp(DecoratedKey key) + /** + * Note: a range may be a strict subset of the shard's full range, + * so we match by containment rather than exact equality + */ + @Nonnull + Shard lookUpForActivation(Range range, long sinceEpoch) { - return lookUp(key.getToken()); - } + Shard match = shards.foldIntersecting(range, (shard, found) -> { + if (shard.sinceEpoch != sinceEpoch || !shard.range.contains(range)) + return found; + if (found != null) + throw new IllegalStateException(format("Ambiguous shard lookup for keyspace %s, epoch %d, range %s: [%s, %s]", + keyspace, sinceEpoch, range, found, shard)); + return shard; + }, null); - Shard lookUp(Token token) - { - VersionedEndpoints.ForRange forRange = groups.matchToken(token); - if (forRange == null) - throw new UnknownShardException(token, groups); - return shards.get(forRange.range()); + if (match == null) + throw new UnknownShardException(range, keyspace); + + return match; } - Shard lookUp(Range range) + /** + * Look up the shard by *exact* range + sinceEpoch. + */ + @Nullable + Shard get(Range range, long sinceEpoch) { - VersionedEndpoints.ForRange forRange = groups.matchRange(range); - if (forRange == null) - throw new UnknownShardException(range, groups); - return shards.get(forRange.range()); + return shards.get(range, sinceEpoch); } void persistToSystemTables() { - for (Shard shard : shards.values()) shard.persistToSystemTables(); + shards.forEach(Shard::persistToSystemTables); } static List loadFromSystemTables(ClusterMetadata cluster, LongSupplier logIdProvider, BiConsumer onNewLog) { - Map, Shard>> groupedShards = new HashMap<>(); + Map> groupedShards = new HashMap<>(); for (Shard shard : Shard.loadFromSystemTables(cluster.myNodeId().id(), logIdProvider, onNewLog)) - groupedShards.computeIfAbsent(shard.keyspace, k -> new HashMap<>()).put(shard.range, shard); - List keyspaceShards = new ArrayList<>(); - for (Map.Entry, Shard>> entry : groupedShards.entrySet()) - { - ReplicationParams params = cluster.schema.getKeyspaceMetadata(entry.getKey()).params.replication; - ReplicaGroups originalGroups = cluster.placements.get(params).writes; // prior to splitting - - Map, VersionedEndpoints.ForRange> splitGroups = new HashMap<>(); - for (Range splitRange : entry.getValue().keySet()) - splitGroups.put(splitRange, originalGroups.matchRange(splitRange)); + groupedShards.computeIfAbsent(shard.keyspace, k -> new ArrayList<>()).add(shard); - keyspaceShards.add(new KeyspaceShards(entry.getKey(), entry.getValue(), new ReplicaGroups(splitGroups))); - } + List keyspaceShards = new ArrayList<>(); + groupedShards.forEach((keyspace, shards) -> { + shards.sort(Shard.COMPARATOR); + keyspaceShards.add(new KeyspaceShards(keyspace, ShardIntervalBTree.fromSorted(shards))); + }); return keyspaceShards; } } @@ -1437,6 +1417,94 @@ static int loadHostLogIdFromSystemTable() return rows.one().getInt("host_log_id"); } + /* + * Shard sealing + */ + + /** + * Fence id allocation on an obsoleted shard by transitioning state from ACTIVE to SEALING. + */ + void markShardSealing(String keyspace, long sinceEpoch, Range range) + { + shardLock.readLock().lock(); + try + { + getShard(keyspace, sinceEpoch, range).markSealing(); + } + finally + { + shardLock.readLock().unlock(); + } + } + + /** + * @return whether the obsoleted shard has drained its in-flight local writes + */ + boolean isShardDrained(String keyspace, long sinceEpoch, Range range) + { + shardLock.readLock().lock(); + try + { + return getShard(keyspace, sinceEpoch, range).isDrained(); + } + finally + { + shardLock.readLock().unlock(); + } + } + + /** + * @return this node's local-applied ("witnessed by me") offsets for each log of the requested shard + */ + Log2OffsetsMap.Immutable collectLocallyWitnessedOffsets(String keyspace, long sinceEpoch, Range range) + { + shardLock.readLock().lock(); + try + { + return getShard(keyspace, sinceEpoch, range).collectLocallyWitnessedOffsets(); + } + finally + { + shardLock.readLock().unlock(); + } + } + + /** + * @return whether the specified shard has locally applied every offset in {@code offsets}. + */ + boolean hasWitnessed(String keyspace, long sinceEpoch, Range range, Log2OffsetsMap offsets) + { + shardLock.readLock().lock(); + try + { + return getShard(keyspace, sinceEpoch, range).hasWitnessed(offsets); + } + finally + { + shardLock.readLock().unlock(); + } + } + + /** + * Promote an obsoleted shard from SEALING to SEALED. + */ + void markShardSealed(String keyspace, long sinceEpoch, Range range) + { + shardLock.readLock().lock(); + try + { + getShard(keyspace, sinceEpoch, range).markSealed(); + } + finally + { + shardLock.readLock().unlock(); + } + } + + /* + * Background processes + */ + private static class BackgroundReconciler { void start() @@ -1475,6 +1543,9 @@ private void run(KeyspaceShards shards) private void run(Shard shard) { + if (shard.isSealed()) + return; + try { List missing = shard.collectLocallyMissingOffsets(); @@ -1559,11 +1630,14 @@ public void run(boolean durable) private void run(KeyspaceShards shards, boolean durable) { if (!isPaused) - shards.forEachShard(sh -> run(sh, durable)); + shards.forEachShard(this::run, durable); } private void run(Shard shard, boolean durable) { + if (shard.isSealed()) + return; + BroadcastLogOffsets replicatedOffsets = shard.collectReplicatedOffsets(durable); if (replicatedOffsets.isEmpty()) return; @@ -1696,20 +1770,15 @@ public static KeyspaceShards getKeyspaceShards(MutationTrackingService service, */ public static KeyspaceShards createTestKeyspaceShards(String keyspace, Set> shardRanges) { - Map, Shard> shards = new HashMap<>(); - Map, VersionedEndpoints.ForRange> groups = new HashMap<>(); - + List shards = new ArrayList<>(); int localNodeId = 1; AtomicInteger hostLogId = new AtomicInteger(0); LongSupplier logId = () -> CoordinatorLogId.asLong(localNodeId, hostLogId.getAndIncrement()); Participants participants = new Participants(List.of(localNodeId)); for (Range range : shardRanges) - { - shards.put(range, new Shard(localNodeId, keyspace, range, participants, logId, (s, l) -> {})); - groups.put(range, VersionedEndpoints.forRange(Epoch.EMPTY, EndpointsForRange.empty(range))); - } - - return new KeyspaceShards(keyspace, shards, new ReplicaGroups(groups)); + shards.add(new Shard(localNodeId, keyspace, Epoch.EMPTY.getEpoch(), range, participants, logId, (s, l) -> {})); + shards.sort(Shard.COMPARATOR); + return new KeyspaceShards(keyspace, ShardIntervalBTree.fromSorted(shards)); } /** diff --git a/src/java/org/apache/cassandra/replication/Offsets.java b/src/java/org/apache/cassandra/replication/Offsets.java index 4430a95556f5..9f4b33dacef7 100644 --- a/src/java/org/apache/cassandra/replication/Offsets.java +++ b/src/java/org/apache/cassandra/replication/Offsets.java @@ -747,7 +747,7 @@ public Immutable create(CoordinatorLogId logId, int[] bounds, int size) return new Immutable(logId, bounds, size); } }; - private static int[] EMPTY = new int[0]; + private static final int[] EMPTY = new int[0]; public Immutable(CoordinatorLogId logId, int[] bounds) { @@ -1569,31 +1569,46 @@ public interface RangeIterator @Override public void serialize(Offsets.Immutable offsets, DataOutputPlus out) throws IOException { - CoordinatorLogId.serializer.serialize(offsets.logId, out); - out.writeInt(offsets.size); - for (int i = 0; i < offsets.size; i++) - out.writeInt(offsets.bounds[i]); + offsets.serialize(out); } @Override public Offsets.Immutable deserialize(DataInputPlus in) throws IOException { - CoordinatorLogId logId = CoordinatorLogId.serializer.deserialize(in); - int size = in.readInt(); - Preconditions.checkArgument(size >= 0 && size % 2 == 0); - int[] bounds = new int[size]; - for (int i = 0; i < size; i++) - bounds[i] = in.readInt(); - return new Offsets.Immutable(logId, bounds); + return Offsets.deserialize(in); } @Override public long serializedSize(Offsets.Immutable offsets) { - long size = CoordinatorLogId.serializer.serializedSize(offsets.logId); - size += TypeSizes.sizeof(offsets.size); - size += (long) TypeSizes.INT_SIZE * offsets.size; - return size; + return offsets.serializedSize(); } }; + + void serialize(DataOutputPlus out) throws IOException + { + CoordinatorLogId.serializer.serialize(logId, out); + out.writeUnsignedVInt32(size); + for (int i = 0; i < size; i++) + out.writeInt(bounds[i]); + } + + static Offsets.Immutable deserialize(DataInputPlus in) throws IOException + { + CoordinatorLogId logId = CoordinatorLogId.serializer.deserialize(in); + int size = in.readUnsignedVInt32(); + Preconditions.checkArgument(size >= 0 && size % 2 == 0); + int[] bounds = new int[size]; + for (int i = 0; i < size; i++) + bounds[i] = in.readInt(); + return new Offsets.Immutable(logId, bounds); + } + + long serializedSize() + { + long size = CoordinatorLogId.serializer.serializedSize(logId); + size += TypeSizes.sizeofUnsignedVInt(this.size); + size += (long) TypeSizes.INT_SIZE * this.size; + return size; + } } diff --git a/src/java/org/apache/cassandra/replication/Participants.java b/src/java/org/apache/cassandra/replication/Participants.java index 5fd0f5bb8d93..502204315d47 100644 --- a/src/java/org/apache/cassandra/replication/Participants.java +++ b/src/java/org/apache/cassandra/replication/Participants.java @@ -17,12 +17,18 @@ */ package org.apache.cassandra.replication; +import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Set; import org.agrona.collections.IntHashSet; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.ArraySerializers; + public class Participants { private final int[] hosts; @@ -36,6 +42,12 @@ public class Participants this.hosts = hosts; } + private Participants(int[] hosts) + { + Arrays.sort(hosts); + this.hosts = hosts; + } + int size() { return hosts.length; @@ -76,11 +88,38 @@ public boolean equals(Object o) return Arrays.equals(this.hosts, that.hosts); } - Set asSet() + @Override + public int hashCode() + { + return Arrays.hashCode(hosts); + } + + public Set asSet() { IntHashSet set = new IntHashSet(hosts.length); for (int host : hosts) set.add(host); return set; } + + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Participants participants, DataOutputPlus out) throws IOException + { + ArraySerializers.serializeVIntArray(participants.hosts, out); + } + + @Override + public Participants deserialize(DataInputPlus in) throws IOException + { + return new Participants(ArraySerializers.deserializeVIntArray(in)); + } + + @Override + public long serializedSize(Participants participants) + { + return ArraySerializers.serializedVIntArraySize(participants.hosts); + } + }; } diff --git a/src/java/org/apache/cassandra/replication/ReconciledKeyspaceOffsets.java b/src/java/org/apache/cassandra/replication/ReconciledKeyspaceOffsets.java deleted file mode 100644 index 372c5ce3c519..000000000000 --- a/src/java/org/apache/cassandra/replication/ReconciledKeyspaceOffsets.java +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.replication; - -import java.io.IOException; -import java.util.Collection; -import java.util.Objects; -import java.util.function.BiConsumer; - -import org.agrona.collections.Long2ObjectHashMap; - -import org.apache.cassandra.dht.AbstractBounds; -import org.apache.cassandra.dht.IPartitioner; -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.utils.CollectionSerializers; -import org.apache.cassandra.utils.Int64Serializer; - -public class ReconciledKeyspaceOffsets -{ - /** - * Simple data holder for offsets and their associated range - */ - static class Entry - { - public final Offsets.Immutable offsets; - public final Range range; - - public Entry(Offsets.Immutable offsets, Range range) - { - this.offsets = offsets; - this.range = range; - } - - @Override - public boolean equals(Object o) - { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Entry entry = (Entry) o; - return Objects.equals(offsets, entry.offsets) && Objects.equals(range, entry.range); - } - - @Override - public int hashCode() - { - return Objects.hash(offsets, range); - } - - @Override - public String toString() - { - return "LogEntry{offsets=" + offsets + ", range=" + range + '}'; - } - } - - private final Long2ObjectHashMap logEntries; - - private ReconciledKeyspaceOffsets(Long2ObjectHashMap logEntries) - { - this.logEntries = logEntries; - } - - public boolean isFullyReconciled(ShortMutationId mutationId) - { - Entry entry = logEntries.get(mutationId.logId()); - return entry != null && entry.offsets.contains(mutationId.offset()); - } - - public boolean isFullyReconciled(long logId, Offsets.Immutable offsets) - { - Entry entry = logEntries.get(logId); - if (entry == null) - return false; - - Offsets.RangeIterator diff = Offsets.difference(offsets.rangeIterator(), entry.offsets.rangeIterator()); - return !diff.tryAdvance(); - } - - public Offsets.Immutable get(CoordinatorLogId logId) - { - Entry entry = logEntries.get(logId.asLong()); - return entry != null ? entry.offsets : null; - } - - public Range getRange(CoordinatorLogId logId) - { - Entry entry = logEntries.get(logId.asLong()); - return entry != null ? entry.range : null; - } - - public Entry getLogEntry(CoordinatorLogId logId) - { - return logEntries.get(logId.asLong()); - } - - public Long2ObjectHashMap getAllOffsets() - { - Long2ObjectHashMap result = new Long2ObjectHashMap<>(); - logEntries.forEachLong((logId, entry) -> result.put(logId, entry.offsets)); - return result; - } - - public Long2ObjectHashMap> getAllRanges() - { - Long2ObjectHashMap> result = new Long2ObjectHashMap<>(); - logEntries.forEachLong((logId, entry) -> result.put(logId, entry.range)); - return result; - } - - void forEach(BiConsumer consumer) - { - logEntries.forEachLong((logId, entry) -> consumer.accept(new CoordinatorLogId(logId), entry)); - } - - public boolean isEmpty() - { - return logEntries.isEmpty(); - } - - public int size() - { - return logEntries.size(); - } - - public boolean contains(CoordinatorLogId logId) - { - return logEntries.containsKey(logId.asLong()); - } - - /** - * Selects log entries whose ranges intersect with any of the target ranges - * and adds them to the provided builder. - * - * @param targetRanges ranges to intersect with - * @param builder builder to add intersecting entries to - */ - void selectIntersecting(Collection> targetRanges, Builder builder) - { - logEntries.forEachLong((logId, entry) -> { - Range logRange = entry.range; - for (Range targetRange : targetRanges) - { - if (logRange.intersects(targetRange)) - { - builder.put(new CoordinatorLogId(logId), entry.offsets, entry.range); - break; - } - } - }); - } - - @Override - public boolean equals(Object o) - { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ReconciledKeyspaceOffsets that = (ReconciledKeyspaceOffsets) o; - return Objects.equals(logEntries, that.logEntries); - } - - @Override - public int hashCode() - { - return Objects.hash(logEntries); - } - - @Override - public String toString() - { - return "ReconciledKeyspaceOffsets{" + - "logEntries=" + logEntries + - '}'; - } - - public static class Builder - { - private final Long2ObjectHashMap logEntries = new Long2ObjectHashMap<>(); - - public Builder put(CoordinatorLogId logId, Offsets.Immutable reconciled, Range range) - { - logEntries.put(logId.asLong(), new Entry(reconciled, range)); - return this; - } - - public ReconciledKeyspaceOffsets build() - { - return new ReconciledKeyspaceOffsets(logEntries); - } - } - - public static Builder builder() - { - return new Builder(); - } - - private static final VersionedSerializer entrySerializer = new VersionedSerializer<>() - { - @Override - public void serialize(Entry e, DataOutputPlus out, Version version) throws IOException - { - Offsets.serializer.serialize(e.offsets, out); - AbstractBounds.tokenSerializer.serialize(e.range, out, version.messagingVersion()); - } - - @Override - public Entry deserialize(DataInputPlus in, Version version) throws IOException - { - Offsets.Immutable offsets = Offsets.serializer.deserialize(in); - Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); - return new Entry(offsets, range); - } - - @Override - public long serializedSize(Entry e, Version version) - { - return Offsets.serializer.serializedSize(e.offsets) + AbstractBounds.tokenSerializer.serializedSize(e.range, version.messagingVersion()); - } - }; - - public static final VersionedSerializer serializer = new VersionedSerializer<>() - { - @Override - public void serialize(ReconciledKeyspaceOffsets keyspaceOffsets, DataOutputPlus out, Version version) throws IOException - { - CollectionSerializers.serializeMap( - keyspaceOffsets.logEntries, out, version, Int64Serializer.serializer, entrySerializer - ); - } - - @Override - public ReconciledKeyspaceOffsets deserialize(DataInputPlus in, Version version) throws IOException - { - Long2ObjectHashMap logEntries = - CollectionSerializers.deserializeMap(in, version, Int64Serializer.serializer, entrySerializer, i -> new Long2ObjectHashMap<>()); - return new ReconciledKeyspaceOffsets(logEntries); - } - - @Override - public long serializedSize(ReconciledKeyspaceOffsets keyspaceOffsets, Version version) - { - return CollectionSerializers.serializedMapSize(keyspaceOffsets.logEntries, version, Int64Serializer.serializer, entrySerializer); - } - }; -} diff --git a/src/java/org/apache/cassandra/replication/ReconciledLogSnapshot.java b/src/java/org/apache/cassandra/replication/ReconciledLogSnapshot.java deleted file mode 100644 index c4f4d4ea8a56..000000000000 --- a/src/java/org/apache/cassandra/replication/ReconciledLogSnapshot.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.replication; - -import java.io.IOException; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import java.util.function.BiConsumer; - -import com.google.common.collect.ImmutableMap; - -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.utils.CollectionSerializers; -import org.apache.cassandra.utils.StringSerializer; - -/** - * Container for reconciled offsets organized by keyspace and shard, including range information for each log. - * This is similar to LogReconciledOffsets but adds range tracking for coordinator logs. - */ -public class ReconciledLogSnapshot -{ - private final ImmutableMap reconciled; - - private ReconciledLogSnapshot(ImmutableMap reconciled) - { - this.reconciled = reconciled; - } - - public boolean isFullyReconciled(String keyspace, ShortMutationId mutationId) - { - ReconciledKeyspaceOffsets keyspaceOffsets = reconciled.get(keyspace); - if (keyspaceOffsets == null) - return true; - return keyspaceOffsets.isFullyReconciled(mutationId); - } - - public void forEach(BiConsumer consumer) - { - reconciled.forEach(consumer); - } - - public Offsets.Immutable get(String keyspace, CoordinatorLogId logId) - { - ReconciledKeyspaceOffsets keyspaceOffsets = reconciled.get(keyspace); - return keyspaceOffsets != null ? keyspaceOffsets.get(logId) : null; - } - - public Range getRange(String keyspace, CoordinatorLogId logId) - { - ReconciledKeyspaceOffsets keyspaceOffsets = reconciled.get(keyspace); - return keyspaceOffsets != null ? keyspaceOffsets.getRange(logId) : null; - } - - public ReconciledKeyspaceOffsets getKeyspace(String keyspace) - { - return reconciled.get(keyspace); - } - - public ImmutableMap getAll() - { - return reconciled; - } - - public boolean isEmpty() - { - return size() == 0; - } - - public int size() - { - return reconciled.values().stream().mapToInt(ReconciledKeyspaceOffsets::size).sum(); - } - - /** - * Creates a filtered subset of this snapshot containing only log entries whose ranges - * intersect with the specified keyspace ranges. - * - * @param keyspaceRanges map of keyspace name to ranges to filter by - * @return new ReconciledLogSnapshot containing only intersecting entries - */ - public ReconciledLogSnapshot select(Map>> keyspaceRanges) - { - ReconciledLogSnapshot.Builder builder = ReconciledLogSnapshot.builder(); - - for (Map.Entry>> entry : keyspaceRanges.entrySet()) - { - String keyspace = entry.getKey(); - Collection> targetRanges = entry.getValue(); - - ReconciledKeyspaceOffsets keyspaceOffsets = reconciled.get(keyspace); - if (keyspaceOffsets == null) - continue; - - ReconciledKeyspaceOffsets.Builder keyspaceBuilder = builder.getKeyspaceBuilder(keyspace); - keyspaceOffsets.selectIntersecting(targetRanges, keyspaceBuilder); - } - - return builder.build(); - } - - @Override - public boolean equals(Object o) - { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ReconciledLogSnapshot that = (ReconciledLogSnapshot) o; - return Objects.equals(reconciled, that.reconciled); - } - - @Override - public int hashCode() - { - return Objects.hash(reconciled); - } - - @Override - public String toString() - { - return "ShardReconciledOffsets{" + - "reconciled=" + reconciled + - '}'; - } - - public static class Builder - { - private final Map keyspaceBuilders = new HashMap<>(); - - public Builder put(String keyspace, CoordinatorLogId logId, Offsets.Immutable offsets, Range range) - { - keyspaceBuilders.computeIfAbsent(keyspace, k -> ReconciledKeyspaceOffsets.builder()) - .put(logId, offsets, range); - return this; - } - - ReconciledKeyspaceOffsets.Builder getKeyspaceBuilder(String keyspace) - { - return keyspaceBuilders.computeIfAbsent(keyspace, k -> ReconciledKeyspaceOffsets.builder()); - } - - public ReconciledLogSnapshot build() - { - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (Map.Entry entry : keyspaceBuilders.entrySet()) - { - ReconciledKeyspaceOffsets ks = entry.getValue().build(); - if (!ks.isEmpty()) - builder.put(entry.getKey(), ks); - } - return new ReconciledLogSnapshot(builder.build()); - } - } - - public static Builder builder() - { - return new Builder(); - } - - public static final VersionedSerializer serializer = new VersionedSerializer<>() - { - @Override - public void serialize(ReconciledLogSnapshot offsets, DataOutputPlus out, Version version) throws IOException - { - CollectionSerializers.serializeMap( - offsets.reconciled, out, version, StringSerializer.instance, ReconciledKeyspaceOffsets.serializer - ); - } - - @Override - public ReconciledLogSnapshot deserialize(DataInputPlus in, Version version) throws IOException - { - ImmutableMap.Builder builder = ImmutableMap.builder(); - CollectionSerializers.deserializeMapToConsumer( - in, version, StringSerializer.instance, ReconciledKeyspaceOffsets.serializer, builder::put - ); - return new ReconciledLogSnapshot(builder.build()); - } - - @Override - public long serializedSize(ReconciledLogSnapshot offsets, Version version) - { - return CollectionSerializers.serializedMapSize( - offsets.reconciled, version, StringSerializer.instance, ReconciledKeyspaceOffsets.serializer - ); - } - }; -} diff --git a/src/java/org/apache/cassandra/replication/SealingCoordinator.java b/src/java/org/apache/cassandra/replication/SealingCoordinator.java new file mode 100644 index 000000000000..25d72ab174d8 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/SealingCoordinator.java @@ -0,0 +1,1294 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.LockSupport; + +import javax.annotation.Nullable; + +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.EndpointsByReplica; +import org.apache.cassandra.locator.EndpointsForRange; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.RangesAtEndpoint; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.ownership.DataPlacement; +import org.apache.cassandra.tcm.ownership.MovementMap; +import org.apache.cassandra.tcm.ownership.PlacementDeltas; +import org.apache.cassandra.utils.CollectionSerializers; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.concurrent.AsyncPromise; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.concurrent.FutureCombiner; + +import static java.lang.String.format; +import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.utils.Clock.Global.nanoTime; + +/* + * Sealing {@link Shard}s once a topology change obsoletes them. + * + * Happy Path Sequence: + * 1. Bring every participant up to (at least) this seal's epoch and fence allocation, so that everyone + * stops writing to the old shards; then wait until all in-progress mutations have been done. + * 2. Same for tracked transfers + * 3. Wait until all participating replicas have reconciled the union of their witnessed ids + * 4. Mark the shard as fully sealed + * 5. Stop including the shard in mutation summaries for read requests. + * + * Unhappy Path Sequence (not yet implemented): + * 1. Wait until everyone who's up stops writing to the old shards and all in-progress mutations have been done + * 2. Same for tracked transfers + * 3. Wait until all *live* participating replicas have reconciled the union of their witnessed ids + * 4. Mark the shard as partially sealed, label it with the set of participants that were able to seal + * 5. When a down node comes up, have it talk to other nodes and learn about sealing, and what mutations/logs + * they have exchanged. Diff with what we have - in system tables and in the journal (though system table + * plus local replay should give us an up-to-date metadata state for the shard) + */ +public final class SealingCoordinator +{ + /** + * Discover all shards obsoleted by an in-flight bootstrap or host replacement. + */ + private static Set discoverShardsObsoletedByAcquisition(ClusterMetadata metadata, MovementMap movements) + { + List>> futures = new ArrayList<>(); + + for (KeyspaceMetadata ksm : metadata.schema.getKeyspaces()) + { + if (!ksm.params.replicationType.isTracked()) + continue; + + EndpointsByReplica acquired = movements.get(ksm.params.replication); + if (acquired.isEmpty()) // when the keyspace is not replicated to this DC + continue; + + DataPlacement placement = metadata.placements.get(ksm.params.replication); + + for (Map.Entry entry : acquired.entrySet()) + { + Range range = entry.getKey().range(); + Set endpoints = entry.getValue().endpoints(); + // cutoff: the epoch this node was added to the write placement for the range (its START_JOIN epoch); + // shards over the range with sinceEpoch < cutoff are obsoleted by this join. + long beforeEpoch = placement.writes.forRange(range).lastModified().getEpoch(); + futures.add(FetchShards.fetch(ksm.name, beforeEpoch, range, endpoints)); + } + } + + Set shards = new HashSet<>(); + try + { + for (Set fetched : FutureCombiner.allOf(futures).get(DatabaseDescriptor.getRpcTimeout(MILLISECONDS), MILLISECONDS)) + shards.addAll(fetched); + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + throw new RuntimeException("Failed to fetch shards to seal", e); + } + return shards; + } + + /** + * Discover and seal all shards obsoleted by an in-flight bootstrap (join) before the joining node + * starts streaming. Driven by the joining node from {@code BootstrapAndJoin.bootstrap()}, + * right after paxos repair and before any {@code StreamPlan} is built. + *

+ * Happy path only: assumes every participant of the obsoleted shards is alive. + *

+ * The joining node is acquiring the ranges in {@code movements}. For each acquired range, the + * cutoff epoch is the epoch at which this node was added to the write placement for that range + * (the {@code START_JOIN} epoch, read from the write placement's {@code lastModified}). It equals + * the just-created shard's {@code sinceEpoch}, so every shard over the range with + * {@code sinceEpoch < cutoff} is obsoleted by this join and must be sealed. + */ + public static void discoverAndSealBootstrapObsoletedShards(ClusterMetadata metadata, MovementMap movements) + { + seal(discoverShardsObsoletedByAcquisition(metadata, movements)); + } + + /** + * Locally collect and globally seal the intermediate shards that were created during START_JOIN + * (adding the new node to write placements) and are now obsoleted by the shards created during + * FINISH_JOIN (removing old owners from write placements). + *

+ * TODO (expected): this MUST eventually run while the affected ranges are still locked, i.e. between the + * removeFromWrites placement change and the range unlock. FinishJoin currently applies removeFromWrites + * and the unlock atomically in a single transformation, so there is no committed "dropped-but-locked" + * epoch to hook. Until then, this runs just after FINISH_JOIN, when the ranges are already unlocked. + */ + public static void collectAndSealStartJoinShards(ClusterMetadata postFinish, PlacementDeltas finishDelta) + { + seal(collectStartJoinShards(postFinish, finishDelta)); + } + + private static Set collectStartJoinShards(ClusterMetadata postFinish, PlacementDeltas finishDelta) + { + long finishEpoch = postFinish.epoch.getEpoch(); + Set shards = new HashSet<>(); + for (KeyspaceMetadata ksm : postFinish.schema.getKeyspaces()) + { + if (!ksm.params.replicationType.isTracked()) + continue; + + DataPlacement placement = postFinish.placements.get(ksm.params.replication); + finishDelta.get(ksm.params.replication).writes.removals.flattenValues().forEach(removed -> { + Set nodeIDs = new HashSet<>(); + for (InetAddressAndPort ep : placement.writes.forRange(removed.range()).endpoints()) + nodeIDs.add(postFinish.directory.peerId(ep).id()); + nodeIDs.add(postFinish.directory.peerId(removed.endpoint()).id()); + Participants participants = new Participants(nodeIDs); + + MutationTrackingService.instance().forEachShardInKeyspace(ksm.name, shard -> { + if (shard.sinceEpoch < finishEpoch + && shard.range.equals(removed.range()) + && shard.participants.equals(participants) + && !shard.isSealed()) + shards.add(new ShardMetadata(shard.keyspace, shard.sinceEpoch, shard.range, shard.participants)); + }); + }); + } + return shards; + } + + /** + * Discover and seal the shards obsoleted by an in-flight host replacement before the replacement node + * starts streaming. Driven by the replacement node from {@code BootstrapAndJoin.bootstrap()}, + * right after paxos repair and before any {@code StreamPlan} is built. + *

+ * The {@code beingReplaced} node is dead and cannot take part in the sealing, so it needs to be side-stepped. + * shard's participant set before sealing; the surviving replicas reconcile and seal among themselves. This is + * + * @param metadata cluster metadata as of MID_REPLACE (the replaced node is still a member) + * @param movements the ranges the replacement is acquiring, mapped to their (survivor) streaming sources + * @param beingReplaced the (dead) node being replaced + */ + public static void discoverAndSealReplacementObsoletedShards(ClusterMetadata metadata, MovementMap movements, InetAddressAndPort beingReplaced) + { + seal(discoverShardsObsoletedByAcquisition(metadata, movements), metadata.directory.peerId(beingReplaced)); + } + + /** + * Locally collect and globally seal the intermediate shards that were created during START_REPLACE + * (adding the new node to write placements) and are now obsoleted by the shards created during + * FINISH_REPLACE (removing the dead node from write placements). + * + * @param postFinish cluster metadata as of FINISH_REPLACE (the replaced node is already removed) + * @param finishDelta the FINISH_REPLACE placement delta ({@code removeFromWrites}) + * @param replaced the (dead) node that was replaced + */ + public static void collectAndSealStartReplaceShards(ClusterMetadata postFinish, PlacementDeltas finishDelta, NodeId replaced) + { + seal(collectStartReplaceShards(postFinish, finishDelta, replaced), replaced); + } + + private static Set collectStartReplaceShards(ClusterMetadata postFinish, PlacementDeltas finishDelta, NodeId replaced) + { + long finishEpoch = postFinish.epoch.getEpoch(); + Set shards = new HashSet<>(); + for (KeyspaceMetadata ksm : postFinish.schema.getKeyspaces()) + { + if (!ksm.params.replicationType.isTracked()) + continue; + + DataPlacement placement = postFinish.placements.get(ksm.params.replication); + finishDelta.get(ksm.params.replication).writes.removals.flattenValues().forEach(removed -> { + Set nodeIDs = new HashSet<>(); + for (InetAddressAndPort ep : placement.writes.forRange(removed.range()).endpoints()) + nodeIDs.add(postFinish.directory.peerId(ep).id()); + nodeIDs.add(replaced.id()); + Participants participants = new Participants(nodeIDs); + + MutationTrackingService.instance().forEachShardInKeyspace(ksm.name, shard -> { + if (shard.sinceEpoch < finishEpoch + && shard.range.equals(removed.range()) + && shard.participants.equals(participants) + && !shard.isSealed()) + shards.add(new ShardMetadata(shard.keyspace, shard.sinceEpoch, shard.range, shard.participants)); + }); + }); + } + return shards; + } + + /** + * Discover and seal the shards obsoleted by an in-flight decommission before the leaving node + * starts streaming its data away. Driven by the leaving node from UnbootstrapStreams. + *

+ * Happy path only: assumes every participant of the obsoleted shards is alive. + *

+ * START_LEAVE added new write replicas for each range the leaving node was handing off, creating a new + * shard ({@code sinceEpoch == START_LEAVE epoch}) over each such range. Every shard over those ranges with a + * smaller {@code sinceEpoch} is obsoleted by the leave and must be sealed. The leaving node is a participant + * of all of them (it was a replica of each handed-off range), so they can be discovered locally. + * + * @param postStartLeave cluster postStartLeave as of START_LEAVE (the leaving node is still a member) + * @param startDelta the START_LEAVE placement delta ({@code addToWrites}) + */ + public static void collectAndSealDecommissionObsoletedShards(ClusterMetadata postStartLeave, PlacementDeltas startDelta) + { + seal(collectDecommissionObsoletedShards(postStartLeave, startDelta)); + } + + private static Set collectDecommissionObsoletedShards(ClusterMetadata postStartLeave, PlacementDeltas startDelta) + { + Set shards = new HashSet<>(); + for (KeyspaceMetadata ksm : postStartLeave.schema.getKeyspaces()) + { + if (!ksm.params.replicationType.isTracked()) + continue; + + DataPlacement placement = postStartLeave.placements.get(ksm.params.replication); + startDelta.get(ksm.params.replication).writes.additions.flattenValues().forEach(added -> { + // cutoff: the epoch the new write replica was added for the range (its START_LEAVE epoch, equal to the + // just-created shard's sinceEpoch); shards over the range with sinceEpoch < cutoff are obsoleted. + // Decommission does not split ranges at START_LEAVE, so an addition's range is always an exact key in + // the post-START_LEAVE write placement and matches the obsoleted shard's range exactly. + long beforeEpoch = placement.writes.forRange(added.range()).lastModified().getEpoch(); + + // expected participants of the obsoleted pre-leave shard: the current (over-replicated) write + // replicas of the range minus the replica just added at START_LEAVE. This is the mirror of + // collectStartJoinShards, which instead adds the removed replica back to the post-removal set. + // this *might* be unnecessarily paranoid/precise, but I feel better with this extra condition + // in place (AY). + Set nodeIDs = new HashSet<>(); + for (InetAddressAndPort ep : placement.writes.forRange(added.range()).endpoints()) + nodeIDs.add(postStartLeave.directory.peerId(ep).id()); + nodeIDs.remove(postStartLeave.directory.peerId(added.endpoint()).id()); + Participants participants = new Participants(nodeIDs); + + MutationTrackingService.instance().forEachShardInKeyspace(ksm.name, shard -> { + if (shard.sinceEpoch < beforeEpoch + && shard.range.equals(added.range()) + && shard.participants.equals(participants) + && !shard.isSealed()) + shards.add(new ShardMetadata(shard.keyspace, shard.sinceEpoch, shard.range, shard.participants)); + }); + }); + } + return shards; + } + + /** + * Collect and seal the shards obsoleted by decommission's FINISH_LEAVE: the intermediate, over-replicated shards + * created during START_LEAVE. + *

+ * Driven by the leaving node from {@code UnbootstrapAndLeave}, right after FINISH_LEAVE is committed. + *

+ * Shards 'obsoleted' by the range merge folded into a wider final range with the same participants are + * intentionally left alone: the leaving node was never a writer of those ranges, so they are not among + * the leaving node's removed ranges. All of their participants remain alive. + * + * @param postFinish cluster metadata as of FINISH_LEAVE (leaving node already removed) + * @param finishDelta the FINISH_LEAVE placement delta ({@code removeFromWrites}) + */ + public static void collectAndSealFinishLeaveShards(ClusterMetadata postFinish, PlacementDeltas finishDelta) + { + seal(collectFinishLeaveShards(postFinish, finishDelta)); + } + + private static Set collectFinishLeaveShards(ClusterMetadata postFinish, PlacementDeltas finishDelta) + { + long finishEpoch = postFinish.epoch.getEpoch(); + Set shards = new HashSet<>(); + for (KeyspaceMetadata ksm : postFinish.schema.getKeyspaces()) + { + if (!ksm.params.replicationType.isTracked()) + continue; + + RangesAtEndpoint writeRemovals = + finishDelta.get(ksm.params.replication).writes.removals.get(FBUtilities.getBroadcastAddressAndPort()); + for (Replica removed : writeRemovals) + { + int leavingId = postFinish.directory.peerId(removed.endpoint()).id(); + + MutationTrackingService.instance().forEachShardInKeyspace(ksm.name, shard -> { + if (shard.sinceEpoch < finishEpoch + && shard.range.equals(removed.range()) + && shard.participants.contains(leavingId) + && !shard.isSealed()) + { + shards.add(new ShardMetadata(shard.keyspace, shard.sinceEpoch, shard.range, shard.participants)); + } + }); + } + } + return shards; + } + + private static void seal(Set shards) + { + seal(shards, null); + } + + private static void seal(Set shards, @Nullable NodeId withoutNode) + { + initiate(shards, withoutNode); // ACTIVE -> SEALING for each Shard + drain(shards, withoutNode); // drain in-flight local writes + reconcile(shards, withoutNode); // wait for logs to reconcile + complete(shards, withoutNode); // SEALING-> SEALED for each Shard + journal flush + } + + private static void initiate(Set shards, @Nullable NodeId withoutNode) + { + List> futures = new ArrayList<>(shards.size()); + for (ShardMetadata shard : shards) + futures.add(initiate(shard, withoutNode)); + try + { + FutureCombiner.allOf(futures).get(DatabaseDescriptor.getRpcTimeout(MILLISECONDS), MILLISECONDS); + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + throw new RuntimeException("Failed to initiate sealing", e); + } + } + + /** + * Fence allocation on every participant of the obsoleted shard (ACTIVE -> SEALING) by marking the shard + * SEALING on each. + * TODO (expected): this assumes every participant has already enacted the topology change that obsoleted the shard (so + * it no longer allocates new ids on it). The ProgressBarrier gating bootstrap()/MID_JOIN only waits for + * EACH_QUORUM of the affected replicas, NOT all live replicas, so a live-but-lagging participant could still + * be on the old shard. Bringing each participant up to the seal epoch must be done per-replica (not + * per-shard) and will be added separately. TLDR - add a step to guaranteed up-to-speed rest of replicas epochs. + */ + private static AsyncPromise initiate(ShardMetadata shard, @Nullable NodeId withoutNode) + { + return InitSealing.initiate(shard.keyspace, shard.sinceEpoch, shard.range, toEndpoints(shard.participants, withoutNode)); + } + + private static void drain(Set shards, @Nullable NodeId withoutNode) + { + long deadlineNanos = nanoTime() + DatabaseDescriptor.getRpcTimeout(NANOSECONDS); + Set pending = new HashSet<>(shards); + while (true) + { + // non-blocking poll: each participant reports its drain status immediately; retry until all drained + pending.removeIf(shard -> drain(shard, withoutNode)); + if (pending.isEmpty()) + return; + if (nanoTime() >= deadlineNanos) + throw new RuntimeException("Timed out draining shards; still pending: " + pending); + LockSupport.parkNanos(SECONDS.toNanos(1)); + } + } + + /** + * Poll every participant of the SEALING shard once for drain status; true iff all report that no mutation + * id allocated before the SEALING fence is still applying locally. + */ + private static boolean drain(ShardMetadata shard, @Nullable NodeId withoutNode) + { + try + { + return Drain.poll(shard.keyspace, shard.sinceEpoch, shard.range, toEndpoints(shard.participants, withoutNode)) + .get(DatabaseDescriptor.getRpcTimeout(MILLISECONDS), MILLISECONDS); + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + throw new RuntimeException(format("Failed to poll drain for shard %s", shard), e); + } + } + + /** + * For each of the shards, capture the witnessed union then poll each participant until it has + * caught up and witnessed the entire offset union itself. + */ + private static void reconcile(Set shards, @Nullable NodeId withoutNode) + { + for (ShardMetadata shard : shards) + reconcile(shard, withoutNode); + } + + private static void reconcile(ShardMetadata shard, @Nullable NodeId withoutNode) + { + // capture shard's witnessed offsets once + Log2OffsetsMap.Mutable offsets = captureWitnessedOffsets(shard, withoutNode); + // poll every participant until they've each witnessed the union of offsets + for (InetAddressAndPort endpoint : toEndpoints(shard.participants, withoutNode)) + pollUntilWitnesses(shard, offsets, endpoint); + } + + private static Log2OffsetsMap.Mutable captureWitnessedOffsets(ShardMetadata shard, @Nullable NodeId withoutNode) + { + List endpoints = toEndpoints(shard.participants, withoutNode); + try + { + return ReconcileCapture.capture(shard.keyspace, shard.sinceEpoch, shard.range, endpoints) + .get(DatabaseDescriptor.getRpcTimeout(MILLISECONDS), MILLISECONDS); + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + throw new RuntimeException(format("Failed to capture witnessed offsets for shard %s", shard), e); + } + } + + private static void pollUntilWitnesses(ShardMetadata shard, Log2OffsetsMap.Mutable offsets, InetAddressAndPort endpoint) + { + // TODO (expected): use a longer timeout + long deadlineNanos = nanoTime() + DatabaseDescriptor.getRpcTimeout(NANOSECONDS); + + while (true) + { + if (nanoTime() >= deadlineNanos) + throw new RuntimeException("Timed out reconciling shard: " + shard); + try + { + boolean witnessed = + ReconcilePoll.poll(shard.keyspace, shard.sinceEpoch, shard.range, offsets, endpoint) + .get(DatabaseDescriptor.getRpcTimeout(MILLISECONDS), MILLISECONDS); + if (witnessed) + return; + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + throw new RuntimeException(format("Failed to poll reconcile for shard %s", shard), e); + } + + LockSupport.parkNanos(SECONDS.toNanos(1)); + } + } + + private static void complete(Set shards, @Nullable NodeId withoutNode) + { + List> futures = new ArrayList<>(shards.size()); + for (ShardMetadata shard : shards) + futures.add(complete(shard, withoutNode)); + + try + { + FutureCombiner.allOf(futures).get(DatabaseDescriptor.getRpcTimeout(MILLISECONDS), MILLISECONDS); + } + catch (InterruptedException | ExecutionException | TimeoutException e) + { + throw new RuntimeException("Failed to complete sealing", e); + } + } + + /** + * Promote every participant of the (drained and reconciled) shard from SEALING to SEALED. + */ + private static AsyncPromise complete(ShardMetadata shard, @Nullable NodeId withoutNode) + { + return CompleteSealing.complete(shard.keyspace, shard.sinceEpoch, shard.range, toEndpoints(shard.participants, withoutNode)); + } + + /** + * A bootstrapping node knows nothing about the shards that currently exist, + * so it must collect the list of shards from the existing replicas before + * it can proceed with the rest of the sealing steps. + */ + public static final class FetchShards + { + public static final class Request + { + final String keyspace; + final long beforeEpoch; + final Range range; + + Request(String keyspace, long beforeEpoch, Range range) + { + this.keyspace = keyspace; + this.beforeEpoch = beforeEpoch; + this.range = range; + } + } + + public static final class Response + { + final Set shards; + + Response(Set shards) + { + this.shards = shards; + } + } + + /** + * Query all replicas in a group in parallel for the shards over {@code ranges} obsoleted by an + * in-flight topology change (those with {@code sinceEpoch < beforeEpoch}). Resembles + * {@link ShardMetadataRequest#queryPeers}, but waits for every (live) replica to respond and + * merges their deduplicated shards rather than taking the first response. + */ + public static AsyncPromise> fetch( + String keyspace, long beforeEpoch, Range range, Set endpoints) + { + AsyncPromise> promise = new AsyncPromise<>(); + Set merged = ConcurrentHashMap.newKeySet(); + + RequestCallback callback = new RequestCallback<>() + { + private final AtomicInteger remaining = new AtomicInteger(endpoints.size()); + + @Override + public void onResponse(Message msg) + { + merged.addAll(msg.payload.shards); + if (remaining.decrementAndGet() == 0) + promise.trySuccess(merged); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + // happy path assumes all replicas are up; fail the whole fetch if any can't respond + promise.tryFailure(new RuntimeException(format("Failed to fetch shards to seal from %s for keyspace %s: %s", from, keyspace, failure))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + Message message = Message.out(Verb.MT_FETCH_SHARDS_REQ, new Request(keyspace, beforeEpoch, range)); + for (InetAddressAndPort peer : endpoints) + MessagingService.instance().sendWithCallback(message, peer, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> + { + MutationTrackingService.ensureEnabled(); + Request request = message.payload; + Set shards = new HashSet<>(); + MutationTrackingService.instance().forEachShardInKeyspace(request.keyspace, shard -> { + if (shard.sinceEpoch < request.beforeEpoch && shard.range.intersects(request.range) && !shard.isSealed()) + shards.add(new ShardMetadata(shard.keyspace, shard.sinceEpoch, shard.range, shard.participants)); + }); + MessagingService.instance().send(message.responseWith(new Response(shards)), message.from()); + }; + + public static final VersionedSerializer requestSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Request r, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(r.keyspace); + out.writeLong(r.beforeEpoch); + AbstractBounds.tokenSerializer.serialize(r.range, out, version.messagingVersion()); + } + + @Override + public Request deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long beforeEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + return new Request(keyspace, beforeEpoch, range); + } + + @Override + public long serializedSize(Request r, Version version) + { + long size = TypeSizes.sizeof(r.keyspace); + size += TypeSizes.sizeof(r.beforeEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(r.range, version.messagingVersion()); + return size; + } + }; + + public static final VersionedSerializer responseSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Response r, DataOutputPlus out, Version version) throws IOException + { + CollectionSerializers.serializeCollection(r.shards, out, version, ShardMetadata.serializer); + } + + @Override + public Response deserialize(DataInputPlus in, Version version) throws IOException + { + return new Response(CollectionSerializers.deserializeSet(in, version, ShardMetadata.serializer)); + } + + @Override + public long serializedSize(Response r, Version version) + { + return CollectionSerializers.serializedCollectionSize(r.shards, version, ShardMetadata.serializer); + } + }; + } + + public static final class InitSealing + { + public static final class Request + { + final String keyspace; + final long sinceEpoch; + final Range range; + + Request(String keyspace, long sinceEpoch, Range range) + { + this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; + this.range = range; + } + } + + // cannot use NoPayload, because its serializer cannot be wrapped inside mtEmbedded() + public static final class Response + { + private static final Response instance = new Response(); + } + + /** + * Tell every participant of an obsoleted shard to fence id allocation (mark it SEALING) + */ + public static AsyncPromise initiate( + String keyspace, long sinceEpoch, Range range, List endpoints) + { + AsyncPromise promise = new AsyncPromise<>(); + + RequestCallback callback = new RequestCallback<>() + { + private final AtomicInteger remaining = new AtomicInteger(endpoints.size()); + + @Override + public void onResponse(Message msg) + { + if (remaining.decrementAndGet() == 0) + promise.trySuccess(null); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + promise.tryFailure(new RuntimeException(format("Failed to initiate sealing on %s for keyspace %s: %s", from, keyspace, failure))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + Message message = Message.out(Verb.MT_INIT_SEALING_REQ, new Request(keyspace, sinceEpoch, range)); + for (InetAddressAndPort peer : endpoints) + MessagingService.instance().sendWithCallback(message, peer, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> { + MutationTrackingService.ensureEnabled(); + Request request = message.payload; + MutationTrackingService.instance().markShardSealing(request.keyspace, request.sinceEpoch, request.range); + MessagingService.instance().send(message.responseWith(Response.instance), message.from()); + }; + + public static final VersionedSerializer requestSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Request r, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(r.keyspace); + out.writeLong(r.sinceEpoch); + AbstractBounds.tokenSerializer.serialize(r.range, out, version.messagingVersion()); + } + + @Override + public Request deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + return new Request(keyspace, sinceEpoch, range); + } + + @Override + public long serializedSize(Request r, Version version) + { + long size = TypeSizes.sizeof(r.keyspace); + size += TypeSizes.sizeof(r.sinceEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(r.range, version.messagingVersion()); + return size; + } + }; + + public static final UnversionedSerializer responseSerializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Response r, DataOutputPlus out) + { + } + + @Override + public Response deserialize(DataInputPlus in) + { + return Response.instance; + } + + @Override + public long serializedSize(Response r) + { + return 0; + } + }; + } + + public static final class Drain + { + public static final class Request + { + final String keyspace; + final long sinceEpoch; + final Range range; + + Request(String keyspace, long sinceEpoch, Range range) + { + this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; + this.range = range; + } + } + + public static final class Response + { + final boolean drained; + + Response(boolean drained) + { + this.drained = drained; + } + } + + /** + * Poll every participant of a SEALING shard for whether it has drained its in-flight local + * writes. Succeeds with {@code true} iff every participant reports drained. + */ + public static AsyncPromise poll( + String keyspace, long sinceEpoch, Range range, List endpoints) + { + AsyncPromise promise = new AsyncPromise<>(); + + RequestCallback callback = new RequestCallback<>() + { + private final AtomicInteger remaining = new AtomicInteger(endpoints.size()); + private volatile boolean allDrained = true; + + @Override + public void onResponse(Message msg) + { + if (!msg.payload.drained) + allDrained = false; + if (remaining.decrementAndGet() == 0) + promise.trySuccess(allDrained); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + promise.tryFailure(new RuntimeException(format("Failed to poll drain on %s for keyspace %s: %s", from, keyspace, failure))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + Message message = Message.out(Verb.MT_DRAIN_REQ, new Request(keyspace, sinceEpoch, range)); + for (InetAddressAndPort peer : endpoints) + MessagingService.instance().sendWithCallback(message, peer, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> { + MutationTrackingService.ensureEnabled(); + Request request = message.payload; + boolean drained = MutationTrackingService.instance().isShardDrained(request.keyspace, request.sinceEpoch, request.range); + MessagingService.instance().send(message.responseWith(new Response(drained)), message.from()); + }; + + public static final VersionedSerializer requestSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Request r, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(r.keyspace); + out.writeLong(r.sinceEpoch); + AbstractBounds.tokenSerializer.serialize(r.range, out, version.messagingVersion()); + } + + @Override + public Request deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + return new Request(keyspace, sinceEpoch, range); + } + + @Override + public long serializedSize(Request r, Version version) + { + long size = TypeSizes.sizeof(r.keyspace); + size += TypeSizes.sizeof(r.sinceEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(r.range, version.messagingVersion()); + return size; + } + }; + + public static final UnversionedSerializer responseSerializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Response r, DataOutputPlus out) throws IOException + { + out.writeBoolean(r.drained); + } + + @Override + public Response deserialize(DataInputPlus in) throws IOException + { + return new Response(in.readBoolean()); + } + + @Override + public long serializedSize(Response r) + { + return TypeSizes.sizeof(r.drained); + } + }; + } + + public static final class ReconcileCapture + { + public static final class Request + { + final String keyspace; + final long sinceEpoch; + final Range range; + + Request(String keyspace, long sinceEpoch, Range range) + { + this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; + this.range = range; + } + } + + public static final class Response + { + final Log2OffsetsMap witnessed; + + Response(Log2OffsetsMap witnessed) + { + this.witnessed = witnessed; + } + } + + public static AsyncPromise capture( + String keyspace, long sinceEpoch, Range range, List endpoints) + { + AsyncPromise promise = new AsyncPromise<>(); + Log2OffsetsMap.Mutable union = new Log2OffsetsMap.Mutable(); + + RequestCallback callback = new RequestCallback<>() + { + private final AtomicInteger remaining = new AtomicInteger(endpoints.size()); + + @Override + public void onResponse(Message msg) + { + synchronized (union) + { + union.addAll(msg.payload.witnessed); + } + if (remaining.decrementAndGet() == 0) + promise.trySuccess(union); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + promise.tryFailure(new RuntimeException(format("Failed to capture witnessed offsets on %s for keyspace %s: %s", from, keyspace, failure))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + Message message = Message.out(Verb.MT_RECONCILE_CAPTURE_REQ, new Request(keyspace, sinceEpoch, range)); + for (InetAddressAndPort peer : endpoints) + MessagingService.instance().sendWithCallback(message, peer, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> { + MutationTrackingService.ensureEnabled(); + Request request = message.payload; + Log2OffsetsMap witnessed = + MutationTrackingService.instance() + .collectLocallyWitnessedOffsets(request.keyspace, request.sinceEpoch, request.range); + MessagingService.instance().send(message.responseWith(new Response(witnessed)), message.from()); + }; + + public static final VersionedSerializer requestSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Request r, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(r.keyspace); + out.writeLong(r.sinceEpoch); + AbstractBounds.tokenSerializer.serialize(r.range, out, version.messagingVersion()); + } + + @Override + public Request deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + return new Request(keyspace, sinceEpoch, range); + } + + @Override + public long serializedSize(Request r, Version version) + { + long size = TypeSizes.sizeof(r.keyspace); + size += TypeSizes.sizeof(r.sinceEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(r.range, version.messagingVersion()); + return size; + } + }; + + public static final UnversionedSerializer responseSerializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Response r, DataOutputPlus out) throws IOException + { + Log2OffsetsMap.serializer.serialize(r.witnessed, out); + } + + @Override + public Response deserialize(DataInputPlus in) throws IOException + { + return new Response(Log2OffsetsMap.serializer.deserialize(in)); + } + + @Override + public long serializedSize(Response r) + { + return Log2OffsetsMap.serializer.serializedSize(r.witnessed); + } + }; + } + + public static final class ReconcilePoll + { + public static final class Request + { + final String keyspace; + final long sinceEpoch; + final Range range; + final Log2OffsetsMap offsets; + + Request(String keyspace, long sinceEpoch, Range range, Log2OffsetsMap offsets) + { + this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; + this.range = range; + this.offsets = offsets; + } + } + + public static final class Response + { + final boolean witnessed; + + Response(boolean witnessed) + { + this.witnessed = witnessed; + } + } + + public static AsyncPromise poll( + String keyspace, long sinceEpoch, Range range, Log2OffsetsMap.Mutable offsets, InetAddressAndPort endpoint) + { + AsyncPromise promise = new AsyncPromise<>(); + + RequestCallback callback = new RequestCallback<>() + { + @Override + public void onResponse(Message msg) + { + promise.trySuccess(msg.payload.witnessed); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + promise.tryFailure(new RuntimeException(format("Failed to reconcile poll %s for keyspace %s: %s", from, keyspace, failure))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + MessagingService.instance() + .sendWithCallback(Message.out(Verb.MT_RECONCILE_POLL_REQ, new Request(keyspace, sinceEpoch, range, offsets)), + endpoint, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> { + MutationTrackingService.ensureEnabled(); + Request request = message.payload; + boolean witnessed = + MutationTrackingService.instance().hasWitnessed(request.keyspace, request.sinceEpoch, request.range, request.offsets); + MessagingService.instance().send(message.responseWith(new Response(witnessed)), message.from()); + }; + + public static final VersionedSerializer requestSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Request r, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(r.keyspace); + out.writeLong(r.sinceEpoch); + AbstractBounds.tokenSerializer.serialize(r.range, out, version.messagingVersion()); + Log2OffsetsMap.serializer.serialize(r.offsets, out); + } + + @Override + public Request deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + Log2OffsetsMap.Immutable offsets = Log2OffsetsMap.serializer.deserialize(in); + return new Request(keyspace, sinceEpoch, range, offsets); + } + + @Override + public long serializedSize(Request r, Version version) + { + long size = TypeSizes.sizeof(r.keyspace); + size += TypeSizes.sizeof(r.sinceEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(r.range, version.messagingVersion()); + size += Log2OffsetsMap.serializer.serializedSize(r.offsets); + return size; + } + }; + + public static final UnversionedSerializer responseSerializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Response r, DataOutputPlus out) throws IOException + { + out.writeBoolean(r.witnessed); + } + + @Override + public Response deserialize(DataInputPlus in) throws IOException + { + return new Response(in.readBoolean()); + } + + @Override + public long serializedSize(Response r) + { + return TypeSizes.sizeof(r.witnessed); + } + }; + } + + public static final class CompleteSealing + { + public static final class Request + { + final String keyspace; + final long sinceEpoch; + final Range range; + + Request(String keyspace, long sinceEpoch, Range range) + { + this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; + this.range = range; + } + } + + // cannot use NoPayload, because its serializer cannot be wrapped inside mtEmbedded() + public static final class Response + { + private static final Response instance = new Response(); + } + + /** + * Tell every participant of a sealing shard to promote it from SEALING to SEALED. + */ + public static AsyncPromise complete( + String keyspace, long sinceEpoch, Range range, List endpoints) + { + AsyncPromise promise = new AsyncPromise<>(); + + RequestCallback callback = new RequestCallback<>() + { + private final AtomicInteger remaining = new AtomicInteger(endpoints.size()); + + @Override + public void onResponse(Message msg) + { + if (remaining.decrementAndGet() == 0) + promise.trySuccess(null); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + promise.tryFailure(new RuntimeException(format("Failed to complete sealing on %s for keyspace %s: %s", from, keyspace, failure))); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + Message message = Message.out(Verb.MT_COMPLETE_SEALING_REQ, new Request(keyspace, sinceEpoch, range)); + for (InetAddressAndPort peer : endpoints) + MessagingService.instance().sendWithCallback(message, peer, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> { + MutationTrackingService.ensureEnabled(); + Request request = message.payload; + + // flush every table in the keyspace so a just-sealed shard's mutations are all made durable as SSTables + List> flushes = new ArrayList<>(); + for (ColumnFamilyStore cfs : Keyspace.open(request.keyspace).getColumnFamilyStores()) + flushes.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.INTERNALLY_FORCED)); + FBUtilities.waitOnFutures(flushes); + + MutationTrackingService.instance().markShardSealed(request.keyspace, request.sinceEpoch, request.range); + MessagingService.instance().send(message.responseWith(Response.instance), message.from()); + }; + + public static final VersionedSerializer requestSerializer = new VersionedSerializer<>() + { + @Override + public void serialize(Request r, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(r.keyspace); + out.writeLong(r.sinceEpoch); + AbstractBounds.tokenSerializer.serialize(r.range, out, version.messagingVersion()); + } + + @Override + public Request deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + return new Request(keyspace, sinceEpoch, range); + } + + @Override + public long serializedSize(Request r, Version version) + { + long size = TypeSizes.sizeof(r.keyspace); + size += TypeSizes.sizeof(r.sinceEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(r.range, version.messagingVersion()); + return size; + } + }; + + public static final UnversionedSerializer responseSerializer = new UnversionedSerializer<>() + { + @Override + public void serialize(Response r, DataOutputPlus out) + { + } + + @Override + public Response deserialize(DataInputPlus in) + { + return Response.instance; + } + + @Override + public long serializedSize(Response r) + { + return 0; + } + }; + } + + private static List toEndpoints(Participants participants, @Nullable NodeId withoutNode) + { + ClusterMetadata metadata = ClusterMetadata.current(); + List endpoints = new ArrayList<>(participants.size()); + for (int i = 0, size = participants.size(); i < size; i++) + { + NodeId nodeId = new NodeId(participants.get(i)); + if (!nodeId.equals(withoutNode)) + endpoints.add(metadata.directory.endpoint(nodeId)); + } + return endpoints; + } +} diff --git a/src/java/org/apache/cassandra/replication/Shard.java b/src/java/org/apache/cassandra/replication/Shard.java index dd4e34902376..eff0b0709598 100644 --- a/src/java/org/apache/cassandra/replication/Shard.java +++ b/src/java/org/apache/cassandra/replication/Shard.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.BiConsumer; import java.util.function.LongSupplier; @@ -60,19 +61,40 @@ public class Shard { private static final Logger logger = LoggerFactory.getLogger(Shard.class); + /** + * 3 possible shard states: + * - ACTIVE: a shard that is allocating new mutation ids and participates in read reconciliations + * - SEALING: a shard that is being sealed; doesn't allocate new mutation ids, but participates + * in read reconciliations + * - SEALED: a shard that was fully sealed; no new mutation ids get allocated for it, and it doesn't + * get included into mutation summaries. + *

+ * See {@link SealingCoordinator} for more context. + */ + enum State + { + ACTIVE, SEALING, SEALED + } + final int localNodeId; public final String keyspace; + public final long sinceEpoch; public final Range range; public final Participants participants; private final LongSupplier logIdProvider; + private final BiConsumer onNewLog; private final NonBlockingHashMapLong logs; + private volatile CoordinatorLogPrimary currentLocalLog; + private volatile State state = State.ACTIVE; Shard(int localNodeId, String keyspace, + long sinceEpoch, Range range, Participants participants, + State state, List logs, LongSupplier logIdProvider, BiConsumer onNewLog) @@ -81,8 +103,10 @@ public class Shard this.localNodeId = localNodeId; this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; this.range = range; this.participants = participants; + this.state = state; this.logIdProvider = logIdProvider; this.logs = new NonBlockingHashMapLong<>(); this.onNewLog = onNewLog; @@ -94,13 +118,14 @@ public class Shard this.currentLocalLog = createNewPrimaryLog(); } - Shard(int localNodeId, String keyspace, Range range, Participants participants, LongSupplier logIdProvider, BiConsumer onNewLog) + Shard(int localNodeId, String keyspace, long sinceEpoch, Range range, Participants participants, LongSupplier logIdProvider, BiConsumer onNewLog) { - this(localNodeId, keyspace, range, participants, Collections.emptyList(), logIdProvider, onNewLog); + this(localNodeId, keyspace, sinceEpoch, range, participants, State.ACTIVE, Collections.emptyList(), logIdProvider, onNewLog); } Shard(int localNodeId, String keyspace, + long sinceEpoch, Range range, Participants participants, NonBlockingHashMapLong logs, @@ -110,6 +135,7 @@ public class Shard { this.localNodeId = localNodeId; this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; this.range = range; this.participants = participants; this.logIdProvider = logIdProvider; @@ -123,45 +149,54 @@ public class Shard */ void reportAllLogsToCallback() { - logs.values().forEach(log -> { - onNewLog.accept(this, log); - }); + logs.values().forEach(log -> onNewLog.accept(this, log)); } - Shard withParticipants(Participants newParticipants) + void forEachLog(BiConsumer callback) { - if (participants.equals(newParticipants)) - return this; - - if (logger.isTraceEnabled()) - logger.trace("Reconfiguring shard {} participants: {} -> {}", - range, participants, newParticipants); + logs.values().forEach(log -> callback.accept(this, log)); + } - NonBlockingHashMapLong newLogs = new NonBlockingHashMapLong<>(); - CoordinatorLog.CoordinatorLogPrimary newCurrentLocalLog = null; + /** + * Incremented before this shard allocates a MutationId. + * Decremented once the mutation has applied or failed locally. + * Used by shard sealing logic for drain() step. + */ + private final AtomicInteger pendingLocalWrites = new AtomicInteger(); - // FIXME: confirm all new logs are added to the relevant views - for (CoordinatorLog log : logs.values()) + @Nonnull + MutationId nextMutationId() + { + pendingLocalWrites.incrementAndGet(); + try { - CoordinatorLog newLog = log.withParticipants(newParticipants); - newLogs.put(newLog.logId.asLong(), newLog); - - if (log == currentLocalLog) - newCurrentLocalLog = (CoordinatorLog.CoordinatorLogPrimary) newLog; + return nextId(); + } + catch (Throwable t) + { + pendingLocalWrites.decrementAndGet(); + throw t; } + } - Shard shard = new Shard(localNodeId, keyspace, range, newParticipants, - newLogs, newCurrentLocalLog, logIdProvider, onNewLog); - newLogs.values().forEach(log -> onNewLog.accept(shard, log)); - return shard; + /* + * TODO (expected): drain() should handle tracked transfers as well (later) + */ + @Nonnull + MutationId nextTransferId() + { + return nextId(); } - MutationId nextId() + @Nonnull + private MutationId nextId() { + if (state != State.ACTIVE) + throw new IllegalStateException(format("%s cannot assign next id, state: %s", this, state)); MutationId nextId = currentLocalLog.nextId(); if (nextId == null) nextId = maybeRotateLocalLogAndGetNextId(); - logger.trace("Issuing next MutationId {}", nextId); + logger.trace("Issuing next id {}", nextId); return nextId; } @@ -174,7 +209,15 @@ synchronized private MutationId maybeRotateLocalLogAndGetNextId() CoordinatorLogId oldLogId = currentLocalLog.logId; currentLocalLog = createNewPrimaryLog(); logger.info("Rotated primary log for {}/{} from {} to {}", keyspace, range, oldLogId, currentLocalLog.logId); - return nextId(); + return currentLocalLog.nextId(); + } + + /** + * Must be called exactly once per {@code nextId()} invocation. + */ + void completeLocalWrite() + { + pendingLocalWrites.decrementAndGet(); } void receivedWriteResponse(ShortMutationId mutationId, InetAddressAndPort fromHost) @@ -201,17 +244,6 @@ void updateReplicatedOffsets(List offsets, boolean durable, I getOrCreate(logOffsets.logId()).updateReplicatedOffsets(logOffsets, durable, onHostId); } - public void recordFullyReconciledOffsets(CoordinatorLogId logId, Offsets.Immutable reconciled) - { - CoordinatorLog log = logs.get(logId.asLong()); - - // Create the coordinator log if it doesn't exist - if (log == null) - log = getOrCreate(logId); - - log.recordFullyReconciledOffsets(reconciled); - } - boolean startWriting(Mutation mutation) { return getOrCreate(mutation).startWriting(mutation); @@ -224,6 +256,12 @@ void finishWriting(Mutation mutation) void addSummaryForKey(Token token, boolean includePending, MutationSummary.Builder builder) { + // TODO (expected): this is a temporary solutions, which is racy *during* a topology change (SEALING -> SEALED transition); + // instead, should be gating on epochs after transition, decided by read coordinator; + // some of those additional TCM transitions are currently missing however (pending for unhappy path) + if (isSealed()) + return; + logs.forEach((id, log) -> { MutationSummary.CoordinatorSummary.Builder summaryBuilder = builder.builderForLog(log.logId); log.collectOffsetsFor(token, builder.tableId, includePending, summaryBuilder.unreconciled, summaryBuilder.reconciled); @@ -232,6 +270,12 @@ void addSummaryForKey(Token token, boolean includePending, MutationSummary.Build void addSummaryForRange(AbstractBounds range, boolean includePending, MutationSummary.Builder builder) { + // TODO (expected): this is a temporary solutions, which is racy *during* a topology change (SEALING -> SEALED transition); + // instead, should be gating on epochs after transition, decided by read coordinator + // some of those additional TCM transitions are currently missing however (pending for unhappy path) + if (isSealed()) + return; + logs.forEach((id, log) -> { MutationSummary.CoordinatorSummary.Builder summaryBuilder = builder.builderForLog(log.logId); log.collectOffsetsFor(range, builder.tableId, includePending, summaryBuilder.unreconciled, summaryBuilder.reconciled); @@ -296,7 +340,7 @@ BroadcastLogOffsets collectReplicatedOffsets(boolean durable) offsets.add(logOffsets); } - return new BroadcastLogOffsets(keyspace, range, offsets, durable); + return new BroadcastLogOffsets(keyspace, sinceEpoch, range, participants, offsets, durable); } /** @@ -363,7 +407,7 @@ private CoordinatorLog getOrCreate(long logId) */ private CoordinatorLog createNewLog(long logId) { - CoordinatorLog next = CoordinatorLog.create(keyspace, range, localNodeId, new CoordinatorLogId(logId), participants); + CoordinatorLog next = CoordinatorLog.create(keyspace, sinceEpoch, range, localNodeId, new CoordinatorLogId(logId), participants); CoordinatorLog prev = logs.putIfAbsent(logId, next); if (null == prev) onNewLog.accept(this, next); return null != prev ? prev : next; @@ -379,12 +423,12 @@ private CoordinatorLogPrimary createNewPrimaryLog() */ private static final String INSERT_QUERY = - format("INSERT INTO %s.%s (keyspace_name, range_start, range_end, participants) VALUES (?, ?, ?, ?)", + format("INSERT INTO %s.%s (keyspace_name, since_epoch, range_start, range_end, participants, state) VALUES (?, ?, ?, ?, ?, ?)", SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.SHARDS); void persistToSystemTables() { - executeInternal(INSERT_QUERY, keyspace, range.left.toString(), range.right.toString(), participants.asSet()); + executeInternal(INSERT_QUERY, keyspace, sinceEpoch, range.left.toString(), range.right.toString(), participants.asSet(), state.name()); for (CoordinatorLog log : logs.values()) log.persistToSystemTable(); } @@ -397,32 +441,101 @@ void updateLogsInSystemTable() private static final String SELECT_QUERY = format("SELECT * FROM %s.%s", SchemaConstants.SYSTEM_KEYSPACE_NAME, SystemKeyspace.SHARDS); - static ArrayList loadFromSystemTables(int localNodeId, LongSupplier logIdProvider, BiConsumer onNewLog) { Token.TokenFactory factory = ClusterMetadata.current().partitioner.getTokenFactory(); ArrayList shards = new ArrayList<>(); + //noinspection DataFlowIssue for (UntypedResultSet.Row row : executeInternal(SELECT_QUERY)) { String keyspace = row.getString("keyspace_name"); + long sinceEpoch = row.getLong("since_epoch"); String rangeStart = row.getString("range_start"); String rangeEnd = row.getString("range_end"); Range range = new Range<>(factory.fromString(rangeStart), factory.fromString(rangeEnd)); Set participants = row.getFrozenSet("participants", Int32Type.instance); - List logs = CoordinatorLog.loadFromSystemTable(keyspace, range, localNodeId); - shards.add(new Shard(localNodeId, keyspace, range, new Participants(participants), logs, logIdProvider, onNewLog)); + State state = row.has("state") ? State.valueOf(row.getString("state")) : State.ACTIVE; + List logs = CoordinatorLog.loadFromSystemTable(keyspace, sinceEpoch, range, localNodeId); + shards.add(new Shard(localNodeId, keyspace, sinceEpoch, range, new Participants(participants), state, logs, logIdProvider, onNewLog)); } return shards; } - public Range tokenRange() + /* + * Sealing + */ + + void markSealing() { - return range; + if (state == State.SEALED) + throw new IllegalStateException(format("%s cannot transition to SEALING from %s", this, state)); + + if (state != State.SEALING) + { + state = State.SEALING; + persistToSystemTables(); + } + } + + boolean isDrained() + { + return state != State.ACTIVE && pendingLocalWrites.get() == 0; } - void collectShardReconciledOffsetsToBuilder(ReconciledKeyspaceOffsets.Builder keyspaceBuilder) + /** + * @return locally-applied offsets for each log of the shard + */ + Log2OffsetsMap.Immutable collectLocallyWitnessedOffsets() { - logs.values().forEach(log -> keyspaceBuilder.put(log.logId, log.collectReconciledOffsets(), range)); + Log2OffsetsMap.Immutable.Builder builder = new Log2OffsetsMap.Immutable.Builder(); + for (CoordinatorLog log : logs.values()) + { + Offsets.Immutable witnessed = log.collectReplicatedOffsets(false); + if (witnessed != null) + builder.add(witnessed); + } + return builder.build(); + } + + /** + * @return whether the shard has locally applied every offset in {@code offsets}. + * TODO (expected): just to be certain, check that there are no local logs not present in passed offsets. + */ + boolean hasWitnessed(Log2OffsetsMap offsets) + { + for (Offsets target : offsets.offsets()) + { + CoordinatorLog log = logs.get(target.logId().asLong()); + if (log == null) + return false; + Offsets.Immutable local = log.collectReplicatedOffsets(false); + // the local offsets must *completely* match target + if (!Offsets.contentsEqual(target, local)) + return false; + } + return true; + } + + void markSealed() + { + if (state == State.ACTIVE) + throw new IllegalStateException(format("%s cannot transition to SEALED from %s", this, state)); + + if (state != State.SEALED) + { + state = State.SEALED; + persistToSystemTables(); + } + } + + public boolean isSealed() + { + return state == State.SEALED; + } + + public Range tokenRange() + { + return range; } /** @@ -498,11 +611,11 @@ public Map collectUnionOfWitnessedOffsetsPe @Override public String toString() { - return "Shard{" + - "participants=" + participants + + return "Shard{keyspace='" + keyspace + '\'' + + ", sinceEpoch=" + sinceEpoch + ", range=" + range + - ", keyspace='" + keyspace + '\'' + - ", localNodeId=" + localNodeId + + ", participants=" + participants + + ", state=" + state + '}'; } @@ -510,27 +623,36 @@ public DebugInfo getDebugInfo() { SortedMap logDebugState = new TreeMap<>(Comparator.comparing(CoordinatorLogId::asLong)); for (CoordinatorLog log : logs.values()) - { logDebugState.put(log.getLogId(), log.getDebugState()); - } - return new DebugInfo(keyspace, range, localNodeId, participants, logDebugState); + return new DebugInfo(keyspace, sinceEpoch, range, localNodeId, participants, logDebugState); } public static class DebugInfo { public final String keyspace; + public final long sinceEpoch; public final Range range; public final int localNodeId; public final Participants participants; public final SortedMap logs; - private DebugInfo(String keyspace, Range range, int localNodeId, Participants participants, SortedMap logs) + private DebugInfo( + String keyspace, long sinceEpoch, Range range, int localNodeId, + Participants participants, SortedMap logs) { this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; this.range = range; this.localNodeId = localNodeId; this.participants = participants; this.logs = logs; } } + + static final Comparator COMPARATOR = (a, b) -> { + int c = a.range.left.compareTo(b.range.left); + if (c == 0) c = Range.compareRightToken(a.range.right, b.range.right); + if (c == 0) c = Long.compare(a.sinceEpoch, b.sinceEpoch); + return c; + }; } diff --git a/src/java/org/apache/cassandra/replication/ShardIntervalBTree.java b/src/java/org/apache/cassandra/replication/ShardIntervalBTree.java new file mode 100644 index 000000000000..54a2f69fe8db --- /dev/null +++ b/src/java/org/apache/cassandra/replication/ShardIntervalBTree.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.util.Collection; +import java.util.Comparator; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Consumer; + +import javax.annotation.Nullable; + +import accord.utils.AsymmetricComparator; +import accord.utils.btree.BTree; +import accord.utils.btree.IntervalBTree; + +import org.apache.cassandra.db.PartitionPosition; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; + +import static accord.utils.btree.IntervalBTree.InclusiveEndHelper.endWithStart; +import static accord.utils.btree.IntervalBTree.InclusiveEndHelper.keyEndWithStart; +import static accord.utils.btree.IntervalBTree.InclusiveEndHelper.keyStartWithEnd; +import static accord.utils.btree.IntervalBTree.InclusiveEndHelper.keyStartWithStart; +import static accord.utils.btree.IntervalBTree.InclusiveEndHelper.startWithEnd; +import static accord.utils.btree.IntervalBTree.InclusiveEndHelper.startWithStart; + +/** + * Immutable interval tree of {@link Shard}s, backed by an augmented + * interval BTree (see {@link IntervalBTree}). + *

+ * Shard ranges can overlap, so multiple shards may cover the same token + * (at different {@link Shard#sinceEpoch epochs}). + */ +final class ShardIntervalBTree +{ + private final Object[] tree; + + ShardIntervalBTree() + { + this(IntervalBTree.empty()); + } + + private ShardIntervalBTree(Object[] tree) + { + this.tree = tree; + } + + /* + * TODO: manipulation method list: + * + * + */ + + /** + * Return a copy of this map with the provided shard added. + * @throws IllegalStateException if it already exists in the map. + */ + ShardIntervalBTree with(Shard shard) + { + if (shard.range.isTrulyWrapAround()) + throw new IllegalArgumentException("Shard's range truly wraps around: " + shard); + + if (BTree.find(tree, Shard.COMPARATOR, shard) != null) + throw new IllegalStateException("Shard is already present: " + shard); + + return new ShardIntervalBTree(IntervalBTree.update(tree, IntervalBTree.singleton(shard), BuildComparators.INSTANCE)); + } + + /** + * Return a copy of this map without the provided shard. + * @throws IllegalStateException if it is not present in the map. + */ + ShardIntervalBTree without(Shard shard) + { + if (shard.range.isTrulyWrapAround()) + throw new IllegalArgumentException("Shard's range truly wraps around: " + shard); + + if (BTree.find(tree, Shard.COMPARATOR, shard) == null) + throw new IllegalStateException("Shard is not present: " + shard); + + return new ShardIntervalBTree(IntervalBTree.subtract(tree, IntervalBTree.singleton(shard), BuildComparators.INSTANCE)); + } + + /** + * Builds {@code ShardIntervalBTree} from an ordered collection of Shards. + * + * @param orderedShards An ordered collection of shards (ordered by {@code Shard.COMPARATOR}). + */ + static ShardIntervalBTree fromSorted(Collection orderedShards) + { + return new ShardIntervalBTree(IntervalBTree.build(orderedShards, BuildComparators.INSTANCE)); + } + + /* + * TODO: lookup method list: + * x find the matching active shard by token (for mutation id generation) + * x forEach() over shards that contain the token (for mutation summaries) + * x forEach() over shards that overlap with the provided token range + * - get (by range and sinceEpoch, exact) + * - forEach() over shards that match the partition position bounds + */ + + /** + * Return the latest {@link Shard} (with the highest {@link Shard#sinceEpoch}) + * responsible for the provided token, or {@code null} if no shard covers it. + */ + @Nullable + Shard latestShardCovering(Token token) + { + return IntervalBTree.accumulate( + tree, PointQueryComparators.INSTANCE, token, + (ignore1, ignore2, shard, acc) -> (acc == null || shard.sinceEpoch > acc.sinceEpoch) ? shard : acc, + null, null, null); + } + + /** + * @return the Shard matching the provided {@code range} and {@code sinceEpoch} exactly (or null if none do). + */ + @Nullable + Shard get(Range range, long sinceEpoch) + { + return IntervalBTree.accumulate( + tree, RangeQueryComparators.INSTANCE, range, + (epoch, p2, shard, acc) -> shard.range.equals(range) && shard.sinceEpoch == epoch ? shard : acc, + sinceEpoch, null, null); + } + + /** + * Apply {@code consumer} to every shard whose range covers {@code token}. + */ + void forEachCovering(Token token, Consumer consumer) + { + IntervalBTree.accumulate( + tree, PointQueryComparators.INSTANCE, token, + (sink, ignore, shard, acc) -> { sink.accept(shard); return null; }, + consumer, null, null); + } + + /** + * Apply {@code consumer} to every shard that intersects with {@code range}. + */ + void forEachIntersecting(Range range, Consumer consumer) + { + // TODO (expected): valitate if the range cannot be truly- or regular wrap-around + if (range.isTrulyWrapAround()) + throw new IllegalArgumentException("Query range truly wraps around: " + range); + + IntervalBTree.accumulate( + tree, RangeQueryComparators.INSTANCE, range, + (sink, ignore, shard, acc) -> { sink.accept(shard); return null; }, + consumer, null, null); + } + + /** + * Fold {@code folder} over every shard that intersects {@code range}. + */ + A foldIntersecting(Range range, BiFunction folder, A accumulator) + { + if (range.isTrulyWrapAround()) + throw new IllegalArgumentException("Query range truly wraps around: " + range); + + return IntervalBTree.accumulate( + tree, RangeQueryComparators.INSTANCE, range, + (BiFunction f, Object ignore, Shard shard, A acc) -> f.apply(shard, acc), + folder, null, accumulator); + } + + void forEachIntersecting(Collection> ranges, Consumer consumer) + { + for (Shard shard : BTree.iterable(tree)) + if (shard.range.intersects(ranges)) + consumer.accept(shard); + } + + void forEachIntersecting(AbstractBounds bounds, Consumer consumer) + { + // TODO (expected): partial workaround - is there a better way to do this? + // SELECT * statements create Bounds[min,min], (PartitionKeyRestrictions.java:L174) not Range(min,min], + // which Ranges generally won't intersect with (Range.java:L148), so contains is used here to make it work + for (Shard shard : BTree.iterable(tree)) + { + Range rowRange = Range.makeRowRange(shard.range); + if (bounds.contains(rowRange.right) || rowRange.intersects(bounds)) + consumer.accept(shard); + } + } + + /** + * Invoke {@code consumer} for every Shard in the tree (exactly once for each shard). + */ + void forEach(Consumer consumer) + { + for (Shard shard : BTree.iterable(tree)) + consumer.accept(shard); + } + + /** + * Invoke {@code consumer} for every Shard in the tree (exactly once for each shard). + * Allows one pass-through arg to allow not allocating some capturing lambdas. + */ +

void forEach(BiConsumer consumer, P param) + { + for (Shard shard : BTree.iterable(tree)) + consumer.accept(shard, param); + } + + boolean isEmpty() + { + return BTree.isEmpty(tree); + } + + /* + * Comparators for shard-vs-shard overlap (used by build/update/subtract). + */ + private static final class BuildComparators implements IntervalBTree.IntervalComparators + { + private static final BuildComparators INSTANCE = new BuildComparators(); + + @Override + public Comparator totalOrder() + { + return Shard.COMPARATOR; + } + + @Override + public Comparator endWithEndSorter() + { + return (a, b) -> Range.compareRightToken(a.range.right, b.range.right); + } + + @Override + public AsymmetricComparator startWithStartSeeker() + { + // required by IntervalComparators; never invoked + return (a, b) -> startWithStart(a.range.left.compareTo(b.range.left)); + } + + @Override + public AsymmetricComparator startWithEndSeeker() + { + // required by IntervalComparators; never invoked + return (a, b) -> startWithEnd(compareTokenToEnd(a.range.left, b.range.right)); + } + + @Override + public AsymmetricComparator endWithStartSeeker() + { + // required by IntervalComparators; never invoked + return (a, b) -> endWithStart(compareEndToToken(a.range.right, b.range.left)); + } + } + + /* + * Comparators for point queries. + */ + private static final class PointQueryComparators implements IntervalBTree.WithIntervalComparators + { + private static final PointQueryComparators INSTANCE = new PointQueryComparators(); + + @Override + public AsymmetricComparator startWithStartSeeker() + { + return (token, shard) -> keyStartWithStart(token.compareTo(shard.range.left)); + } + + @Override + public AsymmetricComparator startWithEndSeeker() + { + return (token, shard) -> keyStartWithEnd(compareTokenToEnd(token, shard.range.right)); + } + + @Override + public AsymmetricComparator endWithStartSeeker() + { + return (token, shard) -> keyEndWithStart(token.compareTo(shard.range.left)); + } + } + + /* + * Comparators for range queries. + */ + private static final class RangeQueryComparators implements IntervalBTree.WithIntervalComparators, Shard> + { + private static final RangeQueryComparators INSTANCE = new RangeQueryComparators(); + + @Override + public AsymmetricComparator, Shard> startWithStartSeeker() + { + // two exclusive starts; + // tie means range starts 'after' for sort-positioning + return (range, shard) -> startWithStart(range.left.compareTo(shard.range.left)); + } + + @Override + public AsymmetricComparator, Shard> startWithEndSeeker() + { + // range.left exclusive, shard.right inclusive (min = +∞): + // tie (range.left == shard.right) means query starts AFTER shard ends, no overlap + return (range, shard) -> startWithEnd(compareTokenToEnd(range.left, shard.range.right)); + } + + @Override + public AsymmetricComparator, Shard> endWithStartSeeker() + { + // range.right inclusive (min = +∞), shard.left exclusive: + // tie (range.right == shard.left) means query ends BEFORE shard starts, no overlap + return (range, shard) -> endWithStart(compareEndToToken(range.right, shard.range.left)); + } + } + + private static int compareTokenToEnd(Token t, Token end) + { + return end.isMinimum() ? -1 : t.compareTo(end); + } + + private static int compareEndToToken(Token end, Token t) + { + return end.isMinimum() ? 1 : end.compareTo(t); + } +} diff --git a/src/java/org/apache/cassandra/replication/ShardMetadata.java b/src/java/org/apache/cassandra/replication/ShardMetadata.java new file mode 100644 index 000000000000..35218495b4c3 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/ShardMetadata.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.io.IOException; +import java.util.Objects; + +import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.dht.AbstractBounds; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.NullableSerializer; + +public final class ShardMetadata +{ + final String keyspace; + final long sinceEpoch; + final Range range; + final Participants participants; + + ShardMetadata(String keyspace, long sinceEpoch, Range range, Participants participants) + { + this.keyspace = keyspace; + this.sinceEpoch = sinceEpoch; + this.range = range; + this.participants = participants; + } + + @Override + public String toString() + { + return '{' + keyspace + ", " + range + ", sinceEpoch=" + sinceEpoch + ", participants=" + participants + '}'; + + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (!(o instanceof ShardMetadata)) return false; + ShardMetadata that = (ShardMetadata) o; + return sinceEpoch == that.sinceEpoch + && keyspace.equals(that.keyspace) + && range.equals(that.range) + && participants.equals(that.participants); + } + + @Override + public int hashCode() + { + return Objects.hash(keyspace, sinceEpoch, range, participants); + } + + public static final VersionedSerializer serializer = new VersionedSerializer<>() + { + @Override + public void serialize(ShardMetadata response, DataOutputPlus out, Version version) throws IOException + { + out.writeUTF(response.keyspace); + out.writeLong(response.sinceEpoch); + AbstractBounds.tokenSerializer.serialize(response.range, out, version.messagingVersion()); + Participants.serializer.serialize(response.participants, out); + } + + @Override + public ShardMetadata deserialize(DataInputPlus in, Version version) throws IOException + { + String keyspace = in.readUTF(); + long sinceEpoch = in.readLong(); + Range range = (Range) AbstractBounds.tokenSerializer.deserialize(in, IPartitioner.global(), version.messagingVersion()); + Participants participants = Participants.serializer.deserialize(in); + return new ShardMetadata(keyspace, sinceEpoch, range, participants); + } + + @Override + public long serializedSize(ShardMetadata response, Version version) + { + long size = 0; + size += TypeSizes.sizeof(response.keyspace); + size += TypeSizes.sizeof(response.sinceEpoch); + size += AbstractBounds.tokenSerializer.serializedSize(response.range, version.messagingVersion()); + size += Participants.serializer.serializedSize(response.participants); + return size; + } + }; + + public static final org.apache.cassandra.io.VersionedSerializer nullableSerializer = + NullableSerializer.wrap(serializer); +} diff --git a/src/java/org/apache/cassandra/replication/ShardMetadataRequest.java b/src/java/org/apache/cassandra/replication/ShardMetadataRequest.java new file mode 100644 index 000000000000..9beafec9e46c --- /dev/null +++ b/src/java/org/apache/cassandra/replication/ShardMetadataRequest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.exceptions.RequestFailure; +import org.apache.cassandra.gms.FailureDetector; +import org.apache.cassandra.io.UnversionedSerializer; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.IVerbHandler; +import org.apache.cassandra.net.Message; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.utils.concurrent.AsyncPromise; + +/** + * Request sent to peers to resolve the shard metadata (epoch, range, participants) + * for an unknown {@link CoordinatorLogId}, to resolve the ambiguity of what shard + * the coordinator log should be placed under. + * Sent to all peers in parallel; the first non-null response is sufficient. + */ +public final class ShardMetadataRequest +{ + private static final Logger logger = LoggerFactory.getLogger(ShardMetadataRequest.class); + + final CoordinatorLogId logId; + + public ShardMetadataRequest(CoordinatorLogId logId) + { + this.logId = logId; + } + + /** + * Query all provided peers in parallel for the shard metadata of a coordinator log. + * Waits for the first peer to reply with a known result. + */ + public static AsyncPromise queryPeers(CoordinatorLogId logId, Set peers) + { + if (peers.isEmpty()) + throw new IllegalArgumentException("Empty peers set to query"); + + Set livePeers = new HashSet<>(peers); + for (InetAddressAndPort peer : peers) + if (!FailureDetector.instance.isAlive(peer)) + livePeers.remove(peer); + + if (livePeers.isEmpty()) + throw new RuntimeException("No peers known or alive to retrieve shard metadata from"); + + AsyncPromise promise = new AsyncPromise<>(); + RequestCallback callback = new RequestCallback<>() + { + private final AtomicInteger remaining = new AtomicInteger(livePeers.size()); + + @Override + public void onResponse(Message msg) + { + if (msg.payload.metadata != null) + promise.trySuccess(msg.payload.metadata); + else if (remaining.decrementAndGet() == 0) + promise.trySuccess(null); + } + + @Override + public void onFailure(InetAddressAndPort from, RequestFailure failure) + { + if (remaining.decrementAndGet() == 0) + promise.trySuccess(null); + } + + @Override + public boolean invokeOnFailure() + { + return true; + } + }; + + Message message = Message.out(Verb.MT_SHARD_METADATA_REQ, new ShardMetadataRequest(logId)); + for (InetAddressAndPort peer : livePeers) + MessagingService.instance().sendWithCallback(message, peer, callback); + + return promise; + } + + public static final IVerbHandler verbHandler = message -> + { + MutationTrackingService.ensureEnabled(); + + ShardMetadataRequest request = message.payload; + logger.trace("Received shard metadata request from {} for log {}", message.from(), request.logId); + + Shard shard = MutationTrackingService.instance().getShardNullable(request.logId); + ShardMetadata metadata = shard != null + ? new ShardMetadata(shard.keyspace, shard.sinceEpoch, shard.range, shard.participants) + : null; + Message response = message.responseWith(new ShardMetadataResponse(metadata)); + MessagingService.instance().send(response, message.from()); + }; + + public static final UnversionedSerializer serializer = new UnversionedSerializer<>() + { + @Override + public void serialize(ShardMetadataRequest request, DataOutputPlus out) throws IOException + { + CoordinatorLogId.serializer.serialize(request.logId, out); + } + + @Override + public ShardMetadataRequest deserialize(DataInputPlus in) throws IOException + { + return new ShardMetadataRequest(CoordinatorLogId.serializer.deserialize(in)); + } + + @Override + public long serializedSize(ShardMetadataRequest request) + { + return CoordinatorLogId.serializer.serializedSize(request.logId); + } + }; +} diff --git a/src/java/org/apache/cassandra/replication/ShardMetadataResponse.java b/src/java/org/apache/cassandra/replication/ShardMetadataResponse.java new file mode 100644 index 000000000000..871b01ff001b --- /dev/null +++ b/src/java/org/apache/cassandra/replication/ShardMetadataResponse.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.io.IOException; + +import javax.annotation.Nullable; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +/** + * Response to a {@link ShardMetadataRequest}, carrying the shard metadata + * (epoch, range, write participants) for a coordinator log, + * or null if the log is unknown to the responder. + */ +public final class ShardMetadataResponse +{ + @Nullable + public final ShardMetadata metadata; + + ShardMetadataResponse(ShardMetadata metadata) + { + this.metadata = metadata; + } + + public static final VersionedSerializer serializer = new VersionedSerializer<>() + { + @Override + public void serialize(ShardMetadataResponse response, DataOutputPlus out, Version version) throws IOException + { + ShardMetadata.nullableSerializer.serialize(response.metadata, out, version); + } + + @Override + public ShardMetadataResponse deserialize(DataInputPlus in, Version version) throws IOException + { + return new ShardMetadataResponse(ShardMetadata.nullableSerializer.deserialize(in, version)); + } + + @Override + public long serializedSize(ShardMetadataResponse response, Version version) + { + return ShardMetadata.nullableSerializer.serializedSize(response.metadata, version); + } + }; +} diff --git a/src/java/org/apache/cassandra/replication/ShortMutationId.java b/src/java/org/apache/cassandra/replication/ShortMutationId.java index 856790a8a1df..aa2431c4ec6c 100644 --- a/src/java/org/apache/cassandra/replication/ShortMutationId.java +++ b/src/java/org/apache/cassandra/replication/ShortMutationId.java @@ -74,11 +74,6 @@ private ShortMutationId(int hostId, int hostLogId, int offset) this.offset = offset; } - public ShortMutationId(MutationId mutationId) - { - this(mutationId.hostLogId(), mutationId.hostId(), mutationId.offset()); - } - public int hostId() { return hostId; diff --git a/src/java/org/apache/cassandra/replication/TrackedImportTransfer.java b/src/java/org/apache/cassandra/replication/TrackedImportTransfer.java index 6710317317f0..653c35f31cf8 100644 --- a/src/java/org/apache/cassandra/replication/TrackedImportTransfer.java +++ b/src/java/org/apache/cassandra/replication/TrackedImportTransfer.java @@ -92,14 +92,14 @@ public class TrackedImportTransfer extends CoordinatedTransfer @VisibleForTesting TrackedImportTransfer(Range range, MutationId id) { - super(id, null, range); + super(id, null, 0L, range); this.sstables = Collections.emptyList(); this.cl = null; } - TrackedImportTransfer(String keyspace, Range range, Participants participants, Collection sstables, ConsistencyLevel cl, Supplier nextId) + TrackedImportTransfer(String keyspace, long sinceEpoch, Range range, Participants participants, Collection sstables, ConsistencyLevel cl, Supplier nextId) { - super(nextId.get(), participants, keyspace, range); + super(nextId.get(), participants, keyspace, sinceEpoch, range); this.sstables = sstables; this.cl = cl; @@ -348,7 +348,7 @@ public void onFailure(InetAddressAndPort from, RequestFailure failure) @Override protected ActivationRequest createActivation(Pair pair, Phase phase) { - return new ActivationRequest(StreamOperation.IMPORT, pair, phase, id(), ClusterMetadata.current().myNodeId(), range, keyspace, streamResults.get(pair).planId()); + return new ActivationRequest(StreamOperation.IMPORT, pair, phase, id(), ClusterMetadata.current().myNodeId(), range, sinceEpoch, keyspace, streamResults.get(pair).planId()); } private SingleTransferResult streamTask(InetAddressAndPort to) throws StreamException, ExecutionException, InterruptedException, TimeoutException diff --git a/src/java/org/apache/cassandra/replication/TrackedImportTransfers.java b/src/java/org/apache/cassandra/replication/TrackedImportTransfers.java index 250c240e0ae5..7d9d0b50ef15 100644 --- a/src/java/org/apache/cassandra/replication/TrackedImportTransfers.java +++ b/src/java/org/apache/cassandra/replication/TrackedImportTransfers.java @@ -70,7 +70,7 @@ static TrackedImportTransfers create(String keyspace, MutationTrackingService.Ke if (sstablesForRange.isEmpty()) return; - TrackedImportTransfer transfer = new TrackedImportTransfer(keyspace, range, shard.participants, sstablesForRange, cl, shard::nextId); + TrackedImportTransfer transfer = new TrackedImportTransfer(keyspace, shard.sinceEpoch, range, shard.participants, sstablesForRange, cl, shard::nextTransferId); transfers.add(transfer); }); return new TrackedImportTransfers(transfers); diff --git a/src/java/org/apache/cassandra/replication/TrackedRepairTransfer.java b/src/java/org/apache/cassandra/replication/TrackedRepairTransfer.java index 0d0906edc05b..d48f3711d0b6 100644 --- a/src/java/org/apache/cassandra/replication/TrackedRepairTransfer.java +++ b/src/java/org/apache/cassandra/replication/TrackedRepairTransfer.java @@ -55,12 +55,12 @@ public class TrackedRepairTransfer extends CoordinatedTransfer public TrackedRepairTransfer(SyncTasks.ShardedSyncTask shardedTask) { - this(shardedTask.task.getTransferId(), shardedTask.participants, shardedTask.task, shardedTask.keyspace, shardedTask.range); + this(shardedTask.task.getTransferId(), shardedTask.participants, shardedTask.task, shardedTask.keyspace, shardedTask.sinceEpoch, shardedTask.range); } - private TrackedRepairTransfer(ShortMutationId id, Participants participants, SyncTask task, String keyspace, Range range) + private TrackedRepairTransfer(ShortMutationId id, Participants participants, SyncTask task, String keyspace, long sinceEpoch, Range range) { - super(id, keyspace, range); + super(id, keyspace, sinceEpoch, range); Set replicaNodeIds = participants.asSet(); Set participating = new HashSet<>(); @@ -113,7 +113,7 @@ protected void prepare(Collection> @Override protected ActivationRequest createActivation(Pair pair, ActivationRequest.Phase phase) { - return new ActivationRequest(StreamOperation.REPAIR, pair, phase, id(), ClusterMetadata.current().myNodeId(), range, keyspace, streamResults.get(pair).planId()); + return new ActivationRequest(StreamOperation.REPAIR, pair, phase, id(), ClusterMetadata.current().myNodeId(), range, sinceEpoch, keyspace, streamResults.get(pair).planId()); } @Override diff --git a/src/java/org/apache/cassandra/replication/TrackedWriteRequest.java b/src/java/org/apache/cassandra/replication/TrackedWriteRequest.java index 9a5ab64157fc..c8b013bd0851 100644 --- a/src/java/org/apache/cassandra/replication/TrackedWriteRequest.java +++ b/src/java/org/apache/cassandra/replication/TrackedWriteRequest.java @@ -206,6 +206,7 @@ public static AbstractWriteResponseHandler perform( writeMetrics.localRequests.mark(); MutationId id = MutationTrackingService.instance().nextMutationId(keyspaceName, token); + mutation = mutation.withMutationId(id); if (logger.isTraceEnabled()) @@ -445,18 +446,18 @@ private static class LocalMutationRunnable implements DebuggableTask.RunnableDeb @Override public final void run() { - long now = MonotonicClock.Global.approxTime.now(); - long deadline = getRequestTime(handler).computeDeadline(MUTATION_REQ.expiresAfterNanos()); - - if (now > deadline) - { - long timeTakenNanos = now - startTimeNanos(); - MessagingService.instance().metrics.recordSelfDroppedMessage(Verb.MUTATION_REQ, timeTakenNanos, NANOSECONDS); - return; - } - try { + long now = MonotonicClock.Global.approxTime.now(); + long deadline = getRequestTime(handler).computeDeadline(MUTATION_REQ.expiresAfterNanos()); + + if (now > deadline) + { + long timeTakenNanos = now - startTimeNanos(); + MessagingService.instance().metrics.recordSelfDroppedMessage(Verb.MUTATION_REQ, timeTakenNanos, NANOSECONDS); + return; + } + mutation.apply(); handler.onResponse(null); } @@ -466,6 +467,10 @@ public final void run() logger.error("Failed to apply mutation locally : ", ex); handler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(ex)); } + finally + { + MutationTrackingService.instance().completeLocalWrite(mutation.id()); + } } @Override @@ -510,18 +515,18 @@ private Dispatcher.RequestTime getReqestTime() @Override public void run() { - long now = MonotonicClock.Global.approxTime.now(); - long deadline = getReqestTime().computeDeadline(COUNTER_MUTATION_REQ.expiresAfterNanos()); - - if (now > deadline) - { - long timeTakenNanos = now - startTimeNanos(); - MessagingService.instance().metrics.recordSelfDroppedMessage(COUNTER_MUTATION_REQ, timeTakenNanos, NANOSECONDS); - return; - } - try { + long now = MonotonicClock.Global.approxTime.now(); + long deadline = getReqestTime().computeDeadline(COUNTER_MUTATION_REQ.expiresAfterNanos()); + + if (now > deadline) + { + long timeTakenNanos = now - startTimeNanos(); + MessagingService.instance().metrics.recordSelfDroppedMessage(COUNTER_MUTATION_REQ, timeTakenNanos, NANOSECONDS); + return; + } + Mutation result = counterMutation.applyCounterMutation(counterMutation.id()); handler.onResponse(null); sendToReplicas(result, plan, handler, null); @@ -532,6 +537,10 @@ public void run() logger.error("Failed to apply counter mutation locally: ", ex); handler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(ex)); } + finally + { + MutationTrackingService.instance().completeLocalWrite(counterMutation.id()); + } } @Override diff --git a/src/java/org/apache/cassandra/replication/TransferFailed.java b/src/java/org/apache/cassandra/replication/TransferFailedRequest.java similarity index 74% rename from src/java/org/apache/cassandra/replication/TransferFailed.java rename to src/java/org/apache/cassandra/replication/TransferFailedRequest.java index 397bdadc80b9..f0a1f1721e35 100644 --- a/src/java/org/apache/cassandra/replication/TransferFailed.java +++ b/src/java/org/apache/cassandra/replication/TransferFailedRequest.java @@ -30,31 +30,31 @@ * @see TrackedImportTransfer * @see PendingLocalTransfer */ -public class TransferFailed +public class TransferFailedRequest { final TimeUUID planId; - public TransferFailed(TimeUUID planId) + public TransferFailedRequest(TimeUUID planId) { this.planId = planId; } - public static final VersionedSerializer serializer = new VersionedSerializer<>() + public static final VersionedSerializer serializer = new VersionedSerializer<>() { @Override - public void serialize(TransferFailed t, DataOutputPlus out, Version version) throws IOException + public void serialize(TransferFailedRequest t, DataOutputPlus out, Version version) throws IOException { TimeUUID.Serializer.instance.serialize(t.planId, out, version.messagingVersion()); } @Override - public TransferFailed deserialize(DataInputPlus in, Version version) throws IOException + public TransferFailedRequest deserialize(DataInputPlus in, Version version) throws IOException { - return new TransferFailed(TimeUUID.Serializer.instance.deserialize(in, version.messagingVersion())); + return new TransferFailedRequest(TimeUUID.Serializer.instance.deserialize(in, version.messagingVersion())); } @Override - public long serializedSize(TransferFailed t, Version version) + public long serializedSize(TransferFailedRequest t, Version version) { return TimeUUID.Serializer.instance.serializedSize(t.planId, version.messagingVersion()); } diff --git a/src/java/org/apache/cassandra/replication/TransferFailedResponse.java b/src/java/org/apache/cassandra/replication/TransferFailedResponse.java new file mode 100644 index 000000000000..093aaad1ddd1 --- /dev/null +++ b/src/java/org/apache/cassandra/replication/TransferFailedResponse.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; + +/** + * {@code NoPayload} cannot be used here because it is special-cased in + * {@code Message.Serializer.payloadSize}, which would clash with the embedded + * version byte written by the {@code mtEmbedded} wrapper. + */ +public class TransferFailedResponse +{ + public static final TransferFailedResponse instance = new TransferFailedResponse(); + + private TransferFailedResponse() {} + + public static final VersionedSerializer serializer = new VersionedSerializer<>() + { + @Override + public void serialize(TransferFailedResponse t, DataOutputPlus out, Version version) + { + } + + @Override + public TransferFailedResponse deserialize(DataInputPlus in, Version version) + { + return instance; + } + + @Override + public long serializedSize(TransferFailedResponse t, Version version) + { + return 0; + } + }; + + @Override + public String toString() + { + return "TransferFailedResponse{}"; + } +} diff --git a/src/java/org/apache/cassandra/replication/TransferTrackingService.java b/src/java/org/apache/cassandra/replication/TransferTrackingService.java index 5729a047f2a6..d78b2e9bc327 100644 --- a/src/java/org/apache/cassandra/replication/TransferTrackingService.java +++ b/src/java/org/apache/cassandra/replication/TransferTrackingService.java @@ -42,7 +42,6 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.net.IVerbHandler; import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.net.NoPayload; import org.apache.cassandra.repair.RepairJob; import org.apache.cassandra.repair.SyncStat; import org.apache.cassandra.repair.SyncTask; @@ -122,7 +121,7 @@ void received(PendingLocalTransfer transfer) /** * Track a repair as a set of {@link TrackedRepairTransfer} instances corresponding to sync tasks prior to task * execution so when the syncs are done, we can activate them via {@link ActivationRequest} or fail by - * sending {@link TransferFailed} to all replicas. In other words, one {@link RepairJob} will have as many + * sending {@link TransferFailedRequest} to all replicas. In other words, one {@link RepairJob} will have as many * transfers as sync tasks. */ public void onRepairSyncExecution(SyncTasks tasks) @@ -307,7 +306,7 @@ private void cleanup() } } - private void purge(TransferFailed failed) + private void purge(TransferFailedRequest failed) { lock.writeLock().lock(); try @@ -424,8 +423,8 @@ CoordinatedTransfer getActivatedTransfer(ShortMutationId transferId) } } - public static IVerbHandler verbHandler = message -> { + public static IVerbHandler verbHandler = message -> { TransferTrackingService.instance().purge(message.payload); - MessagingService.instance().respond(NoPayload.noPayload, message); + MessagingService.instance().respond(TransferFailedResponse.instance, message); }; } diff --git a/src/java/org/apache/cassandra/replication/UnknownShardException.java b/src/java/org/apache/cassandra/replication/UnknownShardException.java index 8aedd2d20f89..f072786fec05 100644 --- a/src/java/org/apache/cassandra/replication/UnknownShardException.java +++ b/src/java/org/apache/cassandra/replication/UnknownShardException.java @@ -15,22 +15,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.cassandra.replication; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; -import org.apache.cassandra.tcm.ownership.ReplicaGroups; public class UnknownShardException extends IllegalStateException { - public UnknownShardException(Token token, ReplicaGroups groups) + public UnknownShardException(Token token, String keyspace) { - super(String.format("Could not find token %s in %s", token, groups)); + super(String.format("Could not find token %s in shards for keyspace %s", token, keyspace)); } - public UnknownShardException(Range range, ReplicaGroups groups) + public UnknownShardException(Range range, String keyspace) { - super(String.format("Could not find range %s in %s", range, groups)); + super(String.format("Could not find range %s in shards for keyspace %s", range, keyspace)); } } diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java index ce6fee3b1813..6cf4ddf4e715 100644 --- a/src/java/org/apache/cassandra/service/StorageProxy.java +++ b/src/java/org/apache/cassandra/service/StorageProxy.java @@ -1112,7 +1112,8 @@ public static void commitPaxosTracked(Keyspace keyspace, Commit proposal, Consis // Generate mutation ID for tracked keyspace, preserving the commit subclass MutationId mutationId; - if (proposal.mutation.id().isNone()) + boolean mustAllocateMutationId = proposal.mutation.id().isNone(); + if (mustAllocateMutationId) { mutationId = MutationTrackingService.instance().nextMutationId(keyspaceName, tk); proposal = proposal.withMutationId(mutationId); @@ -1146,21 +1147,26 @@ public static void commitPaxosTracked(Keyspace keyspace, Commit proposal, Consis } // Execute local commit SYNCHRONOUSLY first to ensure journal write completes - if (localReplica != null) + try { - try + if (localReplica != null) { PaxosState.commitDirect(proposal); if (shouldBlock) responseHandler.onResponse(null); } - catch (Exception ex) - { - if (!(ex instanceof WriteTimeoutException)) - logger.error("Failed to apply paxos commit locally", ex); - if (shouldBlock) - responseHandler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(ex)); - } + } + catch (Exception ex) + { + if (!(ex instanceof WriteTimeoutException)) + logger.error("Failed to apply paxos commit locally", ex); + if (shouldBlock) + responseHandler.onFailure(FBUtilities.getBroadcastAddressAndPort(), RequestFailure.forException(ex)); + } + finally + { + if (mustAllocateMutationId) // 'release' the allocated mutation id + MutationTrackingService.instance().completeLocalWrite(mutationId); } // Now send to remote replicas diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java b/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java index 99311be12c8d..1658a89c8475 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosCommit.java @@ -115,6 +115,7 @@ static class Status final int required; final OnDone onDone; final boolean tracked; + final boolean allocatedMutationId; @Nullable final IntHashSet remoteReplicas; @@ -135,6 +136,7 @@ public PaxosCommit(Agreed commit, boolean allowHints, ConsistencyLevel consisten Agreed commitToUse = commit; IntHashSet remoteReplicas = null; + boolean allocatedMutationId = false; if (isTracked) { // Precondition: for tracked keyspaces, the local node must be a replica @@ -150,6 +152,7 @@ public PaxosCommit(Agreed commit, boolean allowHints, ConsistencyLevel consisten { Token token = commit.partitionKey().getToken(); MutationId mutationId = MutationTrackingService.instance().nextMutationId(commit.metadata().keyspace, token); + allocatedMutationId = true; Mutation mutationWithId = commit.makeMutation(mutationId); commitToUse = new Commit.Agreed(commit.ballot, mutationWithId); } @@ -176,6 +179,7 @@ else if (!commit.mutation.id().isNone()) } this.tracked = isTracked; + this.allocatedMutationId = allocatedMutationId; this.commit = commitToUse; this.allowHints = allowHints; this.consistencyForConsensus = consistencyForConsensus; @@ -428,7 +432,17 @@ public void executeOnSelf() { // For tracked keyspaces, local execution MUST succeed and write to journal. // Use direct execution instead of executeOnSelf to ensure we detect failures. - NoPayload response = RequestHandler.execute(commit); + NoPayload response; + try + { + response = RequestHandler.execute(commit); + } + finally + { + if (allocatedMutationId) + MutationTrackingService.instance().completeLocalWrite(commit.mutation.id()); + } + if (response == null) { throw new IllegalStateException(String.format( diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java b/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java index a29c09f15957..6a81cb170454 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosPrepareRefresh.java @@ -154,6 +154,10 @@ private boolean generateMutationIdAndPersistLocally(Committed commit, List message) { logger.warn("Failed to execute local commit for tracked keyspace mutation {}", mutationId, e); } + finally + { + MutationTrackingService.instance().completeLocalWrite(mutationId); + } if (localResponse == null) { diff --git a/src/java/org/apache/cassandra/streaming/LogReceiveTask.java b/src/java/org/apache/cassandra/streaming/LogReceiveTask.java deleted file mode 100644 index ba83051b1921..000000000000 --- a/src/java/org/apache/cassandra/streaming/LogReceiveTask.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming; - -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.streaming.messages.IncomingMutationLogStreamMessage; -import org.apache.cassandra.streaming.messages.MutationLogReceivedMessage; - -/** - * Task for tracking reception of mutation log streams. - */ -public class LogReceiveTask extends LogStreamTask -{ - - public LogReceiveTask(StreamSession session, InetAddressAndPort peer) - { - super(session, peer); - } - - public synchronized void received(IncomingMutationLogStreamMessage message) - { - // TODO: validate message header with expected ranges - if (markCompleted()) - { - session.taskCompleted(this); - // Send acknowledgment on successful completion - session.sendControlMessage(new MutationLogReceivedMessage()).syncUninterruptibly(); - - } - } - - @Override - public void abort() - { - // cleanup if needed - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/streaming/LogStreamHeader.java b/src/java/org/apache/cassandra/streaming/LogStreamHeader.java deleted file mode 100644 index c79e571fcaa1..000000000000 --- a/src/java/org/apache/cassandra/streaming/LogStreamHeader.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.streaming; - -import java.io.IOException; -import java.util.Objects; - -import org.apache.cassandra.db.TypeSizes; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.replication.ReconciledLogSnapshot; -import org.apache.cassandra.replication.Version; -import org.apache.cassandra.replication.VersionedSerializer; -import org.apache.cassandra.utils.TimeUUID; - -import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer; - -/** - * Header for mutation log stream messages containing both manifest and session metadata. - */ -public class LogStreamHeader -{ - public final LogStreamManifest manifest; - public final ReconciledLogSnapshot reconciled; - public final InetAddressAndPort sender; - public final TimeUUID planId; - public final int sessionIndex; - public final boolean sendByFollower; - - public LogStreamHeader(LogStreamManifest manifest, - ReconciledLogSnapshot reconciled, - InetAddressAndPort sender, - TimeUUID planId, - int sessionIndex, - boolean sendByFollower) - { - this.manifest = manifest; - this.reconciled = reconciled; - this.sender = sender; - this.planId = planId; - this.sessionIndex = sessionIndex; - this.sendByFollower = sendByFollower; - } - - public static final VersionedSerializer serializer = new VersionedSerializer<>() - { - @Override - public void serialize(LogStreamHeader header, DataOutputPlus out, Version version) throws IOException - { - LogStreamManifest.serializer.serialize(header.manifest, out, version); - ReconciledLogSnapshot.serializer.serialize(header.reconciled, out, version); - inetAddressAndPortSerializer.serialize(header.sender, out, version.messagingVersion()); - header.planId.serialize(out); - out.writeInt(header.sessionIndex); - out.writeBoolean(header.sendByFollower); - } - - @Override - public LogStreamHeader deserialize(DataInputPlus in, Version version) throws IOException - { - LogStreamManifest manifest = LogStreamManifest.serializer.deserialize(in, version); - ReconciledLogSnapshot reconciled = ReconciledLogSnapshot.serializer.deserialize(in, version); - InetAddressAndPort sender = inetAddressAndPortSerializer.deserialize(in, version.messagingVersion()); - TimeUUID planId = TimeUUID.deserialize(in); - int sessionIndex = in.readInt(); - boolean sendByFollower = in.readBoolean(); - return new LogStreamHeader(manifest, reconciled, sender, planId, sessionIndex, sendByFollower); - } - - @Override - public long serializedSize(LogStreamHeader header, Version version) - { - return LogStreamManifest.serializer.serializedSize(header.manifest, version) - + ReconciledLogSnapshot.serializer.serializedSize(header.reconciled, version) - + inetAddressAndPortSerializer.serializedSize(header.sender, version.messagingVersion()) - + TimeUUID.sizeInBytes() - + TypeSizes.sizeof(header.sessionIndex) - + TypeSizes.sizeof(header.sendByFollower); - } - }; - - @Override - public boolean equals(Object o) - { - if (o == null || getClass() != o.getClass()) return false; - LogStreamHeader that = (LogStreamHeader) o; - return sessionIndex == that.sessionIndex && - sendByFollower == that.sendByFollower && - Objects.equals(manifest, that.manifest) && - Objects.equals(sender, that.sender) && - Objects.equals(planId, that.planId) && - Objects.equals(reconciled, that.reconciled); - } - - @Override - public int hashCode() - { - return Objects.hash(manifest, sender, planId, sessionIndex, sendByFollower, reconciled); - } - - @Override - public String toString() - { - return String.format("LogStreamHeader{manifest=%s, sender=%s, planId=%s, sessionIndex=%d, sendByFollower=%s, reconciledOffsets=%s}", - manifest, sender, planId, sessionIndex, sendByFollower, reconciled); - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/streaming/LogStreamManifest.java b/src/java/org/apache/cassandra/streaming/LogStreamManifest.java deleted file mode 100644 index 69f5c83a0969..000000000000 --- a/src/java/org/apache/cassandra/streaming/LogStreamManifest.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.streaming; - -import java.io.IOException; -import java.util.Map; -import java.util.Objects; -import java.util.Set; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; - -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.io.EmbeddedAsymmetricVersionedSerializer; -import org.apache.cassandra.io.IVersionedAsymmetricSerializer; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.io.util.DataOutputPlus; -import org.apache.cassandra.replication.Version; -import org.apache.cassandra.replication.VersionedSerializer; -import org.apache.cassandra.utils.StringSerializer; - -import static org.apache.cassandra.utils.CollectionSerializers.deserializeCollectionToConsumer; -import static org.apache.cassandra.utils.CollectionSerializers.deserializeMapToConsumer; -import static org.apache.cassandra.utils.CollectionSerializers.serializeCollection; -import static org.apache.cassandra.utils.CollectionSerializers.serializeMap; -import static org.apache.cassandra.utils.CollectionSerializers.serializedCollectionSize; -import static org.apache.cassandra.utils.CollectionSerializers.serializedMapSize; - -public class LogStreamManifest -{ - public final ImmutableMap>> keyspaceRanges; - - public LogStreamManifest(ImmutableMap>> keyspaceRanges) - { - this.keyspaceRanges = keyspaceRanges; - } - - public static LogStreamManifest create(Map>> keyspaceRanges) - { - ImmutableMap.Builder>> builder = ImmutableMap.builder(); - keyspaceRanges.forEach((keyspace, ranges) -> builder.put(keyspace, ImmutableSet.copyOf(ranges))); - return new LogStreamManifest(builder.build()); - } - - public static class Serializer implements VersionedSerializer - { - private static final VersionedSerializer> rangeSerializer = new VersionedSerializer<>() - { - @Override - public void serialize(Range range, DataOutputPlus out, Version version) throws IOException - { - Token.serializer.serialize(range.left, out, version.messagingVersion()); - Token.serializer.serialize(range.right, out, version.messagingVersion()); - } - - @Override - public Range deserialize(DataInputPlus in, Version version) throws IOException - { - return new Range<>( - Token.serializer.deserialize(in, version.messagingVersion()), - Token.serializer.deserialize(in, version.messagingVersion()) - ); - } - - @Override - public long serializedSize(Range range, Version version) - { - return Token.serializer.serializedSize(range.left, version.messagingVersion()) - + Token.serializer.serializedSize(range.right, version.messagingVersion()); - } - }; - - private static final VersionedSerializer>> rangeSetSerializer = new VersionedSerializer<>() - { - @Override - public void serialize(ImmutableSet> t, DataOutputPlus out, Version version) throws IOException - { - serializeCollection(t, out, version, rangeSerializer); - } - - @Override - public ImmutableSet> deserialize(DataInputPlus in, Version version) throws IOException - { - ImmutableSet.Builder> builder = ImmutableSet.builder(); - deserializeCollectionToConsumer(in, version, rangeSerializer, builder::add); - return builder.build(); - } - - @Override - public long serializedSize(ImmutableSet> t, Version version) - { - return serializedCollectionSize(t, version, rangeSerializer); - } - }; - - @Override - public void serialize(LogStreamManifest header, DataOutputPlus out, Version version) throws IOException - { - serializeMap(header.keyspaceRanges, out, version, StringSerializer.instance, rangeSetSerializer); - } - - @Override - public LogStreamManifest deserialize(DataInputPlus in, Version version) throws IOException - { - ImmutableMap.Builder>> builder = ImmutableMap.builder(); - deserializeMapToConsumer(in, version, StringSerializer.instance, rangeSetSerializer, builder::put); - return new LogStreamManifest(builder.build()); - } - - @Override - public long serializedSize(LogStreamManifest header, Version version) - { - return serializedMapSize(header.keyspaceRanges, version, StringSerializer.instance, rangeSetSerializer); - } - } - - public static final Serializer serializer = new Serializer(); - public static final IVersionedAsymmetricSerializer embedded = - EmbeddedAsymmetricVersionedSerializer.mtEmbedded(serializer); - - @Override - public boolean equals(Object o) - { - if (o == null || getClass() != o.getClass()) return false; - LogStreamManifest that = (LogStreamManifest) o; - return Objects.equals(keyspaceRanges, that.keyspaceRanges); - } - - @Override - public int hashCode() - { - return Objects.hashCode(keyspaceRanges); - } - - @Override - public String toString() - { - return String.format("MutationLogStreamHeader{keyspaceRanges=%s}", keyspaceRanges); - } -} diff --git a/src/java/org/apache/cassandra/streaming/LogStreamTask.java b/src/java/org/apache/cassandra/streaming/LogStreamTask.java deleted file mode 100644 index 342d3dd334d5..000000000000 --- a/src/java/org/apache/cassandra/streaming/LogStreamTask.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming; - -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; - -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.locator.RangesAtEndpoint; - -/** - * Base class for log streaming tasks that track mutation log transfers and receives. - */ -public abstract class LogStreamTask -{ - protected final StreamSession session; - protected final InetAddressAndPort peer; - protected boolean completed = false; - - private final Map>> keyspaceRanges = new HashMap<>(); - - public LogStreamTask(StreamSession session, InetAddressAndPort peer) - { - this.session = session; - this.peer = peer; - } - - public synchronized void addKeyspaceRanges(String keyspace, RangesAtEndpoint ranges) - { - addKeyspaceRanges(keyspace, ranges.ranges()); - } - - public synchronized void addKeyspaceRanges(String keyspace, Collection> ranges) - { - keyspaceRanges.computeIfAbsent(keyspace, k -> new HashSet<>()).addAll(ranges); - } - - public abstract void abort(); - - public boolean isCompleted() - { - return completed; - } - - public LogStreamManifest getManifest() - { - return LogStreamManifest.create(keyspaceRanges); - } - - protected boolean markCompleted() - { - if (completed) - return false; - completed = true; - return true; - } - - public InetAddressAndPort getPeer() - { - return peer; - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/streaming/LogTransferTask.java b/src/java/org/apache/cassandra/streaming/LogTransferTask.java deleted file mode 100644 index 4075b9b1a0ec..000000000000 --- a/src/java/org/apache/cassandra/streaming/LogTransferTask.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming; - -import java.util.Collection; -import java.util.Collections; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.cassandra.concurrent.ScheduledExecutors; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.replication.MutationJournal; -import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; -import org.apache.cassandra.replication.ReconciledLogSnapshot; -import org.apache.cassandra.streaming.messages.OutgoingMutationLogStreamMessage; -import org.apache.cassandra.utils.FBUtilities; - -/** - * Task for tracking sending of mutation log streams. - */ -public class LogTransferTask extends LogStreamTask -{ - private static final Logger logger = LoggerFactory.getLogger(LogTransferTask.class); - - private volatile ScheduledFuture timeoutFuture; - private final ReconciledLogSnapshot reconciled; - private final MutationJournal.Snapshot snapshot; - - public LogTransferTask(StreamSession session, InetAddressAndPort peer, ReconciledLogSnapshot reconciled, MutationJournal.Snapshot snapshot) - { - super(session, peer); - this.reconciled = reconciled; - this.snapshot = snapshot; - } - - public ReconciledKeyspaceOffsets reconciled(String keyspace, Collection> ranges) - { - ReconciledLogSnapshot subset = reconciled.select(Collections.singletonMap(keyspace, ranges)); - return subset.getKeyspace(keyspace); - } - - public OutgoingMutationLogStreamMessage getMessage(StreamSession session) - { - LogStreamManifest manifest = getManifest(); - - ReconciledLogSnapshot subset = reconciled.select(manifest.keyspaceRanges); - - LogStreamHeader header = new LogStreamHeader(manifest, - subset, - FBUtilities.getBroadcastAddressAndPort(), - session.planId(), - 0, - session.isFollower()); - logger.trace("[Stream #{}] Creating outgoing mutation log stream message for peer {}", session.planId(), peer); - return new OutgoingMutationLogStreamMessage(header, snapshot); - } - - private synchronized void cancelTimeout() - { - if (timeoutFuture != null) - { - timeoutFuture.cancel(false); - timeoutFuture = null; - } - } - - public synchronized void complete() - { - // Cancel timeout on successful completion - cancelTimeout(); - // TODO: validate message header with expected ranges - logger.trace("[Stream #{}] Log transfer task completed for peer {}", session.planId(), peer); - if (markCompleted()) - session.taskCompleted(this); - } - - public void scheduleTimeout() - { - timeoutFuture = ScheduledExecutors.nonPeriodicTasks.schedule(session::sessionTimeout, DatabaseDescriptor.getStreamTransferTaskTimeout().toMilliseconds(), TimeUnit.MILLISECONDS); - } - - public void timeout() - { - session.sessionTimeout(); - } - - @Override - public void abort() - { - cancelTimeout(); - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/streaming/StreamPlan.java b/src/java/org/apache/cassandra/streaming/StreamPlan.java index fcd2e7f332a6..982d6cfe6e8a 100644 --- a/src/java/org/apache/cassandra/streaming/StreamPlan.java +++ b/src/java/org/apache/cassandra/streaming/StreamPlan.java @@ -28,10 +28,8 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; -import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.utils.TimeUUID; import static com.google.common.collect.Iterables.all; @@ -120,9 +118,6 @@ public StreamPlan requestRanges(InetAddressAndPort from, String keyspace, Ranges // TODO: add flag for fully reconciled data only if this is for a tracked keyspace session.addStreamRequest(keyspace, fullRanges, transientRanges, Arrays.asList(columnFamilies)); - if (includeMutationLogs(keyspace, session)) - session.addMutationLogRequest(keyspace, fullRanges, transientRanges); - return this; } @@ -138,20 +133,11 @@ public StreamPlan requestRanges(InetAddressAndPort from, String keyspace, Ranges public StreamPlan transferRanges(InetAddressAndPort to, String keyspace, RangesAtEndpoint replicas, String... columnFamilies) { StreamSession session = coordinator.getOrCreateOutboundSession(to); - ReconciledKeyspaceOffsets reconciledKeyspaceOffsets = includeMutationLogs(keyspace, session) - ? session.addMutationLogTransfer(keyspace, replicas) - : null; - - session.addTransferRanges(keyspace, replicas, Arrays.asList(columnFamilies), flushBeforeTransfer, reconciledKeyspaceOffsets); + session.addTransferRanges(keyspace, replicas, Arrays.asList(columnFamilies), flushBeforeTransfer); return this; } - private boolean includeMutationLogs(String keyspace, StreamSession session) - { - return isTrackedReplicationEnabled(keyspace) && session.getStreamOperation() != StreamOperation.REPAIR; - } - /** * Add transfer task to send given streams * @@ -274,15 +260,4 @@ public static boolean hasAccordTables(KeyspaceMetadata ksm) { return ksm.tables.stream().anyMatch(TableMetadata::requiresAccordSupport); } - - /** - * Check if the given keyspace uses tracked replication, which requires mutation log streaming. - * - * @param keyspace the keyspace name - * @return true if the keyspace uses tracked replication - */ - private boolean isTrackedReplicationEnabled(String keyspace) - { - return ClusterMetadata.current().schema.getKeyspaceMetadata(keyspace).useMutationTracking(); - } } diff --git a/src/java/org/apache/cassandra/streaming/StreamSession.java b/src/java/org/apache/cassandra/streaming/StreamSession.java index 941b29568735..9c259009ed5f 100644 --- a/src/java/org/apache/cassandra/streaming/StreamSession.java +++ b/src/java/org/apache/cassandra/streaming/StreamSession.java @@ -40,7 +40,6 @@ import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; @@ -66,19 +65,12 @@ import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.metrics.StreamingMetrics; -import org.apache.cassandra.replication.MutationJournal; -import org.apache.cassandra.replication.MutationTrackingService; -import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; -import org.apache.cassandra.replication.ReconciledLogSnapshot; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.async.StreamingMultiplexedChannel; import org.apache.cassandra.streaming.messages.CompleteMessage; -import org.apache.cassandra.streaming.messages.IncomingMutationLogStreamMessage; import org.apache.cassandra.streaming.messages.IncomingStreamMessage; -import org.apache.cassandra.streaming.messages.MutationLogReceivedMessage; -import org.apache.cassandra.streaming.messages.OutgoingMutationLogStreamMessage; import org.apache.cassandra.streaming.messages.OutgoingStreamMessage; import org.apache.cassandra.streaming.messages.PrepareAckMessage; import org.apache.cassandra.streaming.messages.PrepareSynAckMessage; @@ -209,9 +201,6 @@ public enum PrepareDirection { SEND, ACK } protected final ConcurrentHashMap transfers = new ConcurrentHashMap<>(); // data receivers, filled after receiving prepare message private final Map receivers = new ConcurrentHashMap<>(); - // log streaming tasks - private LogTransferTask logTransfer = null; - private LogReceiveTask logReceive = null; private final StreamingMetrics metrics; final Map>> transferredRangesPerKeyspace = new HashMap<>(); @@ -455,30 +444,6 @@ public void addStreamRequest(String keyspace, RangesAtEndpoint fullRanges, Range requests.add(new StreamRequest(keyspace, fullRanges, transientRanges, columnFamilies)); } - /** - * Request mutation log data from this session. - * - * @param keyspace Requesting keyspace - * @param fullRanges Ranges to retrieve mutation logs for - * @param transientRanges Ranges to retrieve mutation logs for - */ - public synchronized void addMutationLogRequest(String keyspace, RangesAtEndpoint fullRanges, RangesAtEndpoint transientRanges) - { - //It should either be a dummy address for repair or if it's a bootstrap/move/rebuild it should be this node - assert all(fullRanges, Replica::isSelf) || RangesAtEndpoint.isDummyList(fullRanges) : fullRanges.toString(); - assert all(transientRanges, Replica::isSelf) || RangesAtEndpoint.isDummyList(transientRanges) : transientRanges.toString(); - - // Create log receive task for the combined ranges - RangesAtEndpoint allRanges = RangesAtEndpoint.concat(fullRanges, transientRanges); - - if (logReceive == null) - { - logReceive = new LogReceiveTask(this, peer); - logger.trace("[Stream #{}] Created log receive task for peer {}", planId(), peer); - } - logReceive.addKeyspaceRanges(keyspace, allRanges); - } - /** * Set up transfer for specific keyspace/ranges/CFs * @@ -487,7 +452,7 @@ public synchronized void addMutationLogRequest(String keyspace, RangesAtEndpoint * @param columnFamilies Transfer ColumnFamilies * @param flushTables flush tables? */ - synchronized void addTransferRanges(String keyspace, RangesAtEndpoint replicas, Collection columnFamilies, boolean flushTables, ReconciledKeyspaceOffsets reconciledKeyspaceOffsets) + synchronized void addTransferRanges(String keyspace, RangesAtEndpoint replicas, Collection columnFamilies, boolean flushTables) { failIfFinished(); Collection stores = getColumnFamilyStores(keyspace, columnFamilies); @@ -498,7 +463,7 @@ synchronized void addTransferRanges(String keyspace, RangesAtEndpoint replicas, //Do we need to unwrap here also or is that just making it worse? //Range and if it's transient RangesAtEndpoint unwrappedRanges = replicas.unwrap(); - List streams = getOutgoingStreamsForRanges(unwrappedRanges, stores, pendingRepair, previewKind, reconciledKeyspaceOffsets); + List streams = getOutgoingStreamsForRanges(unwrappedRanges, stores, pendingRepair, previewKind); addTransferStreams(streams); Set> toBeUpdated = transferredRangesPerKeyspace.get(keyspace); @@ -510,46 +475,6 @@ synchronized void addTransferRanges(String keyspace, RangesAtEndpoint replicas, transferredRangesPerKeyspace.put(keyspace, toBeUpdated); } - private LogTransferTask createLogTransferTask() - { - ReconciledLogSnapshot reconciled = MutationTrackingService.instance().snapshotReconciledLogs(); - - // TODO: consider tradeoffs of eagerly reading the index of each segment and filtering out the ones that - // only contain fully reconciled ids vs just filtering out fully reconciled when reading out of the - // snapshot for streaming - MutationJournal.Snapshot snapshot = MutationJournal.instance().snapshot(); - try - { - // TODO: grab references to all current segments and the relevant reconciled sets - // TODO: Journal has a select and reference method we could use - LogTransferTask task = new LogTransferTask(this, peer, reconciled, snapshot); - logger.trace("[Stream #{}] Created log transfer task for peer {}", planId(), peer); - return task; - } - catch (Throwable t) - { - snapshot.close(); - throw t; - } - } - /** - * Set up mutation log transfer for specific keyspace and ranges. - * - * @param keyspace Transfer keyspace - * @param replicas Transfer ranges - */ - synchronized ReconciledKeyspaceOffsets addMutationLogTransfer(String keyspace, RangesAtEndpoint replicas) - { - failIfFinished(); - - if (logTransfer == null) - logTransfer = createLogTransferTask(); - - Collection> ranges = replicas.ranges(); - logTransfer.addKeyspaceRanges(keyspace, ranges); - return logTransfer.reconciled(keyspace, ranges); - } - private void failIfFinished() { if (state().isFinalState()) @@ -573,14 +498,14 @@ private Collection getColumnFamilyStores(String keyspace, Col } @VisibleForTesting - public List getOutgoingStreamsForRanges(RangesAtEndpoint replicas, Collection stores, TimeUUID pendingRepair, PreviewKind previewKind, ReconciledKeyspaceOffsets reconciledKeyspaceOffsets) + public List getOutgoingStreamsForRanges(RangesAtEndpoint replicas, Collection stores, TimeUUID pendingRepair, PreviewKind previewKind) { List streams = new ArrayList<>(); try { for (ColumnFamilyStore cfs: stores) { - streams.addAll(cfs.getStreamManager().createOutgoingStreams(this, replicas, pendingRepair, previewKind, reconciledKeyspaceOffsets)); + streams.addAll(cfs.getStreamManager().createOutgoingStreams(this, replicas, pendingRepair, previewKind)); } } catch (Throwable t) @@ -664,8 +589,6 @@ private void abortTasks() { receivers.values().forEach(StreamReceiveTask::abort); transfers.values().forEach(StreamTransferTask::abort); - if (logReceive != null) logReceive.abort(); - if (logTransfer != null) logTransfer.abort(); } catch (Exception e) { @@ -735,7 +658,7 @@ public synchronized void messageReceived(StreamMessage message) case PREPARE_SYN: // at follower PrepareSynMessage msg = (PrepareSynMessage) message; - prepare(msg.requests, msg.summaries, msg.logRequest, msg.logSummary); + prepare(msg.requests, msg.summaries); break; case PREPARE_SYNACK: // at initiator @@ -762,14 +685,6 @@ public synchronized void messageReceived(StreamMessage message) case SESSION_FAILED: sessionFailed(); break; - case MUTATION_LOG_STREAM: - MutationTrackingService.ensureEnabled(); - receiveMutationLog((IncomingMutationLogStreamMessage) message); - break; - case MUTATION_LOG_RECEIVED: - MutationTrackingService.ensureEnabled(); - mutationLogReceived((MutationLogReceivedMessage) message); - break; default: throw new AssertionError("unhandled StreamMessage type: " + message.getClass().getName()); } @@ -789,9 +704,6 @@ public void onInitializationComplete() prepare.summaries.add(task.getSummary()); } - prepare.logRequest = logReceive != null ? logReceive.getManifest() : null; - prepare.logSummary = logTransfer != null ? logTransfer.getManifest() : null; - sendControlMessage(prepare).syncUninterruptibly(); } @@ -867,15 +779,14 @@ private void logError(Throwable e) * * @return the prepare future for testing */ - public Future prepare(Collection requests, Collection summaries, - LogStreamManifest logRequest, LogStreamManifest logSummary) + public Future prepare(Collection requests, Collection summaries) { // prepare tasks state(State.PREPARING); return ScheduledExecutors.nonPeriodicTasks.submit(() -> { try { - prepareAsync(requests, summaries, logRequest, logSummary); + prepareAsync(requests, summaries); return null; } catch (Exception e) @@ -896,8 +807,7 @@ public void countStreamedIn(boolean isEntireSSTable) * so the logic should not execute on the main IO thread (read: netty event loop). */ @VisibleForTesting - void prepareAsync(Collection requests, Collection summaries, - LogStreamManifest logRequest, LogStreamManifest logSummary) + void prepareAsync(Collection requests, Collection summaries) { if (StreamOperation.REPAIR == streamOperation()) checkAvailableDiskSpaceAndCompactions(summaries); @@ -905,20 +815,11 @@ void prepareAsync(Collection requests, Collection for (StreamSummary summary : summaries) prepareReceiving(summary); - // Process mutation log manifests - if (logRequest != null) - prepareLogTransferring(logRequest); - if (logSummary != null) - prepareLogReceiving(logSummary); - PrepareSynAckMessage prepareSynAck = new PrepareSynAckMessage(); if (!peer.equals(FBUtilities.getBroadcastAddressAndPort())) for (StreamTransferTask task : transfers.values()) prepareSynAck.summaries.add(task.getSummary()); - // Include mutation log summary if we have log transfer task - prepareSynAck.logSummary = logTransfer != null ? logTransfer.getManifest() : null; - streamResult.handleSessionPrepared(this, PrepareDirection.SEND); // After sending the message the initiator can close the channel which will cause a ClosedChannelException // in buffer logic, this then gets sent to onError which validates the state isFinalState, if not fails @@ -949,10 +850,6 @@ private void prepareSynAck(PrepareSynAckMessage msg) sendControlMessage(new PrepareAckMessage()).syncUninterruptibly(); } - // Process mutation log summary if present - if (msg.logSummary != null) - prepareLogReceiving(msg.logSummary); - if (isPreview()) completePreview(); else @@ -987,13 +884,7 @@ private void processStreamRequests(Collection requests) { RangesAtEndpoint allRangesAtEndpoint = RangesAtEndpoint.concat(req.full, req.transientReplicas); if (ownedRanges.validateRangeRequest(allRangesAtEndpoint.ranges(), "Stream #" + planId(), "stream request", peer)) - { - - ReconciledKeyspaceOffsets reconciledKeyspaceOffsets = null; - if (logTransfer != null) - reconciledKeyspaceOffsets = logTransfer.reconciled(req.keyspace, allRangesAtEndpoint.ranges()); - addTransferRanges(req.keyspace, allRangesAtEndpoint, req.columnFamilies, true, reconciledKeyspaceOffsets); // always flush on stream request - } + addTransferRanges(req.keyspace, allRangesAtEndpoint, req.columnFamilies, true); // always flush on stream request else rejectedRequests.add(req); }); @@ -1003,28 +894,6 @@ private void processStreamRequests(Collection requests) throw new StreamRequestOutOfTokenRangeException(rejectedRequests); } - private void prepareLogReceiving(LogStreamManifest manifest) - { - // Create log receive task based on manifest - if (logReceive == null) - logReceive = new LogReceiveTask(this, peer); - - // Add keyspace ranges from manifest - manifest.keyspaceRanges.forEach((keyspace, ranges) -> - logReceive.addKeyspaceRanges(keyspace, ranges)); - } - - private void prepareLogTransferring(LogStreamManifest manifest) - { - // Create log transfer task based on manifest - if (logTransfer == null) - logTransfer = createLogTransferTask(); - - // Add keyspace ranges from manifest - manifest.keyspaceRanges.forEach((keyspace, ranges) -> - logTransfer.addKeyspaceRanges(keyspace, ranges)); - } - /** * In the case where we have an error checking disk space we allow the Operation to continue. * In the case where we do _not_ have available space, this method raises a RTE. @@ -1207,18 +1076,7 @@ public void streamSent(OutgoingStreamMessage message) } } - /** - * Call back after sending OutgoingMutationLogStreamMessage. - * - * @param message sent mutation log stream message - */ - public void logStreamSent(OutgoingMutationLogStreamMessage message) - { - if (logTransfer != null) - logTransfer.scheduleTimeout(); - } - - /** +/** * Call back after receiving a stream. * * @param message received stream @@ -1259,26 +1117,6 @@ public void receive(IncomingStreamMessage message) } } - /** - * Call back after receiving a mutation log stream. - * - * @param message received mutation log stream - */ - public void receiveMutationLog(IncomingMutationLogStreamMessage message) - { - if (isPreview()) - { - throw new RuntimeException(String.format("[Stream #%s] Cannot receive mutation log stream for preview session", planId())); - } - - logger.debug("[Stream #{}] Received {}", planId(), message); - // Mutations are already applied during deserialization - - // Create and track the log receive task, then let it handle the message - if (logReceive != null) - logReceive.received(message); - } - public void progress(String filename, ProgressInfo.Direction direction, long bytes, long delta, long total) { if (delta < 0) @@ -1294,12 +1132,6 @@ public void received(TableId tableId, int sequenceNumber) transfers.get(tableId).complete(sequenceNumber); } - public void mutationLogReceived(MutationLogReceivedMessage message) - { - if (logTransfer != null) - logTransfer.complete(); - } - /** * Check if session is completed on receiving {@code StreamMessage.Type.COMPLETE} message. */ @@ -1329,12 +1161,6 @@ private synchronized boolean maybeCompleted() if (!transfers.isEmpty()) return false; - if (logReceive != null && !logReceive.isCompleted()) - return false; - - if (logTransfer != null && !logTransfer.isCompleted()) - return false; - // if already executed once, skip it if (maybeCompleted) return true; @@ -1415,22 +1241,6 @@ public synchronized void taskCompleted(StreamTransferTask completedTask) maybeCompleted(); } - public synchronized void taskCompleted(LogReceiveTask completedTask) - { - Preconditions.checkState(logReceive == completedTask); - logger.trace("[Stream #{}] Log receive task completed, clearing reference", planId()); - logReceive = null; - maybeCompleted(); - } - - public synchronized void taskCompleted(LogTransferTask completedTask) - { - Preconditions.checkState(logTransfer == completedTask); - logger.trace("[Stream #{}] Log transfer task completed, clearing reference", planId()); - logTransfer = null; - maybeCompleted(); - } - private void completePreview() { try @@ -1445,9 +1255,6 @@ private void completePreview() // expected streaming, but don't leak any resources held by the task for (StreamTask task : Iterables.concat(receivers.values(), transfers.values())) task.abort(); - - if (logReceive != null) logReceive.abort(); - if (logTransfer != null) logTransfer.abort(); } } @@ -1478,8 +1285,6 @@ private void startStreamingFiles(@Nullable PrepareDirection prepareDirection) state(State.STREAMING); - startLogStreaming(); - for (StreamTransferTask task : transfers.values()) { Collection messages = task.getFileMessages(); @@ -1501,14 +1306,6 @@ private void startStreamingFiles(@Nullable PrepareDirection prepareDirection) maybeCompleted(); } - private void startLogStreaming() - { - if (logTransfer != null) - { - sendControlMessage(logTransfer.getMessage(this)); - } - } - @VisibleForTesting public int getNumRequests() { diff --git a/src/java/org/apache/cassandra/streaming/TableStreamManager.java b/src/java/org/apache/cassandra/streaming/TableStreamManager.java index e4145415d8aa..d19064c9577e 100644 --- a/src/java/org/apache/cassandra/streaming/TableStreamManager.java +++ b/src/java/org/apache/cassandra/streaming/TableStreamManager.java @@ -24,7 +24,6 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.locator.RangesAtEndpoint; -import org.apache.cassandra.replication.ReconciledKeyspaceOffsets; import org.apache.cassandra.streaming.messages.StreamMessageHeader; import org.apache.cassandra.utils.TimeUUID; @@ -56,6 +55,5 @@ public interface TableStreamManager Collection createOutgoingStreams(StreamSession session, RangesAtEndpoint replicas, TimeUUID pendingRepair, - PreviewKind previewKind, - ReconciledKeyspaceOffsets reconciledKeyspaceOffsets); + PreviewKind previewKind); } diff --git a/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java b/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java index 4c0a2d9bbd72..84841ad43df3 100644 --- a/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java +++ b/src/java/org/apache/cassandra/streaming/async/StreamingMultiplexedChannel.java @@ -45,7 +45,6 @@ import org.apache.cassandra.streaming.StreamingDataOutputPlus; import org.apache.cassandra.streaming.messages.IncomingStreamMessage; import org.apache.cassandra.streaming.messages.KeepAliveMessage; -import org.apache.cassandra.streaming.messages.OutgoingMutationLogStreamMessage; import org.apache.cassandra.streaming.messages.OutgoingStreamMessage; import org.apache.cassandra.streaming.messages.StreamMessage; import org.apache.cassandra.utils.concurrent.ImmediateFuture; @@ -214,7 +213,7 @@ public Future sendMessage(StreamingChannel channel, StreamMessage message) if (closed) throw new RuntimeException("stream has been closed, cannot send " + message); - if (message instanceof OutgoingStreamMessage || message instanceof OutgoingMutationLogStreamMessage) + if (message instanceof OutgoingStreamMessage) { if (session.isPreview()) throw new RuntimeException("Cannot send stream data messages for preview streaming sessions"); @@ -222,9 +221,7 @@ public Future sendMessage(StreamingChannel channel, StreamMessage message) logger.debug("{} Sending {}", createLogTag(session), message); InetAddressAndPort connectTo = factory.supportsPreferredIp() ? SystemKeyspace.getPreferredIP(to) : to; - FileStreamTask task = message instanceof OutgoingStreamMessage - ? new FileStreamTask((OutgoingStreamMessage) message, connectTo) - : new FileStreamTask((OutgoingMutationLogStreamMessage) message, connectTo); + FileStreamTask task = new FileStreamTask((OutgoingStreamMessage) message, connectTo); return fileTransferExecutor.submit(task); } @@ -299,12 +296,6 @@ private FileStreamTask(OutgoingStreamMessage ofm, InetAddressAndPort connectTo) this.connectTo = connectTo; } - FileStreamTask(OutgoingMutationLogStreamMessage ofm, InetAddressAndPort connectTo) - { - this.msg = ofm; - this.connectTo = connectTo; - } - /** * For testing purposes */ diff --git a/src/java/org/apache/cassandra/streaming/messages/IncomingMutationLogStreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/IncomingMutationLogStreamMessage.java deleted file mode 100644 index e01e64724e00..000000000000 --- a/src/java/org/apache/cassandra/streaming/messages/IncomingMutationLogStreamMessage.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming.messages; - -import java.io.IOException; -import java.nio.ByteBuffer; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.cassandra.db.Mutation; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.replication.MutationTrackingService; -import org.apache.cassandra.replication.Version; -import org.apache.cassandra.streaming.LogStreamHeader; -import org.apache.cassandra.streaming.StreamManager; -import org.apache.cassandra.streaming.StreamReceiveException; -import org.apache.cassandra.streaming.StreamSession; -import org.apache.cassandra.streaming.StreamingChannel; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; -import org.apache.cassandra.utils.ByteBufferUtil; - -/** - * Incoming mutation log stream message for receiving mutation logs during bootstrap. - * - * This message handles deserialization and processing of mutation logs received from - * nodes during the bootstrap process. - */ -public class IncomingMutationLogStreamMessage extends MutationLogStreamMessage -{ - private static final Logger logger = LoggerFactory.getLogger(IncomingMutationLogStreamMessage.class); - - public static final StreamMessage.Serializer serializer = new IncomingMutationLogStreamMessageSerializer(); - - public final StreamSession session; - - public IncomingMutationLogStreamMessage(LogStreamHeader header, StreamSession session) - { - super(header); - this.session = session; - } - - @Override - public StreamSession getOrCreateAndAttachInboundSession(StreamingChannel channel, int messagingVersion) - { - session.attachInbound(channel); - return session; - } - - public static class IncomingMutationLogStreamMessageSerializer implements StreamMessage.Serializer - { - @Override - public IncomingMutationLogStreamMessage deserialize(DataInputPlus in, int messagingVersion) throws IOException - { - Version version = Version.serializer.deserialize(in); - LogStreamHeader header = LogStreamHeader.serializer.deserialize(in, version); - - StreamSession session = StreamManager.instance.findSession(header.sender, header.planId, header.sessionIndex, header.sendByFollower); - if (session == null) - throw new IllegalStateException(String.format("unknown stream session: %s - %d", header.planId, header.sessionIndex)); - - try - { - while (in.readBoolean()) - { - int userVersion = in.readInt(); - ByteBuffer buffer = ByteBufferUtil.readWithVIntLength(in); - Mutation mutation = Mutation.serializer.deserialize(buffer, userVersion); - - if (logger.isTraceEnabled()) - logger.trace("Received mutation {}: session={}, keyspace={}, token={}", - mutation.id(), - session.planId(), - mutation.getKeyspaceName(), - mutation.key().getToken()); - - mutation.apply(); - } - - MutationTrackingService.instance().recordFullyReconciledOffsets(header.reconciled); - - return new IncomingMutationLogStreamMessage(header, session); - } - catch (Throwable t) - { - if (t instanceof StreamReceiveException) - throw (StreamReceiveException) t; - throw new StreamReceiveException(session, t); - } - } - - @Override - public void serialize(IncomingMutationLogStreamMessage message, StreamingDataOutputPlus out, int version, StreamSession session) - { - throw new UnsupportedOperationException("Not allowed to call serialize on an incoming stream"); - } - - @Override - public long serializedSize(IncomingMutationLogStreamMessage message, int version) - { - throw new UnsupportedOperationException("Not allowed to call serializedSize on an incoming stream"); - } - } -} diff --git a/src/java/org/apache/cassandra/streaming/messages/MutationLogReceivedMessage.java b/src/java/org/apache/cassandra/streaming/messages/MutationLogReceivedMessage.java deleted file mode 100644 index a1f887b042d0..000000000000 --- a/src/java/org/apache/cassandra/streaming/messages/MutationLogReceivedMessage.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming.messages; - -import java.io.IOException; - -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.replication.Version; -import org.apache.cassandra.streaming.StreamSession; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; - -public class MutationLogReceivedMessage extends StreamMessage -{ - public static Serializer serializer = new Serializer<>() - { - @Override - public MutationLogReceivedMessage deserialize(DataInputPlus in, int version) throws IOException - { - Version ignore = Version.serializer.deserialize(in); - return new MutationLogReceivedMessage(); - } - - @Override - public void serialize(MutationLogReceivedMessage message, StreamingDataOutputPlus out, int version, StreamSession session) throws IOException - { - Version.serializer.serialize(Version.CLUSTER_SAFE_VERSION, out); - } - - @Override - public long serializedSize(MutationLogReceivedMessage message, int version) - { - return Version.serializer.serializedSize(Version.CLUSTER_SAFE_VERSION); - } - }; - - public MutationLogReceivedMessage() - { - super(Type.MUTATION_LOG_RECEIVED); - } - - @Override - public String toString() - { - return "MutationLogReceived"; - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/streaming/messages/MutationLogStreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/MutationLogStreamMessage.java deleted file mode 100644 index 4f11fb8b7118..000000000000 --- a/src/java/org/apache/cassandra/streaming/messages/MutationLogStreamMessage.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming.messages; - -import org.apache.cassandra.streaming.LogStreamHeader; - -public abstract class MutationLogStreamMessage extends StreamMessage -{ - public final LogStreamHeader header; - - protected MutationLogStreamMessage(LogStreamHeader header) - { - super(Type.MUTATION_LOG_STREAM); - this.header = header; - } - - @Override - public String toString() - { - return String.format("%s{header=%s}", getClass().getSimpleName(), header); - } -} \ No newline at end of file diff --git a/src/java/org/apache/cassandra/streaming/messages/OutgoingMutationLogStreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/OutgoingMutationLogStreamMessage.java deleted file mode 100644 index 27d334728101..000000000000 --- a/src/java/org/apache/cassandra/streaming/messages/OutgoingMutationLogStreamMessage.java +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.cassandra.streaming.messages; - -import java.io.IOException; -import java.util.Set; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.cassandra.db.DecoratedKey; -import org.apache.cassandra.db.Mutation; -import org.apache.cassandra.db.rows.DeserializationHelper; -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; -import org.apache.cassandra.io.util.DataInputBuffer; -import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.replication.MutationJournal; -import org.apache.cassandra.replication.Version; -import org.apache.cassandra.schema.TableMetadata; -import org.apache.cassandra.streaming.LogStreamHeader; -import org.apache.cassandra.streaming.StreamSession; -import org.apache.cassandra.streaming.StreamingDataOutputPlus; -import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.Pair; - -public class OutgoingMutationLogStreamMessage extends MutationLogStreamMessage -{ - private static final Logger logger = LoggerFactory.getLogger(OutgoingMutationLogStreamMessage.class); - - public static final StreamMessage.Serializer serializer = new OutgoingMutationLogStreamMessageSerializer(); - - private final MutationJournal.Snapshot snapshot; - - public OutgoingMutationLogStreamMessage(LogStreamHeader header, MutationJournal.Snapshot snapshot) - { - super(header); - this.snapshot = snapshot; - } - - public void serialize(StreamingDataOutputPlus out, int version, StreamSession session) throws IOException - { - Version.serializer.serialize(Version.CLUSTER_SAFE_VERSION, out); - LogStreamHeader.serializer.serialize(header, out, Version.CLUSTER_SAFE_VERSION); - - try - { - // Stream mutations using the journal readAll method and filter by keyspace and token ranges - snapshot.readAll((segment, position, key, buffer, userVersion) -> { - try (DataInputBuffer in = new DataInputBuffer(buffer, true)) - { - Pair keyAndTableMetadata = Mutation.serializer.deserializeKeyAndTableMetadata(in, userVersion, DeserializationHelper.Flag.LOCAL); - DecoratedKey dk = keyAndTableMetadata.left; - String keyspace = keyAndTableMetadata.right.keyspace; - - // don't send fully reconciled mutations - if (header.reconciled.isFullyReconciled(keyspace, key)) - return; - - // Check if the mutation's keyspace and token are in our ranges - Set> ranges = header.manifest.keyspaceRanges.get(keyspace); - - if (ranges == null) - { - if (logger.isTraceEnabled()) - logger.trace("Mutation {} not sent: keyspace {} not in manifest ranges for session {}", key, keyspace, session.planId()); - return; - } - - if (!Range.isInRanges(dk.getToken(), ranges)) - { - if (logger.isTraceEnabled()) - logger.trace("Mutation {} not sent: token {} not in ranges for keyspace {} in session {}", key, dk.getToken(), keyspace, session.planId()); - return; - } - - if (logger.isTraceEnabled()) - logger.trace("Sending mutation {}: keyspace={}, token={}, session={}", key, keyspace, dk.getToken(), session.planId()); - - out.writeBoolean(true); - out.writeInt(userVersion); - ByteBufferUtil.writeWithVIntLength(buffer, out); - } - catch (IOException e) - { - throw new RuntimeException(e); - } - }); - } - finally - { - snapshot.close(); - } - - // end-of-stream marker - out.writeBoolean(false); - - session.logStreamSent(this); - } - - public long serializedSize(int version) - { - return 0; - } - - public static class OutgoingMutationLogStreamMessageSerializer implements StreamMessage.Serializer - { - @Override - public OutgoingMutationLogStreamMessage deserialize(DataInputPlus in, int version) - { - throw new UnsupportedOperationException("Not allowed to call deserialize on an outgoing stream"); - } - - @Override - public void serialize(OutgoingMutationLogStreamMessage message, StreamingDataOutputPlus out, int version, StreamSession session) throws IOException - { - message.serialize(out, version, session); - } - - @Override - public long serializedSize(OutgoingMutationLogStreamMessage message, int version) - { - return message.serializedSize(version); - } - } -} diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java index e6e646371d52..cd437cca8f0f 100644 --- a/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/PrepareSynAckMessage.java @@ -23,8 +23,6 @@ import java.util.Collection; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.streaming.LogStreamManifest; import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.streaming.StreamSummary; import org.apache.cassandra.streaming.StreamingDataOutputPlus; @@ -38,13 +36,6 @@ public void serialize(PrepareSynAckMessage message, StreamingDataOutputPlus out, out.writeInt(message.summaries.size()); for (StreamSummary summary : message.summaries) StreamSummary.serializer.serialize(summary, out, version); - // log summary (optional, added in version 52) - if (version >= MessagingService.VERSION_61) - { - out.writeBoolean(message.logSummary != null); - if (message.logSummary != null) - LogStreamManifest.embedded.serialize(message.logSummary, out, version); - } } public PrepareSynAckMessage deserialize(DataInputPlus input, int version) throws IOException @@ -53,12 +44,6 @@ public PrepareSynAckMessage deserialize(DataInputPlus input, int version) throws int numSummaries = input.readInt(); for (int i = 0; i < numSummaries; i++) message.summaries.add(StreamSummary.serializer.deserialize(input, version)); - // log summary (optional, added in version 52) - if (version >= MessagingService.VERSION_61) - { - if (input.readBoolean()) - message.logSummary = LogStreamManifest.embedded.deserialize(input, version); - } return message; } @@ -67,13 +52,6 @@ public long serializedSize(PrepareSynAckMessage message, int version) long size = 4; // count of summaries for (StreamSummary summary : message.summaries) size += StreamSummary.serializer.serializedSize(summary, version); - // log summary (optional, added in version 52) - if (version >= MessagingService.VERSION_61) - { - size += 1; // boolean for logSummary presence - if (message.logSummary != null) - size += LogStreamManifest.embedded.serializedSize(message.logSummary, version); - } return size; } }; @@ -83,11 +61,6 @@ public long serializedSize(PrepareSynAckMessage message, int version) */ public final Collection summaries = new ArrayList<>(); - /** - * Optional summary of log stream tx - */ - public LogStreamManifest logSummary = null; - public PrepareSynAckMessage() { super(Type.PREPARE_SYNACK); diff --git a/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java b/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java index ce698d763462..c856f469838f 100644 --- a/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/PrepareSynMessage.java @@ -22,8 +22,6 @@ import java.util.Collection; import org.apache.cassandra.io.util.DataInputPlus; -import org.apache.cassandra.net.MessagingService; -import org.apache.cassandra.streaming.LogStreamManifest; import org.apache.cassandra.streaming.StreamRequest; import org.apache.cassandra.streaming.StreamSession; import org.apache.cassandra.streaming.StreamSummary; @@ -44,16 +42,6 @@ public PrepareSynMessage deserialize(DataInputPlus input, int version) throws IO int numSummaries = input.readInt(); for (int i = 0; i < numSummaries; i++) message.summaries.add(StreamSummary.serializer.deserialize(input, version)); - - if (version >= MessagingService.VERSION_61) - { - if (input.readBoolean()) - message.logRequest = LogStreamManifest.embedded.deserialize(input, version); - - if (input.readBoolean()) - message.logSummary = LogStreamManifest.embedded.deserialize(input, version); - } - return message; } @@ -64,16 +52,6 @@ public long serializedSize(PrepareSynMessage message, int version) size += StreamRequest.serializer.serializedSize(request, version); for (StreamSummary summary : message.summaries) size += StreamSummary.serializer.serializedSize(summary, version); - // log request and summary (optional, added in version 52) - if (version >= MessagingService.VERSION_61) - { - size += 1; // boolean for logRequest presence - if (message.logRequest != null) - size += LogStreamManifest.embedded.serializedSize(message.logRequest, version); - size += 1; // boolean for logSummary presence - if (message.logSummary != null) - size += LogStreamManifest.embedded.serializedSize(message.logSummary, version); - } return size; } @@ -87,16 +65,6 @@ public void serialize(PrepareSynMessage message, StreamingDataOutputPlus out, in out.writeInt(message.summaries.size()); for (StreamSummary summary : message.summaries) StreamSummary.serializer.serialize(summary, out, version); - // log request and summary (optional, added in version 52) - if (version >= MessagingService.VERSION_61) - { - out.writeBoolean(message.logRequest != null); - if (message.logRequest != null) - LogStreamManifest.embedded.serialize(message.logRequest, out, version); - out.writeBoolean(message.logSummary != null); - if (message.logSummary != null) - LogStreamManifest.embedded.serialize(message.logSummary, out, version); - } } }; @@ -110,16 +78,6 @@ public void serialize(PrepareSynMessage message, StreamingDataOutputPlus out, in */ public final Collection summaries = new ArrayList<>(); - /** - * Optional request for log stream - */ - public LogStreamManifest logRequest = null; - - /** - * Optional summary of log stream tx - */ - public LogStreamManifest logSummary = null; - public PrepareSynMessage() { super(Type.PREPARE_SYN); diff --git a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java index 4bc494bef612..cb43e263b6a8 100644 --- a/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java +++ b/src/java/org/apache/cassandra/streaming/messages/StreamMessage.java @@ -69,9 +69,7 @@ public enum Type KEEP_ALIVE (7, 5, KeepAliveMessage.serializer ), PREPARE_SYNACK (8, 5, PrepareSynAckMessage.serializer), PREPARE_ACK (9, 5, PrepareAckMessage.serializer ), - STREAM_INIT (10, 5, StreamInitMessage.serializer ), - MUTATION_LOG_STREAM (11, 0, IncomingMutationLogStreamMessage.serializer, OutgoingMutationLogStreamMessage.serializer), - MUTATION_LOG_RECEIVED(12, 4, MutationLogReceivedMessage.serializer); + STREAM_INIT (10, 5, StreamInitMessage.serializer ); private static final Map idToTypeMap; diff --git a/src/java/org/apache/cassandra/tcm/Startup.java b/src/java/org/apache/cassandra/tcm/Startup.java index 5a462665cce3..0b4f5f7b82dc 100644 --- a/src/java/org/apache/cassandra/tcm/Startup.java +++ b/src/java/org/apache/cassandra/tcm/Startup.java @@ -488,11 +488,13 @@ public static void startup(Supplier initialTransformation, boole case JOINED: if (StorageService.isReplacingSameAddress()) { + // TODO (required): we need to support a mode that changes the NodeId when replacing the same address + // for accord transaction safety and for Mutation Tracking shard sealing if (DatabaseDescriptor.getAccordTransactionsEnabled()) - { - // TODO (required): we need to support a mode that changes the NodeId when replacing the same address for accord transaction safety throw new IllegalStateException("Cannot replace same address when accord transactions are enabled."); - } + + if (MutationTrackingService.isEnabled()) + throw new IllegalStateException("Cannot replace same address with mutation tracking enabled."); ReplaceSameAddress.streamData(self, metadata, shouldBootstrap, finishJoiningRing); } diff --git a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java index d2909bd8737b..fa1f70fcc268 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java +++ b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndJoin.java @@ -48,6 +48,8 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.repair.autorepair.AutoRepairUtils; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.SealingCoordinator; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.service.accord.AccordService; @@ -300,6 +302,10 @@ public SequenceState executeNext() return continuable(); } ClusterMetadataService.instance().ensureCMSPlacement(metadata); + + // see the comments for SealingCoordinator.collectAndSealStartJoinShards() + if (MutationTrackingService.isEnabled()) + SealingCoordinator.collectAndSealStartJoinShards(metadata, finishJoin.delta()); break; default: return error(new IllegalStateException("Can't proceed with join from " + next)); @@ -380,6 +386,15 @@ public static boolean bootstrap(final Collection tokens, } StorageService.instance.repairPaxosForTopologyChange("bootstrap"); + + if (MutationTrackingService.isEnabled()) + { + if (beingReplaced == null) + SealingCoordinator.discoverAndSealBootstrapObsoletedShards(metadata, movements); + else + SealingCoordinator.discoverAndSealReplacementObsoletedShards(metadata, movements, beingReplaced); + } + List> bootstraps = new ArrayList<>(); if (AccordService.instance().isEnabled()) { diff --git a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java index 32afe234916d..84231fe12120 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java +++ b/src/java/org/apache/cassandra/tcm/sequences/BootstrapAndReplace.java @@ -46,6 +46,8 @@ import org.apache.cassandra.locator.EndpointsByReplica; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.repair.autorepair.AutoRepairUtils; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.SealingCoordinator; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; @@ -290,6 +292,12 @@ public SequenceState executeNext() } ClusterMetadataService.instance().ensureCMSPlacement(metadata); + // Seal the intermediate START_REPLACE shards now obsoleted by FINISH_REPLACE, mirroring the + // post-FINISH_JOIN seal in BootstrapAndJoin. The replaced node is dead and already removed, so it is + // excluded from the seal and the survivors (+ this replacement) reconcile among themselves. + if (MutationTrackingService.isEnabled()) + SealingCoordinator.collectAndSealStartReplaceShards(metadata, finishReplace.delta(), startReplace.replaced()); + break; default: return error(new IllegalStateException("Can't proceed with replacement from " + next)); diff --git a/src/java/org/apache/cassandra/tcm/sequences/SingleNodeSequences.java b/src/java/org/apache/cassandra/tcm/sequences/SingleNodeSequences.java index 3a2fd2221a31..bda9a3e1cbcf 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/SingleNodeSequences.java +++ b/src/java/org/apache/cassandra/tcm/sequences/SingleNodeSequences.java @@ -30,6 +30,7 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; @@ -136,6 +137,10 @@ static void removeNode(NodeId toRemove, boolean force) if (metadata.inProgressSequences.contains(toRemove)) throw new UnsupportedOperationException("Can not remove a node that has an in-progress sequence"); + // TODO (required): mutation tracking shard sealing for removenode (dead participant) is not yet implemented + if (MutationTrackingService.isEnabled()) + throw new IllegalStateException("Cannot removenode with mutation tracking enabled."); + ReconfigureCMS.maybeReconfigureCMS(metadata, endpoint); logger.info("starting removenode with {} {}", metadata.epoch, toRemove); @@ -169,6 +174,10 @@ static void move(Token newToken) if (ClusterMetadata.current().tokenMap.tokens().contains(newToken)) throw new IllegalArgumentException(String.format("target token %s is already owned by another node.", newToken)); + // TODO (required): mutation tracking shard sealing for move is not yet implemented + if (MutationTrackingService.isEnabled()) + throw new IllegalStateException("Cannot move a node with mutation tracking enabled."); + // address of the current node ClusterMetadata metadata = ClusterMetadata.current(); NodeId self = metadata.myNodeId(); diff --git a/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java b/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java index da8ae74e6a34..f27c083b3245 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java +++ b/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapAndLeave.java @@ -34,6 +34,8 @@ import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.locator.DynamicEndpointSnitch; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.SealingCoordinator; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; @@ -211,9 +213,10 @@ public SequenceState executeNext() } break; case FINISH_LEAVE: + ClusterMetadata postFinish; try { - ClusterMetadataService.instance().commit(finishLeave); + postFinish = ClusterMetadataService.instance().commit(finishLeave); StorageService.instance.clearTransientMode(); } catch (Throwable t) @@ -221,6 +224,15 @@ public SequenceState executeNext() JVMStabilityInspector.inspectThrowable(t); return continuable(); } + + // Seal the shards obsoleted by FINISH_LEAVE (intermediate START_LEAVE shards and any shard + // obsoleted by the range merge), mirroring SealingCoordinator.collectAndSealStartJoinShards. + // Kept outside the commit try/catch (like the join side) so a seal failure propagates rather than + // re-running FINISH_LEAVE and re-committing the already-committed finishLeave. + // Decommission only: removenode/assassinate may run with down participants, which the happy-path + // seal cannot handle, and the leaving node hasn't streamed its data to the survivors. + if (MutationTrackingService.isEnabled() && streams.kind() == LeaveStreams.Kind.UNBOOTSTRAP) + SealingCoordinator.collectAndSealFinishLeaveShards(postFinish, finishLeave.delta()); break; default: return error(new IllegalStateException("Can't proceed with leave from " + next)); diff --git a/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapStreams.java b/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapStreams.java index e62cf77dfa30..b300caa8c314 100644 --- a/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapStreams.java +++ b/src/java/org/apache/cassandra/tcm/sequences/UnbootstrapStreams.java @@ -39,6 +39,8 @@ import org.apache.cassandra.locator.RangesByEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.locator.SystemStrategy; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.SealingCoordinator; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.StorageService; @@ -71,7 +73,7 @@ public void execute(NodeId leaving, PlacementDeltas startLeave, PlacementDeltas started.set(true); try { - unbootstrap(Schema.instance.getNonLocalStrategyKeyspaces(), movements); + unbootstrap(Schema.instance.getNonLocalStrategyKeyspaces(), movements, startLeave); } catch (ExecutionException e) { @@ -111,24 +113,30 @@ private static MovementMap movementMap(InetAddressAndPort leaving, PlacementDelt return allMovements.build(); } - private static void unbootstrap(Keyspaces keyspaces, MovementMap movements) throws ExecutionException, InterruptedException + private static void unbootstrap(Keyspaces keyspaces, MovementMap movements, PlacementDeltas startLeave) throws ExecutionException, InterruptedException { Supplier> startStreaming = prepareUnbootstrapStreaming(keyspaces, movements); StorageService.instance.repairPaxosForTopologyChange("decommission"); + logger.info("replaying batch log and streaming data to other nodes"); // Start with BatchLog replay, which may create hints but no writes since this is no longer a valid endpoint. Future batchlogReplay = BatchlogManager.instance.startBatchlogReplay(); - Future streamSuccess = startStreaming.get(); // Wait for batch log to complete before streaming hints. logger.debug("waiting for batch log processing."); batchlogReplay.get(); logger.info("streaming hints to other nodes"); - Future hintsSuccess = StorageService.instance.streamHints(); + // Seal the shards obsoleted by START_LEAVE before streaming this node's data away, mirroring the + // pre-streaming seal that BootstrapAndJoin.bootstrap() performs for joins. + if (MutationTrackingService.isEnabled()) + SealingCoordinator.collectAndSealDecommissionObsoletedShards(ClusterMetadata.current(), startLeave); + + Future streamSuccess = startStreaming.get(); + // wait for the transfer runnables to signal the latch. logger.debug("waiting for stream acks."); streamSuccess.get(); diff --git a/src/java/org/apache/cassandra/tcm/transformations/Assassinate.java b/src/java/org/apache/cassandra/tcm/transformations/Assassinate.java index f1bb6d50d317..86c369123997 100644 --- a/src/java/org/apache/cassandra/tcm/transformations/Assassinate.java +++ b/src/java/org/apache/cassandra/tcm/transformations/Assassinate.java @@ -25,6 +25,7 @@ import org.apache.cassandra.dht.Token; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.replication.MutationTrackingService; import org.apache.cassandra.tcm.ClusterMetadata; import org.apache.cassandra.tcm.ClusterMetadataService; import org.apache.cassandra.tcm.Epoch; @@ -65,6 +66,10 @@ public static void assassinateEndpoint(InetAddressAndPort endpoint) if (!metadata.directory.isRegistered(endpoint)) return; + // TODO (required): mutation tracking shard sealing for assassinate (dead participant) is not yet implemented + if (MutationTrackingService.isEnabled()) + throw new IllegalStateException("Cannot assassinate a node with mutation tracking enabled."); + ReconfigureCMS.maybeReconfigureCMS(metadata, endpoint); NodeId nodeId = metadata.directory.peerId(endpoint); diff --git a/test/distributed/org/apache/cassandra/distributed/test/TrackedBootstrapTest.java b/test/distributed/org/apache/cassandra/distributed/test/TrackedBootstrapTest.java new file mode 100644 index 000000000000..c299e6495e5d --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/TrackedBootstrapTest.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.distributed.test; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.locks.LockSupport; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.Constants; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInstanceConfig; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.NetworkTopology; + +import static java.lang.String.format; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Bootstrap of a tracked keyspace (single-token, no vnodes). Exercises {@code BootstrapAndJoin.bootstrap()} + * including the (happy-path) sealing of shards obsoleted by the join, and asserts the new node joins the ring. + *

+ */ +public class TrackedBootstrapTest extends TestBaseImpl +{ + /** + * Keyspace / table used across all tests. Each test method uses its own {@link Cluster} + * instance, so re-using the same names is fine. + */ + private static final String KEYSPACE = "tracked_bootstrap_ks"; + private static final String TABLE = "tbl"; + + @Test + public void bootstrapSealsObsoletedShards() throws Throwable + { + bootstrapAndSeal(3, 3); + } + + @Test + public void bootstrapSealsObsoletedShards_largerRing() throws Throwable + { + bootstrapAndSeal(5, 3); + } + + /** + * Start {@code initialNodes} (single token each: node N gets token N*100), create an RF={@code rf} + * tracked keyspace, then bootstrap one more node (token (initialNodes+1)*100). + */ + private void bootstrapAndSeal(int initialNodes, int rf) throws Throwable + { + int expandedNodes = initialNodes + 1; + // Evenly distribute single tokens across the full Murmur3 ring so each node owns a real, balanced + // fraction of the token space (and the joining node actually acquires a substantial range with data). + TokenSupplier tokenSupplier = TokenSupplier.evenlyDistributedTokens(expandedNodes, 1); + try (Cluster cluster = builder().withNodes(initialNodes) + .withTokenSupplier(tokenSupplier) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(expandedNodes, "dc0", "rack0")) + .withConfig(config -> config.with(NETWORK, GOSSIP) + .set("num_tokens", 1)) + .start()) + { + // RF tracked keyspace (mutation tracking is enabled by default in jvm-dtest) + cluster.schemaChange(format("CREATE KEYSPACE %s WITH replication = {'class':'SimpleStrategy','replication_factor':%d} AND replication_type='tracked'", + KEYSPACE, rf)); + cluster.schemaChange(format("CREATE TABLE %s.%s (pk int PRIMARY KEY, v int)", KEYSPACE, TABLE)); + + // Wait until every initial node sees the whole ring as UP + NORMAL before issuing writes, so the + // tracked-write forwarding paths see all replicas alive rather than racing startup/gossip. + for (int n = 1; n <= initialNodes; n++) + ClusterUtils.awaitRingHealthy(cluster.get(n)); + + // Pre-bootstrap writes: exercise the regular and Paxos (LWT) tracked-write chokepoints. + // (Counters are intentionally omitted: with RF < N the tracked counter-leader forward path + // -- ForwardedWrite.forwardCounterMutationInternal -> findCounterLeaderReplica -- fails to + // resolve a remote leader; tracked-counter forwarding is a separate, pre-existing issue.) + int rows = 1000; + // Mirror the write operations (in order) to derive the expected final state for read-back. + Map expected = new ConcurrentHashMap<>(); + for (int i = 0; i < rows; i++) + { + // regular tracked write -> TrackedWriteRequest.perform (regular branch) + cluster.coordinator(1).execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?)", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, i, i); + expected.put(i, i); + // LWT -> Paxos tracked commit path (StorageProxy.commitPaxosTracked) + cluster.coordinator(1).execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?) IF NOT EXISTS", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, 100 + i, i); + expected.putIfAbsent(100 + i, i); + } + + IInstanceConfig config = cluster.newInstanceConfig() + .set("auto_bootstrap", true) + .set(Constants.KEY_DTEST_FULL_STARTUP, true); + IInvokableInstance newNode = cluster.bootstrap(config); + + // Keep client writes running in the background for the whole duration of the bootstrap, through the + // existing (always-alive) coordinators. The joining node must obtain these writes -- already-acked ones + // by streaming from the seeded replicas, in-flight ones via the write set it joins while BOOTSTRAPPING -- + // and the obsoleted shards must still reconcile and seal. Only acked keys are recorded, using a key range + // disjoint from the baseline writes so the read-back is exact. + int concurrentBase = 1_000_000; + AtomicBoolean done = new AtomicBoolean(false); + AtomicInteger nextKey = new AtomicInteger(concurrentBase); + AtomicReference bgError = new AtomicReference<>(); + Thread writer = new Thread(() -> { + while (!done.get()) + { + int k = nextKey.getAndIncrement(); + // Writes can race the in-flight topology change and hit a retryable "ring has changed" error; + // retry the (idempotent) insert until it is acknowledged, as a real client would. + for (int attempt = 1; ; attempt++) + { + try + { + cluster.coordinator((k % 2) + 1) + .execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?)", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, k, k); + expected.put(k, k); + break; + } + catch (Throwable t) + { + if (done.get()) + return; // shutting down; drop this in-flight write + if (attempt >= 100) + { + bgError.set(t); + return; + } + LockSupport.parkNanos(20_000_000L); // 20ms backoff, then retry the same key + } + } + } + }, "background-writer"); + writer.start(); + + newNode.startup(cluster); + + ClusterUtils.awaitRingJoin(cluster.get(1), newNode); + ClusterUtils.awaitRingJoin(newNode, cluster.get(1)); + + // Stop background writes and surface any failure. + done.set(true); + writer.join(); + if (bgError.get() != null) + throw new AssertionError("Background writes failed during bootstrap", bgError.get()); + assertTrue("expected concurrent writes to run during the bootstrap", nextKey.get() > concurrentBase); + + // Read the full dataset back at QUORUM, both through an existing replica and through the freshly + // bootstrapped node, exercising the tracked read path across the post-seal topology. + assertAllRows(cluster.coordinator(1), expected); + assertAllRows(newNode.coordinator(), expected); + } + } + + /** Read the whole table at QUORUM through {@code coordinator} and assert it matches {@code expected}. */ + private static void assertAllRows(ICoordinator coordinator, Map expected) + { + Object[][] result = coordinator.execute(format("SELECT pk, v FROM %s.%s", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM); + Map actual = new HashMap<>(result.length); + for (Object[] row : result) + actual.put((Integer) row[0], (Integer) row[1]); + assertEquals("unexpected row count", expected.size(), actual.size()); + assertEquals("row contents differ", expected, actual); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/TrackedDecommissionTest.java b/test/distributed/org/apache/cassandra/distributed/test/TrackedDecommissionTest.java new file mode 100644 index 000000000000..9d4a1319d35a --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/TrackedDecommissionTest.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.distributed.test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.locks.LockSupport; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.Shard; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeState; + +import static java.lang.String.format; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Decommission of a node in a tracked keyspace (single-token, no vnodes), exercising the leave-side shard sealing + * in {@link org.apache.cassandra.replication.SealingCoordinator}: + *
    + *
  • {@code collectAndSealDecommissionObsoletedShards} (pre-streaming, from {@code UnbootstrapStreams}) seals the pre-leave + * shards over the handed-off ranges, and
  • + *
  • {@code collectAndSealFinishLeaveShards} (post-FINISH_LEAVE, from {@code UnbootstrapAndLeave}) seals the + * intermediate over-replicated shards created during START_LEAVE plus any shard obsoleted only by the + * range merge.
  • + *
+ * Uses human-readable single tokens (node N -> token N*100) so the affected ranges and merges are easy to follow. + */ +public class TrackedDecommissionTest extends TestBaseImpl +{ + private static final String KEYSPACE = "tracked_decommission_ks"; + private static final String TABLE = "tbl"; + + private static final int NODES = 7; + private static final int RF = 3; + private static final int DECOMMISSION_TARGET = 4; // node 4, token 400 + + @Test + public void decommissionSealsObsoletedShards() throws Throwable + { + // node N gets the single human-readable token N*100: 100,200,300,400,500,600,700 + TokenSupplier tokenSupplier = (TokenSupplier) i -> Collections.singleton(String.valueOf(i * 100)); + + try (Cluster cluster = builder().withNodes(NODES) + .withTokenSupplier(tokenSupplier) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(NODES, "dc0", "rack0")) + .withConfig(config -> config.with(NETWORK, GOSSIP) + .set("num_tokens", 1)) + .start()) + { + cluster.schemaChange(format("CREATE KEYSPACE %s WITH replication = {'class':'SimpleStrategy','replication_factor':%d} AND replication_type='tracked'", + KEYSPACE, RF)); + cluster.schemaChange(format("CREATE TABLE %s.%s (pk int PRIMARY KEY, v int)", KEYSPACE, TABLE)); + + for (int n = 1; n <= NODES; n++) + ClusterUtils.awaitRingHealthy(cluster.get(n)); + + int rows = 200; + Map expected = new ConcurrentHashMap<>(); + for (int i = 0; i < rows; i++) + { + cluster.coordinator(1).execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?)", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, i, i); + expected.put(i, i); + } + + // Keep client writes running in the background for the whole duration of the decommission, through + // surviving coordinators (nodes 1 and 2; the leaving node is node DECOMMISSION_TARGET). The obsoleted + // shards must still reconcile and seal despite the in-flight writes and the departing participant. Only + // acked keys are recorded, continuing past the baseline range so the read-back is exact. + AtomicBoolean done = new AtomicBoolean(false); + AtomicInteger nextKey = new AtomicInteger(rows); + AtomicReference bgError = new AtomicReference<>(); + Thread writer = new Thread(() -> { + while (!done.get()) + { + int k = nextKey.getAndIncrement(); + // Writes can race the in-flight topology change and hit a retryable "ring has changed" error; + // retry the (idempotent) insert until it is acknowledged, as a real client would. + for (int attempt = 1; ; attempt++) + { + try + { + cluster.coordinator((k % 2) + 1) + .execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?)", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, k, k); + expected.put(k, k); + break; + } + catch (Throwable t) + { + if (done.get()) + return; // shutting down; drop this in-flight write + if (attempt >= 100) + { + bgError.set(t); + return; + } + LockSupport.parkNanos(20_000_000L); // 20ms backoff, then retry the same key + } + } + } + }, "background-writer"); + writer.start(); + + cluster.get(DECOMMISSION_TARGET).nodetoolResult("decommission", "--force").asserts().success(); + + // Stop background writes and surface any failure. + done.set(true); + writer.join(); + if (bgError.get() != null) + throw new AssertionError("Background writes failed during decommission", bgError.get()); + assertTrue("expected concurrent writes to run during the decommission", nextKey.get() > rows); + + // Correctness invariant after decommission: no surviving node may keep an *unsealed* shard that still + // lists the departed node as a participant. Such a shard could never finish background reconciliation + // (it would wait forever on the departed participant), so the leave-side sealing must have reconciled + // and sealed all of them (the intermediate START_LEAVE shards). Shards obsoleted only by the range merge + // keep only live participants and are allowed to remain unsealed (they reconcile on their own). + for (int n = 1; n <= NODES; n++) + { + if (n == DECOMMISSION_TARGET) + continue; + cluster.get(n).runOnInstance(() -> { + // Only JOINED nodes count as members. The decommissioned node lingers in the directory in state + // LEFT (still in peers, removed only at a later unregister), so peerIds() would still include it + // and rob this check of teeth; filter to JOINED so an unsealed shard still listing the departed + // node is actually flagged. + ClusterMetadata cm = ClusterMetadata.current(); + Set members = new HashSet<>(); + for (NodeId id : cm.directory.peerIds()) + if (cm.directory.peerState(id) == NodeState.JOINED) + members.add(id.id()); + for (Shard shard : MutationTrackingService.instance().getShards()) + { + if (!shard.keyspace.equals(KEYSPACE) || shard.isSealed()) + continue; + for (int participant : shard.participants.asSet()) + if (!members.contains(participant)) + throw new AssertionError(format("Unsealed shard %s@%d references departed node %d (participants %s)", + shard.range, shard.sinceEpoch, participant, shard.participants)); + } + }); + } + + // Data is fully preserved across the decommission, read back at QUORUM through a surviving node. + assertAllRows(cluster.coordinator(1), expected); + } + } + + private static void assertAllRows(ICoordinator coordinator, Map expected) + { + Object[][] result = coordinator.execute(format("SELECT pk, v FROM %s.%s", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM); + Map actual = new HashMap<>(result.length); + for (Object[] row : result) + actual.put((Integer) row[0], (Integer) row[1]); + assertEquals("unexpected row count", expected.size(), actual.size()); + assertEquals("row contents differ", expected, actual); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/TrackedReplacementTest.java b/test/distributed/org/apache/cassandra/distributed/test/TrackedReplacementTest.java new file mode 100644 index 000000000000..6e9e99033bca --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/TrackedReplacementTest.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.distributed.test; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.concurrent.locks.LockSupport; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.ICoordinator; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.shared.ClusterUtils; +import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.replication.MutationTrackingService; +import org.apache.cassandra.replication.Shard; +import org.apache.cassandra.tcm.ClusterMetadata; +import org.apache.cassandra.tcm.membership.NodeId; +import org.apache.cassandra.tcm.membership.NodeState; + +import static java.lang.String.format; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * Host replacement of a (dead) node in a tracked keyspace (single-token, no vnodes), exercising the replace-side + * shard sealing in {@link org.apache.cassandra.replication.SealingCoordinator}. + *

+ * A replacement reuses the dead node's token, so the ring layout is unchanged: only the replica membership of the + * dead node's ranges shifts from the dead node to the (new-NodeId) replacement. The shards over those ranges that + * still list the dead node as a participant are obsoleted by the replacement and must be sealed, mirroring the + * join- and leave-side sealing covered by {@link TrackedBootstrapTest} and {@link TrackedDecommissionTest}. + *

+ * Only same-cluster, different-address replacement is supported (same-address replacement is rejected at startup + * when mutation tracking is enabled). + *

+ * Uses human-readable single tokens (node N -> token N*100) so the affected ranges are easy to follow. + */ +public class TrackedReplacementTest extends TestBaseImpl +{ + private static final String KEYSPACE = "tracked_replacement_ks"; + private static final String TABLE = "tbl"; + + private static final int NODES = 7; + private static final int RF = 3; + private static final int REPLACE_TARGET = 4; // node 4, token 400 (a node in the middle of the ring) + + @Test + public void replacementSealsObsoletedShards() throws Throwable + { + // node N gets the single human-readable token N*100: 100,200,300,400,500,600,700 + TokenSupplier tokenSupplier = (TokenSupplier) i -> Collections.singleton(String.valueOf(i * 100)); + + try (Cluster cluster = builder().withNodes(NODES) + .withTokenSupplier(tokenSupplier) + .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(NODES, "dc0", "rack0")) + .withConfig(config -> config.with(NETWORK, GOSSIP) + .set("num_tokens", 1)) + .start()) + { + cluster.schemaChange(format("CREATE KEYSPACE %s WITH replication = {'class':'SimpleStrategy','replication_factor':%d} AND replication_type='tracked'", + KEYSPACE, RF)); + cluster.schemaChange(format("CREATE TABLE %s.%s (pk int PRIMARY KEY, v int)", KEYSPACE, TABLE)); + + for (int n = 1; n <= NODES; n++) + ClusterUtils.awaitRingHealthy(cluster.get(n)); + + // Baseline data written before the replacement begins. + int baseline = 200; + Map expected = new ConcurrentHashMap<>(); + for (int i = 0; i < baseline; i++) + { + cluster.coordinator(1).execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?)", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, i, i); + expected.put(i, i); + } + + IInvokableInstance victim = cluster.get(REPLACE_TARGET); + Set victimTokens = getNodeTokens(victim); + ClusterUtils.stopUnchecked(victim); + + // Keep client writes running in the background for the whole duration of the replacement (victim down -> + // streaming -> finished), through always-alive surviving coordinators (nodes 1 and 2; the victim is node + // REPLACE_TARGET). The replacement must obtain these writes -- already-acked ones by streaming from the + // survivors, in-flight ones via the over-replicated write set it joins while BOOT_REPLACING -- and the + // obsoleted shards must still reconcile and seal despite the dead participant. Only keys the coordinator + // acknowledged are recorded as expected, so the post-replacement read-back is exact. + AtomicBoolean done = new AtomicBoolean(false); + AtomicInteger nextKey = new AtomicInteger(baseline); + AtomicReference bgError = new AtomicReference<>(); + Thread writer = new Thread(() -> { + while (!done.get()) + { + int k = nextKey.getAndIncrement(); + // Writes can race the in-flight topology change and hit a retryable "ring has changed" error; + // retry the (idempotent) insert until it is acknowledged, as a real client would. + for (int attempt = 1; ; attempt++) + { + try + { + cluster.coordinator((k % 2) + 1) + .execute(format("INSERT INTO %s.%s (pk, v) VALUES (?, ?)", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM, k, k); + expected.put(k, k); + break; + } + catch (Throwable t) + { + if (done.get()) + return; // shutting down; drop this in-flight write + if (attempt >= 100) + { + bgError.set(t); + return; + } + LockSupport.parkNanos(20_000_000L); // 20ms backoff, then retry the same key + } + } + } + }, "background-writer"); + writer.start(); + + // Replace the dead node (different-address, token reused from the victim's config). + IInvokableInstance replacement = ClusterUtils.replaceHostAndStart(cluster, victim); + ClusterUtils.awaitRingJoin(cluster.get(1), replacement); + ClusterUtils.awaitRingJoin(replacement, cluster.get(1)); + + // Stop background writes and surface any failure. + done.set(true); + writer.join(); + if (bgError.get() != null) + throw new AssertionError("Background writes failed during replacement", bgError.get()); + assertTrue("expected concurrent writes to run during the replacement", nextKey.get() > baseline); + + // The replacement reuses the dead node's tokens. + assertEquals("replacement should own the replaced node's tokens", victimTokens, getNodeTokens(replacement)); + + assertNoUnsealedShardReferencesDepartedNode(cluster); + + // All baseline + concurrent writes are preserved, read back at QUORUM through a surviving node and + // through the freshly started replacement node. + assertAllRows(cluster.coordinator(1), expected); + assertAllRows(replacement.coordinator(), expected); + } + } + + /** + * Correctness invariant after replacement: no surviving node may keep an *unsealed* shard that still lists a + * departed (non-member) node as a participant. The replaced node is removed from the directory at FINISH_REPLACE, + * so its id is no longer a member; any pre-replace shard over its ranges is obsoleted and could never finish + * background reconciliation (it would wait forever on the departed participant), so the replace-side sealing must + * have reconciled and sealed all of them. + */ + private static void assertNoUnsealedShardReferencesDepartedNode(Cluster cluster) + { + for (int n = 1; n <= NODES; n++) + { + if (n == REPLACE_TARGET) + continue; // the dead node's slot; the replacement is a separate instance + cluster.get(n).runOnInstance(() -> { + // Only JOINED nodes count as members. The replaced node may linger in the directory briefly; + // filter to JOINED so an unsealed shard still listing a departed node is actually flagged. + ClusterMetadata cm = ClusterMetadata.current(); + Set members = new HashSet<>(); + for (NodeId id : cm.directory.peerIds()) + if (cm.directory.peerState(id) == NodeState.JOINED) + members.add(id.id()); + for (Shard shard : MutationTrackingService.instance().getShards()) + { + if (!shard.keyspace.equals(KEYSPACE) || shard.isSealed()) + continue; + for (int participant : shard.participants.asSet()) + if (!members.contains(participant)) + throw new AssertionError(format("Unsealed shard %s@%d references departed node %d (participants %s)", + shard.range, shard.sinceEpoch, participant, shard.participants)); + } + }); + } + } + + private static Set getNodeTokens(IInvokableInstance node) + { + return node.callOnInstance(() -> { + ClusterMetadata metadata = ClusterMetadata.current(); + Set tokens = new HashSet<>(); + metadata.tokenMap.tokens(metadata.myNodeId()).forEach(t -> tokens.add(t.toString())); + return tokens; + }); + } + + private static void assertAllRows(ICoordinator coordinator, Map expected) + { + Object[][] result = coordinator.execute(format("SELECT pk, v FROM %s.%s", KEYSPACE, TABLE), + ConsistencyLevel.QUORUM); + Map actual = new HashMap<>(result.length); + for (Object[] row : result) + actual.put((Integer) row[0], (Integer) row[1]); + assertEquals("unexpected row count", expected.size(), actual.size()); + assertEquals("row contents differ", expected, actual); + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/TrackedHostReplacementTest.java b/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/TrackedHostReplacementTest.java deleted file mode 100644 index b10bb9ff35f6..000000000000 --- a/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/TrackedHostReplacementTest.java +++ /dev/null @@ -1,376 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.distributed.test.hostreplacement; - -import java.util.Arrays; -import java.util.List; -import java.util.Set; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; - -import com.google.common.collect.Lists; - -import org.junit.Assert; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.cassandra.distributed.Cluster; -import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.distributed.api.Feature; -import org.apache.cassandra.distributed.api.ICoordinator; -import org.apache.cassandra.distributed.api.IInvokableInstance; -import org.apache.cassandra.distributed.api.TokenSupplier; -import org.apache.cassandra.distributed.test.TestBaseImpl; -import org.apache.cassandra.replication.MutationJournal; -import org.apache.cassandra.replication.MutationSummary; -import org.apache.cassandra.replication.MutationTrackingService; -import org.apache.cassandra.tcm.ClusterMetadata; - -import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; -import static org.apache.cassandra.distributed.shared.AssertUtils.row; -import static org.apache.cassandra.distributed.shared.ClusterUtils.awaitRingJoin; -import static org.apache.cassandra.distributed.shared.ClusterUtils.replaceHostAndStart; -import static org.apache.cassandra.distributed.shared.ClusterUtils.stopUnchecked; -import static org.apache.cassandra.distributed.test.tracking.MutationTrackingUtils.assertMatchingSummaryForTable; -import static org.apache.cassandra.distributed.test.tracking.MutationTrackingUtils.summaryForTable; -import static org.assertj.core.api.Assertions.assertThat; - -public class TrackedHostReplacementTest extends TestBaseImpl -{ - private static final Logger logger = LoggerFactory.getLogger(TrackedHostReplacementTest.class); - - private static final String KEYSPACE = "test_ks"; - private static final String TABLE = "test_table"; - private static final String QUALIFIED_TABLE_NAME = KEYSPACE + '.' + TABLE; - - private static void pauseLogBroadcasts(Cluster cluster, boolean pause) - { - cluster.stream() - .filter(node -> !node.isShutdown()) - .forEach(node -> node.runOnInstance(() -> { - MutationTrackingService.instance().pauseOffsetBroadcast(pause); - })); - } - - private static void awaitFullReconciliation(Cluster cluster, int ids) throws InterruptedException - { - // await full reconciliation - boolean fullyReconciled = false; - for (int i = 0; i < 20; i++) - { - int attempt = i + 1; - fullyReconciled = cluster.stream().filter(node -> !node.isShutdown()).allMatch(node -> { - MutationSummary summary = summaryForTable(node, KEYSPACE, TABLE); - if (summary.unreconciledIds() == 0) - { - Assert.assertEquals(node.toString(), ids, summary.reconciledIds()); - return true; - } - else - { - logger.info("Not yet fully reconciled (reconciled: {}, unreconciled:{}) - attempt {} summary: {}", summary.reconciledIds(), summary.unreconciledIds(), attempt, summary); - } - return false; - }); - - if (!fullyReconciled) - Thread.sleep(1000); - } - - Assert.assertTrue(fullyReconciled); - } - @Test - public void testBasicTrackedHostReplacement() throws Exception - { - TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3); - try (Cluster cluster = init(Cluster.build(3) - .withConfig(config -> config.with(Feature.NETWORK, Feature.GOSSIP)) - .withTokenSupplier(node -> even.token(node == 4 ? 3 : node)) - .start())) - { - setupTrackedKeyspace(cluster); - - // write some initial data - writeDataRange(cluster, 0, 10); - awaitFullReconciliation(cluster, 10); - verifyLocalDataContents(cluster, 0, 10); - - IInvokableInstance victimNode = cluster.get(3); - Set victimTokens = getNodeTokens(victimNode); - - // Stop the victim node - stopUnchecked(victimNode); - - // Write more data while victim is down - writeDataRange(cluster, 10, 20); - - // Verify remaining nodes have the new data - verifyLocalDataContents(Arrays.asList(cluster.get(1), cluster.get(2)), 10, 20); - - pauseLogBroadcasts(cluster, true); - MutationSummary expectedSummary = summaryForTable(cluster.get(1), KEYSPACE, TABLE); - Assert.assertEquals(10, expectedSummary.reconciledIds()); - Assert.assertEquals(10, expectedSummary.unreconciledIds()); - - // Replace the node - IInvokableInstance replacementNode = replaceHostAndStart(cluster, victimNode); - - // Wait for replacement to complete - awaitRingJoin(cluster.get(1), replacementNode); - awaitRingJoin(cluster.get(2), replacementNode); - awaitRingJoin(replacementNode, cluster.get(1)); - awaitRingJoin(replacementNode, cluster.get(2)); - - // confirm replacement node took over victim's token ranges - Set replacementTokens = getNodeTokens(replacementNode); - assertThat(replacementTokens).as("Replacement node should have same tokens as victim") - .isEqualTo(victimTokens); - - assertMatchingSummaryForTable(replacementNode, KEYSPACE, TABLE, expectedSummary); - - List remainingNodes = List.of(cluster.get(1), cluster.get(2), replacementNode); - // Verify all nodes have all writes - verifyLocalDataContents(remainingNodes, 0, 20); - - - // unpause id broadcast. all nodes should now reach full reconciliation, even though the replica set has changed - pauseLogBroadcasts(cluster, false); - awaitFullReconciliation(cluster, 20); - - // Write new data and verify replacement node handles writes for its ranges - writeDataRange(cluster, 20, 25); - - awaitFullReconciliation(cluster, 25); - verifyLocalDataContents(remainingNodes, 20, 25); - } - } - - /** - * Test host replacement with writes to the cluster during replacement bootstrap. - */ - @Test - public void testTrackedHostReplacementWithOngoingWrites() throws Exception - { - TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3); - try (Cluster cluster = init(Cluster.build(3) - .withConfig(config -> config.with(Feature.NETWORK, Feature.GOSSIP)) - .withTokenSupplier(node -> even.token(node == 4 ? 3 : node)) - .start())) - { - final int numInitialWrites = 10; - setupTrackedKeyspace(cluster); - - // Phase 1: Establish baseline mutation tracking state - writeDataRange(cluster, 0, numInitialWrites); - awaitFullReconciliation(cluster, numInitialWrites); - verifyLocalDataContents(cluster, 0, numInitialWrites); - - // Capture victim's exact mutation tracking state for streaming validation - IInvokableInstance victim = cluster.get(3); - Set victimTokens = getNodeTokens(victim); - - // Phase 2: Stop victim and start replacement bootstrap - AtomicBoolean replacementCompleted = new AtomicBoolean(false); - AtomicInteger totalWrites = new AtomicInteger(numInitialWrites); - - - Thread thread = new Thread(() -> { - while (!replacementCompleted.get()) - { - int key = totalWrites.getAndIncrement(); - writeDataToCluster(cluster.coordinator((key % 2) + 1), key, key * 10); - } - }); - thread.start(); - - stopUnchecked(victim); - IInvokableInstance replacementNode = replaceHostAndStart(cluster, victim); - - // Wait for replacement to complete - awaitRingJoin(cluster.get(1), replacementNode); - awaitRingJoin(cluster.get(2), replacementNode); - awaitRingJoin(replacementNode, cluster.get(1)); - awaitRingJoin(replacementNode, cluster.get(2)); - - // confirm replacement node took over victim's token ranges - Set replacementTokens = getNodeTokens(replacementNode); - assertThat(replacementTokens).as("Replacement node should have same tokens as victim") - .isEqualTo(victimTokens); - - // stop concurrent writes - replacementCompleted.set(true); - thread.join(); - - if (totalWrites.get() == numInitialWrites) - throw new AssertionError("No concurrent writes were performed during replacement"); - - logger.info("Total writes performed: {} ", totalWrites.get()); - - List remainingNodes = List.of(cluster.get(1), cluster.get(2), replacementNode); - - // wait for all nodes to reach full reconciliation and verify data - awaitFullReconciliation(cluster, totalWrites.get()); - verifyLocalDataContents(remainingNodes, 0, totalWrites.get()); - } - } - - @Test - public void testTrackedHostReplacementWithLargeDataSet() throws Exception - { - TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3); - try (Cluster cluster = init(Cluster.build(3) - .withConfig(config -> config.with(Feature.NETWORK, Feature.GOSSIP)) - .withTokenSupplier(node -> even.token(node == 4 ? 3 : node)) - .start())) - { - setupTrackedKeyspace(cluster); - - // Phase 1: Create initial SSTable generation (Generation 1) - writeDataRange(cluster, 0, 30); - flushAllNodes(cluster); // Creates first SSTable generation - advanceMutationLogSegment(cluster); - - // Phase 2: Add data and create overlapping SSTable generation (Generation 2) - writeDataRange(cluster, 30, 60); - flushAllNodes(cluster); // Creates second SSTable generation with overlapping keys - advanceMutationLogSegment(cluster); - - // Phase 3: Create final SSTable generation with different key distribution (Generation 3) - writeDataRange(cluster, 60, 90); - flushAllNodes(cluster); // Creates third SSTable generation - advanceMutationLogSegment(cluster); - - // Capture victim mutation tracking baseline before replacement - IInvokableInstance nodeToReplace = cluster.get(3); - Set victimTokens = getNodeTokens(nodeToReplace); - - // Stop victim node and write additional data to create streaming complexity - nodeToReplace.shutdown().get(); - - // Write additional data while victim is down - writeDataRange(cluster, 90, 120); - // don't flush - - - // Replace the node - IInvokableInstance replacementNode = replaceHostAndStart(cluster, nodeToReplace); - - // Wait for replacement to complete - awaitRingJoin(cluster.get(1), replacementNode); - awaitRingJoin(cluster.get(2), replacementNode); - awaitRingJoin(replacementNode, cluster.get(1)); - awaitRingJoin(replacementNode, cluster.get(2)); - - // confirm replacement node took over victim's token ranges - Set replacementTokens = getNodeTokens(replacementNode); - assertThat(replacementTokens).as("Replacement node should have same tokens as victim") - .isEqualTo(victimTokens); - - List remainingNodes = List.of(cluster.get(1), cluster.get(2), replacementNode); - - // wait for all nodes to reach full reconciliation and verify data - awaitFullReconciliation(cluster, 120); - verifyLocalDataContents(remainingNodes, 0, 120); - } - } - - private void setupTrackedKeyspace(Cluster cluster) - { - cluster.schemaChange(String.format("CREATE KEYSPACE IF NOT EXISTS %s WITH REPLICATION={'class': 'SimpleStrategy', 'replication_factor': 3} AND replication_type='tracked'", KEYSPACE)); - cluster.schemaChange(String.format("CREATE TABLE IF NOT EXISTS %s (k int PRIMARY KEY, v int)", QUALIFIED_TABLE_NAME)); - } - - /** - * Write data using proper coordinator pattern, not direct node access. - * This ensures replication happens correctly. - */ - private void writeDataToCluster(Cluster cluster, int key, int value) - { - writeDataToCluster(cluster.coordinator(1), key, value); - } - - private void writeDataToCluster(ICoordinator coordinator, int key, int value) - { - // Use coordinator to write with QUORUM - this will replicate properly - coordinator.execute(String.format("INSERT INTO %s (k, v) VALUES (?, ?)", QUALIFIED_TABLE_NAME), ConsistencyLevel.QUORUM, key, value); - } - - /** - * Write multiple keys to establish baseline mutation tracking state - */ - private void writeDataRange(Cluster cluster, int startKey, int endKey) - { - for (int i = startKey; i < endKey; i++) - { - writeDataToCluster(cluster, i, i * 10); - } - } - - private void flushAllNodes(Cluster cluster) - { - for (int i = 1; i <= cluster.size(); i++) - { - cluster.get(i).flush(KEYSPACE); - } - } - - private void advanceMutationLogSegment(Cluster cluster) - { - cluster.stream().filter(node -> !node.isShutdown()).forEach( node -> { - node.runOnInstance(() -> { - MutationJournal.instance().advanceSegment(); - }); - }); - } - - private void verifyLocalDataContents(Cluster cluster, int startKey, int endKey) - { - // Inline simple iteration over cluster nodes instead of using getAllNodes() helper - List nodes = new java.util.ArrayList<>(); - for (int i = 1; i <= cluster.size(); i++) - { - nodes.add(cluster.get(i)); - } - verifyLocalDataContents(Lists.newArrayList(cluster), startKey, endKey); - } - - private void verifyLocalDataContents(List nodes, int startKey, int endKey) - { - for (int key = startKey; key < endKey; key++) - { - int expectedValue = key * 10; - for (IInvokableInstance node : nodes) - { - Object[][] result = node.executeInternal(String.format("SELECT k, v FROM %s WHERE k = ?", QUALIFIED_TABLE_NAME), key); - assertRows(result, row(key, expectedValue)); - } - } - } - - private Set getNodeTokens(IInvokableInstance node) - { - return node.callOnInstance(() -> { - ClusterMetadata metadata = ClusterMetadata.current(); - return metadata.tokenMap.tokens(metadata.myNodeId()).stream() - .map(Object::toString) - .collect(java.util.stream.Collectors.toSet()); - }); - } -} diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java index 94c52d7bcaa1..36aae3443601 100644 --- a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java +++ b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java @@ -159,8 +159,7 @@ private Set getReadersForRange(Range range) Collection streams = cfs.getStreamManager().createOutgoingStreams(session(NO_PENDING_REPAIR), RangesAtEndpoint.toDummyList(Collections.singleton(range)), NO_PENDING_REPAIR, - PreviewKind.NONE, - null); + PreviewKind.NONE); return sstablesFromStreams(streams); } @@ -168,7 +167,7 @@ private Set selectReaders(TimeUUID pendingRepair) { IPartitioner partitioner = DatabaseDescriptor.getPartitioner(); Collection> ranges = Lists.newArrayList(new Range(partitioner.getMinimumToken(), partitioner.getMinimumToken())); - Collection streams = cfs.getStreamManager().createOutgoingStreams(session(pendingRepair), RangesAtEndpoint.toDummyList(ranges), pendingRepair, PreviewKind.NONE, null); + Collection streams = cfs.getStreamManager().createOutgoingStreams(session(pendingRepair), RangesAtEndpoint.toDummyList(ranges), pendingRepair, PreviewKind.NONE); return sstablesFromStreams(streams); } diff --git a/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java b/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java index 9ebda69d788f..78d101e6b770 100644 --- a/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java +++ b/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java @@ -215,7 +215,7 @@ private void testStreamWithConcurrentComponentMutation(Callable runBeforeStre ByteBuf serializedFile = Unpooled.buffer(8192); InetAddressAndPort peer = FBUtilities.getBroadcastAddressAndPort(); StreamSession session = setupStreamingSessionForTest(); - Collection outgoingStreams = store.getStreamManager().createOutgoingStreams(session, rangesAtEndpoint, NO_PENDING_REPAIR, PreviewKind.NONE, null); + Collection outgoingStreams = store.getStreamManager().createOutgoingStreams(session, rangesAtEndpoint, NO_PENDING_REPAIR, PreviewKind.NONE); CassandraOutgoingFile outgoingFile = (CassandraOutgoingFile) Iterables.getOnlyElement(outgoingStreams); Future streaming = executeAsync(() -> { diff --git a/test/unit/org/apache/cassandra/db/virtual/MutationTrackingShardsTableTest.java b/test/unit/org/apache/cassandra/db/virtual/MutationTrackingShardsTableTest.java index 6f34bdba96c9..89190c3aad6e 100644 --- a/test/unit/org/apache/cassandra/db/virtual/MutationTrackingShardsTableTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/MutationTrackingShardsTableTest.java @@ -20,6 +20,7 @@ import java.util.Arrays; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import com.datastax.driver.core.ColumnDefinitions; @@ -40,6 +41,7 @@ import static org.assertj.core.api.Assertions.assertThat; +// TODO (expected): these tests aren't very useful, should be addressed public class MutationTrackingShardsTableTest extends CQLTester { private static final String KS_NAME = "vts"; @@ -117,7 +119,7 @@ public void testSelectAll() String rangeEnd = r.getString("range_end"); String logId = r.getString("log_id"); int localNodeId = r.getInt("local_node_id"); - String participants = r.getString("participants"); + Set participants = r.getSet("participants", Integer.class); String witnessedOffsets = r.getString("witnessed_offsets"); String reconciledOffsets = r.getString("reconciled_offsets"); String persistedOffsets = r.getString("persisted_offsets"); @@ -131,7 +133,7 @@ public void testSelectAll() assertThat(localNodeId).isGreaterThanOrEqualTo(0); - assertThat(participants).isNotNull(); // should show replica node IDs + assertThat(participants).isNotEmpty(); assertThat(witnessedOffsets).isNotNull(); assertThat(reconciledOffsets).isNotNull(); diff --git a/test/unit/org/apache/cassandra/replication/ActivationRequestSerializationTest.java b/test/unit/org/apache/cassandra/replication/ActivationRequestSerializationTest.java index a92348a60952..69fde0df6804 100644 --- a/test/unit/org/apache/cassandra/replication/ActivationRequestSerializationTest.java +++ b/test/unit/org/apache/cassandra/replication/ActivationRequestSerializationTest.java @@ -59,7 +59,7 @@ public void testRoundtripPreparePhase() throws IOException String keyspace = "test_ks"; Range range = new Range<>(new ByteOrderedPartitioner.BytesToken("key1".getBytes()), new ByteOrderedPartitioner.BytesToken("key100".getBytes())); - ActivationRequest activation = new ActivationRequest(StreamOperation.IMPORT, pair, phase, transferId, coordinatorId, range, keyspace, planId); + ActivationRequest activation = new ActivationRequest(StreamOperation.IMPORT, pair, phase, transferId, coordinatorId, range, 42L, keyspace, planId); try (DataOutputBuffer output = new DataOutputBuffer()) { @@ -78,7 +78,7 @@ public void testRoundtripCommitPhase() throws IOException String keyspace = "test_ks"; Range range = new Range<>(new ByteOrderedPartitioner.BytesToken("key1".getBytes()), new ByteOrderedPartitioner.BytesToken("key100".getBytes())); - ActivationRequest activation = new ActivationRequest(StreamOperation.IMPORT, pair, phase, transferId, coordinatorId, range, keyspace, planId); + ActivationRequest activation = new ActivationRequest(StreamOperation.IMPORT, pair, phase, transferId, coordinatorId, range, 42L, keyspace, planId); try (DataOutputBuffer output = new DataOutputBuffer()) { diff --git a/test/unit/org/apache/cassandra/replication/CoordinatorLogOffsetsTest.java b/test/unit/org/apache/cassandra/replication/CoordinatorLogOffsetsTest.java index fdf57d5b616f..13cfae32013a 100644 --- a/test/unit/org/apache/cassandra/replication/CoordinatorLogOffsetsTest.java +++ b/test/unit/org/apache/cassandra/replication/CoordinatorLogOffsetsTest.java @@ -20,9 +20,7 @@ import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -44,8 +42,6 @@ import org.apache.cassandra.db.marshal.Int32Type; import org.apache.cassandra.dht.Bounds; import org.apache.cassandra.dht.Murmur3Partitioner; -import org.apache.cassandra.dht.Range; -import org.apache.cassandra.dht.Token; import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper; import org.apache.cassandra.distributed.test.tracking.MutationTrackingUtils; import org.apache.cassandra.io.util.DataInputBuffer; @@ -291,10 +287,10 @@ public void reconciledBounds() throws InterruptedException, ExecutionException { ImmutableCoordinatorLogOffsets logOffsets = new ImmutableCoordinatorLogOffsets.Builder() .add(mutation.id()) .build(); - Range range = getShardRange(mutation); + Shard shard = getShard(mutation); List offsets = Collections.singletonList(logOffsets.mutations().offsets(mutation.id().logId())); - MutationTrackingService.instance().updateReplicatedOffsets(ks, range, offsets, true, addr2); - MutationTrackingService.instance().updateReplicatedOffsets(ks, range, offsets, true, addr3); + MutationTrackingService.instance().updateReplicatedOffsets(ks, 0L, shard.range, shard.participants, offsets, true, addr2); + MutationTrackingService.instance().updateReplicatedOffsets(ks, 0L, shard.range, shard.participants, offsets, true, addr3); Assertions.assertThat(MutationTrackingService.instance().isDurablyReconciled(logOffsets)).isTrue(); } @@ -325,10 +321,10 @@ public void reconciledBounds() throws InterruptedException, ExecutionException { .add(mutation.id()) .build(); - Range range = getShardRange(mutation); + Shard shard = getShard(mutation); List offsets = Collections.singletonList(logOffsets.mutations().offsets(mutation.id().logId())); - MutationTrackingService.instance().updateReplicatedOffsets(ks, range, offsets, true, addr2); - MutationTrackingService.instance().updateReplicatedOffsets(ks, range, offsets, true, addr3); + MutationTrackingService.instance().updateReplicatedOffsets(ks, 0L, shard.range, shard.participants, offsets, true, addr2); + MutationTrackingService.instance().updateReplicatedOffsets(ks, 0L, shard.range, shard.participants, offsets, true, addr3); Assertions.assertThat(MutationTrackingService.instance().isDurablyReconciled(logOffsets)).isFalse(); } @@ -360,13 +356,8 @@ public void reconciledBounds() throws InterruptedException, ExecutionException { CommitLog.instance.stopUnsafe(true); } - private Range getShardRange(Mutation mutation) + private Shard getShard(Mutation mutation) { - Map> ksRanges = new HashMap<>(); - MutationTrackingService.instance().forEachKeyspace(shards -> { - Shard shard = shards.lookUp(mutation); - ksRanges.put(shard.keyspace, shard.range); - }); - return ksRanges.get(mutation.getKeyspaceName()); + return MutationTrackingService.instance().getShard(mutation.id().asLogId()); } } diff --git a/test/unit/org/apache/cassandra/replication/CoordinatorLogTest.java b/test/unit/org/apache/cassandra/replication/CoordinatorLogTest.java index 374afab28977..5fdec78ac013 100644 --- a/test/unit/org/apache/cassandra/replication/CoordinatorLogTest.java +++ b/test/unit/org/apache/cassandra/replication/CoordinatorLogTest.java @@ -29,7 +29,6 @@ import org.apache.cassandra.db.RowUpdateBuilder; import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.dht.ByteOrderedPartitioner; -import org.apache.cassandra.dht.Murmur3Partitioner; import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.replication.CoordinatorLog.CoordinatorLogPrimary; @@ -103,7 +102,7 @@ public void remoteReconciliationTest() Token tk = tk("key"); TableMetadata metadata = Schema.instance.getTableMetadata(KEYSPACE, TABLE); TableId tableId = metadata.id; - CoordinatorLogPrimary log = new CoordinatorLogPrimary(KEYSPACE, new Range<>(tk, tk), LOCAL_HOST_ID, LOCAL_LOG_ID, PARTICIPANTS); + CoordinatorLogPrimary log = new CoordinatorLogPrimary(KEYSPACE, 0L, new Range<>(tk, tk), LOCAL_HOST_ID, LOCAL_LOG_ID, PARTICIPANTS); MutationId[] ids = new MutationId[] { log.nextId(), log.nextId(), log.nextId(), }; List mutations = new ArrayList<>(ids.length); @@ -180,7 +179,7 @@ private void testPersistAndLoadRoundtrip(CoordinatorLogId logId) MutationJournal.instance().write(mutation2.id(), mutation2); CoordinatorLog log = - CoordinatorLog.recreate(KEYSPACE, range, LOCAL_HOST_ID, logId, PARTICIPANTS, witnessed, witnessed, unreconciled); + CoordinatorLog.recreate(KEYSPACE, 0L, range, LOCAL_HOST_ID, logId, PARTICIPANTS, witnessed, witnessed, unreconciled); Offsets.Mutable reconciled = new Offsets.Mutable(logId); reconciled.add(3, 4); @@ -193,7 +192,7 @@ private void testPersistAndLoadRoundtrip(CoordinatorLogId logId) private static void validatePersistAndLoadRoundtrip(CoordinatorLog log) { log.persistToSystemTable(); - List logs = CoordinatorLog.loadFromSystemTable(KEYSPACE, log.range, LOCAL_HOST_ID); + List logs = CoordinatorLog.loadFromSystemTable(KEYSPACE, 0L, log.range, LOCAL_HOST_ID); assertEquals(1, logs.size()); CoordinatorLog loaded = logs.get(0); @@ -214,92 +213,4 @@ private static void validatePersistAndLoadRoundtrip(CoordinatorLog log) assertTrue(log.unreconciledMutations.equalsForTesting(loaded.unreconciledMutations)); } - - @Test - public void withParticipantsSameParticipantsTest() - { - Token tk = Murmur3Partitioner.instance.getMinimumToken(); - CoordinatorLogPrimary log = new CoordinatorLogPrimary(KEYSPACE, new Range<>(tk, tk), LOCAL_HOST_ID, LOCAL_LOG_ID, PARTICIPANTS); - - CoordinatorLog newLog = log.withParticipants(PARTICIPANTS); - - assertSame("Same participants should return same instance", log, newLog); - } - - @Test - public void withParticipantsReplicaTest() - { - Token tk = Murmur3Partitioner.instance.getMinimumToken(); - CoordinatorLogPrimary log = new CoordinatorLogPrimary(KEYSPACE, new Range<>(tk, tk), LOCAL_HOST_ID, LOCAL_LOG_ID, PARTICIPANTS); - - log.witnessedOffsets.get(2).add(300); - - Participants newParticipants = new Participants(List.of(LOCAL_HOST_ID, 5, 6)); - CoordinatorLog newLog = log.withParticipants(newParticipants); - - assertTrue("Should be CoordinatorLogPrimary", newLog instanceof CoordinatorLog.CoordinatorLogPrimary); - assertTrue("Should have empty witness state for new participants", newLog.witnessedOffsets.get(5).isEmpty()); - assertTrue("Should have empty witness state for new participants", newLog.witnessedOffsets.get(6).isEmpty()); - } - - private static Offsets offsetsWithOffsets(CoordinatorLogId logId, int... offsets) - { - Offsets.Mutable result = new Offsets.Mutable(logId); - for (int offset : offsets) - result.add(offset); - return result; - } - - private static Offsets offsetsWithOffsets(int... offsets) - { - return offsetsWithOffsets(LOCAL_LOG_ID, offsets); - } - - @Test - public void withParticipantsAddNewParticipantTest() - { - Token tk = Murmur3Partitioner.instance.getMinimumToken(); - CoordinatorLogPrimary log = new CoordinatorLogPrimary(KEYSPACE, new Range<>(tk, tk), LOCAL_HOST_ID, LOCAL_LOG_ID, PARTICIPANTS); - - log.witnessedOffsets.get(LOCAL_HOST_ID).add(1); - log.witnessedOffsets.get(2).add(2); - log.witnessedOffsets.get(3).add(3); - - log.witnessedOffsets.get(LOCAL_HOST_ID).add(5); - log.witnessedOffsets.get(2).add(5); - log.witnessedOffsets.get(3).add(5); - log.reconciledOffsets.add(5); - - Participants expandedParticipants = new Participants(List.of(LOCAL_HOST_ID, 2, 3, 4)); - CoordinatorLog newLog = log.withParticipants(expandedParticipants); - - assertEquals(offsetsWithOffsets(1, 5), newLog.witnessedOffsets.get(LOCAL_HOST_ID)); - assertEquals(offsetsWithOffsets(2, 5), newLog.witnessedOffsets.get(2)); - assertEquals(offsetsWithOffsets(3, 5), newLog.witnessedOffsets.get(3)); - assertEquals(offsetsWithOffsets( 5), newLog.witnessedOffsets.get(4)); - assertEquals(offsetsWithOffsets(5), newLog.reconciledOffsets); - } - - @Test - public void withParticipantsRemoveParticipantTest() - { - Token tk = Murmur3Partitioner.instance.getMinimumToken(); - CoordinatorLogPrimary log = new CoordinatorLogPrimary(KEYSPACE, new Range<>(tk, tk), LOCAL_HOST_ID, LOCAL_LOG_ID, PARTICIPANTS); - - log.witnessedOffsets.get(LOCAL_HOST_ID).addAll(offsetsWithOffsets(10, 40)); - log.witnessedOffsets.get(2).addAll(offsetsWithOffsets(10, 40)); - log.witnessedOffsets.get(3).addAll(offsetsWithOffsets(30, 40)); - log.reconciledOffsets.add(40); - - Participants reducedParticipants = new Participants(List.of(LOCAL_HOST_ID, 2)); - CoordinatorLog newLog = log.withParticipants(reducedParticipants); - - assertEquals(2, newLog.participants.size()); - assertEquals(offsetsWithOffsets(10, 40), newLog.witnessedOffsets.get(LOCAL_HOST_ID)); - assertEquals(offsetsWithOffsets(10, 40), newLog.witnessedOffsets.get(2)); - - // offset 10 should be promoted to reconciled since the node without it (3) has been removed - assertEquals(offsetsWithOffsets(10, 40), newLog.reconciledOffsets); - assertTrue(newLog.unreconciledMutations.isEmpty()); - } } diff --git a/test/unit/org/apache/cassandra/replication/ShardIntervalBTreeTest.java b/test/unit/org/apache/cassandra/replication/ShardIntervalBTreeTest.java new file mode 100644 index 000000000000..67c2385c3099 --- /dev/null +++ b/test/unit/org/apache/cassandra/replication/ShardIntervalBTreeTest.java @@ -0,0 +1,598 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.replication; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Consumer; + +import org.jctools.maps.NonBlockingHashMapLong; +import org.junit.Test; + +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; +import org.apache.cassandra.dht.Range; +import org.apache.cassandra.dht.Token; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class ShardIntervalBTreeTest +{ + private static final Participants PARTICIPANTS = new Participants(List.of(1)); + private static final Token MIN = Murmur3Partitioner.MINIMUM; + + private static Token tk(long token) + { + return new LongToken(token); + } + + private static Range range(long left, long right) + { + return new Range<>(tk(left), tk(right)); + } + + /** Range whose right bound is the +∞ sentinel (Murmur3 MINIMUM). */ + private static Range rangeToMax(long left) + { + return new Range<>(tk(left), MIN); + } + + /** Range whose left bound is the +∞ sentinel; with right = finite this is the bottom slice. */ + private static Range rangeFromMin(long right) + { + return new Range<>(MIN, tk(right)); + } + + /** Collect the set of shards returned by a forEach-like method. */ + private static Set collect(java.util.function.Consumer> producer) + { + Set hits = new HashSet<>(); + producer.accept(hits::add); + return hits; + } + + private static Set intersecting(ShardIntervalBTree map, Range query) + { + return collect(c -> map.forEachIntersecting(query, c)); + } + + private static Set covering(ShardIntervalBTree map, Token token) + { + return collect(c -> map.forEachCovering(token, c)); + } + + /** + * Build a {@link Shard} skeleton suitable for indexing - enough state for + * {@code ShardIntervalMap} (range + sinceEpoch + identity) without requiring + * a running mutation journal. + */ + private static Shard shard(Range range, long sinceEpoch) + { + return new Shard(/* localNodeId */ 1, + /* keyspace */ "ks", + /* sinceEpoch */ sinceEpoch, + /* range */ range, + /* participants */ PARTICIPANTS, + /* logs */ new NonBlockingHashMapLong<>(), + /* currentLocal */ null, + /* logIdProvider */ () -> 0L, + /* onNewLog */ (s, l) -> {}); + } + + @Test + public void testWithSingleShardLookup() + { + Shard s = shard(range(0, 100), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + assertFalse(map.isEmpty()); + + // (left, right] semantics + assertNull(map.latestShardCovering(tk(0))); // exclusive on left + assertSame(s, map.latestShardCovering(tk(1))); + assertSame(s, map.latestShardCovering(tk(50))); + assertSame(s, map.latestShardCovering(tk(100))); // inclusive on right + assertNull(map.latestShardCovering(tk(101))); + } + + @Test + public void testNonOverlappingShards() + { + Shard a = shard(range(0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + Shard c = shard(range(200, 300), 1L); + + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b).with(c); + + assertSame(a, map.latestShardCovering(tk(50))); + assertSame(a, map.latestShardCovering(tk(100))); // (0, 100] wins on the boundary + assertSame(b, map.latestShardCovering(tk(150))); + assertSame(b, map.latestShardCovering(tk(200))); + assertSame(c, map.latestShardCovering(tk(250))); + assertNull(map.latestShardCovering(tk(301))); + } + + @Test + public void testNewestShardWinsOnEpoch() + { + Shard older = shard(range(0, 100), 1L); + Shard newer = shard(range(0, 100), 5L); + + ShardIntervalBTree map = new ShardIntervalBTree().with(older).with(newer); + + assertSame(newer, map.latestShardCovering(tk(50))); + + // both are reachable via forEach + Set hits = new HashSet<>(); + map.forEachCovering(tk(50), hits::add); + assertEquals(Set.of(older, newer), hits); + } + + @Test + public void testForEachAll() + { + Shard a = shard(range(0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b); + + List seen = new ArrayList<>(); + map.forEach(seen::add); + assertEquals(Set.of(a, b), new HashSet<>(seen)); + } + + @Test + public void testWithoutShard() + { + Shard a = shard(range(0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b); + ShardIntervalBTree less = map.without(a); + + // the original snapshot is untouched (immutability) + assertSame(a, map.latestShardCovering(tk(50))); + + assertNull(less.latestShardCovering(tk(50))); + assertSame(b, less.latestShardCovering(tk(150))); + } + + @Test + public void testWithDuplicateThrows() + { + Shard a = shard(range(0, 100), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(a); + try + { + map.with(a); + fail("expected IllegalStateException for duplicate shard"); + } + catch (IllegalStateException expected) {} + } + + @Test + public void testWithoutMissingThrows() + { + ShardIntervalBTree map = new ShardIntervalBTree(); + Shard a = shard(range(0, 100), 1L); + try + { + map.without(a); + fail("expected IllegalStateException for missing shard"); + } + catch (IllegalStateException expected) {} + } + + @Test + public void testImmutabilityOfWithAndWithout() + { + Shard a = shard(range(0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + + ShardIntervalBTree empty = new ShardIntervalBTree(); + ShardIntervalBTree one = empty.with(a); + ShardIntervalBTree two = one.with(b); + ShardIntervalBTree back = two.without(b); + + assertTrue(empty.isEmpty()); + assertNotNull(one.latestShardCovering(tk(50))); + assertNotNull(two.latestShardCovering(tk(150))); + assertNull(back.latestShardCovering(tk(150))); + assertSame(a, back.latestShardCovering(tk(50))); + } + + // --------------------------------------------------------------------- + // Same-range / multi-epoch handling + // --------------------------------------------------------------------- + + /** Two shards sharing a range at different epochs coexist and are distinguished by sinceEpoch. */ + @Test + public void testSameRangeDifferentEpochBothStored() + { + Shard olderS = shard(range(0, 100), 1L); + Shard newerS = shard(range(0, 100), 5L); + + ShardIntervalBTree map = new ShardIntervalBTree().with(olderS).with(newerS); + + assertSame(newerS, map.latestShardCovering(tk(50))); + assertEquals(Set.of(olderS, newerS), covering(map, tk(50))); + assertEquals(Set.of(olderS, newerS), intersecting(map, range(10, 90))); + + // Removing one leaves the other intact. + ShardIntervalBTree lessNew = map.without(newerS); + assertSame(olderS, lessNew.latestShardCovering(tk(50))); + assertEquals(Set.of(olderS), covering(lessNew, tk(50))); + + ShardIntervalBTree lessOld = map.without(olderS); + assertSame(newerS, lessOld.latestShardCovering(tk(50))); + assertEquals(Set.of(newerS), covering(lessOld, tk(50))); + } + + // --------------------------------------------------------------------- + // forEachIntersecting: empty tree + // --------------------------------------------------------------------- + + @Test + public void testIntersectingEmptyTree() + { + ShardIntervalBTree map = new ShardIntervalBTree(); + assertEquals(Set.of(), intersecting(map, range(0, 100))); + assertEquals(Set.of(), intersecting(map, rangeToMax(0))); + } + + // --------------------------------------------------------------------- + // forEachIntersecting: basic overlap shapes + // --------------------------------------------------------------------- + + @Test + public void testIntersectingOverlapShapes() + { + Shard a = shard(range( 0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + Shard c = shard(range(200, 300), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b).with(c); + + // query entirely within a single shard + assertEquals(Set.of(a), intersecting(map, range(10, 50))); + + // query spanning two shards + assertEquals(Set.of(a, b), intersecting(map, range(50, 150))); + + // query spanning all three shards + assertEquals(Set.of(a, b, c), intersecting(map, range(50, 250))); + + // query equal to one of the shards + assertEquals(Set.of(a), intersecting(map, range(0, 100))); + + // query strictly to the right of all shards + assertEquals(Set.of(), intersecting(map, range(300, 400))); + + // query strictly to the left (note: tk -1 is a perfectly valid finite token) + assertEquals(Set.of(), intersecting(map, range(-100, -10))); + } + + // --------------------------------------------------------------------- + // forEachIntersecting: tie cases on (left, right] semantics + // --------------------------------------------------------------------- + + /** + * Regression for bug 1: query.left (exclusive) == shard.right (inclusive) is NOT overlap. + * Before the fix, RangeQueryComparators.startWithEndSeeker used keyStartWithEnd which + * returned 0 on a tie (point-query convention), causing a false positive here. + */ + @Test + public void testIntersectingBoundary_QueryLeftEqualsShardRight() + { + Shard s = shard(range(5, 10), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + // query (10, 20] starts strictly after shard ends at 10; they do NOT overlap + assertEquals(Set.of(), intersecting(map, range(10, 20))); + + // But a query that still contains 10 on its right-inclusive side DOES overlap + assertEquals(Set.of(s), intersecting(map, range(9, 20))); + } + + /** + * Mirror of the above: query.right (inclusive) == shard.left (exclusive) is NOT overlap. + * This was already correct in the original code; the test guards against regressions. + */ + @Test + public void testIntersectingBoundary_QueryRightEqualsShardLeft() + { + Shard s = shard(range(10, 20), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + // query (0, 10] ends exactly at shard's exclusive left; they do NOT overlap + assertEquals(Set.of(), intersecting(map, range(0, 10))); + + // Shifting right by one makes them overlap (at token 11). + assertEquals(Set.of(s), intersecting(map, range(0, 11))); + } + + /** Adjacent shards both boundary-cased at once. */ + @Test + public void testIntersectingAdjacentShardBoundary() + { + Shard a = shard(range( 0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b); + + // Query (99, 100] sits entirely inside a (includes token 100 which is a's inclusive right). + assertEquals(Set.of(a), intersecting(map, range(99, 100))); + + // Query (100, 101] sits entirely inside b (excludes token 100 via its exclusive left). + assertEquals(Set.of(b), intersecting(map, range(100, 101))); + + // Query (99, 101] straddles the boundary: both shards match. + assertEquals(Set.of(a, b), intersecting(map, range(99, 101))); + } + + // --------------------------------------------------------------------- + // forEachIntersecting: min-token (+∞) sentinel handling + // --------------------------------------------------------------------- + + /** + * Regression for bug 2: a query whose right bound is the min-token sentinel (= +∞) + * must still correctly intersect shards whose ranges lie above the query's left. + * Before the fix, endWithStartSeeker used raw Token.compareTo which treats min as + * -∞, producing a false negative. + */ + @Test + public void testIntersectingQueryRightIsMin() + { + Shard a = shard(range( 0, 100), 1L); + Shard b = shard(range(100, 200), 1L); + Shard c = shard(range(200, 300), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b).with(c); + + // Query (10, +∞] must return all three shards. + assertEquals(Set.of(a, b, c), intersecting(map, rangeToMax(10))); + + // Query (150, +∞] starts inside b and continues past everything on the right. + assertEquals(Set.of(b, c), intersecting(map, rangeToMax(150))); + + // Query (300, +∞] starts strictly above all shards. + assertEquals(Set.of(), intersecting(map, rangeToMax(300))); + + // Boundary: query (100, +∞] must exclude a (whose right is 100, inclusive but + // coincident with the query's exclusive left). + assertEquals(Set.of(b, c), intersecting(map, rangeToMax(100))); + } + + /** + * Shards whose right bound is the min-token sentinel (= +∞) must be matched + * by queries that fall anywhere in their range. This exercises the storage side + * of the +∞ convention through compareTokenToEnd / endWithEndSorter. + */ + @Test + public void testIntersectingShardRightIsMin() + { + // shard covers (100, +∞] + Shard s = shard(rangeToMax(100), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + // finite window strictly above the shard's left + assertEquals(Set.of(s), intersecting(map, range(200, 300))); + // window that starts before the shard and reaches inside it + assertEquals(Set.of(s), intersecting(map, range(50, 150))); + // window entirely below the shard + assertEquals(Set.of(), intersecting(map, range(0, 50))); + // window whose right bound equals shard's exclusive left + assertEquals(Set.of(), intersecting(map, range(0, 100))); + // query reaching to +∞ + assertEquals(Set.of(s), intersecting(map, rangeToMax(200))); + + // point queries: + assertEquals(Set.of(s), covering(map, tk(200))); + assertEquals(Set.of(), covering(map, tk(100))); // exclusive left + assertEquals(Set.of(), covering(map, tk(50))); + } + + /** + * Full-ring shard (min, min] must be accepted by with() (it is not "truly" + * wrap-around) and match every finite query. + */ + @Test + public void testIntersectingFullRingShard() + { + Shard s = shard(new Range<>(MIN, MIN), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + assertEquals(Set.of(s), intersecting(map, range(0, 100))); + assertEquals(Set.of(s), intersecting(map, range(-100, -1))); + assertEquals(Set.of(s), intersecting(map, rangeToMax(0))); + assertEquals(Set.of(s), intersecting(map, rangeFromMin(100))); + } + + // --------------------------------------------------------------------- + // forEachIntersecting: wrap-around guard + // --------------------------------------------------------------------- + + @Test + public void testIntersectingRejectsTrulyWrapAroundQuery() + { + ShardIntervalBTree map = new ShardIntervalBTree().with(shard(range(0, 100), 1L)); + // (200, 100] truly wraps around (right is finite, smaller than left). + try + { + map.forEachIntersecting(new Range<>(tk(200), tk(100)), s -> fail("should have thrown")); + fail("expected IllegalArgumentException for wrap-around query"); + } + catch (IllegalArgumentException expected) {} + } + + @Test + public void testIntersectingAcceptsQueryWithMinRight() + { + // (50, +∞] is not "truly" wrap-around and must be accepted. + ShardIntervalBTree map = new ShardIntervalBTree().with(shard(range(0, 100), 1L)); + assertEquals(Set.of(map.latestShardCovering(tk(50))), intersecting(map, rangeToMax(50))); + } + + // --------------------------------------------------------------------- + // forEachCovering: boundary pin-downs (regression guards) + // --------------------------------------------------------------------- + + @Test + public void testCoveringBoundaries() + { + Shard s = shard(range(0, 100), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + assertEquals(Set.of(), covering(map, tk(0))); // exclusive left + assertEquals(Set.of(s), covering(map, tk(1))); + assertEquals(Set.of(s), covering(map, tk(100))); // inclusive right + assertEquals(Set.of(), covering(map, tk(101))); + } + + @Test + public void testCoveringAtMinToken() + { + // For both a finite shard and a +∞-right shard, the min token is never + // covered (conceptually it's a sentinel outside the ring). + Shard finite = shard(range(0, 100), 1L); + Shard toMax = shard(rangeToMax(100), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(finite).with(toMax); + + assertEquals(Set.of(), covering(map, MIN)); + assertNull(map.latestShardCovering(MIN)); + } + + // --------------------------------------------------------------------- + // get(Range, sinceEpoch): exact (range, sinceEpoch) lookup + // --------------------------------------------------------------------- + + @Test + public void testGetEmptyTreeReturnsNull() + { + ShardIntervalBTree map = new ShardIntervalBTree(); + assertNull(map.get(range(0, 100), 1L)); + assertNull(map.get(rangeToMax(0), 1L)); + } + + @Test + public void testGetExactMatch() + { + Shard a = shard(range( 0, 100), 1L); + Shard b = shard(range(100, 200), 2L); + Shard c = shard(range(200, 300), 3L); + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b).with(c); + + assertSame(a, map.get(range( 0, 100), 1L)); + assertSame(b, map.get(range(100, 200), 2L)); + assertSame(c, map.get(range(200, 300), 3L)); + } + + @Test + public void testGetRangeMatchesButEpochDiffersReturnsNull() + { + Shard s = shard(range(0, 100), 5L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + // exact range, wrong sinceEpoch + assertNull(map.get(range(0, 100), 1L)); + assertNull(map.get(range(0, 100), 4L)); + assertNull(map.get(range(0, 100), 6L)); + + // sanity: the right epoch still works + assertSame(s, map.get(range(0, 100), 5L)); + } + + @Test + public void testGetEpochMatchesButRangeDiffersReturnsNull() + { + Shard s = shard(range(0, 100), 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + // overlapping but not equal: subset, superset, shifted, adjacent + assertNull(map.get(range( 10, 90), 1L)); // strict subset + assertNull(map.get(range(-10, 110), 1L)); // strict superset + assertNull(map.get(range( 50, 150), 1L)); // shifted right, partial overlap + assertNull(map.get(range(-50, 50), 1L)); // shifted left, partial overlap + assertNull(map.get(range(100, 200), 1L)); // adjacent (no overlap on (left, right] semantics) + assertNull(map.get(range(200, 300), 1L)); // disjoint + + // sanity: exact equality still works + assertSame(s, map.get(range(0, 100), 1L)); + } + + @Test + public void testGetSameRangeDifferentEpochs() + { + Shard older = shard(range(0, 100), 1L); + Shard newer = shard(range(0, 100), 5L); + ShardIntervalBTree map = new ShardIntervalBTree().with(older).with(newer); + + assertSame(older, map.get(range(0, 100), 1L)); + assertSame(newer, map.get(range(0, 100), 5L)); + // No shard exists at this epoch even though the range matches. + assertNull(map.get(range(0, 100), 3L)); + } + + @Test + public void testGetWithRangeReachingMin() + { + // shard covers (100, +∞] + Shard s = shard(rangeToMax(100), 7L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + assertSame(s, map.get(rangeToMax(100), 7L)); + assertNull(map.get(rangeToMax(100), 6L)); // wrong epoch + assertNull(map.get(rangeToMax( 99), 7L)); // different left + assertNull(map.get(range(100, 200), 7L)); // different right (not +∞) + } + + @Test + public void testGetFullRingShard() + { + Range fullRing = new Range<>(MIN, MIN); + Shard s = shard(fullRing, 1L); + ShardIntervalBTree map = new ShardIntervalBTree().with(s); + + assertSame(s, map.get(fullRing, 1L)); + assertNull(map.get(fullRing, 2L)); // wrong epoch + assertNull(map.get(rangeToMax(0), 1L)); // not the same range + assertNull(map.get(rangeFromMin(100), 1L)); // not the same range + } + + @Test + public void testGetAfterWithoutReturnsNull() + { + Shard a = shard(range(0, 100), 1L); + Shard b = shard(range(100, 200), 2L); + ShardIntervalBTree map = new ShardIntervalBTree().with(a).with(b); + + assertSame(a, map.get(range(0, 100), 1L)); + + ShardIntervalBTree less = map.without(a); + assertNull(less.get(range(0, 100), 1L)); + // the original snapshot is untouched (immutability) + assertSame(a, map.get(range(0, 100), 1L)); + // unrelated entries are still reachable + assertSame(b, less.get(range(100, 200), 2L)); + } +} diff --git a/test/unit/org/apache/cassandra/replication/ShardTest.java b/test/unit/org/apache/cassandra/replication/ShardTest.java index daddc631767f..6309dbdde520 100644 --- a/test/unit/org/apache/cassandra/replication/ShardTest.java +++ b/test/unit/org/apache/cassandra/replication/ShardTest.java @@ -72,7 +72,7 @@ public void testPersistAndLoadSingleShard() MutableInteger logId = new MutableInteger(); LongSupplier logIdProvider = () -> CoordinatorLogId.asLong(LOCAL_HOST_ID, logId.getAndIncrement()); - Shard original = new Shard(LOCAL_HOST_ID, KEYSPACE, range, participants, logIdProvider, (s, l) -> {}); + Shard original = new Shard(LOCAL_HOST_ID, KEYSPACE, 1L, range, participants, logIdProvider, (s, l) -> {}); original.persistToSystemTables(); ArrayList loadedShards = Shard.loadFromSystemTables(LOCAL_HOST_ID, logIdProvider, (s, l) -> {}); @@ -96,13 +96,13 @@ public void testLogRotation() Participants participants = new Participants(List.of(LOCAL_HOST_ID, REMOTE_HOST_ID_1, REMOTE_HOST_ID_2)); MutableInteger logId = new MutableInteger(); LongSupplier logIdProvider = () -> CoordinatorLogId.asLong(LOCAL_HOST_ID, logId.getAndIncrement()); - Shard shard = new Shard(LOCAL_HOST_ID, KEYSPACE, range, participants, logIdProvider, (s, l) -> { + Shard shard = new Shard(LOCAL_HOST_ID, KEYSPACE, 1L, range, participants, logIdProvider, (s, l) -> { }); - MutationId firstId = shard.nextId(); + MutationId firstId = shard.nextMutationId(); for (int i = 0; i < 100; i++) - assertEquals(firstId.hostLogId, shard.nextId().hostLogId); - assertEquals(firstId.hostLogId + 1, shard.nextId().hostLogId); + assertEquals(firstId.hostLogId, shard.nextMutationId().hostLogId); + assertEquals(firstId.hostLogId + 1, shard.nextMutationId().hostLogId); } finally { diff --git a/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java b/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java index 82c1f13bc6aa..4cfc469da808 100644 --- a/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java +++ b/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java @@ -123,8 +123,7 @@ public void test() throws Exception Collection outgoingStreams = store.getStreamManager().createOutgoingStreams(session, rangesAtEndpoint, NO_PENDING_REPAIR, - PreviewKind.NONE, - null); + PreviewKind.NONE); session.addTransferStreams(outgoingStreams); AsyncStreamingOutputPlus out = constructDataOutputStream(); diff --git a/test/unit/org/apache/cassandra/streaming/StreamSessionOwnedRangesTest.java b/test/unit/org/apache/cassandra/streaming/StreamSessionOwnedRangesTest.java index b5d22a16459c..41ab4f119732 100644 --- a/test/unit/org/apache/cassandra/streaming/StreamSessionOwnedRangesTest.java +++ b/test/unit/org/apache/cassandra/streaming/StreamSessionOwnedRangesTest.java @@ -150,7 +150,7 @@ private static void tryPrepareExpectingSuccess(Collection request long startMetricCount = StorageMetrics.totalOpsForInvalidToken.getCount(); session.state(StreamSession.State.PREPARING); - session.prepareAsync(requests, Collections.emptySet(), null, null); + session.prepareAsync(requests, Collections.emptySet()); assertEquals(2, sent.size()); assertEquals(PREPARE_SYNACK, sent.get(0).type); @@ -167,7 +167,7 @@ private static void tryPrepareExpectingFailure(Collection request long startMetricCount = StorageMetrics.totalOpsForInvalidToken.getCount(); session.state(StreamSession.State.PREPARING); - java.util.concurrent.Future f = session.prepare(requests, Collections.emptySet(), null, null); + java.util.concurrent.Future f = session.prepare(requests, Collections.emptySet()); Exception ex = f.get(); assertNotNull(ex); if (!(ex instanceof StreamRequestOutOfTokenRangeException))