From 2b5dc7e60ca05536d8d141e1c6a82a69dd4c9bff Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Thu, 9 Oct 2025 11:44:02 +0100 Subject: [PATCH 1/3] kvserver: fix race in split application This commit fixes the replica storage race that may occur when applying a split in the rare case there is a concurrent creation of a higher-ReplicaID RHS. The race stems from the fact that a higher-ReplicaID uninitialized replica is not locked for the entirety of splitPreApply, so it can be created and make some limited progress concurrently. We remove the clearing/rewriting of the unreplicated state which belongs to that RHS, to let it progress untouched. This also removes a blocker towards raft/state-machine storage separation: we don't want to be touching raft state of the RHS in the same batch with the state machine updates. Epic: none Release note (bug fix): fixed a race in range splits that can result in a regressed raft state of a post-split range. The race conditions are very rare / nearly impossible, and we haven't seen it in the wild. --- pkg/kv/kvserver/kvstorage/destroy.go | 20 +---- pkg/kv/kvserver/logstore/stateloader.go | 5 ++ pkg/kv/kvserver/store_split.go | 97 +++++++++++++------------ 3 files changed, 60 insertions(+), 62 deletions(-) diff --git a/pkg/kv/kvserver/kvstorage/destroy.go b/pkg/kv/kvserver/kvstorage/destroy.go index 71038dfc990f..5bf90719cbb7 100644 --- a/pkg/kv/kvserver/kvstorage/destroy.go +++ b/pkg/kv/kvserver/kvstorage/destroy.go @@ -220,9 +220,6 @@ func SubsumeReplica( // is used in a situation when the RHS replica is already known to have been // removed from our store, so any pending writes that were supposed to // initialize the RHS replica should be dropped from the write batch. -// -// TODO(#152199): do not remove the unreplicated state which can belong to a -// newer (uninitialized) replica. func RemoveStaleRHSFromSplit( ctx context.Context, reader storage.Reader, @@ -237,18 +234,9 @@ func RemoveStaleRHSFromSplit( // staged in the batch. clearReplicatedBySpan: keys, clearReplicatedByRangeID: true, - // TODO(tbg): we don't actually want to touch the raft state of the RHS - // replica since it's absent or a more recent one than in the split. Now - // that we have a bool targeting unreplicated RangeID-local keys, we can set - // it to false and remove the HardState+ReplicaID write-back in the caller. - // However, there can be historical split proposals with the - // RaftTruncatedState key set in splitTriggerHelper[^1]. We must first make - // sure that such proposals no longer exist, e.g. with a below-raft - // migration. - // - // [^1]: https://github.com/cockroachdb/cockroach/blob/f263a765d750e41f2701da0a923a6e92d09159fa/pkg/kv/kvserver/batcheval/cmd_end_transaction.go#L1109-L1149 - // - // See also: https://github.com/cockroachdb/cockroach/issues/94933 - clearUnreplicatedByRangeID: true, + // Leave the unreplicated keys intact. The unreplicated space belongs to a + // newer (uninitialized) replica, or is empty and only contains a + // RangeTombstone with a higher ReplicaID than the RHS in the split trigger. + clearUnreplicatedByRangeID: false, }) } diff --git a/pkg/kv/kvserver/logstore/stateloader.go b/pkg/kv/kvserver/logstore/stateloader.go index 9c4000b24208..8c460862476f 100644 --- a/pkg/kv/kvserver/logstore/stateloader.go +++ b/pkg/kv/kvserver/logstore/stateloader.go @@ -135,6 +135,11 @@ func (sl StateLoader) SetRaftTruncatedState( ) } +// ClearRaftTruncatedState clears the RaftTruncatedState. +func (sl StateLoader) ClearRaftTruncatedState(writer storage.Writer) error { + return writer.ClearUnversioned(sl.RaftTruncatedStateKey(), storage.ClearOptions{}) +} + // LoadHardState loads the HardState. func (sl StateLoader) LoadHardState( ctx context.Context, reader storage.Reader, diff --git a/pkg/kv/kvserver/store_split.go b/pkg/kv/kvserver/store_split.go index 2feebaaa4c6b..225c33ce3205 100644 --- a/pkg/kv/kvserver/store_split.go +++ b/pkg/kv/kvserver/store_split.go @@ -12,7 +12,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/load" - "github.com/cockroachdb/cockroach/pkg/raft/raftpb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" @@ -51,82 +50,88 @@ func splitPreApply( // raftMu is locked. // // In the less common case, the ReplicaID is already removed from this Store, - // and rightRepl is either nil or an uninitialized replica with a higher - // ReplicaID. Its raftMu is not locked. + // and rightRepl is either nil (though the replica may be created concurrently + // after we got this nil), or an uninitialized replica with a higher + // ReplicaID. Its raftMu is not locked, so this replica might as well be in + // the process of destruction or being replaced with another higher-ReplicaID + // uninitialized replica. + // + // NB: in any case, the RHS, if exists (one or multiple replicas, throughout + // this splitPreApply call), is uninitialized. Such replicas don't have + // replicated state, and only have non-empty RaftReplicaID and RaftHardState + // keys in storage. + // TODO(pav-kv): possibly some other non-essential unreplicated keys too. But + // importantly, we don't touch any of those here and let them be. As a rule of + // thumb, all unreplicated keys belong to the *current* ReplicaID in the + // store, rather than the ReplicaID in the split trigger (which can be stale). rightRepl := r.store.GetReplicaIfExists(split.RightDesc.RangeID) + + rsl := kvstorage.MakeStateLoader(split.RightDesc.RangeID) + // After PR #149620, the split trigger batch may only contain replicated state + // machine keys, and never contains unreplicated / raft keys. One exception: + // there can still be historical split proposals that write the initial + // RaftTruncatedState of the RHS. Remove this key (if exists), and set it + // below only if necessary. + // + // Note that if the RHS range is already present or being created concurrently + // on this Store, it doesn't have a RaftTruncatedState, so this write will not + // conflict with or corrupt it. + // + // TODO(#152847): remove this line when there are no historical proposals with + // RaftTruncatedState, e.g. after a below-raft migration. + if err := rsl.ClearRaftTruncatedState(readWriter); err != nil { + log.KvExec.Fatalf(ctx, "cannot clear RaftTruncatedState: %v", err) + } + // Check to see if we know that the RHS has already been removed from this // store at the replica ID implied by the split. if rightRepl == nil || rightRepl.isNewerThanSplit(&split) { - // We're in the rare case where we know that the RHS has been removed - // and re-added with a higher replica ID (and then maybe removed again). + // We're in the rare case where we know that the RHS has been removed or + // re-added with a higher replica ID (one or more times). // // If rightRepl is not nil, we are *not* holding raftMu. // // To apply the split, we need to "throw away" the data that would belong to // the RHS, i.e. we clear the user data the RHS would have inherited from - // the LHS due to the split and additionally clear all of the range ID local - // state that the split trigger writes into the RHS. At the time of writing, - // unfortunately that means that we'll also delete any data that might - // already be present in the RHS: the HardState and RaftReplicaID. It is - // important to preserve the HardState because we might however have already - // voted at a higher term. In general this shouldn't happen because we add - // learners and then promote them only after they apply a snapshot but we're - // going to be extra careful in case future versions of cockroach somehow - // promote replicas without ensuring that a snapshot has been received. So - // we write it back (and the RaftReplicaID too, since it's an invariant that - // it's always present). - var hs raftpb.HardState + // the LHS due to the split. + // + // Leave the RangeID-local state intact, since it belongs to a newer replica + // or does not exist. At the time of writing, it can be a non-empty + // HardState and RaftReplicaID. It is important to preserve the HardState + // because the replica might have already voted at a higher term. In general + // this shouldn't happen because we add learners and then promote them only + // after they apply a snapshot, but we're going to be extra careful in case + // future versions of cockroach somehow promote replicas without ensuring + // that a snapshot has been received. + // + // NB: the rightRepl == nil condition is flaky, in a sense that the RHS + // replica can be created concurrently here, one or more times. But we only + // use it for a best effort assertion, so this is not critical. if rightRepl != nil { - // TODO(pav-kv): rightRepl could have been destroyed by the time we get to - // lock it here. The HardState read-then-write appears risky in this case. - rightRepl.raftMu.Lock() - defer rightRepl.raftMu.Unlock() // Assert that the rightRepl is not initialized. We're about to clear out // the data of the RHS of the split; we cannot have already accepted a // snapshot to initialize this newer RHS. if rightRepl.IsInitialized() { log.KvExec.Fatalf(ctx, "unexpectedly found initialized newer RHS of split: %v", rightRepl.Desc()) } - var err error - hs, err = rightRepl.raftMu.stateLoader.LoadHardState(ctx, readWriter) - if err != nil { - log.KvExec.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err) - } } - // TODO(#152199): the rightRepl == nil condition is flaky. There can be a - // racing replica creation for a higher ReplicaID, and it can subsequently - // update its HardState. Here, we can accidentally clear the HardState of - // that new replica. if err := kvstorage.RemoveStaleRHSFromSplit( ctx, readWriter, readWriter, split.RightDesc.RangeID, split.RightDesc.RSpan(), ); err != nil { log.KvExec.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err) } - if rightRepl != nil { - // Cleared the HardState and RaftReplicaID, so rewrite them to the current - // values. NB: rightRepl.raftMu is still locked since HardState was read, - // so it can't have been rewritten in the meantime (fixed in #75918). - if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, readWriter, hs); err != nil { - log.KvExec.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err) - } - if err := rightRepl.raftMu.stateLoader.SetRaftReplicaID( - ctx, readWriter, rightRepl.ReplicaID()); err != nil { - log.KvExec.Fatalf(ctx, "failed to set RaftReplicaID for removed rhs: %v", err) - } - } return } // The RHS replica exists and is uninitialized. We are initializing it here. // This is the common case. // - // Update the raft HardState with the new Commit index (taken from the - // applied state in the write batch), and use existing[*] or default Term - // and Vote. Also write the initial RaftTruncatedState. + // Update the raft HardState with the new Commit index (taken from the applied + // state in the write batch), and use existing[*] or default Term and Vote. + // Also write the initial RaftTruncatedState. // // [*] Note that uninitialized replicas may cast votes, and if they have, we // can't load the default Term and Vote values. - rsl := kvstorage.MakeStateLoader(split.RightDesc.RangeID) if err := rsl.SynthesizeRaftState(ctx, readWriter, kvstorage.TODORaft(readWriter)); err != nil { log.KvExec.Fatalf(ctx, "%v", err) } From 6d5e451fd96218a08381454def6223b8f42cf322 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Mon, 13 Oct 2025 13:03:52 +0100 Subject: [PATCH 2/3] kvserver: refresh isNewerThanSplit comment Epic: none Release note: none --- pkg/kv/kvserver/replica.go | 40 +++++++------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index 7311cd397b7a..0701176bcba5 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -2461,44 +2461,18 @@ func shouldWaitForPendingMerge( return &kvpb.MergeInProgressError{} } -// isNewerThanSplit is a helper used in split(Pre|Post)Apply to -// determine whether the Replica on the right hand side of the split must -// have been removed from this store after the split. +// isNewerThanSplit is a helper used in split(Pre|Post)Apply to determine +// whether the RHS replica of the split must have been removed from this store +// after the split, and the given Replica has a higher ID. // -// TODO(tbg): the below is true as of 22.2: we persist any Replica's ReplicaID -// under RaftReplicaIDKey, so the below caveats should be addressed now. -// -// TODO(ajwerner): There is one false negative where false will be returned but -// the hard state may be due to a newer replica which is outlined below. It -// should be safe. -// Ideally if this store had ever learned that the replica created by the split -// were removed it would not forget that fact. There exists one edge case where -// the store may learn that it should house a replica of the same range with a -// higher replica ID and then forget. If the first raft message this store ever -// receives for the this range contains a replica ID higher than the replica ID -// in the split trigger then an in-memory replica at that higher replica ID will -// be created and no tombstone at a lower replica ID will be written. If the -// server then crashes it will forget that it had ever been the higher replica -// ID. The server may then proceed to process the split and initialize a replica -// at the replica ID implied by the split. This is potentially problematic as -// the replica may have voted as this higher replica ID and when it rediscovers -// the higher replica ID it will delete all of the state corresponding to the -// older replica ID including its hard state which may have been synthesized -// with votes as the newer replica ID. This case tends to be handled safely in -// practice because the replica should only be receiving messages as the newer -// replica ID after it has been added to the range as a learner. -// -// Despite the safety due to the change replicas protocol explained above it'd -// be good to know for sure that a replica ID for a range on a store is always -// monotonically increasing, even across restarts. +// NB: from v22.2, we persist any Replica's ID under RaftReplicaIDKey. A +// complementary mechanism, RangeTombstone, ensures that replica deletions are +// persistent as well. As a result, the ReplicaID existence and non-existence is +// monotonic and survives restarts. // // See TestProcessSplitAfterRightHandSideHasBeenRemoved. func (r *Replica) isNewerThanSplit(split *roachpb.SplitTrigger) bool { rightDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID()) - // If the first raft message we received for the RHS range was for a replica - // ID which is above the replica ID of the split then we would not have - // written a tombstone but we will have a replica ID that will exceed the - // split replica ID. return r.replicaID > rightDesc.ReplicaID } From 148dcbb5ce0b45202ced0ff1e4e8c97b86d62e28 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Mon, 13 Oct 2025 13:17:50 +0100 Subject: [PATCH 3/3] kvserver: refresh split comment in post-add triggers Epic: none Release note: none --- pkg/kv/kvserver/replica_app_batch.go | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pkg/kv/kvserver/replica_app_batch.go b/pkg/kv/kvserver/replica_app_batch.go index 9f3cd46407bf..164e88668f83 100644 --- a/pkg/kv/kvserver/replica_app_batch.go +++ b/pkg/kv/kvserver/replica_app_batch.go @@ -309,15 +309,18 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly( } if res.Split != nil { - // Splits require a new HardState to be written to the new RHS - // range (and this needs to be atomic with the main batch). This - // cannot be constructed at evaluation time because it differs - // on each replica (votes may have already been cast on the - // uninitialized replica). Write this new hardstate to the batch too. + // Splits require a new HardState to be written for the new RHS replica, + // atomically with the main batch. This cannot be constructed at evaluation + // time because it differs on each replica (votes may have already been cast + // on the uninitialized replica). Write this new HardState to the batch too. // See https://github.com/cockroachdb/cockroach/issues/20629. // - // Alternatively if we discover that the RHS has already been removed - // from this store, clean up its data. + // Alternatively if we discover that the RHS has already been removed from + // this store, clean up its data. + // + // NB: another reason why we shouldn't write HardState at evaluation time is + // that it belongs to the log engine, whereas the evaluated batch must + // contain only state machine updates. splitPreApply(ctx, b.r, b.batch, res.Split.SplitTrigger, cmd.Cmd.ClosedTimestamp) // The rangefeed processor will no longer be provided logical ops for @@ -330,9 +333,7 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly( if res.Split.SplitTrigger.ManualSplit { reason = kvpb.RangeFeedRetryError_REASON_MANUAL_RANGE_SPLIT } - b.r.disconnectRangefeedWithReason( - reason, - ) + b.r.disconnectRangefeedWithReason(reason) } if merge := res.Merge; merge != nil {