Skip to content

Commit d1c685a

Browse files
committed
kvserver: fix race in split application
This commit fixes the replica storage race that may occur when applying a split in the rare case there is a concurrent creation of a higher-ReplicaID RHS. The race stems from the fact that a higher-ReplicaID uninitialized replica is not locked for the entirety of splitPreApply, so it can be created and make some limited progress concurrently. We remove the clearing/rewriting of the unreplicated state which belongs to that RHS, to let it progress untouched. This also removes a blocker towards raft/state-machine storage separation: we don't want to be touching raft state of the RHS in the same batch with the state machine updates. Epic: none Release note (bug fix): fixed a race in range splits that can result in a regressed raft state of a post-split range. The race conditions are very rare / nearly impossible, and we haven't seen it in the wild.
1 parent 792292c commit d1c685a

File tree

3 files changed

+66
-62
lines changed

3 files changed

+66
-62
lines changed

pkg/kv/kvserver/kvstorage/destroy.go

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,6 @@ func SubsumeReplica(
199199
// is used in a situation when the RHS replica is already known to have been
200200
// removed from our store, so any pending writes that were supposed to
201201
// initialize the RHS replica should be dropped from the write batch.
202-
//
203-
// TODO(#152199): do not remove the unreplicated state which can belong to a
204-
// newer (uninitialized) replica.
205202
func RemoveStaleRHSFromSplit(
206203
ctx context.Context,
207204
reader storage.Reader,
@@ -216,19 +213,10 @@ func RemoveStaleRHSFromSplit(
216213
// staged in the batch.
217214
ReplicatedByRangeID: true,
218215
Ranged: rditer.SelectAllRanged(keys),
219-
// TODO(tbg): we don't actually want to touch the raft state of the RHS
220-
// replica since it's absent or a more recent one than in the split. Now
221-
// that we have a bool targeting unreplicated RangeID-local keys, we can set
222-
// it to false and remove the HardState+ReplicaID write-back in the caller.
223-
// However, there can be historical split proposals with the
224-
// RaftTruncatedState key set in splitTriggerHelper[^1]. We must first make
225-
// sure that such proposals no longer exist, e.g. with a below-raft
226-
// migration.
227-
//
228-
// [^1]: https://github.com/cockroachdb/cockroach/blob/f263a765d750e41f2701da0a923a6e92d09159fa/pkg/kv/kvserver/batcheval/cmd_end_transaction.go#L1109-L1149
229-
//
230-
// See also: https://github.com/cockroachdb/cockroach/issues/94933
231-
UnreplicatedByRangeID: true,
216+
// Leave the unreplicated keys intact. The unreplicated space either belongs
217+
// to a newer (uninitialized) replica, or is empty and only contains a
218+
// RangeTombstone with a higher ReplicaID than the RHS in the split trigger.
219+
UnreplicatedByRangeID: false,
232220
}) {
233221
if err := storage.ClearRangeWithHeuristic(
234222
ctx, reader, writer, span.Key, span.EndKey, ClearRangeThresholdPointKeys(),

pkg/kv/kvserver/logstore/stateloader.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,11 @@ func (sl StateLoader) SetRaftTruncatedState(
135135
)
136136
}
137137

138+
// ClearRaftTruncatedState clears the RaftTruncatedState.
139+
func (sl StateLoader) ClearRaftTruncatedState(writer storage.Writer) error {
140+
return writer.ClearUnversioned(sl.RaftTruncatedStateKey(), storage.ClearOptions{})
141+
}
142+
138143
// LoadHardState loads the HardState.
139144
func (sl StateLoader) LoadHardState(
140145
ctx context.Context, reader storage.Reader,

pkg/kv/kvserver/store_split.go

Lines changed: 57 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1313
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/load"
15-
"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
1615
"github.com/cockroachdb/cockroach/pkg/roachpb"
1716
"github.com/cockroachdb/cockroach/pkg/storage"
1817
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
@@ -51,82 +50,94 @@ func splitPreApply(
5150
// raftMu is locked.
5251
//
5352
// In the less common case, the ReplicaID is already removed from this Store,
54-
// and rightRepl is either nil or an uninitialized replica with a higher
55-
// ReplicaID. Its raftMu is not locked.
53+
// and rightRepl is either nil (though the replica may be created concurrently
54+
// after we got this nil), or an uninitialized replica with a higher
55+
// ReplicaID. Its raftMu is not locked, so this replica might as well be in
56+
// the process of destruction or being replaced with another higher-ReplicaID
57+
// uninitialized replica.
58+
//
59+
// In any case, the RHS, if exists (one or multiple replicas, throughout this
60+
// splitPreApply call), is uninitialized. This is due to the Store invariant
61+
// that all initialized replicas don't overlap, thus the RHS one can only be
62+
// uninitialized while we still own the RHS part of the keyspace.
63+
//
64+
// Uninitialized replicas don't have replicated state, and only have non-empty
65+
// RaftReplicaID and RaftHardState keys in storage (at the time of writing),
66+
// or, more generally, only unreplicated keys. As a rule of thumb, all
67+
// unreplicated keys belong to the *current* ReplicaID in the store, rather
68+
// than the ReplicaID in the split trigger (which can be stale).
5669
rightRepl := r.store.GetReplicaIfExists(split.RightDesc.RangeID)
70+
71+
rsl := kvstorage.MakeStateLoader(split.RightDesc.RangeID)
72+
// After PR #149620, the split trigger batch may only contain replicated state
73+
// machine keys, and never contains unreplicated / raft keys. One exception:
74+
// there can still be historical split proposals that write the initial
75+
// RaftTruncatedState of the RHS. Remove this key (if exists), and set it
76+
// below only if necessary.
77+
//
78+
// Note that if the RHS range is already present or being created concurrently
79+
// on this Store, it doesn't have a RaftTruncatedState (which only initialized
80+
// replicas can have), so this deletion will not conflict with or corrupt it.
81+
//
82+
// TODO(#152847): remove this workaround when there are no historical
83+
// proposals with RaftTruncatedState, e.g. after a below-raft migration.
84+
if ts, err := rsl.LoadRaftTruncatedState(ctx, readWriter); err != nil {
85+
log.KvExec.Fatalf(ctx, "cannot load RaftTruncatedState: %v", err)
86+
} else if ts == (kvserverpb.RaftTruncatedState{}) {
87+
// Common case. Do nothing.
88+
} else if err := rsl.ClearRaftTruncatedState(readWriter); err != nil {
89+
log.KvExec.Fatalf(ctx, "cannot clear RaftTruncatedState: %v", err)
90+
}
91+
5792
// Check to see if we know that the RHS has already been removed from this
5893
// store at the replica ID implied by the split.
5994
if rightRepl == nil || rightRepl.isNewerThanSplit(&split) {
60-
// We're in the rare case where we know that the RHS has been removed
61-
// and re-added with a higher replica ID (and then maybe removed again).
95+
// We're in the rare case where we know that the RHS has been removed or
96+
// re-added with a higher replica ID (one or more times).
6297
//
6398
// If rightRepl is not nil, we are *not* holding raftMu.
6499
//
65100
// To apply the split, we need to "throw away" the data that would belong to
66101
// the RHS, i.e. we clear the user data the RHS would have inherited from
67-
// the LHS due to the split and additionally clear all of the range ID local
68-
// state that the split trigger writes into the RHS. At the time of writing,
69-
// unfortunately that means that we'll also delete any data that might
70-
// already be present in the RHS: the HardState and RaftReplicaID. It is
71-
// important to preserve the HardState because we might however have already
72-
// voted at a higher term. In general this shouldn't happen because we add
73-
// learners and then promote them only after they apply a snapshot but we're
74-
// going to be extra careful in case future versions of cockroach somehow
75-
// promote replicas without ensuring that a snapshot has been received. So
76-
// we write it back (and the RaftReplicaID too, since it's an invariant that
77-
// it's always present).
78-
var hs raftpb.HardState
102+
// the LHS due to the split.
103+
//
104+
// Leave the RangeID-local state intact, since it either belongs to a newer
105+
// replica or does not exist. At the time of writing, it can be a non-empty
106+
// HardState and RaftReplicaID. It is important to preserve the HardState
107+
// because the replica might have already voted at a higher term. In general
108+
// this shouldn't happen because we add learners and then promote them only
109+
// after they apply a snapshot, but we're going to be extra careful in case
110+
// future versions of cockroach somehow promote replicas without ensuring
111+
// that a snapshot has been received.
112+
//
113+
// NB: the rightRepl == nil condition is flaky, in a sense that the RHS
114+
// replica can be created concurrently here, one or more times. But we only
115+
// use it for a best effort assertion, so this is not critical.
79116
if rightRepl != nil {
80-
// TODO(pav-kv): rightRepl could have been destroyed by the time we get to
81-
// lock it here. The HardState read-then-write appears risky in this case.
82-
rightRepl.raftMu.Lock()
83-
defer rightRepl.raftMu.Unlock()
84117
// Assert that the rightRepl is not initialized. We're about to clear out
85118
// the data of the RHS of the split; we cannot have already accepted a
86119
// snapshot to initialize this newer RHS.
87120
if rightRepl.IsInitialized() {
88121
log.KvExec.Fatalf(ctx, "unexpectedly found initialized newer RHS of split: %v", rightRepl.Desc())
89122
}
90-
var err error
91-
hs, err = rightRepl.raftMu.stateLoader.LoadHardState(ctx, readWriter)
92-
if err != nil {
93-
log.KvExec.Fatalf(ctx, "failed to load hard state for removed rhs: %v", err)
94-
}
95123
}
96-
// TODO(#152199): the rightRepl == nil condition is flaky. There can be a
97-
// racing replica creation for a higher ReplicaID, and it can subsequently
98-
// update its HardState. Here, we can accidentally clear the HardState of
99-
// that new replica.
100124
if err := kvstorage.RemoveStaleRHSFromSplit(
101125
ctx, readWriter, readWriter, split.RightDesc.RangeID, split.RightDesc.RSpan(),
102126
); err != nil {
103127
log.KvExec.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err)
104128
}
105-
if rightRepl != nil {
106-
// Cleared the HardState and RaftReplicaID, so rewrite them to the current
107-
// values. NB: rightRepl.raftMu is still locked since HardState was read,
108-
// so it can't have been rewritten in the meantime (fixed in #75918).
109-
if err := rightRepl.raftMu.stateLoader.SetHardState(ctx, readWriter, hs); err != nil {
110-
log.KvExec.Fatalf(ctx, "failed to set hard state with 0 commit index for removed rhs: %v", err)
111-
}
112-
if err := rightRepl.raftMu.stateLoader.SetRaftReplicaID(
113-
ctx, readWriter, rightRepl.ReplicaID()); err != nil {
114-
log.KvExec.Fatalf(ctx, "failed to set RaftReplicaID for removed rhs: %v", err)
115-
}
116-
}
117129
return
118130
}
119131

120132
// The RHS replica exists and is uninitialized. We are initializing it here.
121133
// This is the common case.
122134
//
123-
// Update the raft HardState with the new Commit index (taken from the
124-
// applied state in the write batch), and use existing[*] or default Term
125-
// and Vote. Also write the initial RaftTruncatedState.
135+
// Update the raft HardState with the new Commit index (taken from the applied
136+
// state in the write batch), and use existing[*] or default Term and Vote.
137+
// Also write the initial RaftTruncatedState.
126138
//
127139
// [*] Note that uninitialized replicas may cast votes, and if they have, we
128140
// can't load the default Term and Vote values.
129-
rsl := kvstorage.MakeStateLoader(split.RightDesc.RangeID)
130141
if err := rsl.SynthesizeRaftState(ctx, readWriter, kvstorage.TODORaft(readWriter)); err != nil {
131142
log.KvExec.Fatalf(ctx, "%v", err)
132143
}

0 commit comments

Comments
 (0)