Skip to content

Commit 94060ac

Browse files
craig[bot]pav-kv
andcommitted
Merge #155539
155539: kvstorage: encapsulate variants of replica destruction r=arulajmani a=pav-kv This PR moves all variants of replica destruction writes to `kvstorage` package. It also decouples the `DestroyReplica` helper into two: one that fully removes the replica, and one that doesn't touch the user keys (`SubsumeReplica`). Finally, `ClearRangeData` and its various options are no longer exposed outside the package, and become its implementation detail. `ClearRangeData` will likely be further broken down in a follow-up since it is not flexible enough for supporting separated raft storage. Related to #152845 Related to #152845 Co-authored-by: Pavel Kalinnikov <pavel@cockroachlabs.com>
2 parents 8abd590 + 19c1452 commit 94060ac

File tree

8 files changed

+132
-120
lines changed

8 files changed

+132
-120
lines changed

pkg/kv/kvserver/kvstorage/destroy.go

Lines changed: 115 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package kvstorage
77

88
import (
99
"context"
10+
"math"
1011

1112
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1213
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
@@ -29,55 +30,63 @@ const (
2930
// perhaps we should fix Pebble to handle large numbers of range tombstones in
3031
// an sstable better.
3132
ClearRangeThresholdPointKeys = 64
33+
34+
// MergedTombstoneReplicaID is the replica ID written into the RangeTombstone
35+
// for replicas of a range which is known to have been merged. This value
36+
// should prevent any messages from stale replicas of that range from ever
37+
// resurrecting merged replicas. Whenever merging or subsuming a replica we
38+
// know new replicas can never be created so this value is used even if we
39+
// don't know the current replica ID.
40+
MergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32
3241
)
3342

34-
// ClearRangeDataOptions specify which parts of a Replica are to be destroyed.
35-
type ClearRangeDataOptions struct {
36-
// ClearReplicatedByRangeID indicates that replicated RangeID-based keys
43+
// clearRangeDataOptions specify which parts of a Replica are to be destroyed.
44+
type clearRangeDataOptions struct {
45+
// clearReplicatedByRangeID indicates that replicated RangeID-based keys
3746
// (abort span, etc) should be removed.
38-
ClearReplicatedByRangeID bool
39-
// ClearUnreplicatedByRangeID indicates that unreplicated RangeID-based keys
47+
clearReplicatedByRangeID bool
48+
// clearUnreplicatedByRangeID indicates that unreplicated RangeID-based keys
4049
// (logstore state incl. HardState, etc) should be removed.
41-
ClearUnreplicatedByRangeID bool
42-
// ClearReplicatedBySpan causes the state machine data (i.e. the replicated state
50+
clearUnreplicatedByRangeID bool
51+
// clearReplicatedBySpan causes the state machine data (i.e. the replicated state
4352
// for the given RSpan) that is key-addressable (i.e. range descriptor, user keys,
4453
// locks) to be removed. No data is removed if this is the zero span.
45-
ClearReplicatedBySpan roachpb.RSpan
54+
clearReplicatedBySpan roachpb.RSpan
4655

47-
// If MustUseClearRange is true, a Pebble range tombstone will always be used
56+
// If mustUseClearRange is true, a Pebble range tombstone will always be used
4857
// to clear the key spans (unless empty). This is typically used when we need
4958
// to write additional keys to an SST after this clear, e.g. a replica
5059
// tombstone, since keys must be written in order. When this is false, a
5160
// heuristic will be used instead.
52-
MustUseClearRange bool
61+
mustUseClearRange bool
5362
}
5463

55-
// ClearRangeData clears the data associated with a range descriptor selected
64+
// clearRangeData clears the data associated with a range descriptor selected
5665
// by the provided options.
5766
//
5867
// TODO(tbg): could rename this to XReplica. The use of "Range" in both the
5968
// "CRDB Range" and "storage.ClearRange" context in the setting of this method could
6069
// be confusing.
61-
func ClearRangeData(
70+
func clearRangeData(
6271
ctx context.Context,
6372
rangeID roachpb.RangeID,
6473
reader storage.Reader,
6574
writer storage.Writer,
66-
opts ClearRangeDataOptions,
75+
opts clearRangeDataOptions,
6776
) error {
6877
keySpans := rditer.Select(rangeID, rditer.SelectOpts{
6978
Ranged: rditer.SelectRangedOptions{
70-
RSpan: opts.ClearReplicatedBySpan,
79+
RSpan: opts.clearReplicatedBySpan,
7180
SystemKeys: true,
7281
LockTable: true,
7382
UserKeys: true,
7483
},
75-
ReplicatedByRangeID: opts.ClearReplicatedByRangeID,
76-
UnreplicatedByRangeID: opts.ClearUnreplicatedByRangeID,
84+
ReplicatedByRangeID: opts.clearReplicatedByRangeID,
85+
UnreplicatedByRangeID: opts.clearUnreplicatedByRangeID,
7786
})
7887

7988
pointKeyThreshold := ClearRangeThresholdPointKeys
80-
if opts.MustUseClearRange {
89+
if opts.mustUseClearRange {
8190
pointKeyThreshold = 1
8291
}
8392

@@ -120,21 +129,30 @@ type DestroyReplicaInfo struct {
120129
Keys roachpb.RSpan
121130
}
122131

123-
// DestroyReplica destroys all or a part of the Replica's state, installing a
124-
// RangeTombstone in its place. Due to merges, splits, etc, there is a need to
125-
// control which part of the state this method actually gets to remove, which is
126-
// done via the provided options[^1]; the caller is always responsible for
127-
// managing the remaining disk state accordingly.
128-
//
129-
// [^1] e.g., on a merge, the user data moves to the subsuming replica and must
130-
// not be cleared.
132+
// DestroyReplica destroys the entirety of the replica's state in storage, and
133+
// installs a RangeTombstone in its place. It handles both uninitialized and
134+
// initialized replicas uniformly.
131135
func DestroyReplica(
132136
ctx context.Context,
133137
reader storage.Reader,
134138
writer storage.Writer,
135139
info DestroyReplicaInfo,
136140
next roachpb.ReplicaID,
137-
opts ClearRangeDataOptions,
141+
) error {
142+
return destroyReplicaImpl(ctx, reader, writer, info, next, clearRangeDataOptions{
143+
clearReplicatedByRangeID: true,
144+
clearUnreplicatedByRangeID: true,
145+
clearReplicatedBySpan: info.Keys,
146+
})
147+
}
148+
149+
func destroyReplicaImpl(
150+
ctx context.Context,
151+
reader storage.Reader,
152+
writer storage.Writer,
153+
info DestroyReplicaInfo,
154+
next roachpb.ReplicaID,
155+
opts clearRangeDataOptions,
138156
) error {
139157
if next <= info.ReplicaID {
140158
return errors.AssertionFailedf("%v must not survive its own tombstone", info.FullReplicaID)
@@ -156,7 +174,7 @@ func DestroyReplica(
156174
}
157175

158176
_ = DestroyReplicaTODO // 2.1 + 2.2 + 3.1
159-
if err := ClearRangeData(ctx, info.RangeID, reader, writer, opts); err != nil {
177+
if err := clearRangeData(ctx, info.RangeID, reader, writer, opts); err != nil {
160178
return err
161179
}
162180
// Save a tombstone to ensure that replica IDs never get reused.
@@ -165,3 +183,73 @@ func DestroyReplica(
165183
NextReplicaID: next, // NB: NextReplicaID > 0
166184
})
167185
}
186+
187+
// SubsumeReplica is like DestroyReplica, but it does not delete the user keys
188+
// (and the corresponding system and lock table keys). The latter are inherited
189+
// by the subsuming range.
190+
//
191+
// The forceSortedKeys flag, if true, forces the writes to be generated in the
192+
// sorted order of keys. This is to support feeding the writes from this
193+
// function to SSTables, in the snapshot ingestion path.
194+
//
195+
// Returns SelectOpts which can be used to reflect on the key spans that this
196+
// function clears.
197+
// TODO(pav-kv): get rid of SelectOpts.
198+
func SubsumeReplica(
199+
ctx context.Context,
200+
reader storage.Reader,
201+
writer storage.Writer,
202+
info DestroyReplicaInfo,
203+
forceSortedKeys bool,
204+
) (rditer.SelectOpts, error) {
205+
// NB: if required, set MustUseClearRange to true. This call can be used for
206+
// generating SSTables when ingesting a snapshot, which requires Clears and
207+
// Puts to be written in key order. DestroyReplica sets RangeTombstoneKey
208+
// after clearing the unreplicated span which may contain higher keys.
209+
opts := clearRangeDataOptions{
210+
clearReplicatedByRangeID: true,
211+
clearUnreplicatedByRangeID: true,
212+
mustUseClearRange: forceSortedKeys,
213+
}
214+
return rditer.SelectOpts{
215+
ReplicatedByRangeID: opts.clearReplicatedByRangeID,
216+
UnreplicatedByRangeID: opts.clearUnreplicatedByRangeID,
217+
}, destroyReplicaImpl(ctx, reader, writer, info, MergedTombstoneReplicaID, opts)
218+
}
219+
220+
// RemoveStaleRHSFromSplit removes all data for the RHS replica of a split. This
221+
// is used in a situation when the RHS replica is already known to have been
222+
// removed from our store, so any pending writes that were supposed to
223+
// initialize the RHS replica should be dropped from the write batch.
224+
//
225+
// TODO(#152199): do not remove the unreplicated state which can belong to a
226+
// newer (uninitialized) replica.
227+
func RemoveStaleRHSFromSplit(
228+
ctx context.Context,
229+
reader storage.Reader,
230+
writer storage.Writer,
231+
rangeID roachpb.RangeID,
232+
keys roachpb.RSpan,
233+
) error {
234+
return clearRangeData(ctx, rangeID, reader, writer, clearRangeDataOptions{
235+
// Since the RHS replica is uninitalized, we know there isn't anything in
236+
// the two replicated spans below, before the current batch. Setting these
237+
// options will in effect only clear the writes to the RHS replicated state
238+
// staged in the batch.
239+
clearReplicatedBySpan: keys,
240+
clearReplicatedByRangeID: true,
241+
// TODO(tbg): we don't actually want to touch the raft state of the RHS
242+
// replica since it's absent or a more recent one than in the split. Now
243+
// that we have a bool targeting unreplicated RangeID-local keys, we can set
244+
// it to false and remove the HardState+ReplicaID write-back in the caller.
245+
// However, there can be historical split proposals with the
246+
// RaftTruncatedState key set in splitTriggerHelper[^1]. We must first make
247+
// sure that such proposals no longer exist, e.g. with a below-raft
248+
// migration.
249+
//
250+
// [^1]: https://github.com/cockroachdb/cockroach/blob/f263a765d750e41f2701da0a923a6e92d09159fa/pkg/kv/kvserver/batcheval/cmd_end_transaction.go#L1109-L1149
251+
//
252+
// See also: https://github.com/cockroachdb/cockroach/issues/94933
253+
clearUnreplicatedByRangeID: true,
254+
})
255+
}

pkg/kv/kvserver/kvstorage/destroy_test.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,7 @@ func TestDestroyReplica(t *testing.T) {
6969
})
7070
mutate("destroy", func(rw storage.ReadWriter) {
7171
require.NoError(t, DestroyReplica(
72-
ctx, rw, rw,
73-
DestroyReplicaInfo{FullReplicaID: r.id, Keys: r.keys}, r.id.ReplicaID+1,
74-
ClearRangeDataOptions{
75-
ClearUnreplicatedByRangeID: true,
76-
ClearReplicatedByRangeID: true,
77-
ClearReplicatedBySpan: r.keys,
78-
},
72+
ctx, rw, rw, DestroyReplicaInfo{FullReplicaID: r.id, Keys: r.keys}, r.id.ReplicaID+1,
7973
))
8074
})
8175

pkg/kv/kvserver/replica_app_batch.go

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -359,21 +359,10 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
359359
rhsRepl.mu.Unlock()
360360
rhsRepl.readOnlyCmdMu.Unlock()
361361

362-
// Use math.MaxInt32 (mergedTombstoneReplicaID) as the nextReplicaID as an
363-
// extra safeguard against creating new replicas of the RHS. This isn't
364-
// required for correctness, since the merge protocol should guarantee that
365-
// no new replicas of the RHS can ever be created, but it doesn't hurt to
366-
// be careful.
367-
if err := kvstorage.DestroyReplica(
368-
ctx, b.batch, b.batch,
369-
rhsRepl.destroyInfoRaftMuLocked(),
370-
mergedTombstoneReplicaID,
371-
kvstorage.ClearRangeDataOptions{
372-
ClearReplicatedByRangeID: true,
373-
ClearUnreplicatedByRangeID: true,
374-
},
362+
if _, err := kvstorage.SubsumeReplica(
363+
ctx, b.batch, b.batch, rhsRepl.destroyInfoRaftMuLocked(), false, /* forceSortedKeys */
375364
); err != nil {
376-
return errors.Wrapf(err, "unable to destroy replica before merge")
365+
return errors.Wrapf(err, "unable to subsume replica before merge")
377366
}
378367

379368
// Shut down rangefeed processors on either side of the merge.
@@ -451,7 +440,6 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
451440
b.r.shMu.destroyStatus.Set(
452441
kvpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()),
453442
destroyReasonRemoved)
454-
span := b.r.descRLocked().RSpan()
455443
b.r.mu.Unlock()
456444
b.r.readOnlyCmdMu.Unlock()
457445
b.changeRemovesReplica = true
@@ -461,14 +449,7 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
461449
// above, and DestroyReplica will also add a range tombstone to the
462450
// batch, so that when we commit it, the removal is finalized.
463451
if err := kvstorage.DestroyReplica(
464-
ctx, b.batch, b.batch,
465-
b.r.destroyInfoRaftMuLocked(),
466-
change.NextReplicaID(),
467-
kvstorage.ClearRangeDataOptions{
468-
ClearReplicatedBySpan: span,
469-
ClearReplicatedByRangeID: true,
470-
ClearUnreplicatedByRangeID: true,
471-
},
452+
ctx, b.batch, b.batch, b.r.destroyInfoRaftMuLocked(), change.NextReplicaID(),
472453
); err != nil {
473454
return errors.Wrapf(err, "unable to destroy replica before removal")
474455
}

pkg/kv/kvserver/replica_destroy.go

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ package kvserver
88
import (
99
"context"
1010
"fmt"
11-
"math"
1211

1312
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
1413
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply"
@@ -56,14 +55,6 @@ func (s destroyStatus) Removed() bool {
5655
return s.reason == destroyReasonRemoved
5756
}
5857

59-
// mergedTombstoneReplicaID is the replica ID written into the tombstone
60-
// for replicas which are part of a range which is known to have been merged.
61-
// This value should prevent any messages from stale replicas of that range from
62-
// ever resurrecting merged replicas. Whenever merging or subsuming a replica we
63-
// know new replicas can never be created so this value is used even if we
64-
// don't know the current replica ID.
65-
const mergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32
66-
6758
// postDestroyRaftMuLocked is called after the replica destruction is durably
6859
// written to Pebble.
6960
func (r *Replica) postDestroyRaftMuLocked(ctx context.Context) error {
@@ -97,23 +88,10 @@ func (r *Replica) destroyRaftMuLocked(ctx context.Context, nextReplicaID roachpb
9788
ms := r.GetMVCCStats()
9889
batch := r.store.TODOEngine().NewWriteBatch()
9990
defer batch.Close()
100-
desc := r.Desc()
101-
inited := desc.IsInitialized()
102-
103-
opts := kvstorage.ClearRangeDataOptions{
104-
ClearReplicatedBySpan: desc.RSpan(), // zero if !inited
105-
// TODO(tbg): if it's uninitialized, we might as well clear
106-
// the replicated state because there isn't any. This seems
107-
// like it would be simpler, but needs a code audit to ensure
108-
// callers don't call this in in-between states where the above
109-
// assumption doesn't hold.
110-
ClearReplicatedByRangeID: inited,
111-
ClearUnreplicatedByRangeID: true,
112-
}
91+
11392
// TODO(sep-raft-log): need both engines separately here.
11493
if err := kvstorage.DestroyReplica(
115-
ctx, r.store.TODOEngine(), batch,
116-
r.destroyInfoRaftMuLocked(), nextReplicaID, opts,
94+
ctx, r.store.TODOEngine(), batch, r.destroyInfoRaftMuLocked(), nextReplicaID,
11795
); err != nil {
11896
return err
11997
}

pkg/kv/kvserver/replica_gc_queue.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/kv"
1313
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
1414
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
15+
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvstorage"
1516
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
1617
"github.com/cockroachdb/cockroach/pkg/raft/raftpb"
1718
"github.com/cockroachdb/cockroach/pkg/roachpb"
@@ -348,11 +349,11 @@ func (rgcq *replicaGCQueue) process(
348349
}
349350
}
350351

351-
// A tombstone is written with a value of mergedTombstoneReplicaID because
352+
// A tombstone is written with a value of MergedTombstoneReplicaID because
352353
// we know the range to have been merged. See the Merge case of
353354
// runPreApplyTriggers() for details.
354355
if err := repl.store.RemoveReplica(
355-
ctx, repl, mergedTombstoneReplicaID, "dangling subsume via replica GC queue",
356+
ctx, repl, kvstorage.MergedTombstoneReplicaID, "dangling subsume via replica GC queue",
356357
); err != nil {
357358
return false, err
358359
}

pkg/kv/kvserver/replica_raftstorage.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ func (r *Replica) clearSubsumedReplicaInMemoryData(
818818
// allowed in (perhaps not involving any of the RangeIDs known to the merge
819819
// but still touching its keyspace) and causing corruption.
820820
ph, err := r.store.removeInitializedReplicaRaftMuLocked(
821-
ctx, sr, mergedTombstoneReplicaID, "subsumed by snapshot",
821+
ctx, sr, kvstorage.MergedTombstoneReplicaID, "subsumed by snapshot",
822822
RemoveOptions{
823823
// The data was already destroyed by clearSubsumedReplicaDiskData.
824824
DestroyData: false,

pkg/kv/kvserver/snapshot_apply_prepare.go

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -150,20 +150,9 @@ func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) err
150150
for _, sub := range s.subsume {
151151
// We have to create an SST for the subsumed replica's range-id local keys.
152152
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
153-
// NOTE: We set mustClearRange to true because we are setting
154-
// RangeTombstoneKey. Since Clears and Puts need to be done in increasing
155-
// order of keys, it is not safe to use ClearRangeIter.
156-
opts := kvstorage.ClearRangeDataOptions{
157-
ClearReplicatedByRangeID: true,
158-
ClearUnreplicatedByRangeID: true,
159-
MustUseClearRange: true,
160-
}
161-
s.cleared = append(s.cleared, rditer.Select(sub.RangeID, rditer.SelectOpts{
162-
ReplicatedByRangeID: opts.ClearReplicatedByRangeID,
163-
UnreplicatedByRangeID: opts.ClearUnreplicatedByRangeID,
164-
})...)
165-
// NB: Actually clear RangeID local key spans.
166-
return kvstorage.DestroyReplica(ctx, reader, w, sub, mergedTombstoneReplicaID, opts)
153+
opts, err := kvstorage.SubsumeReplica(ctx, reader, w, sub, true /* forceSortedKeys */)
154+
s.cleared = append(s.cleared, rditer.Select(sub.RangeID, opts)...)
155+
return err
167156
}); err != nil {
168157
return err
169158
}

0 commit comments

Comments
 (0)