@@ -7,6 +7,7 @@ package kvstorage
77
88import (
99 "context"
10+ "math"
1011
1112 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1213 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
@@ -29,55 +30,63 @@ const (
2930 // perhaps we should fix Pebble to handle large numbers of range tombstones in
3031 // an sstable better.
3132 ClearRangeThresholdPointKeys = 64
33+
34+ // MergedTombstoneReplicaID is the replica ID written into the RangeTombstone
35+ // for replicas of a range which is known to have been merged. This value
36+ // should prevent any messages from stale replicas of that range from ever
37+ // resurrecting merged replicas. Whenever merging or subsuming a replica we
38+ // know new replicas can never be created so this value is used even if we
39+ // don't know the current replica ID.
40+ MergedTombstoneReplicaID roachpb.ReplicaID = math .MaxInt32
3241)
3342
34- // ClearRangeDataOptions specify which parts of a Replica are to be destroyed.
35- type ClearRangeDataOptions struct {
36- // ClearReplicatedByRangeID indicates that replicated RangeID-based keys
43+ // clearRangeDataOptions specify which parts of a Replica are to be destroyed.
44+ type clearRangeDataOptions struct {
45+ // clearReplicatedByRangeID indicates that replicated RangeID-based keys
3746 // (abort span, etc) should be removed.
38- ClearReplicatedByRangeID bool
39- // ClearUnreplicatedByRangeID indicates that unreplicated RangeID-based keys
47+ clearReplicatedByRangeID bool
48+ // clearUnreplicatedByRangeID indicates that unreplicated RangeID-based keys
4049 // (logstore state incl. HardState, etc) should be removed.
41- ClearUnreplicatedByRangeID bool
42- // ClearReplicatedBySpan causes the state machine data (i.e. the replicated state
50+ clearUnreplicatedByRangeID bool
51+ // clearReplicatedBySpan causes the state machine data (i.e. the replicated state
4352 // for the given RSpan) that is key-addressable (i.e. range descriptor, user keys,
4453 // locks) to be removed. No data is removed if this is the zero span.
45- ClearReplicatedBySpan roachpb.RSpan
54+ clearReplicatedBySpan roachpb.RSpan
4655
47- // If MustUseClearRange is true, a Pebble range tombstone will always be used
56+ // If mustUseClearRange is true, a Pebble range tombstone will always be used
4857 // to clear the key spans (unless empty). This is typically used when we need
4958 // to write additional keys to an SST after this clear, e.g. a replica
5059 // tombstone, since keys must be written in order. When this is false, a
5160 // heuristic will be used instead.
52- MustUseClearRange bool
61+ mustUseClearRange bool
5362}
5463
55- // ClearRangeData clears the data associated with a range descriptor selected
64+ // clearRangeData clears the data associated with a range descriptor selected
5665// by the provided options.
5766//
5867// TODO(tbg): could rename this to XReplica. The use of "Range" in both the
5968// "CRDB Range" and "storage.ClearRange" context in the setting of this method could
6069// be confusing.
61- func ClearRangeData (
70+ func clearRangeData (
6271 ctx context.Context ,
6372 rangeID roachpb.RangeID ,
6473 reader storage.Reader ,
6574 writer storage.Writer ,
66- opts ClearRangeDataOptions ,
75+ opts clearRangeDataOptions ,
6776) error {
6877 keySpans := rditer .Select (rangeID , rditer.SelectOpts {
6978 Ranged : rditer.SelectRangedOptions {
70- RSpan : opts .ClearReplicatedBySpan ,
79+ RSpan : opts .clearReplicatedBySpan ,
7180 SystemKeys : true ,
7281 LockTable : true ,
7382 UserKeys : true ,
7483 },
75- ReplicatedByRangeID : opts .ClearReplicatedByRangeID ,
76- UnreplicatedByRangeID : opts .ClearUnreplicatedByRangeID ,
84+ ReplicatedByRangeID : opts .clearReplicatedByRangeID ,
85+ UnreplicatedByRangeID : opts .clearUnreplicatedByRangeID ,
7786 })
7887
7988 pointKeyThreshold := ClearRangeThresholdPointKeys
80- if opts .MustUseClearRange {
89+ if opts .mustUseClearRange {
8190 pointKeyThreshold = 1
8291 }
8392
@@ -120,21 +129,30 @@ type DestroyReplicaInfo struct {
120129 Keys roachpb.RSpan
121130}
122131
123- // DestroyReplica destroys all or a part of the Replica's state, installing a
124- // RangeTombstone in its place. Due to merges, splits, etc, there is a need to
125- // control which part of the state this method actually gets to remove, which is
126- // done via the provided options[^1]; the caller is always responsible for
127- // managing the remaining disk state accordingly.
128- //
129- // [^1] e.g., on a merge, the user data moves to the subsuming replica and must
130- // not be cleared.
132+ // DestroyReplica destroys the entirety of the replica's state in storage, and
133+ // installs a RangeTombstone in its place. It handles both uninitialized and
134+ // initialized replicas uniformly.
131135func DestroyReplica (
132136 ctx context.Context ,
133137 reader storage.Reader ,
134138 writer storage.Writer ,
135139 info DestroyReplicaInfo ,
136140 next roachpb.ReplicaID ,
137- opts ClearRangeDataOptions ,
141+ ) error {
142+ return destroyReplicaImpl (ctx , reader , writer , info , next , clearRangeDataOptions {
143+ clearReplicatedByRangeID : true ,
144+ clearUnreplicatedByRangeID : true ,
145+ clearReplicatedBySpan : info .Keys ,
146+ })
147+ }
148+
149+ func destroyReplicaImpl (
150+ ctx context.Context ,
151+ reader storage.Reader ,
152+ writer storage.Writer ,
153+ info DestroyReplicaInfo ,
154+ next roachpb.ReplicaID ,
155+ opts clearRangeDataOptions ,
138156) error {
139157 if next <= info .ReplicaID {
140158 return errors .AssertionFailedf ("%v must not survive its own tombstone" , info .FullReplicaID )
@@ -156,7 +174,7 @@ func DestroyReplica(
156174 }
157175
158176 _ = DestroyReplicaTODO // 2.1 + 2.2 + 3.1
159- if err := ClearRangeData (ctx , info .RangeID , reader , writer , opts ); err != nil {
177+ if err := clearRangeData (ctx , info .RangeID , reader , writer , opts ); err != nil {
160178 return err
161179 }
162180 // Save a tombstone to ensure that replica IDs never get reused.
@@ -165,3 +183,73 @@ func DestroyReplica(
165183 NextReplicaID : next , // NB: NextReplicaID > 0
166184 })
167185}
186+
187+ // SubsumeReplica is like DestroyReplica, but it does not delete the user keys
188+ // (and the corresponding system and lock table keys). The latter are inherited
189+ // by the subsuming range.
190+ //
191+ // The forceSortedKeys flag, if true, forces the writes to be generated in the
192+ // sorted order of keys. This is to support feeding the writes from this
193+ // function to SSTables, in the snapshot ingestion path.
194+ //
195+ // Returns SelectOpts which can be used to reflect on the key spans that this
196+ // function clears.
197+ // TODO(pav-kv): get rid of SelectOpts.
198+ func SubsumeReplica (
199+ ctx context.Context ,
200+ reader storage.Reader ,
201+ writer storage.Writer ,
202+ info DestroyReplicaInfo ,
203+ forceSortedKeys bool ,
204+ ) (rditer.SelectOpts , error ) {
205+ // NB: if required, set MustUseClearRange to true. This call can be used for
206+ // generating SSTables when ingesting a snapshot, which requires Clears and
207+ // Puts to be written in key order. DestroyReplica sets RangeTombstoneKey
208+ // after clearing the unreplicated span which may contain higher keys.
209+ opts := clearRangeDataOptions {
210+ clearReplicatedByRangeID : true ,
211+ clearUnreplicatedByRangeID : true ,
212+ mustUseClearRange : forceSortedKeys ,
213+ }
214+ return rditer.SelectOpts {
215+ ReplicatedByRangeID : opts .clearReplicatedByRangeID ,
216+ UnreplicatedByRangeID : opts .clearUnreplicatedByRangeID ,
217+ }, destroyReplicaImpl (ctx , reader , writer , info , MergedTombstoneReplicaID , opts )
218+ }
219+
220+ // RemoveStaleRHSFromSplit removes all data for the RHS replica of a split. This
221+ // is used in a situation when the RHS replica is already known to have been
222+ // removed from our store, so any pending writes that were supposed to
223+ // initialize the RHS replica should be dropped from the write batch.
224+ //
225+ // TODO(#152199): do not remove the unreplicated state which can belong to a
226+ // newer (uninitialized) replica.
227+ func RemoveStaleRHSFromSplit (
228+ ctx context.Context ,
229+ reader storage.Reader ,
230+ writer storage.Writer ,
231+ rangeID roachpb.RangeID ,
232+ keys roachpb.RSpan ,
233+ ) error {
234+ return clearRangeData (ctx , rangeID , reader , writer , clearRangeDataOptions {
235+ // Since the RHS replica is uninitalized, we know there isn't anything in
236+ // the two replicated spans below, before the current batch. Setting these
237+ // options will in effect only clear the writes to the RHS replicated state
238+ // staged in the batch.
239+ clearReplicatedBySpan : keys ,
240+ clearReplicatedByRangeID : true ,
241+ // TODO(tbg): we don't actually want to touch the raft state of the RHS
242+ // replica since it's absent or a more recent one than in the split. Now
243+ // that we have a bool targeting unreplicated RangeID-local keys, we can set
244+ // it to false and remove the HardState+ReplicaID write-back in the caller.
245+ // However, there can be historical split proposals with the
246+ // RaftTruncatedState key set in splitTriggerHelper[^1]. We must first make
247+ // sure that such proposals no longer exist, e.g. with a below-raft
248+ // migration.
249+ //
250+ // [^1]: https://github.com/cockroachdb/cockroach/blob/f263a765d750e41f2701da0a923a6e92d09159fa/pkg/kv/kvserver/batcheval/cmd_end_transaction.go#L1109-L1149
251+ //
252+ // See also: https://github.com/cockroachdb/cockroach/issues/94933
253+ clearUnreplicatedByRangeID : true ,
254+ })
255+ }
0 commit comments