Skip to content

Commit 934a86d

Browse files
committed
kvstorage: make replica destruction always sorted
Epic: none Release note: none
1 parent 843478c commit 934a86d

File tree

6 files changed

+73
-52
lines changed

6 files changed

+73
-52
lines changed

pkg/kv/kvserver/client_merge_test.go

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3957,18 +3957,17 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
39573957
sstFile := &storage.MemObject{}
39583958
sst := storage.MakeIngestionSSTWriter(ctx, st, sstFile)
39593959
defer sst.Close()
3960-
{
3961-
// The snapshot code uses ClearRangeWithHeuristic with a threshold of
3962-
// 1 to clear the range, but it will truncate the range tombstone to
3963-
// the first key. In this case, the first key is RangeGCThresholdKey,
3964-
// which doesn't yet exist in the engine, so we set it manually.
3965-
sl := rditer.Select(rangeID, rditer.SelectOpts{
3966-
ReplicatedByRangeID: true,
3967-
})
3968-
require.Len(t, sl, 1)
3969-
s := sl[0]
3970-
require.NoError(t, sst.ClearRawRange(keys.RangeGCThresholdKey(rangeID), s.EndKey, true, false))
3971-
}
3960+
// The snapshot code uses ClearRangeWithHeuristic with a threshold of 1
3961+
// to clear the range, but it will truncate the range tombstone to the
3962+
// first key. In this case, the first key is RangeGCThresholdKey, which
3963+
// doesn't yet exist in the engine, so we set it manually.
3964+
//
3965+
// The deletion also extends to the RangeTombstoneKey.
3966+
require.NoError(t, sst.ClearRawRange(
3967+
keys.RangeGCThresholdKey(rangeID),
3968+
keys.RangeTombstoneKey(rangeID),
3969+
true, false,
3970+
))
39723971
require.NoError(t, kvstorage.MakeStateLoader(rangeID).SetRangeTombstone(
39733972
context.Background(), &sst,
39743973
kvserverpb.RangeTombstone{NextReplicaID: math.MaxInt32},

pkg/kv/kvserver/kvstorage/destroy.go

Lines changed: 57 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"context"
1010
"math"
1111

12+
"github.com/cockroachdb/cockroach/pkg/keys"
1213
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
1314
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
1415
"github.com/cockroachdb/cockroach/pkg/roachpb"
@@ -77,14 +78,17 @@ type DestroyReplicaInfo struct {
7778
// DestroyReplica destroys the entirety of the replica's state in storage, and
7879
// installs a RangeTombstone in its place. It handles both uninitialized and
7980
// initialized replicas uniformly.
81+
//
82+
// This call issues all writes ordered by key. This is to support a large
83+
// variety of uses, including SSTable generation for snapshot application.
8084
func DestroyReplica(
8185
ctx context.Context,
8286
reader storage.Reader,
8387
writer storage.Writer,
8488
info DestroyReplicaInfo,
8589
next roachpb.ReplicaID,
8690
) error {
87-
return destroyReplicaImpl(ctx, reader, writer, info, next, false /* forceSortedKeys */)
91+
return destroyReplicaImpl(ctx, reader, writer, info, next)
8892
}
8993

9094
func destroyReplicaImpl(
@@ -93,7 +97,6 @@ func destroyReplicaImpl(
9397
writer storage.Writer,
9498
info DestroyReplicaInfo,
9599
next roachpb.ReplicaID,
96-
forceSortedKeys bool,
97100
) error {
98101
if next <= info.ReplicaID {
99102
return errors.AssertionFailedf("%v must not survive its own tombstone", info.FullReplicaID)
@@ -114,62 +117,82 @@ func destroyReplicaImpl(
114117
info.FullReplicaID, ts.NextReplicaID, next)
115118
}
116119

117-
_ = DestroyReplicaTODO // 2.1 + 2.2 + 3.1
118-
// NB: if forceSortedKeys is true, disable the heuristic and force deletion by
119-
// range keys. This call can be used for generating SSTables when ingesting a
120-
// snapshot, which requires Clears and Puts to be written in key order. Since
121-
// we write RangeTombstoneKey further down, ensure that this is the only point
122-
// key here.
123-
pointKeyThreshold := ClearRangeThresholdPointKeys()
124-
if forceSortedKeys {
125-
pointKeyThreshold = 1
120+
_ = DestroyReplicaTODO
121+
// Clear all range data in sorted order of engine keys. This call can be used
122+
// for generating SSTables when ingesting a snapshot, which requires Clears
123+
// and Puts to be written in key order.
124+
//
125+
// All the code below is equivalent to clearing all the spans that the
126+
// following SelectOpts represents, and leaving only the RangeTombstoneKey
127+
// populated with the specified NextReplicaID.
128+
if buildutil.CrdbTestBuild {
129+
_ = rditer.SelectOpts{
130+
ReplicatedByRangeID: true,
131+
UnreplicatedByRangeID: true,
132+
Ranged: rditer.SelectAllRanged(info.Keys),
133+
}
134+
}
135+
136+
// First, clear all RangeID-local replicated keys. Also, include all
137+
// RangeID-local unreplicated keys < RangeTombstoneKey as a drive-by, since we
138+
// decided that these (currently none) belong to the state machine engine.
139+
tombstoneKey := keys.RangeTombstoneKey(info.RangeID)
140+
if err := storage.ClearRangeWithHeuristic(
141+
ctx, reader, writer,
142+
keys.MakeRangeIDReplicatedPrefix(info.RangeID),
143+
tombstoneKey,
144+
ClearRangeThresholdPointKeys(),
145+
); err != nil {
146+
return err
147+
}
148+
// Save a tombstone to ensure that replica IDs never get reused.
149+
if err := sl.SetRangeTombstone(ctx, writer, kvserverpb.RangeTombstone{
150+
NextReplicaID: next, // NB: NextReplicaID > 0
151+
}); err != nil {
152+
return err
126153
}
127-
// TODO(pav-kv): decompose this loop to delete raft and state machine keys
128-
// separately. The main challenge is the unreplicated span because it is
129-
// scattered across 2 engines.
154+
// Clear the rest of the RangeID-local unreplicated keys.
155+
// TODO(pav-kv): decompose this further to delete raft and state machine keys
156+
// separately. Currently, all the remaining keys in this span belong to the
157+
// raft engine except the RaftReplicaID.
158+
if err := storage.ClearRangeWithHeuristic(
159+
ctx, reader, writer,
160+
tombstoneKey.Next(),
161+
keys.MakeRangeIDUnreplicatedPrefix(info.RangeID).PrefixEnd(),
162+
ClearRangeThresholdPointKeys(),
163+
); err != nil {
164+
return err
165+
}
166+
// Finally, clear all the user keys (MVCC keyspace and the corresponding
167+
// system and lock table keys), if info.Keys is not empty.
130168
for _, span := range rditer.Select(info.RangeID, rditer.SelectOpts{
131-
UnreplicatedByRangeID: true,
132-
ReplicatedByRangeID: true,
133-
Ranged: rditer.SelectAllRanged(info.Keys),
169+
Ranged: rditer.SelectAllRanged(info.Keys),
134170
}) {
135171
if err := storage.ClearRangeWithHeuristic(
136-
ctx, reader, writer, span.Key, span.EndKey, pointKeyThreshold,
172+
ctx, reader, writer, span.Key, span.EndKey, ClearRangeThresholdPointKeys(),
137173
); err != nil {
138174
return err
139175
}
140176
}
141-
142-
// Save a tombstone to ensure that replica IDs never get reused.
143-
_ = DestroyReplicaTODO // 2.3
144-
return sl.SetRangeTombstone(ctx, writer, kvserverpb.RangeTombstone{
145-
NextReplicaID: next, // NB: NextReplicaID > 0
146-
})
177+
return nil
147178
}
148179

149180
// SubsumeReplica is like DestroyReplica, but it does not delete the user keys
150181
// (and the corresponding system and lock table keys). The latter are inherited
151182
// by the subsuming range.
152183
//
153-
// The forceSortedKeys flag, if true, forces the writes to be generated in the
154-
// sorted order of keys. This is to support feeding the writes from this
155-
// function to SSTables, in the snapshot ingestion path.
156-
//
157184
// Returns SelectOpts which can be used to reflect on the key spans that this
158185
// function clears.
159186
// TODO(pav-kv): get rid of SelectOpts.
160187
func SubsumeReplica(
161-
ctx context.Context,
162-
reader storage.Reader,
163-
writer storage.Writer,
164-
info DestroyReplicaInfo,
165-
forceSortedKeys bool,
188+
ctx context.Context, reader storage.Reader, writer storage.Writer, info DestroyReplicaInfo,
166189
) (rditer.SelectOpts, error) {
167190
// Forget about the user keys.
168191
info.Keys = roachpb.RSpan{}
169192
return rditer.SelectOpts{
170193
ReplicatedByRangeID: true,
171194
UnreplicatedByRangeID: true,
172-
}, destroyReplicaImpl(ctx, reader, writer, info, MergedTombstoneReplicaID, forceSortedKeys)
195+
}, destroyReplicaImpl(ctx, reader, writer, info, MergedTombstoneReplicaID)
173196
}
174197

175198
// RemoveStaleRHSFromSplit removes all data for the RHS replica of a split. This

pkg/kv/kvserver/kvstorage/testdata/TestDestroyReplica.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Put: "a"/Intent/00000000-0000-0000-0000-000000000000:
1919
Put: 0.000000001,0 "y\xff" (0x79ff00000000000000000109): ""
2020
Put: "y\xff"/Intent/00000000-0000-0000-0000-000000000000:
2121
>> destroy:
22-
Delete (Sized at 31): 0,0 /Local/RangeID/123/u/RangeTombstone (0x0169f67b757266746200):
22+
Put: 0,0 /Local/RangeID/123/u/RangeTombstone (0x0169f67b757266746200): next_replica_id:4
2323
Delete (Sized at 39): 0,0 /Local/RangeID/123/u/RaftHardState (0x0169f67b757266746800):
2424
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:11 (0x0169f67b757266746c000000000000000b00):
2525
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:12 (0x0169f67b757266746c000000000000000c00):
@@ -35,4 +35,3 @@ Delete (Sized at 26): /Local/Lock"a" 0300000000000000000000000000000000 (0x017a6
3535
Delete (Sized at 27): /Local/Lock"y\xff" 0300000000000000000000000000000000 (0x017a6b1279ff000100030000000000000000000000000000000012):
3636
Delete (Sized at 11): 0.000000001,0 "a" (0x6100000000000000000109):
3737
Delete (Sized at 12): 0.000000001,0 "y\xff" (0x79ff00000000000000000109):
38-
Put: 0,0 /Local/RangeID/123/u/RangeTombstone (0x0169f67b757266746200): next_replica_id:4

pkg/kv/kvserver/replica_app_batch.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ func (b *replicaAppBatch) runPostAddTriggersReplicaOnly(
360360
rhsRepl.readOnlyCmdMu.Unlock()
361361

362362
if _, err := kvstorage.SubsumeReplica(
363-
ctx, b.batch, b.batch, rhsRepl.destroyInfoRaftMuLocked(), false, /* forceSortedKeys */
363+
ctx, b.batch, b.batch, rhsRepl.destroyInfoRaftMuLocked(),
364364
); err != nil {
365365
return errors.Wrapf(err, "unable to subsume replica before merge")
366366
}

pkg/kv/kvserver/snapshot_apply_prepare.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ func (s *snapWriteBuilder) clearSubsumedReplicaDiskData(ctx context.Context) err
149149
for _, sub := range s.subsume {
150150
// We have to create an SST for the subsumed replica's range-id local keys.
151151
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
152-
opts, err := kvstorage.SubsumeReplica(ctx, reader, w, sub, true /* forceSortedKeys */)
152+
opts, err := kvstorage.SubsumeReplica(ctx, reader, w, sub)
153153
s.cleared = append(s.cleared, rditer.Select(sub.RangeID, opts)...)
154154
return err
155155
}); err != nil {

pkg/kv/kvserver/testdata/TestPrepareSnapApply.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ Put: 0,0 /Local/RangeID/123/u/RaftHardState (0x0169f67b757266746800): term:20 vo
66
Put: 0,0 /Local/RangeID/123/u/RaftReplicaID (0x0169f67b757266747200): replica_id:4
77
Put: 0,0 /Local/RangeID/123/u/RaftTruncatedState (0x0169f67b757266747400): index:100 term:20
88
>> sst:
9-
Delete Range: [0,0 /Local/RangeID/101/u/RaftReplicaID (0x0169ed757266747200): , 0,0 /Local/RangeID/101/v"" (0x0169ed7600): )
109
Put: 0,0 /Local/RangeID/101/u/RangeTombstone (0x0169ed757266746200): next_replica_id:2147483647
10+
Delete (Sized at 30): 0,0 /Local/RangeID/101/u/RaftReplicaID (0x0169ed757266747200):
1111
>> sst:
12-
Delete Range: [0,0 /Local/RangeID/102/u/RaftReplicaID (0x0169ee757266747200): , 0,0 /Local/RangeID/102/v"" (0x0169ee7600): )
1312
Put: 0,0 /Local/RangeID/102/u/RangeTombstone (0x0169ee757266746200): next_replica_id:2147483647
13+
Delete (Sized at 30): 0,0 /Local/RangeID/102/u/RaftReplicaID (0x0169ee757266747200):
1414
>> sst:
1515
Delete (Sized at 27): /Local/Lock"y\xff" 0300000000000000000000000000000000 (0x017a6b1279ff000100030000000000000000000000000000000012):
1616
>> sst:

0 commit comments

Comments
 (0)