Skip to content

Commit 792292c

Browse files
craig[bot]pav-kvandy-kimball
committed
155549: kvstorage: make replica destruction always sorted r=arulajmani a=pav-kv This PR refactors replica destruction helpers in `kvstorage` (`DestroyReplica` and `SubsumeReplica`) so that they always generate storage writes ordered by engine key. This is to support the most restrictive user of `SubsumeReplica`: `prepareSnapApply` feeds these writes into SSTables (which assert on the keys order), part of snapshot ingestion path. Other byproducts of this PR: - `ClearRangeData` and its options are removed due to being redundant and not flexible. - A decision is made to split the unreplicated RangeID-local keyspace into "state machine" and "raft", separated by the `RangeTombstone` key. This leaves us with only one exception: `RaftReplicaID` key in the middle of the "raft" keyspace is said to be a "state machine" key. We can migrate it to the correct half later, together with other storage migrations. - The loop unwinding in `destroyReplicaImpl` brings us very close to logical separation of engines in this function. We now only need to handle the `RaftReplicaID` key correctly. - The heuristic from `ClearRangeWithHeuristic` is now always used with the same threshold. There are no forced `ClearRawRange`s in the snapshot path anymore, unless opted in by tests. Related to #152845 156334: sql: add read/write metrics r=yuzefovich a=andy-kimball ### sql: add index_bytes_written metric Add new `sql.statements.index_bytes_written.count` metric that counts the number of primary and secondary index bytes modified by SQL statements. ### sql: add index_rows_written metric Add new `sql.statements.index_rows_written.count` metric that counts the number of primary and secondary index rows modified by SQL statements. ### sql: add bytes_read metric Add new `sql.statements.bytes_read.count` metric that counts the number of bytes scanned by SQL statements. This is the same value that's collected by SQL stats for each statement and transaction, except in aggregated metric form. Co-authored-by: Pavel Kalinnikov <pavel@cockroachlabs.com> Co-authored-by: Andrew Kimball <andyk@cockroachlabs.com>
3 parents cb48491 + 886c4d6 + 4beb167 commit 792292c

29 files changed

+564
-270
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8747,17 +8747,65 @@ layers:
87478747
unit: COUNT
87488748
aggregation: AVG
87498749
derivative: NON_NEGATIVE_DERIVATIVE
8750+
- name: sql.statements.bytes_read.count
8751+
exported_name: sql_statements_bytes_read_count
8752+
description: Number of bytes read by SQL statements
8753+
y_axis_label: SQL Statements
8754+
type: COUNTER
8755+
unit: BYTES
8756+
aggregation: AVG
8757+
derivative: NON_NEGATIVE_DERIVATIVE
8758+
- name: sql.statements.bytes_read.count.internal
8759+
exported_name: sql_statements_bytes_read_count_internal
8760+
description: Number of bytes read by SQL statements (internal queries)
8761+
y_axis_label: SQL Internal Statements
8762+
type: COUNTER
8763+
unit: BYTES
8764+
aggregation: AVG
8765+
derivative: NON_NEGATIVE_DERIVATIVE
8766+
- name: sql.statements.index_bytes_written.count
8767+
exported_name: sql_statements_index_bytes_written_count
8768+
description: Number of primary and secondary index bytes modified by SQL statements
8769+
y_axis_label: SQL Statements
8770+
type: COUNTER
8771+
unit: BYTES
8772+
aggregation: AVG
8773+
derivative: NON_NEGATIVE_DERIVATIVE
8774+
- name: sql.statements.index_bytes_written.count.internal
8775+
exported_name: sql_statements_index_bytes_written_count_internal
8776+
description: Number of primary and secondary index bytes modified by SQL statements (internal queries)
8777+
y_axis_label: SQL Internal Statements
8778+
type: COUNTER
8779+
unit: BYTES
8780+
aggregation: AVG
8781+
derivative: NON_NEGATIVE_DERIVATIVE
8782+
- name: sql.statements.index_rows_written.count
8783+
exported_name: sql_statements_index_rows_written_count
8784+
description: Number of primary and secondary index rows modified by SQL statements
8785+
y_axis_label: SQL Statements
8786+
type: COUNTER
8787+
unit: COUNT
8788+
aggregation: AVG
8789+
derivative: NON_NEGATIVE_DERIVATIVE
8790+
- name: sql.statements.index_rows_written.count.internal
8791+
exported_name: sql_statements_index_rows_written_count_internal
8792+
description: Number of primary and secondary index rows modified by SQL statements (internal queries)
8793+
y_axis_label: SQL Internal Statements
8794+
type: COUNTER
8795+
unit: COUNT
8796+
aggregation: AVG
8797+
derivative: NON_NEGATIVE_DERIVATIVE
87508798
- name: sql.statements.rows_read.count
87518799
exported_name: sql_statements_rows_read_count
8752-
description: Number of rows read by SQL statements from primary and secondary indexes
8800+
description: Number of rows read by SQL statements
87538801
y_axis_label: SQL Statements
87548802
type: COUNTER
87558803
unit: COUNT
87568804
aggregation: AVG
87578805
derivative: NON_NEGATIVE_DERIVATIVE
87588806
- name: sql.statements.rows_read.count.internal
87598807
exported_name: sql_statements_rows_read_count_internal
8760-
description: Number of rows read by SQL statements from primary and secondary indexes (internal queries)
8808+
description: Number of rows read by SQL statements (internal queries)
87618809
y_axis_label: SQL Internal Statements
87628810
type: COUNTER
87638811
unit: COUNT

pkg/keys/doc.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ var schema = [...]interface{}{
192192
// pertain to just one replica of a range. They are unreplicated and
193193
// unaddressable. The typical example is the Raft log. They all share
194194
// `LocalRangeIDPrefix` and `localRangeIDUnreplicatedInfix`.
195+
//
196+
// WARNING: when adding a new key in this section, decide whether it should be
197+
// classified as "raft" or "state machine" key, correspondingly to which
198+
// engine it resides in:
199+
//
200+
// - keys <= RangeTombstoneKey in this prefix are "state machine" engine keys
201+
// - keys > RangeTombstoneKey in this prefix are "raft" engine keys
202+
// - historical exception: RaftReplicaIDKey belongs to the state machine
203+
//
204+
// Failure to classify may result in replica state corruption in storage.
195205
localRangeIDUnreplicatedInfix, // "u"
196206
RangeTombstoneKey, // "rftb"
197207
RaftHardStateKey, // "rfth"

pkg/kv/kvserver/client_merge_test.go

Lines changed: 46 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -3815,6 +3815,10 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
38153815
defer leaktest.AfterTest(t)()
38163816
defer log.Scope(t).Close(t)
38173817

3818+
// The test expects replica destruction paths to use ranged clears rather than
3819+
// point clears.
3820+
defer kvstorage.TestingForceClearRange()()
3821+
38183822
testutils.RunTrueAndFalse(t, "rebalanceRHSAway", func(t *testing.T, rebalanceRHSAway bool) {
38193823
// We will be testing the SSTs written on store2's engine.
38203824
var receivingEng, sendingEng storage.Engine
@@ -3852,11 +3856,7 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
38523856
// NOTE: There are no range-local keys or lock table keys, in [d,
38533857
// /Max) in the store we're sending a snapshot to, so we aren't
38543858
// expecting SSTs to clear those keys.
3855-
expectedSSTCount := 9
3856-
if len(sstNames) != expectedSSTCount {
3857-
return errors.Errorf("expected to ingest %d SSTs, got %d SSTs",
3858-
expectedSSTCount, len(sstNames))
3859-
}
3859+
require.Len(t, sstNames, 9)
38603860

38613861
// Only try to predict SSTs for:
38623862
// - The user keys in the snapshot
@@ -3900,11 +3900,11 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
39003900
file := &storage.MemObject{}
39013901
writer := storage.MakeIngestionSSTWriter(ctx, st, file)
39023902
if i < len(keySpans)-1 {
3903-
// The last span is the MVCC span, and is always cleared via
3904-
// Excise. See MultiSSTWriter.
3905-
if err := writer.ClearRawRange(span.Key, span.EndKey, true /* pointKeys */, true /* rangeKeys */); err != nil {
3906-
return err
3907-
}
3903+
// The last span is the MVCC span, and is always cleared via Excise.
3904+
// See MultiSSTWriter.
3905+
require.NoError(t, writer.ClearRawRange(
3906+
span.Key, span.EndKey, true /* pointKeys */, true, /* rangeKeys */
3907+
))
39083908
}
39093909
sstFileWriters[string(span.Key)] = sstFileWriter{
39103910
span: span,
@@ -3913,7 +3913,7 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
39133913
}
39143914
}
39153915

3916-
if err := rditer.IterateReplicaKeySpans(ctx, inSnap.Desc, snapReader, rditer.SelectOpts{
3916+
require.NoError(t, rditer.IterateReplicaKeySpans(ctx, inSnap.Desc, snapReader, rditer.SelectOpts{
39173917
Ranged: rditer.SelectRangedOptions{
39183918
SystemKeys: true,
39193919
LockTable: true,
@@ -3923,43 +3923,27 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
39233923
UnreplicatedByRangeID: false,
39243924
}, func(iter storage.EngineIterator, span roachpb.Span) error {
39253925
fw, ok := sstFileWriters[string(span.Key)]
3926-
if !ok || !fw.span.Equal(span) {
3927-
return errors.Errorf("unexpected span %s", span)
3928-
}
3926+
require.True(t, ok)
3927+
require.Truef(t, fw.span.Equal(span), "unexpected span %s", span)
39293928
var err error
39303929
for ok := true; ok && err == nil; ok, err = iter.NextEngineKey() {
3931-
var key storage.EngineKey
3932-
if key, err = iter.UnsafeEngineKey(); err != nil {
3933-
return err
3934-
}
3930+
key, err := iter.UnsafeEngineKey()
3931+
require.NoError(t, err)
39353932
v, err := iter.UnsafeValue()
3936-
if err != nil {
3937-
return err
3938-
}
3939-
if err := fw.writer.PutEngineKey(key, v); err != nil {
3940-
return err
3941-
}
3942-
}
3943-
if err != nil {
3944-
return err
3933+
require.NoError(t, err)
3934+
require.NoError(t, fw.writer.PutEngineKey(key, v))
39453935
}
3936+
require.NoError(t, err)
39463937
return nil
3947-
}); err != nil {
3948-
return err
3949-
}
3938+
}))
39503939

39513940
for _, span := range keySpans {
39523941
fw := sstFileWriters[string(span.Key)]
3953-
if err := fw.writer.Finish(); err != nil {
3954-
return err
3955-
}
3942+
require.NoError(t, fw.writer.Finish())
39563943
expectedSSTs = append(expectedSSTs, fw.file.Data())
39573944
}
39583945

3959-
if len(expectedSSTs) != 5 {
3960-
return errors.Errorf("len of expectedSSTs should expected to be %d, but got %d",
3961-
5, len(expectedSSTs))
3962-
}
3946+
require.Len(t, expectedSSTs, 5)
39633947
// Keep the last one which contains the user keys.
39643948
expectedSSTs = expectedSSTs[len(expectedSSTs)-1:]
39653949

@@ -3973,23 +3957,24 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
39733957
sstFile := &storage.MemObject{}
39743958
sst := storage.MakeIngestionSSTWriter(ctx, st, sstFile)
39753959
defer sst.Close()
3960+
// The snapshot code uses ClearRangeWithHeuristic with a threshold of 1
3961+
// to clear the range, but it will truncate the range tombstone to the
3962+
// first key. In this case, the first key is RangeGCThresholdKey, which
3963+
// doesn't yet exist in the engine, so we set it manually.
3964+
//
3965+
// The deletion also extends to the RangeTombstoneKey.
3966+
require.NoError(t, sst.ClearRawRange(
3967+
keys.RangeGCThresholdKey(rangeID),
3968+
keys.RangeTombstoneKey(rangeID),
3969+
true, false,
3970+
))
3971+
require.NoError(t, kvstorage.MakeStateLoader(rangeID).SetRangeTombstone(
3972+
context.Background(), &sst,
3973+
kvserverpb.RangeTombstone{NextReplicaID: math.MaxInt32},
3974+
))
39763975
{
3977-
// The snapshot code will use ClearRangeWithHeuristic with a
3978-
// threshold of 1 to clear the range, but this will truncate
3979-
// the range tombstone to the first key. In this case, the
3980-
// first key is RangeGCThresholdKey, which doesn't yet exist
3981-
// in the engine, so we write the Pebble range tombstone
3982-
// manually.
3983-
sl := rditer.Select(rangeID, rditer.SelectOpts{
3984-
ReplicatedByRangeID: true,
3985-
})
3986-
require.Len(t, sl, 1)
3987-
s := sl[0]
3988-
require.NoError(t, sst.ClearRawRange(keys.RangeGCThresholdKey(rangeID), s.EndKey, true, false))
3989-
}
3990-
{
3991-
// Ditto for the unreplicated version, where the first key
3992-
// happens to be the HardState.
3976+
// Ditto for the unreplicated version, where the first key happens to
3977+
// be the HardState.
39933978
sl := rditer.Select(rangeID, rditer.SelectOpts{
39943979
UnreplicatedByRangeID: true,
39953980
})
@@ -3998,15 +3983,7 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
39983983
require.NoError(t, sst.ClearRawRange(keys.RaftHardStateKey(rangeID), s.EndKey, true, false))
39993984
}
40003985

4001-
if err := kvstorage.MakeStateLoader(rangeID).SetRangeTombstone(
4002-
context.Background(), &sst,
4003-
kvserverpb.RangeTombstone{NextReplicaID: math.MaxInt32},
4004-
); err != nil {
4005-
return err
4006-
}
4007-
if err := sst.Finish(); err != nil {
4008-
return err
4009-
}
3986+
require.NoError(t, sst.Finish())
40103987
expectedSSTs = append(expectedSSTs, sstFile.Data())
40113988
}
40123989

@@ -4018,23 +3995,20 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
40183995
StartKey: roachpb.RKey(keyD),
40193996
EndKey: roachpb.RKey(keyEnd),
40203997
}
4021-
if err := storage.ClearRangeWithHeuristic(
4022-
ctx, receivingEng, &sst, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey(), 64,
4023-
); err != nil {
4024-
return err
4025-
} else if err := sst.Finish(); err != nil {
4026-
return err
4027-
}
3998+
require.NoError(t, storage.ClearRangeWithHeuristic(
3999+
ctx, receivingEng, &sst,
4000+
desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey(),
4001+
kvstorage.ClearRangeThresholdPointKeys(),
4002+
))
4003+
require.NoError(t, sst.Finish())
40284004
expectedSSTs = append(expectedSSTs, sstFile.Data())
40294005

40304006
// Iterate over all the tested SSTs and check that they're
40314007
// byte-by-byte equal.
40324008
var dumpDir string
40334009
for i := range sstNamesSubset {
40344010
actualSST, err := fs.ReadFile(receivingEng.Env(), sstNamesSubset[i])
4035-
if err != nil {
4036-
return err
4037-
}
4011+
require.NoError(t, err)
40384012
if !bytes.Equal(expectedSSTs[i], actualSST) { // intentionally not printing
40394013
t.Logf("%d=%s", i, sstNamesSubset[i])
40404014
if dumpDir == "" {

0 commit comments

Comments
 (0)