Skip to content

Commit cb4e6f5

Browse files
authored
Merge pull request #156619 from tbg/blathers/backport-release-25.4-156540
release-25.4: kvserver: deflake and improve obs in TestPromoteNonVoterInAddVoter
2 parents 19e2a6e + d52e7e7 commit cb4e6f5

File tree

3 files changed

+67
-36
lines changed

3 files changed

+67
-36
lines changed

pkg/kv/kvserver/lease_queue.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ func (lq *leaseQueue) process(
137137

138138
if transferOp, ok := change.Op.(plan.AllocationTransferLeaseOp); ok {
139139
lease, _ := repl.GetLease()
140-
log.KvDistribution.Infof(ctx, "transferring lease to s%d usage=%v, lease=[%v type=%v]", transferOp.Target, transferOp.Usage, lease, lease.Type())
140+
log.KvDistribution.Infof(ctx, "transferring lease to %d usage=%v, lease=[%v type=%v]", transferOp.Target, transferOp.Usage, lease, lease.Type())
141141
lq.lastLeaseTransfer.Store(timeutil.Now())
142142
changeID := lq.as.NonMMAPreTransferLease(
143143
desc,

pkg/kv/kvserver/replicate_queue_test.go

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ import (
4141
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
4242
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
4343
"github.com/cockroachdb/cockroach/pkg/spanconfig"
44-
"github.com/cockroachdb/cockroach/pkg/spanconfig/spanconfigstore"
4544
"github.com/cockroachdb/cockroach/pkg/sql/catalog/bootstrap"
4645
"github.com/cockroachdb/cockroach/pkg/testutils"
4746
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
@@ -618,7 +617,7 @@ func checkReplicaCount(
618617
rangeDesc *roachpb.RangeDescriptor,
619618
voterCount, nonVoterCount int,
620619
) (bool, error) {
621-
err := forceScanOnAllReplicationQueues(tc)
620+
err := forceScanOnAllReplicationAndSplitQueues(tc)
622621
if err != nil {
623622
log.KvDistribution.Infof(ctx, "store.ForceReplicationScanAndProcess() failed with: %s", err)
624623
return false, err
@@ -1544,7 +1543,7 @@ func TestReplicateQueueSwapVotersWithNonVoters(t *testing.T) {
15441543
" num_replicas=5, num_voters=1")
15451544
require.NoError(t, err)
15461545
testutils.SucceedsSoon(t, func() error {
1547-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
1546+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
15481547
return err
15491548
}
15501549
scratchRange := tc.LookupRangeOrFatal(t, scratchKey)
@@ -1559,7 +1558,7 @@ func TestReplicateQueueSwapVotersWithNonVoters(t *testing.T) {
15591558

15601559
checkRelocated := func(t *testing.T, voterStores, nonVoterStores []roachpb.StoreID) {
15611560
testutils.SucceedsSoon(t, func() error {
1562-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
1561+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
15631562
return err
15641563
}
15651564
scratchRange := tc.LookupRangeOrFatal(t, scratchKey)
@@ -1665,7 +1664,7 @@ func TestReplicateQueueShouldQueueNonVoter(t *testing.T) {
16651664
// Make sure that the range has conformed to the constraints we just set
16661665
// above.
16671666
require.Eventually(t, func() bool {
1668-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
1667+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
16691668
log.KvDistribution.Warningf(ctx, "received error while forcing a replicateQueue scan: %s", err)
16701669
return false
16711670
}
@@ -1785,13 +1784,24 @@ func toggleReplicationQueues(tc *testcluster.TestCluster, active bool) {
17851784
}
17861785
}
17871786

1788-
func forceScanOnAllReplicationQueues(tc *testcluster.TestCluster) (err error) {
1787+
func forceScanOnAllReplicationAndSplitQueues(tc *testcluster.TestCluster) (err error) {
1788+
// We force scans on the replication and split queues because sometimes splits
1789+
// are necessary to apply zone config changes, which then lead to further
1790+
// replication decisions. See #156530 for an example of this.
17891791
for _, s := range tc.Servers {
1790-
err = s.GetStores().(*kvserver.Stores).VisitStores(func(store *kvserver.Store) error {
1791-
return store.ForceReplicationScanAndProcess()
1792-
})
1792+
if err := s.GetStores().(*kvserver.Stores).VisitStores(func(store *kvserver.Store) error {
1793+
if err := store.ForceReplicationScanAndProcess(); err != nil {
1794+
return err
1795+
}
1796+
if err := store.ForceSplitScanAndProcess(); err != nil {
1797+
return err
1798+
}
1799+
return nil
1800+
}); err != nil {
1801+
return err
1802+
}
17931803
}
1794-
return err
1804+
return nil
17951805
}
17961806

17971807
func toggleSplitQueues(tc *testcluster.TestCluster, active bool) {
@@ -2189,20 +2199,13 @@ func TestPromoteNonVoterInAddVoter(t *testing.T) {
21892199
defer testutils.StartExecTrace(t, scope.GetDirectory()).Finish(t)
21902200

21912201
ctx := context.Background()
2192-
st := cluster.MakeTestingClusterSettings()
2193-
// NB: Ensure that tables created by the test start off on their own range.
2194-
// This ensures that any span config changes made by the test don't have to
2195-
// first induce a split, which is a known source of flakiness.
2196-
spanconfigstore.StorageCoalesceAdjacentSetting.Override(ctx, &st.SV, false)
2197-
spanconfigstore.TenantCoalesceAdjacentSetting.Override(ctx, &st.SV, false)
21982202

21992203
// Create 7 stores: 3 in Region 1, 2 in Region 2, and 2 in Region 3.
22002204
const numNodes = 7
22012205
serverArgs := make(map[int]base.TestServerArgs)
22022206
regions := [numNodes]int{1, 1, 1, 2, 2, 3, 3}
22032207
for i := 0; i < numNodes; i++ {
22042208
serverArgs[i] = base.TestServerArgs{
2205-
Settings: st,
22062209
Locality: roachpb.Locality{
22072210
Tiers: []roachpb.Tier{
22082211
{
@@ -2237,7 +2240,7 @@ func TestPromoteNonVoterInAddVoter(t *testing.T) {
22372240
setConstraintFn("RANGE meta", 7, 7, "")
22382241
setConstraintFn("RANGE default", 7, 7, "")
22392242
testutils.SucceedsSoon(t, func() error {
2240-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
2243+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
22412244
return err
22422245
}
22432246
s, err := sqlutils.RowsToDataDrivenOutput(sqlutils.MakeSQLRunner(tc.Conns[0]).Query(t, `
@@ -2271,36 +2274,64 @@ SELECT * FROM (
22712274
t *testing.T,
22722275
tc *testcluster.TestCluster,
22732276
db *gosql.DB,
2274-
) (numVoters, numNonVoters int, err error) {
2275-
if err := forceScanOnAllReplicationQueues(tc); err != nil {
2276-
return 0, 0, err
2277+
) (numVoters, numNonVoters int, _ roachpb.RangeDescriptor, err error) {
2278+
if err := forceScanOnAllReplicationAndSplitQueues(tc); err != nil {
2279+
return 0, 0, roachpb.RangeDescriptor{}, err
22772280
}
22782281

2279-
var rangeID roachpb.RangeID
2280-
if err := db.QueryRow("SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1").Scan(&rangeID); err != nil {
2281-
return 0, 0, err
2282+
var key roachpb.Key
2283+
require.NoError(t,
2284+
db.QueryRow(`
2285+
SELECT start_key from crdb_internal.ranges_no_leases WHERE range_id IN
2286+
(SELECT range_id FROM [SHOW RANGES FROM TABLE t] LIMIT 1);
2287+
`).Scan(&key))
2288+
desc, err := tc.LookupRange(key)
2289+
if err != nil {
2290+
return 0, 0, roachpb.RangeDescriptor{}, err
22822291
}
2292+
22832293
iterateOverAllStores(t, tc, func(s *kvserver.Store) error {
2284-
if replica, err := s.GetReplica(rangeID); err == nil && replica.OwnsValidLease(ctx, replica.Clock().NowAsClockTimestamp()) {
2294+
if replica, err := s.GetReplica(desc.RangeID); err == nil && replica.OwnsValidLease(ctx,
2295+
replica.Clock().NowAsClockTimestamp()) {
22852296
desc := replica.Desc()
22862297
numVoters = len(desc.Replicas().VoterDescriptors())
22872298
numNonVoters = len(desc.Replicas().NonVoterDescriptors())
22882299
}
22892300
return nil
22902301
})
2291-
return numVoters, numNonVoters, nil
2302+
return numVoters, numNonVoters, desc, nil
22922303
}
22932304

22942305
// Ensure we are meeting our ZONE survival configuration.
2306+
logMore := time.After(15 * time.Second)
22952307
testutils.SucceedsSoon(t, func() error {
2296-
numVoters, numNonVoters, err := computeNumberOfReplicas(t, tc, db)
2308+
numVoters, numNonVoters, desc, err := computeNumberOfReplicas(t, tc, db)
2309+
22972310
require.NoError(t, err)
2311+
select {
2312+
default:
2313+
case <-logMore:
2314+
// If the retry loop has been stuck for a while, log the span config
2315+
// as seen by each store. For it to apply, the range's span needs to
2316+
// be contained in the start key's span config. If this is not the
2317+
// case, it can explain why the replication changes are not being made.
2318+
iterateOverAllStores(t, tc, func(s *kvserver.Store) error {
2319+
cfg, sp, err := s.GetStoreConfig().SpanConfigSubscriber.GetSpanConfigForKey(ctx, desc.StartKey)
2320+
if err != nil {
2321+
return err
2322+
}
2323+
t.Logf("s%d: r%d %s -> span config %s %s", s.StoreID(), desc.RangeID, desc.RSpan(), sp, &cfg)
2324+
return nil
2325+
})
2326+
}
2327+
22982328
if numVoters != 3 {
2299-
return errors.Newf("expected 3 voters; got %d", numVoters)
2329+
return errors.Newf("expected 3 voters for r%d; got %d", desc.RangeID, numVoters)
23002330
}
23012331
if numNonVoters != 2 {
2302-
return errors.Newf("expected 2 non-voters; got %v", numNonVoters)
2332+
return errors.Newf("expected 2 non-voters for r%d; got %v", desc.RangeID, numNonVoters)
23032333
}
2334+
t.Logf("success: %d has %d voters and %d non-voters", desc.RangeID, numVoters, numNonVoters)
23042335
return nil
23052336
})
23062337

@@ -2316,13 +2347,13 @@ SELECT * FROM (
23162347

23172348
// Ensure we are meeting our REGION survival configuration.
23182349
testutils.SucceedsSoon(t, func() error {
2319-
numVoters, numNonVoters, err := computeNumberOfReplicas(t, tc, db)
2350+
numVoters, numNonVoters, desc, err := computeNumberOfReplicas(t, tc, db)
23202351
require.NoError(t, err)
23212352
if numVoters != 5 {
2322-
return errors.Newf("expected 5 voters; got %d", numVoters)
2353+
return errors.Newf("expected 5 voters for r%d; got %d", desc.RangeID, numVoters)
23232354
}
23242355
if numNonVoters != 0 {
2325-
return errors.Newf("expected 0 non-voters; got %v", numNonVoters)
2356+
return errors.Newf("expected 0 non-voters for r%d; got %v", desc.RangeID, numNonVoters)
23262357
}
23272358
return nil
23282359
})

pkg/spanconfig/spanconfigstore/span_store.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ import (
1919
"github.com/cockroachdb/errors"
2020
)
2121

22-
// TenantCoalesceAdjacentSetting is a public cluster setting that
22+
// tenantCoalesceAdjacentSetting is a public cluster setting that
2323
// controls whether we coalesce adjacent ranges across all secondary
2424
// tenant keyspaces if they have the same span config.
25-
var TenantCoalesceAdjacentSetting = settings.RegisterBoolSetting(
25+
var tenantCoalesceAdjacentSetting = settings.RegisterBoolSetting(
2626
settings.SystemOnly,
2727
"spanconfig.tenant_coalesce_adjacent.enabled",
2828
`collapse adjacent ranges with the same span configs across all secondary tenant keyspaces`,
@@ -157,7 +157,7 @@ func (s *spanConfigStore) computeSplitKey(
157157
return roachpb.RKey(match.span.Key), nil
158158
}
159159
} else {
160-
if !TenantCoalesceAdjacentSetting.Get(&s.settings.SV) {
160+
if !tenantCoalesceAdjacentSetting.Get(&s.settings.SV) {
161161
return roachpb.RKey(match.span.Key), nil
162162
}
163163
}

0 commit comments

Comments
 (0)