Skip to content

Commit 838ad72

Browse files
committed
roachtest: deflake multi-store-remove
The test was checking on replication by asserting that the number of ranges times the replication factor is equal to the number of replicas. Replicas can persist after no longer being part of a range (waiting for replicaGC), which is why this test was flaky. I first tried to fix it by lowering the replicaGC queue's interval at which it checks ranges (defaults to 12h), but looking more closely at what the test was doing, I decided to change that instead by no longer caring about what replicas each store reports. What matters is what the meta ranges report, and now the test checks replication similar to how many other tests do, by querying ranges_no_leases. Fixes #146435. Fixes #147763. Fixes #156447. Epic: none
1 parent 002acf9 commit 838ad72

File tree

1 file changed

+28
-23
lines changed

1 file changed

+28
-23
lines changed

pkg/cmd/roachtest/tests/multi_store_remove.go

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1313
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1414
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
15+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1516
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1617
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
1718
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
19+
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
1820
"github.com/cockroachdb/cockroach/pkg/util/retry"
1921
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2022
"github.com/cockroachdb/errors"
23+
"github.com/stretchr/testify/require"
2124
)
2225

2326
const (
@@ -67,6 +70,7 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
6770
)
6871

6972
c.Start(ctx, t.L(), startOpts, startSettings, c.Range(1, 3))
73+
require.NoError(t, roachtestutil.WaitFor3XReplication(ctx, t.L(), c.Conn(ctx, t.L(), 1)))
7074

7175
// Confirm that there are 6 stores live.
7276
t.Status("store setup")
@@ -118,6 +122,9 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
118122
if err := c.StartE(ctx, t.L(), startOpts, startSettings, node); err != nil {
119123
t.Fatalf("restarting node: %s", err)
120124
}
125+
// Example: with 12 stores per node, since we removed n1's last one, we did remove
126+
// s12.
127+
removedStoreID := multiStoreStoresPerNode
121128

122129
// Wait for the store to be marked as dead.
123130
t.Status("awaiting store death")
@@ -135,32 +142,30 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
135142
t.Fatalf("awaiting store death: %s", err)
136143
}
137144

138-
// Wait for up-replication.
139-
// NOTE: At the time of writing, under-replicated ranges are computed using
140-
// node liveness, rather than store liveness, so we instead compare the range
141-
// count to the current per-store replica count to compute whether all there
142-
// are still under-replicated ranges from the dead store.
143-
// TODO(travers): Once #123561 is solved, re-work this.
144-
t.Status("awaiting up-replication")
145-
tStart := timeutil.Now()
146-
var oldReplicas int
145+
// Wait until no ranges remain on the removed store.
146+
t.Status("awaiting range relocation")
147+
sr := sqlutils.MakeSQLRunner(conn)
147148
for {
148-
var ranges, replicas int
149-
if err := conn.QueryRowContext(ctx,
150-
`SELECT
151-
(SELECT count(1) FROM crdB_internal.ranges) AS ranges
152-
, (SELECT sum(range_count) FROM crdb_internal.kv_store_status) AS replicas`,
153-
).Scan(&ranges, &replicas); err != nil {
154-
t.Fatalf("replication status: %s", err)
155-
}
156-
if replicas == 3*ranges {
157-
t.L().Printf("up-replication complete")
149+
res := sr.QueryStr(t, `
150+
SELECT
151+
(SELECT count(*) FROM crdb_internal.ranges_no_leases
152+
WHERE $1::INT = ANY(replicas)) AS cnt,
153+
ARRAY(
154+
SELECT range_id
155+
FROM crdb_internal.ranges_no_leases
156+
WHERE $1::INT = ANY(replicas)
157+
ORDER BY range_id
158+
LIMIT 10) AS sample;
159+
`, removedStoreID)
160+
cnt := res[0][0]
161+
sample := res[0][1]
162+
163+
if cnt == "0" {
164+
t.L().Printf("all ranges moved off removed store")
158165
break
159166
}
160-
if timeutil.Since(tStart) > 30*time.Second || oldReplicas != replicas {
161-
t.L().Printf("still waiting for replication (%d / %d)", replicas, 3*ranges)
162-
}
163-
oldReplicas = replicas
167+
168+
t.L().Printf("%s ranges remain on removed store; for example: %s", cnt, sample)
164169
time.Sleep(5 * time.Second)
165170
}
166171
t.Status("done")

0 commit comments

Comments
 (0)