Skip to content

Commit a66397e

Browse files
authored
Merge pull request #157137 from tbg/blathers/backport-release-25.4-156620
2 parents cbc05c2 + c12ac13 commit a66397e

File tree

1 file changed

+28
-24
lines changed

1 file changed

+28
-24
lines changed

pkg/cmd/roachtest/tests/multi_store_remove.go

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,14 @@ import (
1212
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1313
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1414
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
15+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1516
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1617
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
1718
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
19+
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
1820
"github.com/cockroachdb/cockroach/pkg/util/retry"
19-
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2021
"github.com/cockroachdb/errors"
22+
"github.com/stretchr/testify/require"
2123
)
2224

2325
const (
@@ -67,6 +69,7 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
6769
)
6870

6971
c.Start(ctx, t.L(), startOpts, startSettings, c.Range(1, 3))
72+
require.NoError(t, roachtestutil.WaitFor3XReplication(ctx, t.L(), c.Conn(ctx, t.L(), 1)))
7073

7174
// Confirm that there are 6 stores live.
7275
t.Status("store setup")
@@ -118,6 +121,9 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
118121
if err := c.StartE(ctx, t.L(), startOpts, startSettings, node); err != nil {
119122
t.Fatalf("restarting node: %s", err)
120123
}
124+
// Example: with 12 stores per node, since we removed n1's last one, we did remove
125+
// s12.
126+
removedStoreID := multiStoreStoresPerNode
121127

122128
// Wait for the store to be marked as dead.
123129
t.Status("awaiting store death")
@@ -135,32 +141,30 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
135141
t.Fatalf("awaiting store death: %s", err)
136142
}
137143

138-
// Wait for up-replication.
139-
// NOTE: At the time of writing, under-replicated ranges are computed using
140-
// node liveness, rather than store liveness, so we instead compare the range
141-
// count to the current per-store replica count to compute whether all there
142-
// are still under-replicated ranges from the dead store.
143-
// TODO(travers): Once #123561 is solved, re-work this.
144-
t.Status("awaiting up-replication")
145-
tStart := timeutil.Now()
146-
var oldReplicas int
144+
// Wait until no ranges remain on the removed store.
145+
t.Status("awaiting range relocation")
146+
sr := sqlutils.MakeSQLRunner(conn)
147147
for {
148-
var ranges, replicas int
149-
if err := conn.QueryRowContext(ctx,
150-
`SELECT
151-
(SELECT count(1) FROM crdB_internal.ranges) AS ranges
152-
, (SELECT sum(range_count) FROM crdb_internal.kv_store_status) AS replicas`,
153-
).Scan(&ranges, &replicas); err != nil {
154-
t.Fatalf("replication status: %s", err)
155-
}
156-
if replicas == 3*ranges {
157-
t.L().Printf("up-replication complete")
148+
res := sr.QueryStr(t, `
149+
SELECT
150+
(SELECT count(*) FROM crdb_internal.ranges_no_leases
151+
WHERE $1::INT = ANY(replicas)) AS cnt,
152+
ARRAY(
153+
SELECT range_id
154+
FROM crdb_internal.ranges_no_leases
155+
WHERE $1::INT = ANY(replicas)
156+
ORDER BY range_id
157+
LIMIT 10) AS sample;
158+
`, removedStoreID)
159+
cnt := res[0][0]
160+
sample := res[0][1]
161+
162+
if cnt == "0" {
163+
t.L().Printf("all ranges moved off removed store")
158164
break
159165
}
160-
if timeutil.Since(tStart) > 30*time.Second || oldReplicas != replicas {
161-
t.L().Printf("still waiting for replication (%d / %d)", replicas, 3*ranges)
162-
}
163-
oldReplicas = replicas
166+
167+
t.L().Printf("%s ranges remain on removed store; for example: %s", cnt, sample)
164168
time.Sleep(5 * time.Second)
165169
}
166170
t.Status("done")

0 commit comments

Comments
 (0)