@@ -12,12 +12,14 @@ import (
1212 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1313 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1414 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
15+ "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1516 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1617 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
1718 "github.com/cockroachdb/cockroach/pkg/roachprod/install"
19+ "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
1820 "github.com/cockroachdb/cockroach/pkg/util/retry"
19- "github.com/cockroachdb/cockroach/pkg/util/timeutil"
2021 "github.com/cockroachdb/errors"
22+ "github.com/stretchr/testify/require"
2123)
2224
2325const (
@@ -67,6 +69,7 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
6769 )
6870
6971 c .Start (ctx , t .L (), startOpts , startSettings , c .Range (1 , 3 ))
72+ require .NoError (t , roachtestutil .WaitFor3XReplication (ctx , t .L (), c .Conn (ctx , t .L (), 1 )))
7073
7174 // Confirm that there are 6 stores live.
7275 t .Status ("store setup" )
@@ -118,6 +121,9 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
118121 if err := c .StartE (ctx , t .L (), startOpts , startSettings , node ); err != nil {
119122 t .Fatalf ("restarting node: %s" , err )
120123 }
124+ // Example: with 12 stores per node, since we removed n1's last one, we did remove
125+ // s12.
126+ removedStoreID := multiStoreStoresPerNode
121127
122128 // Wait for the store to be marked as dead.
123129 t .Status ("awaiting store death" )
@@ -135,32 +141,30 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
135141 t .Fatalf ("awaiting store death: %s" , err )
136142 }
137143
138- // Wait for up-replication.
139- // NOTE: At the time of writing, under-replicated ranges are computed using
140- // node liveness, rather than store liveness, so we instead compare the range
141- // count to the current per-store replica count to compute whether all there
142- // are still under-replicated ranges from the dead store.
143- // TODO(travers): Once #123561 is solved, re-work this.
144- t .Status ("awaiting up-replication" )
145- tStart := timeutil .Now ()
146- var oldReplicas int
144+ // Wait until no ranges remain on the removed store.
145+ t .Status ("awaiting range relocation" )
146+ sr := sqlutils .MakeSQLRunner (conn )
147147 for {
148- var ranges , replicas int
149- if err := conn .QueryRowContext (ctx ,
150- `SELECT
151- (SELECT count(1) FROM crdB_internal.ranges) AS ranges
152- , (SELECT sum(range_count) FROM crdb_internal.kv_store_status) AS replicas` ,
153- ).Scan (& ranges , & replicas ); err != nil {
154- t .Fatalf ("replication status: %s" , err )
155- }
156- if replicas == 3 * ranges {
157- t .L ().Printf ("up-replication complete" )
148+ res := sr .QueryStr (t , `
149+ SELECT
150+ (SELECT count(*) FROM crdb_internal.ranges_no_leases
151+ WHERE $1::INT = ANY(replicas)) AS cnt,
152+ ARRAY(
153+ SELECT range_id
154+ FROM crdb_internal.ranges_no_leases
155+ WHERE $1::INT = ANY(replicas)
156+ ORDER BY range_id
157+ LIMIT 10) AS sample;
158+ ` , removedStoreID )
159+ cnt := res [0 ][0 ]
160+ sample := res [0 ][1 ]
161+
162+ if cnt == "0" {
163+ t .L ().Printf ("all ranges moved off removed store" )
158164 break
159165 }
160- if timeutil .Since (tStart ) > 30 * time .Second || oldReplicas != replicas {
161- t .L ().Printf ("still waiting for replication (%d / %d)" , replicas , 3 * ranges )
162- }
163- oldReplicas = replicas
166+
167+ t .L ().Printf ("%s ranges remain on removed store; for example: %s" , cnt , sample )
164168 time .Sleep (5 * time .Second )
165169 }
166170 t .Status ("done" )
0 commit comments