@@ -85,18 +85,13 @@ func TestStartupFailure(t *testing.T) {
8585func TestStartupFailureRandomRange (t * testing.T ) {
8686 defer leaktest .AfterTest (t )()
8787 defer log .Scope (t ).Close (t )
88- // This test takes 30s and so we don't want it to run in the "blocking path"
89- // of CI at all, and we also don't want to stress it in nightlies as part of
90- // a big package (where it will take a lot of time that could be spent running
91- // "faster" tests). In this package, it is the only test and so it's fine to
92- // run it under nightly (skipping race builds because with many nodes they are
93- // very resource intensive and tend to collapse).
88+ // This test takes 30s+, so we don't want it to run in the "blocking path" of
89+ // CI. We also skip race builds as the test uses multiple nodes, which can
90+ // cause the test to grind to a halt and flake out.
9491 skip .UnderRace (t , "6 nodes with replication is too slow for race" )
95- skip .WithIssue (t , 9999999999 , "nicktrav will have a fix shortly" )
96- // TODO(nicktrav): re-enable only under nightlies once the fix is out.
97- //if !skip.NightlyStress() {
98- // skip.IgnoreLint(t, "test takes 30s to run due to circuit breakers and timeouts")
99- //}
92+ if ! skip .Stress () {
93+ skip .IgnoreLint (t , "test takes 30s to run due to circuit breakers and timeouts" )
94+ }
10095
10196 rng , seed := randutil .NewTestRand ()
10297 t .Log ("TestStartupFailureRandomRange using seed" , seed )
@@ -148,6 +143,11 @@ func runCircuitBreakerTestForKey(
148143 args := base.TestClusterArgs {
149144 ServerArgsPerNode : make (map [int ]base.TestServerArgs ),
150145 ReusableListenerReg : lReg ,
146+ // TODO(travers): This test is has a lingering issue when run in UA mode
147+ // that needs to be addressed before the following can be removed.
148+ ServerArgs : base.TestServerArgs {
149+ DefaultTestTenant : base .TestIsSpecificToStorageLayerAndNeedsASystemTenant ,
150+ },
151151 }
152152 var enableFaults atomic.Bool
153153 for i := 0 ; i < nodes ; i ++ {
@@ -229,6 +229,7 @@ func runCircuitBreakerTestForKey(
229229 return d .StartKey
230230 }
231231
232+ t .Log ("segmenting ranges" )
232233 var rangeSpans []roachpb.Span
233234 r , err := c .QueryContext (ctx , "select range_id, start_key, end_key from crdb_internal.ranges_no_leases order by start_key" )
234235 require .NoError (t , err , "failed to query ranges" )
@@ -243,9 +244,11 @@ func runCircuitBreakerTestForKey(
243244 })
244245 }
245246 good , bad := faultyRangeSelector (rangeSpans )
247+ t .Logf ("prepping %d good ranges" , len (good ))
246248 for _ , span := range good {
247249 prepRange (span .Key , false )
248250 }
251+ t .Logf ("prepping %d faulty ranges" , len (good ))
249252 var ranges []string
250253 for _ , span := range bad {
251254 prepRange (span .Key , true )
@@ -254,27 +257,33 @@ func runCircuitBreakerTestForKey(
254257 rangesList := fmt .Sprintf ("[%s]" , strings .Join (ranges , ", " ))
255258
256259 // Remove nodes permanently to only leave quorum on planned ranges.
260+ t .Log ("stopping n3 and n4" )
257261 tc .StopServer (3 )
258262 tc .StopServer (4 )
259263
260264 // Stop node with replicas that would leave ranges without quorum.
265+ t .Log ("stopping n5" )
261266 tc .StopServer (5 )
262267
263268 // Probe compromised ranges to trigger circuit breakers on them. If we don't
264269 // do this, then restart queries will wait for quorum to be reestablished with
265270 // restarting node without failing.
271+ t .Logf ("waiting for %d compromised ranges to trigger CBs" , len (bad ))
266272 var wg sync.WaitGroup
267273 wg .Add (len (bad ))
268274 for _ , span := range bad {
269275 go func (key roachpb.Key ) {
270276 defer wg .Done ()
277+ t .Logf ("waiting for compromised range: %s" , key )
271278 _ = db .Put (context .Background (), keys .RangeProbeKey (roachpb .RKey (key )), "" )
279+ t .Logf ("done waiting for compromised range: %s" , key )
272280 }(span .Key )
273281 }
274282 wg .Wait ()
275283
276284 // Restart node and check that it succeeds in reestablishing range quorum
277285 // necessary for startup actions.
286+ t .Log ("starting n5" )
278287 require .NoError (t , lReg .MustGet (t , 5 ).Reopen ())
279288 err = tc .RestartServer (5 )
280289 require .NoError (t , err , "restarting server with range(s) %s tripping circuit breaker" , rangesList )
0 commit comments