Skip to content

Commit 40ab8f6

Browse files
committed
roachtest: use decommission nudger for decommissionbench.go
This commit adds the decommissioning nudger as a new flavor in `decommissionbench.go` and updates them to run weekly instead. This commit also removes the metamorphically enabled decommissioning nudger from `runDecommissionSlow`, since these roachtests typically complete in under 10 minutes and usually pass. The `decommissionBench`, on the other hand, runs longer and is a better fit for this configuration.
1 parent 63cc608 commit 40ab8f6

File tree

2 files changed

+40
-14
lines changed

2 files changed

+40
-14
lines changed

pkg/cmd/roachtest/tests/decommission.go

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,19 +1135,6 @@ func runDecommissionSlow(ctx context.Context, t test.Test, c cluster.Cluster) {
11351135
// Increase the speed of decommissioning.
11361136
run(db, `SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='2GiB'`)
11371137

1138-
// Metamorphically enable the decommissioning nudger to get more test
1139-
// coverage on decommissioning nudger.
1140-
{
1141-
seed := timeutil.Now().UnixNano()
1142-
t.L().Printf("seed: %d", seed)
1143-
rng := rand.New(rand.NewSource(seed))
1144-
1145-
if rng.Intn(2) == 0 {
1146-
run(db, `SET CLUSTER SETTING kv.enqueue_in_replicate_queue_on_problem.interval = '10m'`)
1147-
t.L().Printf("metamorphically enabled decommissioning nudger")
1148-
}
1149-
}
1150-
11511138
// Wait for initial up-replication.
11521139
err := roachtestutil.WaitForReplication(ctx, t.L(), db, replicationFactor, roachprod.AtLeastReplicationFactor)
11531140
require.NoError(t, err)

pkg/cmd/roachtest/tests/decommissionbench.go

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ type decommissionBenchSpec struct {
8989
// instead of a random node.
9090
decommissionNode int
9191

92+
// When true, the decommissioning nudger
93+
// (kv.enqueue_in_replicate_queue_on_problem.interval) will be used to
94+
// periodically enqueue decommissioning ranges at its leaseholders every 5
95+
// minutes.
96+
useDecommissioningNudger bool
97+
9298
skip string
9399
}
94100

@@ -198,6 +204,16 @@ func registerDecommissionBench(r registry.Registry) {
198204
multiregion: true,
199205
decommissionNode: 2,
200206
},
207+
{
208+
// Same as above but using the decommissioning nudger.
209+
nodes: 6,
210+
warehouses: 1000,
211+
whileUpreplicating: true,
212+
drainFirst: true,
213+
multiregion: true,
214+
decommissionNode: 2,
215+
useDecommissioningNudger: true,
216+
},
201217
{
202218
// Multiregion decommission, and add a new node in a different region.
203219
nodes: 6,
@@ -207,6 +223,16 @@ func registerDecommissionBench(r registry.Registry) {
207223
multiregion: true,
208224
decommissionNode: 3,
209225
},
226+
{
227+
// Same as above but using the decommissioning nudger.
228+
nodes: 6,
229+
warehouses: 1000,
230+
whileUpreplicating: true,
231+
drainFirst: true,
232+
multiregion: true,
233+
decommissionNode: 3,
234+
useDecommissioningNudger: true,
235+
},
210236
} {
211237
registerDecommissionBenchSpec(r, benchSpec)
212238
}
@@ -274,6 +300,10 @@ func registerDecommissionBenchSpec(r registry.Registry, benchSpec decommissionBe
274300
extraNameParts = append(extraNameParts, "multi-region")
275301
}
276302

303+
if benchSpec.useDecommissioningNudger {
304+
extraNameParts = append(extraNameParts, "use-nudger")
305+
}
306+
277307
// Save some money and CPU quota by using a smaller workload CPU. Only
278308
// do this for cluster of size 3 or smaller to avoid regressions.
279309
specOptions = append(specOptions, spec.WorkloadNode())
@@ -299,7 +329,7 @@ func registerDecommissionBenchSpec(r registry.Registry, benchSpec decommissionBe
299329
specOptions...,
300330
),
301331
CompatibleClouds: registry.OnlyGCE,
302-
Suites: registry.Suites(registry.Nightly),
332+
Suites: registry.Suites(registry.Weekly),
303333
SkipPostValidations: registry.PostValidationNoDeadNodes,
304334
Timeout: timeout,
305335
NonReleaseBlocker: true,
@@ -427,6 +457,15 @@ func setupDecommissionBench(
427457
db := c.Conn(ctx, t.L(), pinnedNode)
428458
defer db.Close()
429459

460+
// Metamorphically enable the decommissioning nudger to get more test
461+
// coverage on decommissioning nudger.
462+
if benchSpec.useDecommissioningNudger {
463+
if _, err := db.ExecContext(ctx, `SET CLUSTER SETTING kv.enqueue_in_replicate_queue_on_problem.interval = '5m'`); err != nil {
464+
t.Fatal(err)
465+
}
466+
t.L().Printf("enabled decommissioning nudger")
467+
}
468+
430469
// Note that we are waiting for 3 replicas only. We can't assume 5 replicas
431470
// here because 5 only applies to system ranges so we will never reach this
432471
// number globally. We also don't know if all upreplication succeeded, but

0 commit comments

Comments
 (0)