Merge #155413

craig[bot] · sumeerbhola · craig[bot] · commit 294ad262167d · 2025-10-27T15:50:34.000Z
155413: tests: use admission.io.overload in admission-control/elastic-io r=tbg a=sumeerbhola The previously used sub-level metric was flawed, in that the IO overload score could stay low even at higher sub-level counts if L0 had very few bytes (which is a deliberate choice in admission control). So admission control would not throttle elastic work as aggressively as the test expected it to. Running with this change, we don't exceed a score of 0.15, while the previously used sub-level count metric spikes higher. For example: ``` 2025/10/14 22:13:15 admission_control_elastic_io.go:105: admission_io_overload(store=1): 0.100000 2025/10/14 22:13:25 admission_control_elastic_io.go:105: admission_io_overload(store=1): 0.100000 I251014 22:13:27.665296 868 util/admission/io_load_listener.go:780 ⋮ [T1,Vsystem,n1,s1] 2918 IO overload: compaction score 0.150 (131 ssts, 9 sub-levels), L0 growth 551 MiB (write 551 MiB (ignored 0 B) ingest 0 B (ignored 0 B)): requests 15985 (0 bypassed) with 505 MiB acc-write (0 B bypassed) + 0 B acc-ingest (0 B bypassed) + 551 MiB adjusted-LSM-writes + 4.2 GiB adjusted-disk-writes + write-model 1.09x+1 B (smoothed 1.08x+1 B) + l0-ingest-model 0.00x+0 B (smoothed 0.75x+1 B) + ingest-model 0.00x+0 B (smoothed 1.00x+1 B) + write-amp-model 7.87x+1 B (smoothed 8.01x+1 B) + at-admission-tokens 126 B, compacted 550 MiB [≈545 MiB], flushed 799 MiB [≈838 MiB] (mult 0.77); admitting 649 MiB (rate 43 MiB/s) (elastic 519 MiB rate 35 MiB/s) due to memtable flush (multiplier 0.775) (used total: 543 MiB elastic 541 MiB); write stalls 0; diskBandwidthLimiter (unlimited) (tokenUtilization 0.00, tokensUsed (elastic 0 B, snapshot 0 B, regular 0 B) tokens (write 0 B (prev 0 B), read 0 B (prev 0 B)), writeBW 0 B/s, readBW 0 B/s, provisioned 0 B/s) 2025/10/14 22:13:35 admission_control_elastic_io.go:105: admission_io_overload(store=1): 0.050000 I251014 22:13:42.666326 868 util/admission/io_load_listener.go:780 ⋮ [T1,Vsystem,n1,s1] 2926 IO overload: compaction score 0.050 (70 ssts, 5 sub-levels), L0 growth 498 MiB (write 498 MiB (ignored 0 B) ingest 0 B (ignored 0 B)): requests 15228 (0 bypassed) with 480 MiB acc-write (0 B bypassed) + 0 B acc-ingest (0 B bypassed) + 498 MiB adjusted-LSM-writes + 4.2 GiB adjusted-disk-writes + write-model 1.04x+1 B (smoothed 1.06x+1 B) + l0-ingest-model 0.00x+0 B (smoothed 0.75x+1 B) + ingest-model 0.00x+0 B (smoothed 1.00x+1 B) + write-amp-model 8.57x+1 B (smoothed 8.29x+1 B) + at-admission-tokens 153 B, compacted 498 MiB [≈522 MiB], flushed 883 MiB [≈860 MiB] (mult 0.77); admitting 667 MiB (rate 44 MiB/s) (elastic 533 MiB rate 36 MiB/s) due to memtable flush (multiplier 0.775) (used total: 519 MiB elastic 517 MiB); write stalls 0; diskBandwidthLimiter (unlimited) (tokenUtilization 0.00, tokensUsed (elastic 0 B, snapshot 0 B, regular 0 B) tokens (write 0 B (prev 0 B), read 0 B (prev 0 B)), writeBW 0 B/s, readBW 0 B/s, provisioned 0 B/s) ``` Fixes #148786 Fixes #156168 Fixes #156215 Epic: none Release note: None Co-authored-by: sumeerbhola <sumeer@cockroachlabs.com>
diff --git a/pkg/cmd/roachtest/tests/admission_control_elastic_io.go b/pkg/cmd/roachtest/tests/admission_control_elastic_io.go
@@ -85,7 +85,7 @@ func registerElasticIO(r registry.Registry) {
 				return nil
 			})
 			m.Go(func(ctx context.Context) error {
-				const subLevelMetric = "storage_l0_sublevels"
+				const ioOverloadMetric = "admission_io_overload"
 				getMetricVal := func(metricName string) (float64, error) {
 					point, err := statCollector.CollectPoint(ctx, t.L(), timeutil.Now(), metricName)
 					if err != nil {
@@ -110,16 +110,16 @@ func registerElasticIO(r registry.Registry) {
 				}
 				now := timeutil.Now()
 				endTime := now.Add(duration)
-				// We typically see fluctuations from 1 to 5 sub-levels because the
-				// elastic IO token logic gives 1.25*compaction-bandwidth tokens at 1
-				// sub-level and 0.75*compaction-bandwidth at 5 sub-levels, with 5
-				// sub-levels being very rare. We leave some breathing room and pick a
-				// threshold of greater than 7 to fail the test. If elastic tokens are
-				// not working, the threshold of 7 will be easily breached, since
-				// regular tokens allow sub-levels to exceed 10.
-				const subLevelThreshold = 7
-				const sampleCountForL0Sublevel = 12
-				var l0SublevelCount []float64
+				// We typically see fluctuations from 0.05 to 0.25 IO overload score
+				// because the elastic IO token logic gives 1.25*compaction-bandwidth
+				// tokens at 0.05 score and 0.75*compaction-bandwidth at 0.25 score,
+				// with 0.25 score being very rare. We leave some breathing room and
+				// pick a threshold of greater than 0.35 to fail the test. If elastic
+				// tokens are not working, the threshold of 0.35 will be easily
+				// breached, since regular tokens allow the score to exceed 0.5.
+				const ioOverloadThreshold = 0.35
+				const sampleCountForIOOverload = 12
+				var ioOverloadScore []float64
 				// Sleep initially for stability to be achieved, before measuring.
 				time.Sleep(5 * time.Minute)
 				for {
@@ -129,17 +129,19 @@ func registerElasticIO(r registry.Registry) {
 					default:
 					}
 					time.Sleep(10 * time.Second)
-					val, err := getMetricVal(subLevelMetric)
+					val, err := getMetricVal(ioOverloadMetric)
 					if err != nil {
 						continue
 					}
-					l0SublevelCount = append(l0SublevelCount, val)
+					ioOverloadScore = append(ioOverloadScore, val)
 					// We want to use the mean of the last 2m of data to avoid short-lived
 					// spikes causing failures.
-					if len(l0SublevelCount) >= sampleCountForL0Sublevel {
-						latestSampleMeanL0Sublevels := roachtestutil.GetMeanOverLastN(sampleCountForL0Sublevel, l0SublevelCount)
-						if latestSampleMeanL0Sublevels > subLevelThreshold {
-							t.Fatalf("sub-level mean %f over last %d iterations exceeded threshold", latestSampleMeanL0Sublevels, sampleCountForL0Sublevel)
+					if len(ioOverloadScore) >= sampleCountForIOOverload {
+						latestSampleMeanIOOverloadScore :=
+							roachtestutil.GetMeanOverLastN(sampleCountForIOOverload, ioOverloadScore)
+						if latestSampleMeanIOOverloadScore > ioOverloadThreshold {
+							t.Fatalf("io-overload score mean %f over last %d iterations exceeded threshold",
+								latestSampleMeanIOOverloadScore, sampleCountForIOOverload)
 						}
 					}
 					if timeutil.Now().After(endTime) {