@@ -85,7 +85,7 @@ func registerElasticIO(r registry.Registry) {
8585 return nil
8686 })
8787 m .Go (func (ctx context.Context ) error {
88- const subLevelMetric = "storage_l0_sublevels "
88+ const ioOverloadMetric = "admission_io_overload "
8989 getMetricVal := func (metricName string ) (float64 , error ) {
9090 point , err := statCollector .CollectPoint (ctx , t .L (), timeutil .Now (), metricName )
9191 if err != nil {
@@ -110,16 +110,16 @@ func registerElasticIO(r registry.Registry) {
110110 }
111111 now := timeutil .Now ()
112112 endTime := now .Add (duration )
113- // We typically see fluctuations from 1 to 5 sub-levels because the
114- // elastic IO token logic gives 1.25*compaction-bandwidth tokens at 1
115- // sub-level and 0.75*compaction-bandwidth at 5 sub-levels, with 5
116- // sub-levels being very rare. We leave some breathing room and pick a
117- // threshold of greater than 7 to fail the test. If elastic tokens are
118- // not working, the threshold of 7 will be easily breached, since
119- // regular tokens allow sub-levels to exceed 10 .
120- const subLevelThreshold = 7
121- const sampleCountForL0Sublevel = 12
122- var l0SublevelCount []float64
113+ // We typically see fluctuations from 0.05 to 0.25 IO overload score
114+ // because the elastic IO token logic gives 1.25*compaction-bandwidth
115+ // tokens at 0.05 score and 0.75*compaction-bandwidth at 0.25 score,
116+ // with 0.25 score being very rare. We leave some breathing room and
117+ // pick a threshold of greater than 0.35 to fail the test. If elastic
118+ // tokens are not working, the threshold of 0.35 will be easily
119+ // breached, since regular tokens allow the score to exceed 0.5 .
120+ const ioOverloadThreshold = 0.35
121+ const sampleCountForIOOverload = 12
122+ var ioOverloadScore []float64
123123 // Sleep initially for stability to be achieved, before measuring.
124124 time .Sleep (5 * time .Minute )
125125 for {
@@ -129,17 +129,19 @@ func registerElasticIO(r registry.Registry) {
129129 default :
130130 }
131131 time .Sleep (10 * time .Second )
132- val , err := getMetricVal (subLevelMetric )
132+ val , err := getMetricVal (ioOverloadMetric )
133133 if err != nil {
134134 continue
135135 }
136- l0SublevelCount = append (l0SublevelCount , val )
136+ ioOverloadScore = append (ioOverloadScore , val )
137137 // We want to use the mean of the last 2m of data to avoid short-lived
138138 // spikes causing failures.
139- if len (l0SublevelCount ) >= sampleCountForL0Sublevel {
140- latestSampleMeanL0Sublevels := roachtestutil .GetMeanOverLastN (sampleCountForL0Sublevel , l0SublevelCount )
141- if latestSampleMeanL0Sublevels > subLevelThreshold {
142- t .Fatalf ("sub-level mean %f over last %d iterations exceeded threshold" , latestSampleMeanL0Sublevels , sampleCountForL0Sublevel )
139+ if len (ioOverloadScore ) >= sampleCountForIOOverload {
140+ latestSampleMeanIOOverloadScore :=
141+ roachtestutil .GetMeanOverLastN (sampleCountForIOOverload , ioOverloadScore )
142+ if latestSampleMeanIOOverloadScore > ioOverloadThreshold {
143+ t .Fatalf ("io-overload score mean %f over last %d iterations exceeded threshold" ,
144+ latestSampleMeanIOOverloadScore , sampleCountForIOOverload )
143145 }
144146 }
145147 if timeutil .Now ().After (endTime ) {
0 commit comments