@@ -249,6 +249,15 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
249249 return None , None , None
250250
251251
252+ def mann_whitney_pval_significance (mann_pvalue , pvalue_threshold = PVALUE_THRESHOLD ):
253+ p_value_interpretation = None
254+ if mann_pvalue > pvalue_threshold :
255+ p_value_interpretation = "not significant"
256+ if mann_pvalue <= pvalue_threshold :
257+ p_value_interpretation = "significant"
258+ return p_value_interpretation
259+
260+
252261# Mann-Whitney U test
253262# Tests the null hypothesis that the distributions patch and without patch are identical.
254263# Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value
@@ -259,11 +268,7 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
259268 mann_stat = float (mann_stat ) if mann_stat else None
260269 mann_pvalue = float (mann_pvalue ) if mann_pvalue else None
261270 # Mann-Whitney U p-value interpretation
262- p_value_interpretation = None
263- if mann_pvalue >= pvalue_threshold :
264- p_value_interpretation = "not significant"
265- if mann_pvalue < pvalue_threshold :
266- p_value_interpretation = "significant"
271+ p_value_interpretation = mann_whitney_pval_significance (mann_pvalue , pvalue_threshold )
267272
268273 mann_whitney = {
269274 "test_name" : "Mann-Whitney U" ,
@@ -274,18 +279,60 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
274279 return mann_whitney , mann_stat , mann_pvalue
275280
276281
282+ # https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
283+ def interpret_effect_size (delta ):
284+ if delta is None :
285+ return "Effect cannot be interpreted"
286+ if abs (delta ) < 0.15 :
287+ return "negligible"
288+ elif abs (delta ) < 0.33 :
289+ return "small"
290+ elif abs (delta ) < 0.47 :
291+ return "moderate"
292+ else :
293+ return "large"
294+
295+
296+ def interpret_cles_direction (cles , pvalue_threshold = PVALUE_THRESHOLD ):
297+ greater_rev = None
298+ if cles is None :
299+ return "CLES cannot be interpreted" , greater_rev
300+ if cles > pvalue_threshold :
301+ greater_rev = "base"
302+ return f"{ cles :.0%} chance a base value > a new value" , greater_rev
303+ if cles < pvalue_threshold :
304+ greater_rev = "new"
305+ return f"{ 1 - cles :.0%} chance a new value > base value" , greater_rev
306+ return "CLES cannot be interpreted" , greater_rev
307+
308+
277309def is_new_better (c_delta , cles , mann_pvalue , lower_is_better , pvalue_threshold = PVALUE_THRESHOLD ):
278310 """This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change"""
279311 # Possibility Base > than New with a small amount or more significance
280- if cles > pvalue_threshold and abs (c_delta ) > 0.33 and mann_pvalue < pvalue_threshold :
312+ cles_interpretation , greater_rev = interpret_cles_direction (
313+ cles , pvalue_threshold = PVALUE_THRESHOLD
314+ )
315+ effect_size = interpret_effect_size (c_delta )
316+ effect_value_significance = ["small" , "moderate" , "large" ]
317+ p_value_interpretation = mann_whitney_pval_significance (mann_pvalue , pvalue_threshold )
318+
319+ if (
320+ greater_rev == "base"
321+ and any (effect_size in effect_value_significance )
322+ and p_value_interpretation == "significant"
323+ ):
281324 if lower_is_better :
282325 is_new_better = True
283326 direction = "improvement"
284327 else :
285328 is_new_better = False
286329 direction = "regression"
287330 # Possibility New > Base with a small amount or more significance
288- if cles < pvalue_threshold and abs (c_delta ) > 0.33 and mann_pvalue < pvalue_threshold :
331+ if (
332+ greater_rev == "new"
333+ and any (effect_size in effect_value_significance )
334+ and p_value_interpretation == "significant"
335+ ):
289336 if lower_is_better :
290337 is_new_better = False
291338 direction = "regression"
@@ -298,30 +345,6 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=
298345 return direction , is_new_better
299346
300347
301- def interpret_cles_direction (cles , pvalue_threshold = PVALUE_THRESHOLD ):
302- if cles is None :
303- return "CLES cannot be interpreted"
304- if cles > pvalue_threshold :
305- return f"{ cles :.0%} chance a base value > a new value"
306- if cles < pvalue_threshold :
307- return f"{ 1 - cles :.0%} chance a new value > base value"
308- return "CLES cannot be interpreted"
309-
310-
311- # https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
312- def interpret_effect_size (delta ):
313- if delta is None :
314- return "Effect cannot be interpreted"
315- if abs (delta ) < 0.15 :
316- return "negligible"
317- elif abs (delta ) < 0.33 :
318- return "small"
319- elif abs (delta ) < 0.47 :
320- return "moderate"
321- else :
322- return "large"
323-
324-
325348def interpret_performance_direction (ci_low , ci_high , lower_is_better ):
326349 is_regression = False
327350 is_improvement = False
0 commit comments