88from scipy .stats import bootstrap , iqr , ks_2samp , mannwhitneyu
99
1010# New Stats Code
11+ # various formulas extracted from here:
12+ # https://colab.research.google.com/gist/padenot/2a408f0a39e269977045fc2fb265663b/end-to-end.ipynb#scrollTo=M8WY0yVIX5Ru&uniqifier=1
1113
1214# p-value threshold to use throughout
1315PVALUE_THRESHOLD = 0.05
@@ -249,6 +251,18 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
249251 return None , None , None
250252
251253
254+ def mann_whitney_pval_significance (mann_pvalue , pvalue_threshold = PVALUE_THRESHOLD ):
255+ p_value_interpretation = ""
256+ is_significant = False
257+
258+ if mann_pvalue > pvalue_threshold :
259+ p_value_interpretation = "not significant"
260+ if mann_pvalue <= pvalue_threshold :
261+ is_significant = True
262+ p_value_interpretation = "significant"
263+ return p_value_interpretation , is_significant
264+
265+
252266# Mann-Whitney U test
253267# Tests the null hypothesis that the distributions patch and without patch are identical.
254268# Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value
@@ -259,60 +273,71 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
259273 mann_stat = float (mann_stat ) if mann_stat else None
260274 mann_pvalue = float (mann_pvalue ) if mann_pvalue else None
261275 # Mann-Whitney U p-value interpretation
262- p_value_interpretation = None
263- if mann_pvalue >= pvalue_threshold :
264- p_value_interpretation = "not significant"
265- if mann_pvalue < pvalue_threshold :
266- p_value_interpretation = "significant"
276+ p_value_interpretation , is_significant = mann_whitney_pval_significance (
277+ mann_pvalue , pvalue_threshold
278+ )
267279
268280 mann_whitney = {
269281 "test_name" : "Mann-Whitney U" ,
270282 "stat" : mann_stat ,
271283 "pvalue" : mann_pvalue ,
272284 "interpretation" : p_value_interpretation ,
273285 }
274- return mann_whitney , mann_stat , mann_pvalue
275-
276-
277- def is_new_better (delta_value , lower_is_better ):
278- """This method returns if the new result is better or worse (even if unsure)"""
279- if delta_value is None :
280- direction = None
281- is_new_better = None
282- is_new_better = None
283- if abs (delta_value ) < 0.001 :
284- direction = "no change"
285- elif (lower_is_better and delta_value < 0 ) or (not lower_is_better and delta_value > 0 ):
286- direction = "better"
287- is_new_better = True
288- else :
289- direction = "worse"
290- is_new_better = False
291- return direction , is_new_better
292-
293-
294- def interpret_cles_direction (cles , pvalue_threshold = PVALUE_THRESHOLD ):
295- if cles is None :
296- return "CLES cannot be interpreted"
297- if cles >= pvalue_threshold :
298- return f"{ cles :.0%} chance a base value > a new value"
299- if cles < pvalue_threshold :
300- return f"{ 1 - cles :.0%} chance a new value > base value"
301- return "CLES cannot be interpreted"
286+ return mann_whitney , mann_stat , mann_pvalue , is_significant
302287
303288
304289# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
305290def interpret_effect_size (delta ):
291+ is_effect_meaningful = False
306292 if delta is None :
307- return "Effect cannot be interpreted"
293+ return "Effect cannot be interpreted" , is_effect_meaningful
308294 if abs (delta ) < 0.15 :
309- return "negligible"
310- elif abs (delta ) < 0.33 :
311- return "small"
312- elif abs (delta ) < 0.47 :
313- return "moderate"
314- else :
315- return "large"
295+ return "negligible" , is_effect_meaningful
296+ is_effect_meaningful = True
297+ if abs (delta ) < 0.33 :
298+ return "small" , is_effect_meaningful
299+ if abs (delta ) < 0.47 :
300+ return "moderate" , is_effect_meaningful
301+ return "large" , is_effect_meaningful
302+
303+
304+ def interpret_cles_direction (cles ):
305+ # probability that a randomly selected score from one group will be greater than a randomly selected score from a second group
306+ # A CLES of 0.5 indicates a 50% chance for either outcome, 50/50 toss up, no change
307+ # A CLES of 0.6 would mean there is a 60% chance that a score Base > New
308+ # A CLES of 0.4 would mean there is a 40% chance that a score from Base > New, or a 60% chance that a score from New > Base
309+ is_base_greater = None
310+ if cles is None :
311+ return "CLES cannot be interpreted" , is_base_greater
312+ elif cles > 0.5 :
313+ is_base_greater = True
314+ return f"{ cles :.0%} chance a base value > a new value" , is_base_greater
315+ elif cles < 0.5 :
316+ is_base_greater = False
317+ return f"{ 1 - cles :.0%} chance a new value > base value" , is_base_greater
318+ return "CLES cannot be interpreted" , is_base_greater
319+
320+
321+ def is_new_better (is_effect_meaningful , is_base_greater , is_significant , lower_is_better ):
322+ is_new_better = None
323+ direction = "no change"
324+ # Possibility Base > than New with a small amount or more significance
325+ if is_base_greater and is_effect_meaningful and is_significant :
326+ if lower_is_better :
327+ is_new_better = True
328+ direction = "improvement"
329+ else :
330+ is_new_better = False
331+ direction = "regression"
332+ # Possibility New > Base with a small amount or more significance
333+ elif (is_base_greater is False ) and is_effect_meaningful and is_significant :
334+ if lower_is_better :
335+ is_new_better = False
336+ direction = "regression"
337+ else :
338+ is_new_better = True
339+ direction = "improvement"
340+ return direction , is_new_better
316341
317342
318343def interpret_performance_direction (ci_low , ci_high , lower_is_better ):
@@ -347,13 +372,11 @@ def interpret_performance_direction(ci_low, ci_high, lower_is_better):
347372# Common Language Effect Size, and its interpretation in english
348373def interpret_cles (
349374 mann_stat ,
350- mann_pvalue ,
351375 new_revision ,
352376 base_revision ,
353377 delta ,
354378 interpretation ,
355379 lower_is_better ,
356- pvalue_threshold = PVALUE_THRESHOLD ,
357380):
358381 try :
359382 cles = None
@@ -373,9 +396,8 @@ def interpret_cles(
373396 else :
374397 mann_whitney_u_cles = ""
375398
376- is_significant = False if mann_pvalue > pvalue_threshold else True
377399 # Generate CLES explanation
378- cles_explanation = interpret_cles_direction (cles ) if cles else ""
400+ cles_explanation , is_base_greater = interpret_cles_direction (cles )
379401 # Cliff's delta CLES
380402 cliffs_delta_cles = f"Cliff's Delta: { delta :.2f} → { interpretation } " if delta else ""
381403
@@ -389,10 +411,10 @@ def interpret_cles(
389411 return (
390412 cles_obj ,
391413 cles ,
392- is_significant ,
393414 cles_explanation ,
394415 mann_whitney_u_cles ,
395416 cliffs_delta_cles ,
417+ is_base_greater ,
396418 )
397419 except Exception :
398420 return None , None , None , None , None , None
@@ -455,23 +477,36 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
455477 is_improvement = None
456478 performance_intepretation = None
457479 modes = []
458- if base_mode_count == new_mode_count :
459- base_intervals , base_peak_xs = find_mode_interval (x_base , y_base , base_peak_locs )
460- new_intervals , new_peak_xs = find_mode_interval (x_new , y_new , new_peak_locs )
461- per_mode_new = split_per_mode (new_data , new_intervals )
462- per_mode_base = split_per_mode (base_data , base_intervals )
463-
464- for i , interval in enumerate (base_intervals ):
465- tup = interval
466- if len (tup ) != 2 :
467- return None , None , None , None , None , None
468-
469- start , end = tup
470- shift = 0
471- ci_low = 0
472- ci_high = 0
473- median_shift_summary = None
474- mode_name = f"Mode { i + 1 } "
480+ base_intervals , base_peak_xs = find_mode_interval (x_base , y_base , base_peak_locs )
481+ new_intervals , new_peak_xs = find_mode_interval (x_new , y_new , new_peak_locs )
482+ for i , interval in enumerate (base_intervals ):
483+ if len (interval ) != 2 :
484+ return None , None , None , None , None , None
485+
486+ start , end = interval
487+ shift = 0
488+ ci_low = 0
489+ ci_high = 0
490+ median_shift_summary = (
491+ "Cannot measure shift, base mode count not equal to new mode count"
492+ )
493+ shift = None
494+ mode_name = f"Mode { i + 1 } "
495+ mode_info = {
496+ "mode_name" : mode_name ,
497+ "mode_start" : f"{ start :.2f} " if start else None ,
498+ "mode_end" : f"{ end :.2f} " if end else None ,
499+ "median_shift_summary" : median_shift_summary ,
500+ "ci_low" : ci_low ,
501+ "ci_high" : ci_high ,
502+ "shift" : shift ,
503+ "shift_summary" : performance_intepretation ,
504+ "ci_warning" : ci_warning ,
505+ }
506+
507+ if base_mode_count == new_mode_count :
508+ per_mode_new = split_per_mode (new_data , new_intervals )
509+ per_mode_base = split_per_mode (base_data , base_intervals )
475510
476511 try :
477512 ref_vals = [val for val , mode in zip (base_data , per_mode_base ) if mode == i ]
@@ -509,7 +544,8 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
509544 "shift_summary" : performance_intepretation ,
510545 "ci_warning" : ci_warning ,
511546 }
512- modes .append (mode_info )
547+
548+ modes .append (mode_info )
513549
514550 silverman_kde = {
515551 "bandwidth" : "Silverman" ,
0 commit comments