diff --git a/tests/perf/test_stats.py b/tests/perf/test_stats.py index 6222e8f01c3..fc162bc3074 100644 --- a/tests/perf/test_stats.py +++ b/tests/perf/test_stats.py @@ -59,27 +59,21 @@ def test_interpret_cles(): mock_base = [2.74] mock_new = [2.65] mock_mann_stat = 0.1 - mock_mann_pvalue = 0.2 interpretation = ("",) lower_is_better = (False,) mock_delta = 0.2 - ( - cles_obj, - cles, - is_significant, - cles_explanation, - mann_whitney_u_cles, - cliffs_delta_cles, - ) = interpret_cles( - mock_mann_stat, - mock_mann_pvalue, - mock_new, - mock_base, - mock_delta, - interpretation, - lower_is_better, + (cles_obj, cles, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, is_base_greater) = ( + interpret_cles( + mock_mann_stat, + mock_new, + mock_base, + mock_delta, + interpretation, + lower_is_better, + ) ) assert cles_obj["cles"] == 0.1 assert cles == 0.1 + assert is_base_greater is False diff --git a/tests/webapp/api/test_perfcompare_api.py b/tests/webapp/api/test_perfcompare_api.py index c25492becc0..763f5d5cf1b 100644 --- a/tests/webapp/api/test_perfcompare_api.py +++ b/tests/webapp/api/test_perfcompare_api.py @@ -1181,8 +1181,8 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "is_fit_good": True, "is_improvement": None, "is_regression": None, - "is_meaningful": None, - "is_new_better": False, + "is_meaningful": True, + "is_new_better": None, "base_parent_signature": response["base_parent_signature"], "new_parent_signature": response["new_parent_signature"], "base_signature_id": response["base_signature_id"], @@ -1191,7 +1191,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "cles": None, "cliffs_delta": -1.0, "cliffs_interpretation": "large", - "direction_of_change": "worse", + "direction_of_change": "no change", "base_standard_stats": { "count": 1, "max": 32.4, @@ -1269,7 +1269,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "ci_high": None, "ci_low": None, "ci_warning": None, - "median_shift_summary": None, + "median_shift_summary": "Cannot measure shift, base mode count not equal to new mode count", "mode_end": "36.47", "mode_name": "Mode 1", "mode_start": "28.33", diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index dd73b1a8c76..95ca61d91ef 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -8,6 +8,8 @@ from scipy.stats import bootstrap, iqr, ks_2samp, mannwhitneyu # New Stats Code +# various formulas extracted from here: +# https://colab.research.google.com/gist/padenot/2a408f0a39e269977045fc2fb265663b/end-to-end.ipynb#scrollTo=M8WY0yVIX5Ru&uniqifier=1 # p-value threshold to use throughout PVALUE_THRESHOLD = 0.05 @@ -249,6 +251,18 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD): return None, None, None +def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD): + p_value_interpretation = "" + is_significant = False + + if mann_pvalue > pvalue_threshold: + p_value_interpretation = "not significant" + if mann_pvalue <= pvalue_threshold: + is_significant = True + p_value_interpretation = "significant" + return p_value_interpretation, is_significant + + # Mann-Whitney U test # Tests the null hypothesis that the distributions patch and without patch are identical. # Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value @@ -259,11 +273,9 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): mann_stat = float(mann_stat) if mann_stat else None mann_pvalue = float(mann_pvalue) if mann_pvalue else None # Mann-Whitney U p-value interpretation - p_value_interpretation = None - if mann_pvalue >= pvalue_threshold: - p_value_interpretation = "not significant" - if mann_pvalue < pvalue_threshold: - p_value_interpretation = "significant" + p_value_interpretation, is_significant = mann_whitney_pval_significance( + mann_pvalue, pvalue_threshold + ) mann_whitney = { "test_name": "Mann-Whitney U", @@ -271,48 +283,61 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): "pvalue": mann_pvalue, "interpretation": p_value_interpretation, } - return mann_whitney, mann_stat, mann_pvalue - - -def is_new_better(delta_value, lower_is_better): - """This method returns if the new result is better or worse (even if unsure)""" - if delta_value is None: - direction = None - is_new_better = None - is_new_better = None - if abs(delta_value) < 0.001: - direction = "no change" - elif (lower_is_better and delta_value < 0) or (not lower_is_better and delta_value > 0): - direction = "better" - is_new_better = True - else: - direction = "worse" - is_new_better = False - return direction, is_new_better - - -def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): - if cles is None: - return "CLES cannot be interpreted" - if cles >= pvalue_threshold: - return f"{cles:.0%} chance a base value > a new value" - if cles < pvalue_threshold: - return f"{1 - cles:.0%} chance a new value > base value" - return "CLES cannot be interpreted" + return mann_whitney, mann_stat, mann_pvalue, is_significant # https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/ def interpret_effect_size(delta): + is_effect_meaningful = False if delta is None: - return "Effect cannot be interpreted" + return "Effect cannot be interpreted", is_effect_meaningful if abs(delta) < 0.15: - return "negligible" - elif abs(delta) < 0.33: - return "small" - elif abs(delta) < 0.47: - return "moderate" - else: - return "large" + return "negligible", is_effect_meaningful + is_effect_meaningful = True + if abs(delta) < 0.33: + return "small", is_effect_meaningful + if abs(delta) < 0.47: + return "moderate", is_effect_meaningful + return "large", is_effect_meaningful + + +def interpret_cles_direction(cles): + # probability that a randomly selected score from one group will be greater than a randomly selected score from a second group + # A CLES of 0.5 indicates a 50% chance for either outcome, 50/50 toss up, no change + # A CLES of 0.6 would mean there is a 60% chance that a score Base > New + # A CLES of 0.4 would mean there is a 40% chance that a score from Base > New, or a 60% chance that a score from New > Base + is_base_greater = None + if cles is None: + return "CLES cannot be interpreted", is_base_greater + elif cles > 0.5: + is_base_greater = True + return f"{cles:.0%} chance a base value > a new value", is_base_greater + elif cles < 0.5: + is_base_greater = False + return f"{1 - cles:.0%} chance a new value > base value", is_base_greater + return "CLES cannot be interpreted", is_base_greater + + +def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_is_better): + is_new_better = None + direction = "no change" + # Possibility Base > than New with a small amount or more significance + if is_base_greater and is_effect_meaningful and is_significant: + if lower_is_better: + is_new_better = True + direction = "improvement" + else: + is_new_better = False + direction = "regression" + # Possibility New > Base with a small amount or more significance + elif (is_base_greater is False) and is_effect_meaningful and is_significant: + if lower_is_better: + is_new_better = False + direction = "regression" + else: + is_new_better = True + direction = "improvement" + return direction, is_new_better def interpret_performance_direction(ci_low, ci_high, lower_is_better): @@ -347,13 +372,11 @@ def interpret_performance_direction(ci_low, ci_high, lower_is_better): # Common Language Effect Size, and its interpretation in english def interpret_cles( mann_stat, - mann_pvalue, new_revision, base_revision, delta, interpretation, lower_is_better, - pvalue_threshold=PVALUE_THRESHOLD, ): try: cles = None @@ -373,9 +396,8 @@ def interpret_cles( else: mann_whitney_u_cles = "" - is_significant = False if mann_pvalue > pvalue_threshold else True # Generate CLES explanation - cles_explanation = interpret_cles_direction(cles) if cles else "" + cles_explanation, is_base_greater = interpret_cles_direction(cles) # Cliff's delta CLES cliffs_delta_cles = f"Cliff's Delta: {delta:.2f} → {interpretation}" if delta else "" @@ -389,10 +411,10 @@ def interpret_cles( return ( cles_obj, cles, - is_significant, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, + is_base_greater, ) except Exception: return None, None, None, None, None, None @@ -455,23 +477,36 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): is_improvement = None performance_intepretation = None modes = [] - if base_mode_count == new_mode_count: - base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs) - new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs) - per_mode_new = split_per_mode(new_data, new_intervals) - per_mode_base = split_per_mode(base_data, base_intervals) - - for i, interval in enumerate(base_intervals): - tup = interval - if len(tup) != 2: - return None, None, None, None, None, None - - start, end = tup - shift = 0 - ci_low = 0 - ci_high = 0 - median_shift_summary = None - mode_name = f"Mode {i + 1}" + base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs) + new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs) + for i, interval in enumerate(base_intervals): + if len(interval) != 2: + return None, None, None, None, None, None + + start, end = interval + shift = 0 + ci_low = 0 + ci_high = 0 + median_shift_summary = ( + "Cannot measure shift, base mode count not equal to new mode count" + ) + shift = None + mode_name = f"Mode {i + 1}" + mode_info = { + "mode_name": mode_name, + "mode_start": f"{start:.2f}" if start else None, + "mode_end": f"{end:.2f}" if end else None, + "median_shift_summary": median_shift_summary, + "ci_low": ci_low, + "ci_high": ci_high, + "shift": shift, + "shift_summary": performance_intepretation, + "ci_warning": ci_warning, + } + + if base_mode_count == new_mode_count: + per_mode_new = split_per_mode(new_data, new_intervals) + per_mode_base = split_per_mode(base_data, base_intervals) try: ref_vals = [val for val, mode in zip(base_data, per_mode_base) if mode == i] @@ -509,7 +544,8 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): "shift_summary": performance_intepretation, "ci_warning": ci_warning, } - modes.append(mode_info) + + modes.append(mode_info) silverman_kde = { "bandwidth": "Silverman", diff --git a/treeherder/webapp/api/performance_data.py b/treeherder/webapp/api/performance_data.py index db81f22d31a..45e00eb46d3 100644 --- a/treeherder/webapp/api/performance_data.py +++ b/treeherder/webapp/api/performance_data.py @@ -1501,9 +1501,12 @@ def _process_stats( # Mann-Whitney U test, two sided because we're never quite sure what of # the intent of the patch, as things stand # Tests the null hypothesis that the distributions of the two are identical - mann_whitney, mann_stat, mann_pvalue = stats.interpret_mann_whitneyu( - base_rev_data, new_rev_data, pvalue_threshold - ) + ( + mann_whitney, + mann_stat, + mann_pvalue, + is_significant, + ) = stats.interpret_mann_whitneyu(base_rev_data, new_rev_data, pvalue_threshold) delta_value = new_median - base_median delta_percentage = (delta_value / base_median * 100) if base_median != 0 else 0 @@ -1520,32 +1523,32 @@ def _process_stats( else: c_delta, _ = cliffs_delta(base_rev_data, new_rev_data) - cliffs_interpretation = stats.interpret_effect_size(c_delta) - direction, is_new_better = stats.is_new_better(delta_value, lower_is_better) - - # Interpret effect size - effect_size = stats.interpret_effect_size(c_delta) + # interpret effect size + cliffs_interpretation, is_effect_meaningful = stats.interpret_effect_size(c_delta) - # returns CLES, direction + # returns CLES ( cles_obj, cles, - is_significant, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, + is_base_greater, ) = stats.interpret_cles( mann_stat, - mann_pvalue, new_rev_data, base_rev_data, - cliffs_interpretation, c_delta, + cliffs_interpretation, lower_is_better, - pvalue_threshold, ) + + direction, is_new_better = stats.is_new_better( + is_effect_meaningful, is_base_greater, is_significant, lower_is_better + ) + if cles_obj: - cles_obj["effect_size"] = effect_size + cles_obj["effect_size"] = cliffs_interpretation cles_obj["cles_direction"] = direction # Compute KDE with Silverman bandwidth, and warn if multimodal. @@ -1615,13 +1618,13 @@ def _process_stats( # short form summary based on former tests shapiro, silverman, etc... "is_fit_good": is_fit_good, "is_new_better": is_new_better, - "is_meaningful": is_significant, + "is_meaningful": is_effect_meaningful, "lower_is_better": lower_is_better, "is_regression": is_regression, "is_improvement": is_improvement, "more_runs_are_needed": more_runs_are_needed, "performance_intepretation": performance_intepretation, - "direction_of_change": direction, # 'neutral', 'better', or 'worse' + "direction_of_change": direction, # 'no change', 'improvement', or 'regression' } return stats_data