From 30fd7efe40f7f39218c6f246be3eb432aeeb3104 Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Mon, 17 Nov 2025 22:15:43 -0500 Subject: [PATCH 01/12] add is new better update logic --- treeherder/perf/stats.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index b62a340c1ec..2aeef8c3068 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -244,27 +244,34 @@ def interpret_mann_whitneyu(base, new): return mann_whitney, mann_stat, mann_pvalue -def is_new_better(delta_value, lower_is_better): - """This method returns if the new result is better or worse (even if unsure)""" - if delta_value is None: - direction = None +def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=PVALUE_THRESHOLD): + """This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change""" + # Possibility Base > than New with a small amount or more significance + if cles > pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold: + if lower_is_better: + is_new_better = True + direction = "better" + else: + is_new_better = False + direction = "worse" + # Possibility New > Base with a small amount or more significance + if cles < pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold: + if lower_is_better: + is_new_better = False + direction = "worse" + else: + is_new_better = True + direction = "better" + else: is_new_better = None - is_new_better = None - if abs(delta_value) < 0.001: direction = "no change" - elif (lower_is_better and delta_value < 0) or (not lower_is_better and delta_value > 0): - direction = "better" - is_new_better = True - else: - direction = "worse" - is_new_better = False return direction, is_new_better def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): if cles is None: return "CLES cannot be interpreted" - if cles >= pvalue_threshold: + if cles > pvalue_threshold: return f"{cles:.0%} chance a base value is greater than a new value" else: return f"{1 - cles:.0%} chance a new value is greater than a base value" From 80b2cdef8b8c378c2ebf5b67d3ef836f18e696ce Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Tue, 18 Nov 2025 01:00:25 -0500 Subject: [PATCH 02/12] add update to text --- treeherder/perf/stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index 2aeef8c3068..f1a4d3b25c3 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -250,18 +250,18 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold= if cles > pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold: if lower_is_better: is_new_better = True - direction = "better" + direction = "improvement" else: is_new_better = False - direction = "worse" + direction = "regression" # Possibility New > Base with a small amount or more significance if cles < pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold: if lower_is_better: is_new_better = False - direction = "worse" + direction = "regression" else: is_new_better = True - direction = "better" + direction = "improvement" else: is_new_better = None direction = "no change" From b1a9a46ba7eff17315d055e3cf0ab9307faa351d Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Tue, 18 Nov 2025 09:11:25 -0500 Subject: [PATCH 03/12] add else logic to still render modes without calculating ci with different mode count --- treeherder/perf/stats.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index 91f072a3aec..03efaf55707 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -462,9 +462,10 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): is_improvement = None performance_intepretation = None modes = [] + base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs) + new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs) + if base_mode_count == new_mode_count: - base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs) - new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs) per_mode_new = split_per_mode(new_data, new_intervals) per_mode_base = split_per_mode(base_data, base_intervals) @@ -517,6 +518,32 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): "ci_warning": ci_warning, } modes.append(mode_info) + else: + for i, interval in enumerate(base_intervals): + tup = interval + if len(tup) != 2: + return None, None, None, None, None, None + + start, end = tup + shift = 0 + ci_low = 0 + ci_high = 0 + median_shift_summary = ( + "Cannot measure shift, base mode count not equal to new mode count." + ) + mode_name = f"Mode {i + 1}" + mode_info = { + "mode_name": mode_name, + "mode_start": f"{start:.2f}" if start else None, + "mode_end": f"{end:.2f}" if end else None, + "median_shift_summary": median_shift_summary, + "ci_low": ci_low, + "ci_high": ci_high, + "shift": shift, + "shift_summary": performance_intepretation, + "ci_warning": ci_warning, + } + modes.append(mode_info) silverman_kde = { "bandwidth": "Silverman", From a829e16f1b78090e1d736267c063a536f657259c Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Tue, 18 Nov 2025 09:19:44 -0500 Subject: [PATCH 04/12] fix shift summary --- treeherder/perf/stats.py | 1 + 1 file changed, 1 insertion(+) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index 03efaf55707..a4771b0ef4f 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -531,6 +531,7 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): median_shift_summary = ( "Cannot measure shift, base mode count not equal to new mode count." ) + shift = None mode_name = f"Mode {i + 1}" mode_info = { "mode_name": mode_name, From 5bbe303efdc74f796aa7fe5549ffbf7255d19afe Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Tue, 18 Nov 2025 12:34:27 -0500 Subject: [PATCH 05/12] fix shift summary, and cleaned up new stats is_new_better --- tests/webapp/api/test_perfcompare_api.py | 4 +- treeherder/perf/stats.py | 85 ++++++++++++++--------- treeherder/webapp/api/performance_data.py | 11 +-- 3 files changed, 62 insertions(+), 38 deletions(-) diff --git a/tests/webapp/api/test_perfcompare_api.py b/tests/webapp/api/test_perfcompare_api.py index c25492becc0..9f0a82cfff1 100644 --- a/tests/webapp/api/test_perfcompare_api.py +++ b/tests/webapp/api/test_perfcompare_api.py @@ -1182,7 +1182,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "is_improvement": None, "is_regression": None, "is_meaningful": None, - "is_new_better": False, + "is_new_better": None, "base_parent_signature": response["base_parent_signature"], "new_parent_signature": response["new_parent_signature"], "base_signature_id": response["base_signature_id"], @@ -1191,7 +1191,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "cles": None, "cliffs_delta": -1.0, "cliffs_interpretation": "large", - "direction_of_change": "worse", + "direction_of_change": "no change", "base_standard_stats": { "count": 1, "max": 32.4, diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index a4771b0ef4f..982ddb85f4c 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -249,6 +249,15 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD): return None, None, None +def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD): + p_value_interpretation = None + if mann_pvalue > pvalue_threshold: + p_value_interpretation = "not significant" + if mann_pvalue <= pvalue_threshold: + p_value_interpretation = "significant" + return p_value_interpretation + + # Mann-Whitney U test # Tests the null hypothesis that the distributions patch and without patch are identical. # Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value @@ -259,11 +268,7 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): mann_stat = float(mann_stat) if mann_stat else None mann_pvalue = float(mann_pvalue) if mann_pvalue else None # Mann-Whitney U p-value interpretation - p_value_interpretation = None - if mann_pvalue >= pvalue_threshold: - p_value_interpretation = "not significant" - if mann_pvalue < pvalue_threshold: - p_value_interpretation = "significant" + p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold) mann_whitney = { "test_name": "Mann-Whitney U", @@ -274,10 +279,48 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): return mann_whitney, mann_stat, mann_pvalue +# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/ +def interpret_effect_size(delta): + if delta is None: + return "Effect cannot be interpreted" + if abs(delta) < 0.15: + return "negligible" + elif abs(delta) < 0.33: + return "small" + elif abs(delta) < 0.47: + return "moderate" + else: + return "large" + + +def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): + greater_rev = None + if cles is None: + return "CLES cannot be interpreted", greater_rev + if cles > pvalue_threshold: + greater_rev = "base" + return f"{cles:.0%} chance a base value > a new value", greater_rev + if cles < pvalue_threshold: + greater_rev = "new" + return f"{1 - cles:.0%} chance a new value > base value", greater_rev + return "CLES cannot be interpreted", greater_rev + + def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=PVALUE_THRESHOLD): """This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change""" # Possibility Base > than New with a small amount or more significance - if cles > pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold: + cles_interpretation, greater_rev = interpret_cles_direction( + cles, pvalue_threshold=PVALUE_THRESHOLD + ) + effect_size = interpret_effect_size(c_delta) + effect_value_significance = ["small", "moderate", "large"] + p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold) + + if ( + greater_rev == "base" + and any(effect_size in effect_value_significance) + and p_value_interpretation == "significant" + ): if lower_is_better: is_new_better = True direction = "improvement" @@ -285,7 +328,11 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold= is_new_better = False direction = "regression" # Possibility New > Base with a small amount or more significance - if cles < pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold: + if ( + greater_rev == "new" + and any(effect_size in effect_value_significance) + and p_value_interpretation == "significant" + ): if lower_is_better: is_new_better = False direction = "regression" @@ -298,30 +345,6 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold= return direction, is_new_better -def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): - if cles is None: - return "CLES cannot be interpreted" - if cles > pvalue_threshold: - return f"{cles:.0%} chance a base value > a new value" - if cles < pvalue_threshold: - return f"{1 - cles:.0%} chance a new value > base value" - return "CLES cannot be interpreted" - - -# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/ -def interpret_effect_size(delta): - if delta is None: - return "Effect cannot be interpreted" - if abs(delta) < 0.15: - return "negligible" - elif abs(delta) < 0.33: - return "small" - elif abs(delta) < 0.47: - return "moderate" - else: - return "large" - - def interpret_performance_direction(ci_low, ci_high, lower_is_better): is_regression = False is_improvement = False diff --git a/treeherder/webapp/api/performance_data.py b/treeherder/webapp/api/performance_data.py index db81f22d31a..f1bb1782052 100644 --- a/treeherder/webapp/api/performance_data.py +++ b/treeherder/webapp/api/performance_data.py @@ -1521,12 +1521,8 @@ def _process_stats( c_delta, _ = cliffs_delta(base_rev_data, new_rev_data) cliffs_interpretation = stats.interpret_effect_size(c_delta) - direction, is_new_better = stats.is_new_better(delta_value, lower_is_better) - # Interpret effect size - effect_size = stats.interpret_effect_size(c_delta) - - # returns CLES, direction + # returns CLES ( cles_obj, cles, @@ -1544,6 +1540,11 @@ def _process_stats( lower_is_better, pvalue_threshold, ) + + # Interpret effect size + effect_size = stats.interpret_effect_size(c_delta) + direction, is_new_better = stats.is_new_better(c_delta, cles, mann_pvalue, lower_is_better) + if cles_obj: cles_obj["effect_size"] = effect_size cles_obj["cles_direction"] = direction From 0ebb56cf65628b76319ae5a73443ee353ed0c5c1 Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 00:26:22 -0500 Subject: [PATCH 06/12] refactor is_new_better along with interpret_silverman_kde --- tests/perf/test_stats.py | 26 ++-- tests/webapp/api/test_perfcompare_api.py | 4 +- treeherder/perf/stats.py | 153 +++++++++------------- treeherder/webapp/api/performance_data.py | 28 ++-- 4 files changed, 92 insertions(+), 119 deletions(-) diff --git a/tests/perf/test_stats.py b/tests/perf/test_stats.py index 6222e8f01c3..e032c2e0f70 100644 --- a/tests/perf/test_stats.py +++ b/tests/perf/test_stats.py @@ -59,27 +59,21 @@ def test_interpret_cles(): mock_base = [2.74] mock_new = [2.65] mock_mann_stat = 0.1 - mock_mann_pvalue = 0.2 interpretation = ("",) lower_is_better = (False,) mock_delta = 0.2 - ( - cles_obj, - cles, - is_significant, - cles_explanation, - mann_whitney_u_cles, - cliffs_delta_cles, - ) = interpret_cles( - mock_mann_stat, - mock_mann_pvalue, - mock_new, - mock_base, - mock_delta, - interpretation, - lower_is_better, + (cles_obj, cles, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, is_base_greater) = ( + interpret_cles( + mock_mann_stat, + mock_new, + mock_base, + mock_delta, + interpretation, + lower_is_better, + ) ) assert cles_obj["cles"] == 0.1 assert cles == 0.1 + assert is_base_greater is None diff --git a/tests/webapp/api/test_perfcompare_api.py b/tests/webapp/api/test_perfcompare_api.py index 9f0a82cfff1..763f5d5cf1b 100644 --- a/tests/webapp/api/test_perfcompare_api.py +++ b/tests/webapp/api/test_perfcompare_api.py @@ -1181,7 +1181,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "is_fit_good": True, "is_improvement": None, "is_regression": None, - "is_meaningful": None, + "is_meaningful": True, "is_new_better": None, "base_parent_signature": response["base_parent_signature"], "new_parent_signature": response["new_parent_signature"], @@ -1269,7 +1269,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base( "ci_high": None, "ci_low": None, "ci_warning": None, - "median_shift_summary": None, + "median_shift_summary": "Cannot measure shift, base mode count not equal to new mode count", "mode_end": "36.47", "mode_name": "Mode 1", "mode_start": "28.33", diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index 982ddb85f4c..eb1cfb048b1 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -251,11 +251,14 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD): def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD): p_value_interpretation = None + is_significant = False + if mann_pvalue > pvalue_threshold: p_value_interpretation = "not significant" if mann_pvalue <= pvalue_threshold: + is_significant = True p_value_interpretation = "significant" - return p_value_interpretation + return p_value_interpretation, is_significant # Mann-Whitney U test @@ -268,7 +271,9 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): mann_stat = float(mann_stat) if mann_stat else None mann_pvalue = float(mann_pvalue) if mann_pvalue else None # Mann-Whitney U p-value interpretation - p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold) + p_value_interpretation, is_significant = mann_whitney_pval_significance( + mann_pvalue, pvalue_threshold + ) mann_whitney = { "test_name": "Mann-Whitney U", @@ -276,51 +281,43 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): "pvalue": mann_pvalue, "interpretation": p_value_interpretation, } - return mann_whitney, mann_stat, mann_pvalue + return mann_whitney, mann_stat, mann_pvalue, is_significant # https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/ def interpret_effect_size(delta): + is_effect_meaningful = False if delta is None: - return "Effect cannot be interpreted" + return "Effect cannot be interpreted", is_effect_meaningful if abs(delta) < 0.15: - return "negligible" - elif abs(delta) < 0.33: - return "small" - elif abs(delta) < 0.47: - return "moderate" + return "negligible", is_effect_meaningful + if abs(delta) < 0.33: + is_effect_meaningful = True + return "small", is_effect_meaningful + if abs(delta) < 0.47: + is_effect_meaningful = True + return "moderate", is_effect_meaningful else: - return "large" + is_effect_meaningful = True + return "large", is_effect_meaningful def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): - greater_rev = None + is_base_greater = None if cles is None: - return "CLES cannot be interpreted", greater_rev - if cles > pvalue_threshold: - greater_rev = "base" - return f"{cles:.0%} chance a base value > a new value", greater_rev - if cles < pvalue_threshold: - greater_rev = "new" - return f"{1 - cles:.0%} chance a new value > base value", greater_rev - return "CLES cannot be interpreted", greater_rev - - -def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=PVALUE_THRESHOLD): - """This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change""" + return "CLES cannot be interpreted", is_base_greater + elif cles > pvalue_threshold: + is_base_greater = True + return f"{cles:.0%} chance a base value > a new value", is_base_greater + elif cles < pvalue_threshold: + is_base_greater = False + return f"{1 - cles:.0%} chance a new value > base value", is_base_greater + return "CLES cannot be interpreted", is_base_greater + + +def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_is_better): # Possibility Base > than New with a small amount or more significance - cles_interpretation, greater_rev = interpret_cles_direction( - cles, pvalue_threshold=PVALUE_THRESHOLD - ) - effect_size = interpret_effect_size(c_delta) - effect_value_significance = ["small", "moderate", "large"] - p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold) - - if ( - greater_rev == "base" - and any(effect_size in effect_value_significance) - and p_value_interpretation == "significant" - ): + if is_base_greater and is_effect_meaningful and is_significant: if lower_is_better: is_new_better = True direction = "improvement" @@ -328,11 +325,7 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold= is_new_better = False direction = "regression" # Possibility New > Base with a small amount or more significance - if ( - greater_rev == "new" - and any(effect_size in effect_value_significance) - and p_value_interpretation == "significant" - ): + elif (is_base_greater is False) and is_effect_meaningful and is_significant: if lower_is_better: is_new_better = False direction = "regression" @@ -377,13 +370,11 @@ def interpret_performance_direction(ci_low, ci_high, lower_is_better): # Common Language Effect Size, and its interpretation in english def interpret_cles( mann_stat, - mann_pvalue, new_revision, base_revision, delta, interpretation, lower_is_better, - pvalue_threshold=PVALUE_THRESHOLD, ): try: cles = None @@ -403,9 +394,8 @@ def interpret_cles( else: mann_whitney_u_cles = "" - is_significant = False if mann_pvalue > pvalue_threshold else True # Generate CLES explanation - cles_explanation = interpret_cles_direction(cles) if cles else "" + cles_explanation, is_base_greater = interpret_cles_direction(cles) if cles else "", None # Cliff's delta CLES cliffs_delta_cles = f"Cliff's Delta: {delta:.2f} → {interpretation}" if delta else "" @@ -419,10 +409,10 @@ def interpret_cles( return ( cles_obj, cles, - is_significant, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, + is_base_greater, ) except Exception: return None, None, None, None, None, None @@ -487,22 +477,35 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): modes = [] base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs) new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs) - - if base_mode_count == new_mode_count: - per_mode_new = split_per_mode(new_data, new_intervals) - per_mode_base = split_per_mode(base_data, base_intervals) - - for i, interval in enumerate(base_intervals): - tup = interval - if len(tup) != 2: - return None, None, None, None, None, None - - start, end = tup - shift = 0 - ci_low = 0 - ci_high = 0 - median_shift_summary = None - mode_name = f"Mode {i + 1}" + for i, interval in enumerate(base_intervals): + tup = interval + if len(tup) != 2: + return None, None, None, None, None, None + + start, end = tup + shift = 0 + ci_low = 0 + ci_high = 0 + median_shift_summary = ( + "Cannot measure shift, base mode count not equal to new mode count" + ) + shift = None + mode_name = f"Mode {i + 1}" + mode_info = { + "mode_name": mode_name, + "mode_start": f"{start:.2f}" if start else None, + "mode_end": f"{end:.2f}" if end else None, + "median_shift_summary": median_shift_summary, + "ci_low": ci_low, + "ci_high": ci_high, + "shift": shift, + "shift_summary": performance_intepretation, + "ci_warning": ci_warning, + } + + if base_mode_count == new_mode_count: + per_mode_new = split_per_mode(new_data, new_intervals) + per_mode_base = split_per_mode(base_data, base_intervals) try: ref_vals = [val for val, mode in zip(base_data, per_mode_base) if mode == i] @@ -540,33 +543,7 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): "shift_summary": performance_intepretation, "ci_warning": ci_warning, } - modes.append(mode_info) - else: - for i, interval in enumerate(base_intervals): - tup = interval - if len(tup) != 2: - return None, None, None, None, None, None - - start, end = tup - shift = 0 - ci_low = 0 - ci_high = 0 - median_shift_summary = ( - "Cannot measure shift, base mode count not equal to new mode count." - ) - shift = None - mode_name = f"Mode {i + 1}" - mode_info = { - "mode_name": mode_name, - "mode_start": f"{start:.2f}" if start else None, - "mode_end": f"{end:.2f}" if end else None, - "median_shift_summary": median_shift_summary, - "ci_low": ci_low, - "ci_high": ci_high, - "shift": shift, - "shift_summary": performance_intepretation, - "ci_warning": ci_warning, - } + modes.append(mode_info) silverman_kde = { diff --git a/treeherder/webapp/api/performance_data.py b/treeherder/webapp/api/performance_data.py index f1bb1782052..c423c46c026 100644 --- a/treeherder/webapp/api/performance_data.py +++ b/treeherder/webapp/api/performance_data.py @@ -1501,9 +1501,12 @@ def _process_stats( # Mann-Whitney U test, two sided because we're never quite sure what of # the intent of the patch, as things stand # Tests the null hypothesis that the distributions of the two are identical - mann_whitney, mann_stat, mann_pvalue = stats.interpret_mann_whitneyu( - base_rev_data, new_rev_data, pvalue_threshold - ) + ( + mann_whitney, + mann_stat, + mann_pvalue, + is_significant, + ) = stats.interpret_mann_whitneyu(base_rev_data, new_rev_data, pvalue_threshold) delta_value = new_median - base_median delta_percentage = (delta_value / base_median * 100) if base_median != 0 else 0 @@ -1520,33 +1523,32 @@ def _process_stats( else: c_delta, _ = cliffs_delta(base_rev_data, new_rev_data) - cliffs_interpretation = stats.interpret_effect_size(c_delta) + # interpret effect size + cliffs_interpretation, is_effect_meaningful = stats.interpret_effect_size(c_delta) # returns CLES ( cles_obj, cles, - is_significant, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, + is_base_greater, ) = stats.interpret_cles( mann_stat, - mann_pvalue, new_rev_data, base_rev_data, - cliffs_interpretation, c_delta, + cliffs_interpretation, lower_is_better, - pvalue_threshold, ) - # Interpret effect size - effect_size = stats.interpret_effect_size(c_delta) - direction, is_new_better = stats.is_new_better(c_delta, cles, mann_pvalue, lower_is_better) + direction, is_new_better = stats.is_new_better( + is_effect_meaningful, is_base_greater, is_significant, lower_is_better + ) if cles_obj: - cles_obj["effect_size"] = effect_size + cles_obj["effect_size"] = cliffs_interpretation cles_obj["cles_direction"] = direction # Compute KDE with Silverman bandwidth, and warn if multimodal. @@ -1616,7 +1618,7 @@ def _process_stats( # short form summary based on former tests shapiro, silverman, etc... "is_fit_good": is_fit_good, "is_new_better": is_new_better, - "is_meaningful": is_significant, + "is_meaningful": is_effect_meaningful, "lower_is_better": lower_is_better, "is_regression": is_regression, "is_improvement": is_improvement, From 35248b8a29c814c9ee1b5ef943ab086442c81029 Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 05:36:53 -0500 Subject: [PATCH 07/12] get rid of else condition in is_new_better --- treeherder/perf/stats.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index eb1cfb048b1..b482afc2e80 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -316,6 +316,8 @@ def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_is_better): + is_new_better = None + direction = "no change" # Possibility Base > than New with a small amount or more significance if is_base_greater and is_effect_meaningful and is_significant: if lower_is_better: @@ -332,9 +334,6 @@ def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_i else: is_new_better = True direction = "improvement" - else: - is_new_better = None - direction = "no change" return direction, is_new_better From a7215c397c8a3ab31f6f1ee3ea9a9306a507a40d Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 12:32:52 -0500 Subject: [PATCH 08/12] fix CLES interpretation logic --- treeherder/perf/stats.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index b482afc2e80..fe5e8c6809e 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -8,6 +8,8 @@ from scipy.stats import bootstrap, iqr, ks_2samp, mannwhitneyu # New Stats Code +# various formulas extracted from here: +# https://colab.research.google.com/gist/padenot/2a408f0a39e269977045fc2fb265663b/end-to-end.ipynb#scrollTo=M8WY0yVIX5Ru&uniqifier=1 # p-value threshold to use throughout PVALUE_THRESHOLD = 0.05 @@ -302,14 +304,18 @@ def interpret_effect_size(delta): return "large", is_effect_meaningful -def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD): +def interpret_cles_direction(cles): + # probability that a randomly selected score from one group will be greater than a randomly selected score from a second group + # A CLES of 0.5 indicates a 50% chance for either outcome, 50/50 toss up, no change + # A CLES of 0.6 would mean there is a 60% chance that a score Base > New + # A CLES of 0.4 would mean there is a 40% chance that a score from Base > New, or a 60% chance that a score from New > Base is_base_greater = None if cles is None: return "CLES cannot be interpreted", is_base_greater - elif cles > pvalue_threshold: + elif cles > 0.5: is_base_greater = True return f"{cles:.0%} chance a base value > a new value", is_base_greater - elif cles < pvalue_threshold: + elif cles < 0.5: is_base_greater = False return f"{1 - cles:.0%} chance a new value > base value", is_base_greater return "CLES cannot be interpreted", is_base_greater From 69b16382a2ecc6a8e7e532a25e77a86ad89e0057 Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 14:41:06 -0500 Subject: [PATCH 09/12] some cleanup interpret_effect_size, append modeInfo, and others --- treeherder/perf/stats.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index fe5e8c6809e..2f3bb281333 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -252,7 +252,7 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD): def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD): - p_value_interpretation = None + p_value_interpretation = "" is_significant = False if mann_pvalue > pvalue_threshold: @@ -288,20 +288,16 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): # https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/ def interpret_effect_size(delta): - is_effect_meaningful = False + is_effect_meaningful = True if delta is None: - return "Effect cannot be interpreted", is_effect_meaningful + return "Effect cannot be interpreted" if abs(delta) < 0.15: - return "negligible", is_effect_meaningful + return "negligible" if abs(delta) < 0.33: - is_effect_meaningful = True return "small", is_effect_meaningful if abs(delta) < 0.47: - is_effect_meaningful = True return "moderate", is_effect_meaningful - else: - is_effect_meaningful = True - return "large", is_effect_meaningful + return "large", is_effect_meaningful def interpret_cles_direction(cles): @@ -400,7 +396,7 @@ def interpret_cles( mann_whitney_u_cles = "" # Generate CLES explanation - cles_explanation, is_base_greater = interpret_cles_direction(cles) if cles else "", None + cles_explanation, is_base_greater = interpret_cles_direction(cles) # Cliff's delta CLES cliffs_delta_cles = f"Cliff's Delta: {delta:.2f} → {interpretation}" if delta else "" @@ -483,11 +479,10 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs) new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs) for i, interval in enumerate(base_intervals): - tup = interval - if len(tup) != 2: + if len(interval) != 2: return None, None, None, None, None, None - start, end = tup + start, end = interval shift = 0 ci_low = 0 ci_high = 0 @@ -549,7 +544,7 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better): "ci_warning": ci_warning, } - modes.append(mode_info) + modes.append(mode_info) silverman_kde = { "bandwidth": "Silverman", From 4b5118ec68e0589ca18ea3dc38f75ac9d5802830 Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 15:00:04 -0500 Subject: [PATCH 10/12] get rid of else case in is_new_better --- tests/perf/test_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/perf/test_stats.py b/tests/perf/test_stats.py index e032c2e0f70..fc162bc3074 100644 --- a/tests/perf/test_stats.py +++ b/tests/perf/test_stats.py @@ -76,4 +76,4 @@ def test_interpret_cles(): assert cles_obj["cles"] == 0.1 assert cles == 0.1 - assert is_base_greater is None + assert is_base_greater is False From ac218159d718f6cc2b1e2392b2128f923f9e82dc Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 15:53:24 -0500 Subject: [PATCH 11/12] revert back interpret_effect_size --- treeherder/perf/stats.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index 2f3bb281333..a73dc82018c 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -288,15 +288,18 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD): # https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/ def interpret_effect_size(delta): - is_effect_meaningful = True + is_effect_meaningful = False if delta is None: - return "Effect cannot be interpreted" + return "Effect cannot be interpreted", is_effect_meaningful if abs(delta) < 0.15: - return "negligible" + return "negligible", is_effect_meaningful if abs(delta) < 0.33: + is_effect_meaningful = True return "small", is_effect_meaningful if abs(delta) < 0.47: + is_effect_meaningful = True return "moderate", is_effect_meaningful + is_effect_meaningful = True return "large", is_effect_meaningful From f728dee6d06cb482d8b063a9b029693fe6db2d9b Mon Sep 17 00:00:00 2001 From: Vi Tran Date: Wed, 19 Nov 2025 16:22:15 -0500 Subject: [PATCH 12/12] fix interpret_effect_size and comments on direction_of_change return values --- treeherder/perf/stats.py | 4 +--- treeherder/webapp/api/performance_data.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py index a73dc82018c..95ca61d91ef 100644 --- a/treeherder/perf/stats.py +++ b/treeherder/perf/stats.py @@ -293,13 +293,11 @@ def interpret_effect_size(delta): return "Effect cannot be interpreted", is_effect_meaningful if abs(delta) < 0.15: return "negligible", is_effect_meaningful + is_effect_meaningful = True if abs(delta) < 0.33: - is_effect_meaningful = True return "small", is_effect_meaningful if abs(delta) < 0.47: - is_effect_meaningful = True return "moderate", is_effect_meaningful - is_effect_meaningful = True return "large", is_effect_meaningful diff --git a/treeherder/webapp/api/performance_data.py b/treeherder/webapp/api/performance_data.py index c423c46c026..45e00eb46d3 100644 --- a/treeherder/webapp/api/performance_data.py +++ b/treeherder/webapp/api/performance_data.py @@ -1624,7 +1624,7 @@ def _process_stats( "is_improvement": is_improvement, "more_runs_are_needed": more_runs_are_needed, "performance_intepretation": performance_intepretation, - "direction_of_change": direction, # 'neutral', 'better', or 'worse' + "direction_of_change": direction, # 'no change', 'improvement', or 'regression' } return stats_data