fix shift summary, and cleaned up new stats is_new_better

misspran · misspran · commit 5bbe303efdc7 · 2025-11-18T12:34:27.000-05:00
diff --git a/tests/webapp/api/test_perfcompare_api.py b/tests/webapp/api/test_perfcompare_api.py
@@ -1182,7 +1182,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
             "is_improvement": None,
             "is_regression": None,
             "is_meaningful": None,
-            "is_new_better": False,
+            "is_new_better": None,
             "base_parent_signature": response["base_parent_signature"],
             "new_parent_signature": response["new_parent_signature"],
             "base_signature_id": response["base_signature_id"],
@@ -1191,7 +1191,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
             "cles": None,
             "cliffs_delta": -1.0,
             "cliffs_interpretation": "large",
-            "direction_of_change": "worse",
+            "direction_of_change": "no change",
             "base_standard_stats": {
                 "count": 1,
                 "max": 32.4,
diff --git a/treeherder/perf/stats.py b/treeherder/perf/stats.py
@@ -249,6 +249,15 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
         return None, None, None
 
 
+def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD):
+    p_value_interpretation = None
+    if mann_pvalue > pvalue_threshold:
+        p_value_interpretation = "not significant"
+    if mann_pvalue <= pvalue_threshold:
+        p_value_interpretation = "significant"
+    return p_value_interpretation
+
+
 # Mann-Whitney U test
 # Tests the null hypothesis that the distributions patch and without patch are identical.
 # Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value
@@ -259,11 +268,7 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
     mann_stat = float(mann_stat) if mann_stat else None
     mann_pvalue = float(mann_pvalue) if mann_pvalue else None
     # Mann-Whitney U  p-value interpretation
-    p_value_interpretation = None
-    if mann_pvalue >= pvalue_threshold:
-        p_value_interpretation = "not significant"
-    if mann_pvalue < pvalue_threshold:
-        p_value_interpretation = "significant"
+    p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold)
 
     mann_whitney = {
         "test_name": "Mann-Whitney U",
@@ -274,18 +279,60 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
     return mann_whitney, mann_stat, mann_pvalue
 
 
+# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
+def interpret_effect_size(delta):
+    if delta is None:
+        return "Effect cannot be interpreted"
+    if abs(delta) < 0.15:
+        return "negligible"
+    elif abs(delta) < 0.33:
+        return "small"
+    elif abs(delta) < 0.47:
+        return "moderate"
+    else:
+        return "large"
+
+
+def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
+    greater_rev = None
+    if cles is None:
+        return "CLES cannot be interpreted", greater_rev
+    if cles > pvalue_threshold:
+        greater_rev = "base"
+        return f"{cles:.0%} chance a base value > a new value", greater_rev
+    if cles < pvalue_threshold:
+        greater_rev = "new"
+        return f"{1 - cles:.0%} chance a new value > base value", greater_rev
+    return "CLES cannot be interpreted", greater_rev
+
+
 def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=PVALUE_THRESHOLD):
     """This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change"""
     # Possibility Base > than New with a small amount or more significance
-    if cles > pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold:
+    cles_interpretation, greater_rev = interpret_cles_direction(
+        cles, pvalue_threshold=PVALUE_THRESHOLD
+    )
+    effect_size = interpret_effect_size(c_delta)
+    effect_value_significance = ["small", "moderate", "large"]
+    p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold)
+
+    if (
+        greater_rev == "base"
+        and any(effect_size in effect_value_significance)
+        and p_value_interpretation == "significant"
+    ):
         if lower_is_better:
             is_new_better = True
             direction = "improvement"
         else:
             is_new_better = False
             direction = "regression"
     # Possibility New > Base with a small amount or more significance
-    if cles < pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold:
+    if (
+        greater_rev == "new"
+        and any(effect_size in effect_value_significance)
+        and p_value_interpretation == "significant"
+    ):
         if lower_is_better:
             is_new_better = False
             direction = "regression"
@@ -298,30 +345,6 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=
     return direction, is_new_better
 
 
-def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
-    if cles is None:
-        return "CLES cannot be interpreted"
-    if cles > pvalue_threshold:
-        return f"{cles:.0%} chance a base value > a new value"
-    if cles < pvalue_threshold:
-        return f"{1 - cles:.0%} chance a new value > base value"
-    return "CLES cannot be interpreted"
-
-
-# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
-def interpret_effect_size(delta):
-    if delta is None:
-        return "Effect cannot be interpreted"
-    if abs(delta) < 0.15:
-        return "negligible"
-    elif abs(delta) < 0.33:
-        return "small"
-    elif abs(delta) < 0.47:
-        return "moderate"
-    else:
-        return "large"
-
-
 def interpret_performance_direction(ci_low, ci_high, lower_is_better):
     is_regression = False
     is_improvement = False
diff --git a/treeherder/webapp/api/performance_data.py b/treeherder/webapp/api/performance_data.py
@@ -1521,12 +1521,8 @@ def _process_stats(
             c_delta, _ = cliffs_delta(base_rev_data, new_rev_data)
 
         cliffs_interpretation = stats.interpret_effect_size(c_delta)
-        direction, is_new_better = stats.is_new_better(delta_value, lower_is_better)
 
-        # Interpret effect size
-        effect_size = stats.interpret_effect_size(c_delta)
-
-        # returns CLES, direction
+        # returns CLES
         (
             cles_obj,
             cles,
@@ -1544,6 +1540,11 @@ def _process_stats(
             lower_is_better,
             pvalue_threshold,
         )
+
+        # Interpret effect size
+        effect_size = stats.interpret_effect_size(c_delta)
+        direction, is_new_better = stats.is_new_better(c_delta, cles, mann_pvalue, lower_is_better)
+
         if cles_obj:
             cles_obj["effect_size"] = effect_size
             cles_obj["cles_direction"] = direction