Skip to content

Commit 5bbe303

Browse files
committed
fix shift summary, and cleaned up new stats is_new_better
1 parent a829e16 commit 5bbe303

File tree

3 files changed

+62
-38
lines changed

3 files changed

+62
-38
lines changed

tests/webapp/api/test_perfcompare_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,7 +1182,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
11821182
"is_improvement": None,
11831183
"is_regression": None,
11841184
"is_meaningful": None,
1185-
"is_new_better": False,
1185+
"is_new_better": None,
11861186
"base_parent_signature": response["base_parent_signature"],
11871187
"new_parent_signature": response["new_parent_signature"],
11881188
"base_signature_id": response["base_signature_id"],
@@ -1191,7 +1191,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
11911191
"cles": None,
11921192
"cliffs_delta": -1.0,
11931193
"cliffs_interpretation": "large",
1194-
"direction_of_change": "worse",
1194+
"direction_of_change": "no change",
11951195
"base_standard_stats": {
11961196
"count": 1,
11971197
"max": 32.4,

treeherder/perf/stats.py

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,15 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
249249
return None, None, None
250250

251251

252+
def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD):
253+
p_value_interpretation = None
254+
if mann_pvalue > pvalue_threshold:
255+
p_value_interpretation = "not significant"
256+
if mann_pvalue <= pvalue_threshold:
257+
p_value_interpretation = "significant"
258+
return p_value_interpretation
259+
260+
252261
# Mann-Whitney U test
253262
# Tests the null hypothesis that the distributions patch and without patch are identical.
254263
# Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value
@@ -259,11 +268,7 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
259268
mann_stat = float(mann_stat) if mann_stat else None
260269
mann_pvalue = float(mann_pvalue) if mann_pvalue else None
261270
# Mann-Whitney U p-value interpretation
262-
p_value_interpretation = None
263-
if mann_pvalue >= pvalue_threshold:
264-
p_value_interpretation = "not significant"
265-
if mann_pvalue < pvalue_threshold:
266-
p_value_interpretation = "significant"
271+
p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold)
267272

268273
mann_whitney = {
269274
"test_name": "Mann-Whitney U",
@@ -274,18 +279,60 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
274279
return mann_whitney, mann_stat, mann_pvalue
275280

276281

282+
# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
283+
def interpret_effect_size(delta):
284+
if delta is None:
285+
return "Effect cannot be interpreted"
286+
if abs(delta) < 0.15:
287+
return "negligible"
288+
elif abs(delta) < 0.33:
289+
return "small"
290+
elif abs(delta) < 0.47:
291+
return "moderate"
292+
else:
293+
return "large"
294+
295+
296+
def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
297+
greater_rev = None
298+
if cles is None:
299+
return "CLES cannot be interpreted", greater_rev
300+
if cles > pvalue_threshold:
301+
greater_rev = "base"
302+
return f"{cles:.0%} chance a base value > a new value", greater_rev
303+
if cles < pvalue_threshold:
304+
greater_rev = "new"
305+
return f"{1 - cles:.0%} chance a new value > base value", greater_rev
306+
return "CLES cannot be interpreted", greater_rev
307+
308+
277309
def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=PVALUE_THRESHOLD):
278310
"""This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change"""
279311
# Possibility Base > than New with a small amount or more significance
280-
if cles > pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold:
312+
cles_interpretation, greater_rev = interpret_cles_direction(
313+
cles, pvalue_threshold=PVALUE_THRESHOLD
314+
)
315+
effect_size = interpret_effect_size(c_delta)
316+
effect_value_significance = ["small", "moderate", "large"]
317+
p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold)
318+
319+
if (
320+
greater_rev == "base"
321+
and any(effect_size in effect_value_significance)
322+
and p_value_interpretation == "significant"
323+
):
281324
if lower_is_better:
282325
is_new_better = True
283326
direction = "improvement"
284327
else:
285328
is_new_better = False
286329
direction = "regression"
287330
# Possibility New > Base with a small amount or more significance
288-
if cles < pvalue_threshold and abs(c_delta) > 0.33 and mann_pvalue < pvalue_threshold:
331+
if (
332+
greater_rev == "new"
333+
and any(effect_size in effect_value_significance)
334+
and p_value_interpretation == "significant"
335+
):
289336
if lower_is_better:
290337
is_new_better = False
291338
direction = "regression"
@@ -298,30 +345,6 @@ def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=
298345
return direction, is_new_better
299346

300347

301-
def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
302-
if cles is None:
303-
return "CLES cannot be interpreted"
304-
if cles > pvalue_threshold:
305-
return f"{cles:.0%} chance a base value > a new value"
306-
if cles < pvalue_threshold:
307-
return f"{1 - cles:.0%} chance a new value > base value"
308-
return "CLES cannot be interpreted"
309-
310-
311-
# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
312-
def interpret_effect_size(delta):
313-
if delta is None:
314-
return "Effect cannot be interpreted"
315-
if abs(delta) < 0.15:
316-
return "negligible"
317-
elif abs(delta) < 0.33:
318-
return "small"
319-
elif abs(delta) < 0.47:
320-
return "moderate"
321-
else:
322-
return "large"
323-
324-
325348
def interpret_performance_direction(ci_low, ci_high, lower_is_better):
326349
is_regression = False
327350
is_improvement = False

treeherder/webapp/api/performance_data.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,12 +1521,8 @@ def _process_stats(
15211521
c_delta, _ = cliffs_delta(base_rev_data, new_rev_data)
15221522

15231523
cliffs_interpretation = stats.interpret_effect_size(c_delta)
1524-
direction, is_new_better = stats.is_new_better(delta_value, lower_is_better)
15251524

1526-
# Interpret effect size
1527-
effect_size = stats.interpret_effect_size(c_delta)
1528-
1529-
# returns CLES, direction
1525+
# returns CLES
15301526
(
15311527
cles_obj,
15321528
cles,
@@ -1544,6 +1540,11 @@ def _process_stats(
15441540
lower_is_better,
15451541
pvalue_threshold,
15461542
)
1543+
1544+
# Interpret effect size
1545+
effect_size = stats.interpret_effect_size(c_delta)
1546+
direction, is_new_better = stats.is_new_better(c_delta, cles, mann_pvalue, lower_is_better)
1547+
15471548
if cles_obj:
15481549
cles_obj["effect_size"] = effect_size
15491550
cles_obj["cles_direction"] = direction

0 commit comments

Comments
 (0)