Skip to content

Commit 09f003e

Browse files
authored
PCF 634 update is_new_better logic (#9075)
Fixes the logic for is_new_better to take the significance of the result into consideration.
1 parent 75d47a7 commit 09f003e

File tree

4 files changed

+133
-100
lines changed

4 files changed

+133
-100
lines changed

tests/perf/test_stats.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -59,27 +59,21 @@ def test_interpret_cles():
5959
mock_base = [2.74]
6060
mock_new = [2.65]
6161
mock_mann_stat = 0.1
62-
mock_mann_pvalue = 0.2
6362
interpretation = ("",)
6463
lower_is_better = (False,)
6564
mock_delta = 0.2
6665

67-
(
68-
cles_obj,
69-
cles,
70-
is_significant,
71-
cles_explanation,
72-
mann_whitney_u_cles,
73-
cliffs_delta_cles,
74-
) = interpret_cles(
75-
mock_mann_stat,
76-
mock_mann_pvalue,
77-
mock_new,
78-
mock_base,
79-
mock_delta,
80-
interpretation,
81-
lower_is_better,
66+
(cles_obj, cles, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, is_base_greater) = (
67+
interpret_cles(
68+
mock_mann_stat,
69+
mock_new,
70+
mock_base,
71+
mock_delta,
72+
interpretation,
73+
lower_is_better,
74+
)
8275
)
8376

8477
assert cles_obj["cles"] == 0.1
8578
assert cles == 0.1
79+
assert is_base_greater is False

tests/webapp/api/test_perfcompare_api.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,8 +1181,8 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
11811181
"is_fit_good": True,
11821182
"is_improvement": None,
11831183
"is_regression": None,
1184-
"is_meaningful": None,
1185-
"is_new_better": False,
1184+
"is_meaningful": True,
1185+
"is_new_better": None,
11861186
"base_parent_signature": response["base_parent_signature"],
11871187
"new_parent_signature": response["new_parent_signature"],
11881188
"base_signature_id": response["base_signature_id"],
@@ -1191,7 +1191,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
11911191
"cles": None,
11921192
"cliffs_delta": -1.0,
11931193
"cliffs_interpretation": "large",
1194-
"direction_of_change": "worse",
1194+
"direction_of_change": "no change",
11951195
"base_standard_stats": {
11961196
"count": 1,
11971197
"max": 32.4,
@@ -1269,7 +1269,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
12691269
"ci_high": None,
12701270
"ci_low": None,
12711271
"ci_warning": None,
1272-
"median_shift_summary": None,
1272+
"median_shift_summary": "Cannot measure shift, base mode count not equal to new mode count",
12731273
"mode_end": "36.47",
12741274
"mode_name": "Mode 1",
12751275
"mode_start": "28.33",

treeherder/perf/stats.py

Lines changed: 100 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from scipy.stats import bootstrap, iqr, ks_2samp, mannwhitneyu
99

1010
# New Stats Code
11+
# various formulas extracted from here:
12+
# https://colab.research.google.com/gist/padenot/2a408f0a39e269977045fc2fb265663b/end-to-end.ipynb#scrollTo=M8WY0yVIX5Ru&uniqifier=1
1113

1214
# p-value threshold to use throughout
1315
PVALUE_THRESHOLD = 0.05
@@ -249,6 +251,18 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
249251
return None, None, None
250252

251253

254+
def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD):
255+
p_value_interpretation = ""
256+
is_significant = False
257+
258+
if mann_pvalue > pvalue_threshold:
259+
p_value_interpretation = "not significant"
260+
if mann_pvalue <= pvalue_threshold:
261+
is_significant = True
262+
p_value_interpretation = "significant"
263+
return p_value_interpretation, is_significant
264+
265+
252266
# Mann-Whitney U test
253267
# Tests the null hypothesis that the distributions patch and without patch are identical.
254268
# Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value
@@ -259,60 +273,71 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
259273
mann_stat = float(mann_stat) if mann_stat else None
260274
mann_pvalue = float(mann_pvalue) if mann_pvalue else None
261275
# Mann-Whitney U p-value interpretation
262-
p_value_interpretation = None
263-
if mann_pvalue >= pvalue_threshold:
264-
p_value_interpretation = "not significant"
265-
if mann_pvalue < pvalue_threshold:
266-
p_value_interpretation = "significant"
276+
p_value_interpretation, is_significant = mann_whitney_pval_significance(
277+
mann_pvalue, pvalue_threshold
278+
)
267279

268280
mann_whitney = {
269281
"test_name": "Mann-Whitney U",
270282
"stat": mann_stat,
271283
"pvalue": mann_pvalue,
272284
"interpretation": p_value_interpretation,
273285
}
274-
return mann_whitney, mann_stat, mann_pvalue
275-
276-
277-
def is_new_better(delta_value, lower_is_better):
278-
"""This method returns if the new result is better or worse (even if unsure)"""
279-
if delta_value is None:
280-
direction = None
281-
is_new_better = None
282-
is_new_better = None
283-
if abs(delta_value) < 0.001:
284-
direction = "no change"
285-
elif (lower_is_better and delta_value < 0) or (not lower_is_better and delta_value > 0):
286-
direction = "better"
287-
is_new_better = True
288-
else:
289-
direction = "worse"
290-
is_new_better = False
291-
return direction, is_new_better
292-
293-
294-
def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
295-
if cles is None:
296-
return "CLES cannot be interpreted"
297-
if cles >= pvalue_threshold:
298-
return f"{cles:.0%} chance a base value > a new value"
299-
if cles < pvalue_threshold:
300-
return f"{1 - cles:.0%} chance a new value > base value"
301-
return "CLES cannot be interpreted"
286+
return mann_whitney, mann_stat, mann_pvalue, is_significant
302287

303288

304289
# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
305290
def interpret_effect_size(delta):
291+
is_effect_meaningful = False
306292
if delta is None:
307-
return "Effect cannot be interpreted"
293+
return "Effect cannot be interpreted", is_effect_meaningful
308294
if abs(delta) < 0.15:
309-
return "negligible"
310-
elif abs(delta) < 0.33:
311-
return "small"
312-
elif abs(delta) < 0.47:
313-
return "moderate"
314-
else:
315-
return "large"
295+
return "negligible", is_effect_meaningful
296+
is_effect_meaningful = True
297+
if abs(delta) < 0.33:
298+
return "small", is_effect_meaningful
299+
if abs(delta) < 0.47:
300+
return "moderate", is_effect_meaningful
301+
return "large", is_effect_meaningful
302+
303+
304+
def interpret_cles_direction(cles):
305+
# probability that a randomly selected score from one group will be greater than a randomly selected score from a second group
306+
# A CLES of 0.5 indicates a 50% chance for either outcome, 50/50 toss up, no change
307+
# A CLES of 0.6 would mean there is a 60% chance that a score Base > New
308+
# A CLES of 0.4 would mean there is a 40% chance that a score from Base > New, or a 60% chance that a score from New > Base
309+
is_base_greater = None
310+
if cles is None:
311+
return "CLES cannot be interpreted", is_base_greater
312+
elif cles > 0.5:
313+
is_base_greater = True
314+
return f"{cles:.0%} chance a base value > a new value", is_base_greater
315+
elif cles < 0.5:
316+
is_base_greater = False
317+
return f"{1 - cles:.0%} chance a new value > base value", is_base_greater
318+
return "CLES cannot be interpreted", is_base_greater
319+
320+
321+
def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_is_better):
322+
is_new_better = None
323+
direction = "no change"
324+
# Possibility Base > than New with a small amount or more significance
325+
if is_base_greater and is_effect_meaningful and is_significant:
326+
if lower_is_better:
327+
is_new_better = True
328+
direction = "improvement"
329+
else:
330+
is_new_better = False
331+
direction = "regression"
332+
# Possibility New > Base with a small amount or more significance
333+
elif (is_base_greater is False) and is_effect_meaningful and is_significant:
334+
if lower_is_better:
335+
is_new_better = False
336+
direction = "regression"
337+
else:
338+
is_new_better = True
339+
direction = "improvement"
340+
return direction, is_new_better
316341

317342

318343
def interpret_performance_direction(ci_low, ci_high, lower_is_better):
@@ -347,13 +372,11 @@ def interpret_performance_direction(ci_low, ci_high, lower_is_better):
347372
# Common Language Effect Size, and its interpretation in english
348373
def interpret_cles(
349374
mann_stat,
350-
mann_pvalue,
351375
new_revision,
352376
base_revision,
353377
delta,
354378
interpretation,
355379
lower_is_better,
356-
pvalue_threshold=PVALUE_THRESHOLD,
357380
):
358381
try:
359382
cles = None
@@ -373,9 +396,8 @@ def interpret_cles(
373396
else:
374397
mann_whitney_u_cles = ""
375398

376-
is_significant = False if mann_pvalue > pvalue_threshold else True
377399
# Generate CLES explanation
378-
cles_explanation = interpret_cles_direction(cles) if cles else ""
400+
cles_explanation, is_base_greater = interpret_cles_direction(cles)
379401
# Cliff's delta CLES
380402
cliffs_delta_cles = f"Cliff's Delta: {delta:.2f}{interpretation}" if delta else ""
381403

@@ -389,10 +411,10 @@ def interpret_cles(
389411
return (
390412
cles_obj,
391413
cles,
392-
is_significant,
393414
cles_explanation,
394415
mann_whitney_u_cles,
395416
cliffs_delta_cles,
417+
is_base_greater,
396418
)
397419
except Exception:
398420
return None, None, None, None, None, None
@@ -455,23 +477,36 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
455477
is_improvement = None
456478
performance_intepretation = None
457479
modes = []
458-
if base_mode_count == new_mode_count:
459-
base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs)
460-
new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs)
461-
per_mode_new = split_per_mode(new_data, new_intervals)
462-
per_mode_base = split_per_mode(base_data, base_intervals)
463-
464-
for i, interval in enumerate(base_intervals):
465-
tup = interval
466-
if len(tup) != 2:
467-
return None, None, None, None, None, None
468-
469-
start, end = tup
470-
shift = 0
471-
ci_low = 0
472-
ci_high = 0
473-
median_shift_summary = None
474-
mode_name = f"Mode {i + 1}"
480+
base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs)
481+
new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs)
482+
for i, interval in enumerate(base_intervals):
483+
if len(interval) != 2:
484+
return None, None, None, None, None, None
485+
486+
start, end = interval
487+
shift = 0
488+
ci_low = 0
489+
ci_high = 0
490+
median_shift_summary = (
491+
"Cannot measure shift, base mode count not equal to new mode count"
492+
)
493+
shift = None
494+
mode_name = f"Mode {i + 1}"
495+
mode_info = {
496+
"mode_name": mode_name,
497+
"mode_start": f"{start:.2f}" if start else None,
498+
"mode_end": f"{end:.2f}" if end else None,
499+
"median_shift_summary": median_shift_summary,
500+
"ci_low": ci_low,
501+
"ci_high": ci_high,
502+
"shift": shift,
503+
"shift_summary": performance_intepretation,
504+
"ci_warning": ci_warning,
505+
}
506+
507+
if base_mode_count == new_mode_count:
508+
per_mode_new = split_per_mode(new_data, new_intervals)
509+
per_mode_base = split_per_mode(base_data, base_intervals)
475510

476511
try:
477512
ref_vals = [val for val, mode in zip(base_data, per_mode_base) if mode == i]
@@ -509,7 +544,8 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
509544
"shift_summary": performance_intepretation,
510545
"ci_warning": ci_warning,
511546
}
512-
modes.append(mode_info)
547+
548+
modes.append(mode_info)
513549

514550
silverman_kde = {
515551
"bandwidth": "Silverman",

treeherder/webapp/api/performance_data.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,9 +1501,12 @@ def _process_stats(
15011501
# Mann-Whitney U test, two sided because we're never quite sure what of
15021502
# the intent of the patch, as things stand
15031503
# Tests the null hypothesis that the distributions of the two are identical
1504-
mann_whitney, mann_stat, mann_pvalue = stats.interpret_mann_whitneyu(
1505-
base_rev_data, new_rev_data, pvalue_threshold
1506-
)
1504+
(
1505+
mann_whitney,
1506+
mann_stat,
1507+
mann_pvalue,
1508+
is_significant,
1509+
) = stats.interpret_mann_whitneyu(base_rev_data, new_rev_data, pvalue_threshold)
15071510
delta_value = new_median - base_median
15081511
delta_percentage = (delta_value / base_median * 100) if base_median != 0 else 0
15091512

@@ -1520,32 +1523,32 @@ def _process_stats(
15201523
else:
15211524
c_delta, _ = cliffs_delta(base_rev_data, new_rev_data)
15221525

1523-
cliffs_interpretation = stats.interpret_effect_size(c_delta)
1524-
direction, is_new_better = stats.is_new_better(delta_value, lower_is_better)
1525-
1526-
# Interpret effect size
1527-
effect_size = stats.interpret_effect_size(c_delta)
1526+
# interpret effect size
1527+
cliffs_interpretation, is_effect_meaningful = stats.interpret_effect_size(c_delta)
15281528

1529-
# returns CLES, direction
1529+
# returns CLES
15301530
(
15311531
cles_obj,
15321532
cles,
1533-
is_significant,
15341533
cles_explanation,
15351534
mann_whitney_u_cles,
15361535
cliffs_delta_cles,
1536+
is_base_greater,
15371537
) = stats.interpret_cles(
15381538
mann_stat,
1539-
mann_pvalue,
15401539
new_rev_data,
15411540
base_rev_data,
1542-
cliffs_interpretation,
15431541
c_delta,
1542+
cliffs_interpretation,
15441543
lower_is_better,
1545-
pvalue_threshold,
15461544
)
1545+
1546+
direction, is_new_better = stats.is_new_better(
1547+
is_effect_meaningful, is_base_greater, is_significant, lower_is_better
1548+
)
1549+
15471550
if cles_obj:
1548-
cles_obj["effect_size"] = effect_size
1551+
cles_obj["effect_size"] = cliffs_interpretation
15491552
cles_obj["cles_direction"] = direction
15501553

15511554
# Compute KDE with Silverman bandwidth, and warn if multimodal.
@@ -1615,13 +1618,13 @@ def _process_stats(
16151618
# short form summary based on former tests shapiro, silverman, etc...
16161619
"is_fit_good": is_fit_good,
16171620
"is_new_better": is_new_better,
1618-
"is_meaningful": is_significant,
1621+
"is_meaningful": is_effect_meaningful,
16191622
"lower_is_better": lower_is_better,
16201623
"is_regression": is_regression,
16211624
"is_improvement": is_improvement,
16221625
"more_runs_are_needed": more_runs_are_needed,
16231626
"performance_intepretation": performance_intepretation,
1624-
"direction_of_change": direction, # 'neutral', 'better', or 'worse'
1627+
"direction_of_change": direction, # 'no change', 'improvement', or 'regression'
16251628
}
16261629

16271630
return stats_data

0 commit comments

Comments
 (0)