Skip to content

Commit 0ebb56c

Browse files
committed
refactor is_new_better along with interpret_silverman_kde
1 parent 5bbe303 commit 0ebb56c

File tree

4 files changed

+92
-119
lines changed

4 files changed

+92
-119
lines changed

tests/perf/test_stats.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -59,27 +59,21 @@ def test_interpret_cles():
5959
mock_base = [2.74]
6060
mock_new = [2.65]
6161
mock_mann_stat = 0.1
62-
mock_mann_pvalue = 0.2
6362
interpretation = ("",)
6463
lower_is_better = (False,)
6564
mock_delta = 0.2
6665

67-
(
68-
cles_obj,
69-
cles,
70-
is_significant,
71-
cles_explanation,
72-
mann_whitney_u_cles,
73-
cliffs_delta_cles,
74-
) = interpret_cles(
75-
mock_mann_stat,
76-
mock_mann_pvalue,
77-
mock_new,
78-
mock_base,
79-
mock_delta,
80-
interpretation,
81-
lower_is_better,
66+
(cles_obj, cles, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, is_base_greater) = (
67+
interpret_cles(
68+
mock_mann_stat,
69+
mock_new,
70+
mock_base,
71+
mock_delta,
72+
interpretation,
73+
lower_is_better,
74+
)
8275
)
8376

8477
assert cles_obj["cles"] == 0.1
8578
assert cles == 0.1
79+
assert is_base_greater is None

tests/webapp/api/test_perfcompare_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,7 +1181,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
11811181
"is_fit_good": True,
11821182
"is_improvement": None,
11831183
"is_regression": None,
1184-
"is_meaningful": None,
1184+
"is_meaningful": True,
11851185
"is_new_better": None,
11861186
"base_parent_signature": response["base_parent_signature"],
11871187
"new_parent_signature": response["new_parent_signature"],
@@ -1269,7 +1269,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
12691269
"ci_high": None,
12701270
"ci_low": None,
12711271
"ci_warning": None,
1272-
"median_shift_summary": None,
1272+
"median_shift_summary": "Cannot measure shift, base mode count not equal to new mode count",
12731273
"mode_end": "36.47",
12741274
"mode_name": "Mode 1",
12751275
"mode_start": "28.33",

treeherder/perf/stats.py

Lines changed: 65 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,14 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
251251

252252
def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD):
253253
p_value_interpretation = None
254+
is_significant = False
255+
254256
if mann_pvalue > pvalue_threshold:
255257
p_value_interpretation = "not significant"
256258
if mann_pvalue <= pvalue_threshold:
259+
is_significant = True
257260
p_value_interpretation = "significant"
258-
return p_value_interpretation
261+
return p_value_interpretation, is_significant
259262

260263

261264
# Mann-Whitney U test
@@ -268,71 +271,61 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
268271
mann_stat = float(mann_stat) if mann_stat else None
269272
mann_pvalue = float(mann_pvalue) if mann_pvalue else None
270273
# Mann-Whitney U p-value interpretation
271-
p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold)
274+
p_value_interpretation, is_significant = mann_whitney_pval_significance(
275+
mann_pvalue, pvalue_threshold
276+
)
272277

273278
mann_whitney = {
274279
"test_name": "Mann-Whitney U",
275280
"stat": mann_stat,
276281
"pvalue": mann_pvalue,
277282
"interpretation": p_value_interpretation,
278283
}
279-
return mann_whitney, mann_stat, mann_pvalue
284+
return mann_whitney, mann_stat, mann_pvalue, is_significant
280285

281286

282287
# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
283288
def interpret_effect_size(delta):
289+
is_effect_meaningful = False
284290
if delta is None:
285-
return "Effect cannot be interpreted"
291+
return "Effect cannot be interpreted", is_effect_meaningful
286292
if abs(delta) < 0.15:
287-
return "negligible"
288-
elif abs(delta) < 0.33:
289-
return "small"
290-
elif abs(delta) < 0.47:
291-
return "moderate"
293+
return "negligible", is_effect_meaningful
294+
if abs(delta) < 0.33:
295+
is_effect_meaningful = True
296+
return "small", is_effect_meaningful
297+
if abs(delta) < 0.47:
298+
is_effect_meaningful = True
299+
return "moderate", is_effect_meaningful
292300
else:
293-
return "large"
301+
is_effect_meaningful = True
302+
return "large", is_effect_meaningful
294303

295304

296305
def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
297-
greater_rev = None
306+
is_base_greater = None
298307
if cles is None:
299-
return "CLES cannot be interpreted", greater_rev
300-
if cles > pvalue_threshold:
301-
greater_rev = "base"
302-
return f"{cles:.0%} chance a base value > a new value", greater_rev
303-
if cles < pvalue_threshold:
304-
greater_rev = "new"
305-
return f"{1 - cles:.0%} chance a new value > base value", greater_rev
306-
return "CLES cannot be interpreted", greater_rev
307-
308-
309-
def is_new_better(c_delta, cles, mann_pvalue, lower_is_better, pvalue_threshold=PVALUE_THRESHOLD):
310-
"""This method takes in CLES to measure if meaningful, Mann Whitney p-val for significance as well as Cliff's Delta for change"""
308+
return "CLES cannot be interpreted", is_base_greater
309+
elif cles > pvalue_threshold:
310+
is_base_greater = True
311+
return f"{cles:.0%} chance a base value > a new value", is_base_greater
312+
elif cles < pvalue_threshold:
313+
is_base_greater = False
314+
return f"{1 - cles:.0%} chance a new value > base value", is_base_greater
315+
return "CLES cannot be interpreted", is_base_greater
316+
317+
318+
def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_is_better):
311319
# Possibility Base > than New with a small amount or more significance
312-
cles_interpretation, greater_rev = interpret_cles_direction(
313-
cles, pvalue_threshold=PVALUE_THRESHOLD
314-
)
315-
effect_size = interpret_effect_size(c_delta)
316-
effect_value_significance = ["small", "moderate", "large"]
317-
p_value_interpretation = mann_whitney_pval_significance(mann_pvalue, pvalue_threshold)
318-
319-
if (
320-
greater_rev == "base"
321-
and any(effect_size in effect_value_significance)
322-
and p_value_interpretation == "significant"
323-
):
320+
if is_base_greater and is_effect_meaningful and is_significant:
324321
if lower_is_better:
325322
is_new_better = True
326323
direction = "improvement"
327324
else:
328325
is_new_better = False
329326
direction = "regression"
330327
# Possibility New > Base with a small amount or more significance
331-
if (
332-
greater_rev == "new"
333-
and any(effect_size in effect_value_significance)
334-
and p_value_interpretation == "significant"
335-
):
328+
elif (is_base_greater is False) and is_effect_meaningful and is_significant:
336329
if lower_is_better:
337330
is_new_better = False
338331
direction = "regression"
@@ -377,13 +370,11 @@ def interpret_performance_direction(ci_low, ci_high, lower_is_better):
377370
# Common Language Effect Size, and its interpretation in english
378371
def interpret_cles(
379372
mann_stat,
380-
mann_pvalue,
381373
new_revision,
382374
base_revision,
383375
delta,
384376
interpretation,
385377
lower_is_better,
386-
pvalue_threshold=PVALUE_THRESHOLD,
387378
):
388379
try:
389380
cles = None
@@ -403,9 +394,8 @@ def interpret_cles(
403394
else:
404395
mann_whitney_u_cles = ""
405396

406-
is_significant = False if mann_pvalue > pvalue_threshold else True
407397
# Generate CLES explanation
408-
cles_explanation = interpret_cles_direction(cles) if cles else ""
398+
cles_explanation, is_base_greater = interpret_cles_direction(cles) if cles else "", None
409399
# Cliff's delta CLES
410400
cliffs_delta_cles = f"Cliff's Delta: {delta:.2f}{interpretation}" if delta else ""
411401

@@ -419,10 +409,10 @@ def interpret_cles(
419409
return (
420410
cles_obj,
421411
cles,
422-
is_significant,
423412
cles_explanation,
424413
mann_whitney_u_cles,
425414
cliffs_delta_cles,
415+
is_base_greater,
426416
)
427417
except Exception:
428418
return None, None, None, None, None, None
@@ -487,22 +477,35 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
487477
modes = []
488478
base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs)
489479
new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs)
490-
491-
if base_mode_count == new_mode_count:
492-
per_mode_new = split_per_mode(new_data, new_intervals)
493-
per_mode_base = split_per_mode(base_data, base_intervals)
494-
495-
for i, interval in enumerate(base_intervals):
496-
tup = interval
497-
if len(tup) != 2:
498-
return None, None, None, None, None, None
499-
500-
start, end = tup
501-
shift = 0
502-
ci_low = 0
503-
ci_high = 0
504-
median_shift_summary = None
505-
mode_name = f"Mode {i + 1}"
480+
for i, interval in enumerate(base_intervals):
481+
tup = interval
482+
if len(tup) != 2:
483+
return None, None, None, None, None, None
484+
485+
start, end = tup
486+
shift = 0
487+
ci_low = 0
488+
ci_high = 0
489+
median_shift_summary = (
490+
"Cannot measure shift, base mode count not equal to new mode count"
491+
)
492+
shift = None
493+
mode_name = f"Mode {i + 1}"
494+
mode_info = {
495+
"mode_name": mode_name,
496+
"mode_start": f"{start:.2f}" if start else None,
497+
"mode_end": f"{end:.2f}" if end else None,
498+
"median_shift_summary": median_shift_summary,
499+
"ci_low": ci_low,
500+
"ci_high": ci_high,
501+
"shift": shift,
502+
"shift_summary": performance_intepretation,
503+
"ci_warning": ci_warning,
504+
}
505+
506+
if base_mode_count == new_mode_count:
507+
per_mode_new = split_per_mode(new_data, new_intervals)
508+
per_mode_base = split_per_mode(base_data, base_intervals)
506509

507510
try:
508511
ref_vals = [val for val, mode in zip(base_data, per_mode_base) if mode == i]
@@ -540,33 +543,7 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
540543
"shift_summary": performance_intepretation,
541544
"ci_warning": ci_warning,
542545
}
543-
modes.append(mode_info)
544-
else:
545-
for i, interval in enumerate(base_intervals):
546-
tup = interval
547-
if len(tup) != 2:
548-
return None, None, None, None, None, None
549-
550-
start, end = tup
551-
shift = 0
552-
ci_low = 0
553-
ci_high = 0
554-
median_shift_summary = (
555-
"Cannot measure shift, base mode count not equal to new mode count."
556-
)
557-
shift = None
558-
mode_name = f"Mode {i + 1}"
559-
mode_info = {
560-
"mode_name": mode_name,
561-
"mode_start": f"{start:.2f}" if start else None,
562-
"mode_end": f"{end:.2f}" if end else None,
563-
"median_shift_summary": median_shift_summary,
564-
"ci_low": ci_low,
565-
"ci_high": ci_high,
566-
"shift": shift,
567-
"shift_summary": performance_intepretation,
568-
"ci_warning": ci_warning,
569-
}
546+
570547
modes.append(mode_info)
571548

572549
silverman_kde = {

treeherder/webapp/api/performance_data.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1501,9 +1501,12 @@ def _process_stats(
15011501
# Mann-Whitney U test, two sided because we're never quite sure what of
15021502
# the intent of the patch, as things stand
15031503
# Tests the null hypothesis that the distributions of the two are identical
1504-
mann_whitney, mann_stat, mann_pvalue = stats.interpret_mann_whitneyu(
1505-
base_rev_data, new_rev_data, pvalue_threshold
1506-
)
1504+
(
1505+
mann_whitney,
1506+
mann_stat,
1507+
mann_pvalue,
1508+
is_significant,
1509+
) = stats.interpret_mann_whitneyu(base_rev_data, new_rev_data, pvalue_threshold)
15071510
delta_value = new_median - base_median
15081511
delta_percentage = (delta_value / base_median * 100) if base_median != 0 else 0
15091512

@@ -1520,33 +1523,32 @@ def _process_stats(
15201523
else:
15211524
c_delta, _ = cliffs_delta(base_rev_data, new_rev_data)
15221525

1523-
cliffs_interpretation = stats.interpret_effect_size(c_delta)
1526+
# interpret effect size
1527+
cliffs_interpretation, is_effect_meaningful = stats.interpret_effect_size(c_delta)
15241528

15251529
# returns CLES
15261530
(
15271531
cles_obj,
15281532
cles,
1529-
is_significant,
15301533
cles_explanation,
15311534
mann_whitney_u_cles,
15321535
cliffs_delta_cles,
1536+
is_base_greater,
15331537
) = stats.interpret_cles(
15341538
mann_stat,
1535-
mann_pvalue,
15361539
new_rev_data,
15371540
base_rev_data,
1538-
cliffs_interpretation,
15391541
c_delta,
1542+
cliffs_interpretation,
15401543
lower_is_better,
1541-
pvalue_threshold,
15421544
)
15431545

1544-
# Interpret effect size
1545-
effect_size = stats.interpret_effect_size(c_delta)
1546-
direction, is_new_better = stats.is_new_better(c_delta, cles, mann_pvalue, lower_is_better)
1546+
direction, is_new_better = stats.is_new_better(
1547+
is_effect_meaningful, is_base_greater, is_significant, lower_is_better
1548+
)
15471549

15481550
if cles_obj:
1549-
cles_obj["effect_size"] = effect_size
1551+
cles_obj["effect_size"] = cliffs_interpretation
15501552
cles_obj["cles_direction"] = direction
15511553

15521554
# Compute KDE with Silverman bandwidth, and warn if multimodal.
@@ -1616,7 +1618,7 @@ def _process_stats(
16161618
# short form summary based on former tests shapiro, silverman, etc...
16171619
"is_fit_good": is_fit_good,
16181620
"is_new_better": is_new_better,
1619-
"is_meaningful": is_significant,
1621+
"is_meaningful": is_effect_meaningful,
16201622
"lower_is_better": lower_is_better,
16211623
"is_regression": is_regression,
16221624
"is_improvement": is_improvement,

0 commit comments

Comments
 (0)