Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions tests/perf/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,27 +59,21 @@ def test_interpret_cles():
mock_base = [2.74]
mock_new = [2.65]
mock_mann_stat = 0.1
mock_mann_pvalue = 0.2
interpretation = ("",)
lower_is_better = (False,)
mock_delta = 0.2

(
cles_obj,
cles,
is_significant,
cles_explanation,
mann_whitney_u_cles,
cliffs_delta_cles,
) = interpret_cles(
mock_mann_stat,
mock_mann_pvalue,
mock_new,
mock_base,
mock_delta,
interpretation,
lower_is_better,
(cles_obj, cles, cles_explanation, mann_whitney_u_cles, cliffs_delta_cles, is_base_greater) = (
interpret_cles(
mock_mann_stat,
mock_new,
mock_base,
mock_delta,
interpretation,
lower_is_better,
)
)

assert cles_obj["cles"] == 0.1
assert cles == 0.1
assert is_base_greater is False
8 changes: 4 additions & 4 deletions tests/webapp/api/test_perfcompare_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1181,8 +1181,8 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
"is_fit_good": True,
"is_improvement": None,
"is_regression": None,
"is_meaningful": None,
"is_new_better": False,
"is_meaningful": True,
"is_new_better": None,
"base_parent_signature": response["base_parent_signature"],
"new_parent_signature": response["new_parent_signature"],
"base_signature_id": response["base_signature_id"],
Expand All @@ -1191,7 +1191,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
"cles": None,
"cliffs_delta": -1.0,
"cliffs_interpretation": "large",
"direction_of_change": "worse",
"direction_of_change": "no change",
"base_standard_stats": {
"count": 1,
"max": 32.4,
Expand Down Expand Up @@ -1269,7 +1269,7 @@ def test_perfcompare_results_with_mann_witney_u_against_no_base(
"ci_high": None,
"ci_low": None,
"ci_warning": None,
"median_shift_summary": None,
"median_shift_summary": "Cannot measure shift, base mode count not equal to new mode count",
"mode_end": "36.47",
"mode_name": "Mode 1",
"mode_start": "28.33",
Expand Down
164 changes: 100 additions & 64 deletions treeherder/perf/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from scipy.stats import bootstrap, iqr, ks_2samp, mannwhitneyu

# New Stats Code
# various formulas extracted from here:
# https://colab.research.google.com/gist/padenot/2a408f0a39e269977045fc2fb265663b/end-to-end.ipynb#scrollTo=M8WY0yVIX5Ru&uniqifier=1

# p-value threshold to use throughout
PVALUE_THRESHOLD = 0.05
Expand Down Expand Up @@ -249,6 +251,18 @@ def interpret_ks_test(base, new, pvalue_threshold=PVALUE_THRESHOLD):
return None, None, None


def mann_whitney_pval_significance(mann_pvalue, pvalue_threshold=PVALUE_THRESHOLD):
p_value_interpretation = ""
is_significant = False

if mann_pvalue > pvalue_threshold:
p_value_interpretation = "not significant"
if mann_pvalue <= pvalue_threshold:
is_significant = True
p_value_interpretation = "significant"
return p_value_interpretation, is_significant


# Mann-Whitney U test
# Tests the null hypothesis that the distributions patch and without patch are identical.
# Null hypothesis is a statement that there is no significant difference or effect in population, calculates p-value
Expand All @@ -259,60 +273,71 @@ def interpret_mann_whitneyu(base, new, pvalue_threshold=PVALUE_THRESHOLD):
mann_stat = float(mann_stat) if mann_stat else None
mann_pvalue = float(mann_pvalue) if mann_pvalue else None
# Mann-Whitney U p-value interpretation
p_value_interpretation = None
if mann_pvalue >= pvalue_threshold:
p_value_interpretation = "not significant"
if mann_pvalue < pvalue_threshold:
p_value_interpretation = "significant"
p_value_interpretation, is_significant = mann_whitney_pval_significance(
mann_pvalue, pvalue_threshold
)

mann_whitney = {
"test_name": "Mann-Whitney U",
"stat": mann_stat,
"pvalue": mann_pvalue,
"interpretation": p_value_interpretation,
}
return mann_whitney, mann_stat, mann_pvalue


def is_new_better(delta_value, lower_is_better):
"""This method returns if the new result is better or worse (even if unsure)"""
if delta_value is None:
direction = None
is_new_better = None
is_new_better = None
if abs(delta_value) < 0.001:
direction = "no change"
elif (lower_is_better and delta_value < 0) or (not lower_is_better and delta_value > 0):
direction = "better"
is_new_better = True
else:
direction = "worse"
is_new_better = False
return direction, is_new_better


def interpret_cles_direction(cles, pvalue_threshold=PVALUE_THRESHOLD):
if cles is None:
return "CLES cannot be interpreted"
if cles >= pvalue_threshold:
return f"{cles:.0%} chance a base value > a new value"
if cles < pvalue_threshold:
return f"{1 - cles:.0%} chance a new value > base value"
return "CLES cannot be interpreted"
return mann_whitney, mann_stat, mann_pvalue, is_significant


# https://openpublishing.library.umass.edu/pare/article/1977/galley/1980/view/
def interpret_effect_size(delta):
is_effect_meaningful = False
if delta is None:
return "Effect cannot be interpreted"
return "Effect cannot be interpreted", is_effect_meaningful
if abs(delta) < 0.15:
return "negligible"
elif abs(delta) < 0.33:
return "small"
elif abs(delta) < 0.47:
return "moderate"
else:
return "large"
return "negligible", is_effect_meaningful
is_effect_meaningful = True
if abs(delta) < 0.33:
return "small", is_effect_meaningful
if abs(delta) < 0.47:
return "moderate", is_effect_meaningful
return "large", is_effect_meaningful


def interpret_cles_direction(cles):
# probability that a randomly selected score from one group will be greater than a randomly selected score from a second group
# A CLES of 0.5 indicates a 50% chance for either outcome, 50/50 toss up, no change
# A CLES of 0.6 would mean there is a 60% chance that a score Base > New
# A CLES of 0.4 would mean there is a 40% chance that a score from Base > New, or a 60% chance that a score from New > Base
is_base_greater = None
if cles is None:
return "CLES cannot be interpreted", is_base_greater
elif cles > 0.5:
is_base_greater = True
return f"{cles:.0%} chance a base value > a new value", is_base_greater
elif cles < 0.5:
is_base_greater = False
return f"{1 - cles:.0%} chance a new value > base value", is_base_greater
return "CLES cannot be interpreted", is_base_greater


def is_new_better(is_effect_meaningful, is_base_greater, is_significant, lower_is_better):
is_new_better = None
direction = "no change"
# Possibility Base > than New with a small amount or more significance
if is_base_greater and is_effect_meaningful and is_significant:
if lower_is_better:
is_new_better = True
direction = "improvement"
else:
is_new_better = False
direction = "regression"
# Possibility New > Base with a small amount or more significance
elif (is_base_greater is False) and is_effect_meaningful and is_significant:
if lower_is_better:
is_new_better = False
direction = "regression"
else:
is_new_better = True
direction = "improvement"
return direction, is_new_better


def interpret_performance_direction(ci_low, ci_high, lower_is_better):
Expand Down Expand Up @@ -347,13 +372,11 @@ def interpret_performance_direction(ci_low, ci_high, lower_is_better):
# Common Language Effect Size, and its interpretation in english
def interpret_cles(
mann_stat,
mann_pvalue,
new_revision,
base_revision,
delta,
interpretation,
lower_is_better,
pvalue_threshold=PVALUE_THRESHOLD,
):
try:
cles = None
Expand All @@ -373,9 +396,8 @@ def interpret_cles(
else:
mann_whitney_u_cles = ""

is_significant = False if mann_pvalue > pvalue_threshold else True
# Generate CLES explanation
cles_explanation = interpret_cles_direction(cles) if cles else ""
cles_explanation, is_base_greater = interpret_cles_direction(cles)
# Cliff's delta CLES
cliffs_delta_cles = f"Cliff's Delta: {delta:.2f} → {interpretation}" if delta else ""

Expand All @@ -389,10 +411,10 @@ def interpret_cles(
return (
cles_obj,
cles,
is_significant,
cles_explanation,
mann_whitney_u_cles,
cliffs_delta_cles,
is_base_greater,
)
except Exception:
return None, None, None, None, None, None
Expand Down Expand Up @@ -455,23 +477,36 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
is_improvement = None
performance_intepretation = None
modes = []
if base_mode_count == new_mode_count:
base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs)
new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs)
per_mode_new = split_per_mode(new_data, new_intervals)
per_mode_base = split_per_mode(base_data, base_intervals)

for i, interval in enumerate(base_intervals):
tup = interval
if len(tup) != 2:
return None, None, None, None, None, None

start, end = tup
shift = 0
ci_low = 0
ci_high = 0
median_shift_summary = None
mode_name = f"Mode {i + 1}"
base_intervals, base_peak_xs = find_mode_interval(x_base, y_base, base_peak_locs)
new_intervals, new_peak_xs = find_mode_interval(x_new, y_new, new_peak_locs)
for i, interval in enumerate(base_intervals):
if len(interval) != 2:
return None, None, None, None, None, None

start, end = interval
shift = 0
ci_low = 0
ci_high = 0
median_shift_summary = (
"Cannot measure shift, base mode count not equal to new mode count"
)
shift = None
mode_name = f"Mode {i + 1}"
mode_info = {
"mode_name": mode_name,
"mode_start": f"{start:.2f}" if start else None,
"mode_end": f"{end:.2f}" if end else None,
"median_shift_summary": median_shift_summary,
"ci_low": ci_low,
"ci_high": ci_high,
"shift": shift,
"shift_summary": performance_intepretation,
"ci_warning": ci_warning,
}

if base_mode_count == new_mode_count:
per_mode_new = split_per_mode(new_data, new_intervals)
per_mode_base = split_per_mode(base_data, base_intervals)

try:
ref_vals = [val for val, mode in zip(base_data, per_mode_base) if mode == i]
Expand Down Expand Up @@ -509,7 +544,8 @@ def interpret_silverman_kde(base_data, new_data, lower_is_better):
"shift_summary": performance_intepretation,
"ci_warning": ci_warning,
}
modes.append(mode_info)

modes.append(mode_info)

silverman_kde = {
"bandwidth": "Silverman",
Expand Down
35 changes: 19 additions & 16 deletions treeherder/webapp/api/performance_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1501,9 +1501,12 @@ def _process_stats(
# Mann-Whitney U test, two sided because we're never quite sure what of
# the intent of the patch, as things stand
# Tests the null hypothesis that the distributions of the two are identical
mann_whitney, mann_stat, mann_pvalue = stats.interpret_mann_whitneyu(
base_rev_data, new_rev_data, pvalue_threshold
)
(
mann_whitney,
mann_stat,
mann_pvalue,
is_significant,
) = stats.interpret_mann_whitneyu(base_rev_data, new_rev_data, pvalue_threshold)
delta_value = new_median - base_median
delta_percentage = (delta_value / base_median * 100) if base_median != 0 else 0

Expand All @@ -1520,32 +1523,32 @@ def _process_stats(
else:
c_delta, _ = cliffs_delta(base_rev_data, new_rev_data)

cliffs_interpretation = stats.interpret_effect_size(c_delta)
direction, is_new_better = stats.is_new_better(delta_value, lower_is_better)

# Interpret effect size
effect_size = stats.interpret_effect_size(c_delta)
# interpret effect size
cliffs_interpretation, is_effect_meaningful = stats.interpret_effect_size(c_delta)

# returns CLES, direction
# returns CLES
(
cles_obj,
cles,
is_significant,
cles_explanation,
mann_whitney_u_cles,
cliffs_delta_cles,
is_base_greater,
) = stats.interpret_cles(
mann_stat,
mann_pvalue,
new_rev_data,
base_rev_data,
cliffs_interpretation,
c_delta,
cliffs_interpretation,
lower_is_better,
pvalue_threshold,
)

direction, is_new_better = stats.is_new_better(
is_effect_meaningful, is_base_greater, is_significant, lower_is_better
)

if cles_obj:
cles_obj["effect_size"] = effect_size
cles_obj["effect_size"] = cliffs_interpretation
cles_obj["cles_direction"] = direction

# Compute KDE with Silverman bandwidth, and warn if multimodal.
Expand Down Expand Up @@ -1615,13 +1618,13 @@ def _process_stats(
# short form summary based on former tests shapiro, silverman, etc...
"is_fit_good": is_fit_good,
"is_new_better": is_new_better,
"is_meaningful": is_significant,
"is_meaningful": is_effect_meaningful,
"lower_is_better": lower_is_better,
"is_regression": is_regression,
"is_improvement": is_improvement,
"more_runs_are_needed": more_runs_are_needed,
"performance_intepretation": performance_intepretation,
"direction_of_change": direction, # 'neutral', 'better', or 'worse'
"direction_of_change": direction, # 'no change', 'improvement', or 'regression'
}

return stats_data
Expand Down