Skip to content

Commit ab32316

Browse files
committed
refactor: update ab_test.py to use metrics instead of emf
Since we store metrics for each test directly, we don't need to parse emf logs from test-report.json. Replace the logic of gathering metrics in ab_test.py to look at all `metrics.json` files instead. Signed-off-by: Egor Lazarchuk <yegorlz@amazon.co.uk>
1 parent a305f36 commit ab32316

File tree

1 file changed

+48
-120
lines changed

1 file changed

+48
-120
lines changed

tools/ab_test.py

Lines changed: 48 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,17 @@
66
77
The script takes two git revisions and a pytest integration test. It utilizes
88
our integration test frameworks --binary-dir parameter to execute the given
9-
test using binaries compiled from each revision, and captures the EMF logs
10-
output. It the searches for list-valued properties/metrics in the EMF, and runs a
11-
regression test comparing these lists for the two runs.
9+
test using binaries compiled from each revision, and runs a regression test
10+
comparing resulting metrics between runs.
1211
1312
It performs the A/B-test as follows:
14-
For each EMF log message output, look at the dimensions. The script assumes that
15-
dimensions are unique across all log messages output from a single test run. In
16-
each log message, then look for all properties that have lists assigned to them,
17-
and collect them. For both runs of the test, the set of distinct dimensions
18-
collected this way must be the same. Then, we match corresponding dimensions
19-
between the two runs, performing statistical regression test across all the list-
20-
valued properties collected.
13+
For both A and B runs, collect all `metrics.json` files and read all dimentions
14+
from them. Script assumes all dimentions are unique within single run and both
15+
A and B runs result in the same dimentions. After collection is done, perform
16+
statistical regression test across all the list-valued properties collected.
2117
"""
2218

19+
import glob
2320
import argparse
2421
import json
2522
import os
@@ -36,7 +33,6 @@
3633
from framework.ab_test import binary_ab_test, check_regression
3734
from framework.properties import global_props
3835
from host_tools.metrics import (
39-
emit_raw_emf,
4036
format_with_reduced_unit,
4137
get_metrics_logger,
4238
)
@@ -69,106 +65,37 @@ def is_ignored(dimensions) -> bool:
6965
return False
7066

7167

72-
def extract_dimensions(emf):
73-
"""Extracts the cloudwatch dimensions from an EMF log message"""
74-
if not emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"]:
75-
# Skipped tests emit a duration metric, but have no dimensions set
76-
return {}
68+
def load_data_series(data_path: Path):
69+
"""Recursively collects `metrics.json` files in provided path"""
70+
data = {}
71+
for name in glob.glob(f"{data_path}/**/metrics.json", recursive=True):
72+
with open(name, encoding="utf-8") as f:
73+
j = json.load(f)
7774

78-
dimension_list = [
79-
dim
80-
for dimensions in emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"]
81-
for dim in dimensions
82-
]
83-
return {key: emf[key] for key in emf if key in dimension_list}
75+
metrics = j["metrics"]
76+
dimentions = frozenset(j["dimensions"].items())
8477

78+
data[dimentions] = {}
79+
for m in metrics:
80+
# Ignore certain metrics as we know them to be volatile
81+
if "cpu_utilization" in m:
82+
continue
83+
mm = metrics[m]
84+
unit = mm["unit"]
85+
values = mm["values"]
86+
data[dimentions][m] = (values, unit)
8587

86-
def process_log_entry(emf: dict):
87-
"""Parses the given EMF log entry
88-
89-
Returns the entries dimensions and its list-valued properties/metrics, together with their units
90-
"""
91-
result = {
92-
key: (value, find_unit(emf, key))
93-
for key, value in emf.items()
94-
if (
95-
"fc_metrics" not in key
96-
and "cpu_utilization" not in key
97-
and isinstance(value, list)
98-
)
99-
}
100-
# Since we don't consider metrics having fc_metrics in key
101-
# result could be empty so, return empty dimensions as well
102-
if not result:
103-
return {}, {}
104-
105-
return extract_dimensions(emf), result
106-
107-
108-
def find_unit(emf: dict, metric: str):
109-
"""Determines the unit of the given metric"""
110-
metrics = {
111-
y["Name"]: y["Unit"] for y in emf["_aws"]["CloudWatchMetrics"][0]["Metrics"]
112-
}
113-
return metrics.get(metric, "None")
114-
115-
116-
def load_data_series(report_path: Path, tag=None, *, reemit: bool = False):
117-
"""Loads the data series relevant for A/B-testing from test_results/test-report.json
118-
into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
119-
its list-valued properties/metrics.
120-
121-
If `reemit` is True, it also reemits all EMF logs to a local EMF agent,
122-
overwriting the attached "git_commit_id" field with the given revision."""
123-
# Dictionary mapping EMF dimensions to A/B-testable metrics/properties
124-
processed_emf = {}
125-
126-
report = json.loads(report_path.read_text("UTF-8"))
127-
for test in report["tests"]:
128-
for line in test["teardown"]["stdout"].splitlines():
129-
# Only look at EMF log messages. If we ever have other stdout that starts with braces,
130-
# we will need to rethink this heuristic.
131-
if line.startswith("{"):
132-
emf = json.loads(line)
133-
134-
if reemit:
135-
assert tag is not None
136-
137-
emf["git_commit_id"] = str(tag)
138-
emit_raw_emf(emf)
139-
140-
dimensions, result = process_log_entry(emf)
141-
142-
if not dimensions:
143-
continue
144-
145-
dimension_set = frozenset(dimensions.items())
146-
147-
if dimension_set not in processed_emf:
148-
processed_emf[dimension_set] = result
149-
else:
150-
# If there are many data points for a metric, they will be split across
151-
# multiple EMF log messages. We need to reassemble :(
152-
assert (
153-
processed_emf[dimension_set].keys() == result.keys()
154-
), f"Found incompatible metrics associated with dimension set {dimension_set}: {processed_emf[dimension_set].keys()} in one EMF message, but {result.keys()} in another."
155-
156-
for metric, (values, unit) in processed_emf[dimension_set].items():
157-
assert result[metric][1] == unit
158-
159-
values.extend(result[metric][0])
160-
161-
return processed_emf
88+
return data
16289

16390

164-
def uninteresting_dimensions(processed_emf):
91+
def uninteresting_dimensions(data):
16592
"""
166-
Computes the set of cloudwatch dimensions that only ever take on a
93+
Computes the set of dimensions that only ever take on a
16794
single value across the entire dataset.
16895
"""
16996
values_per_dimension = defaultdict(set)
17097

171-
for dimension_set in processed_emf:
98+
for dimension_set in data:
17299
for dimension, value in dimension_set:
173100
values_per_dimension[dimension].add(value)
174101

@@ -189,7 +116,8 @@ def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
189116
binary_dir = binary_dir.resolve()
190117

191118
print(f"Collecting samples with {binary_dir}")
192-
test_report_path = f"test_results/{tag}/test-report.json"
119+
test_path = f"test_results/{tag}"
120+
test_report_path = f"{test_path}/test-report.json"
193121
subprocess.run(
194122
f"./tools/test.sh --binary-dir={binary_dir} {pytest_opts} -m '' --json-report-file=../{test_report_path}",
195123
env=os.environ
@@ -201,12 +129,12 @@ def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
201129
shell=True,
202130
)
203131

204-
return load_data_series(Path(test_report_path), binary_dir, reemit=True)
132+
return load_data_series(Path(test_path))
205133

206134

207135
def analyze_data(
208-
processed_emf_a,
209-
processed_emf_b,
136+
data_a,
137+
data_b,
210138
p_thresh,
211139
strength_abs_thresh,
212140
noise_threshold,
@@ -219,8 +147,8 @@ def analyze_data(
219147
220148
Returns a mapping of dimensions and properties/metrics to the result of their regression test.
221149
"""
222-
assert set(processed_emf_a.keys()) == set(
223-
processed_emf_b.keys()
150+
assert set(data_a.keys()) == set(
151+
data_b.keys()
224152
), "A and B run produced incomparable data. This is a bug in the test!"
225153

226154
results = {}
@@ -230,9 +158,9 @@ def analyze_data(
230158
for prop_name, prop_val in global_props.__dict__.items():
231159
metrics_logger.set_property(prop_name, prop_val)
232160

233-
for dimension_set in processed_emf_a:
234-
metrics_a = processed_emf_a[dimension_set]
235-
metrics_b = processed_emf_b[dimension_set]
161+
for dimension_set in data_a:
162+
metrics_a = data_a[dimension_set]
163+
metrics_b = data_b[dimension_set]
236164

237165
assert set(metrics_a.keys()) == set(
238166
metrics_b.keys()
@@ -296,7 +224,7 @@ def analyze_data(
296224

297225
print(f"Doing A/B-test for dimensions {dimension_set} and property {metric}")
298226

299-
values_a = processed_emf_a[dimension_set][metric][0]
227+
values_a = data_a[dimension_set][metric][0]
300228
baseline_mean = statistics.mean(values_a)
301229

302230
relative_changes_by_metric[metric].append(result.statistic / baseline_mean)
@@ -309,7 +237,7 @@ def analyze_data(
309237
)
310238

311239
messages = []
312-
do_not_print_list = uninteresting_dimensions(processed_emf_a)
240+
do_not_print_list = uninteresting_dimensions(data_a)
313241
for dimension_set, metric, result, unit in failures:
314242
# Sanity check as described above
315243
if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold:
@@ -321,8 +249,8 @@ def analyze_data(
321249

322250
# The significant data points themselves are above the noise threshold
323251
if abs(statistics.mean(relative_changes_significant[metric])) > noise_threshold:
324-
old_mean = statistics.mean(processed_emf_a[dimension_set][metric][0])
325-
new_mean = statistics.mean(processed_emf_b[dimension_set][metric][0])
252+
old_mean = statistics.mean(data_a[dimension_set][metric][0])
253+
new_mean = statistics.mean(data_b[dimension_set][metric][0])
326254

327255
msg = (
328256
f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "
@@ -393,13 +321,13 @@ def ab_performance_test(
393321
help="Analyze the results of two manually ran tests based on their test-report.json files",
394322
)
395323
analyze_parser.add_argument(
396-
"report_a",
397-
help="The path to the test-report.json file of the baseline run",
324+
"path_a",
325+
help="The path to the directlry with A run",
398326
type=Path,
399327
)
400328
analyze_parser.add_argument(
401-
"report_b",
402-
help="The path to the test-report.json file of the run whose performance we want to compare against report_a",
329+
"path_b",
330+
help="The path to the directlry with B run",
403331
type=Path,
404332
)
405333
parser.add_argument(
@@ -432,8 +360,8 @@ def ab_performance_test(
432360
args.noise_threshold,
433361
)
434362
else:
435-
data_a = load_data_series(args.report_a)
436-
data_b = load_data_series(args.report_b)
363+
data_a = load_data_series(args.path_a)
364+
data_b = load_data_series(args.path_b)
437365

438366
analyze_data(
439367
data_a,

0 commit comments

Comments
 (0)