66
77The script takes two git revisions and a pytest integration test. It utilizes
88our integration test frameworks --binary-dir parameter to execute the given
9- test using binaries compiled from each revision, and captures the EMF logs
10- output. It the searches for list-valued properties/metrics in the EMF, and runs a
11- regression test comparing these lists for the two runs.
9+ test using binaries compiled from each revision, and runs a regression test
10+ comparing resulting metrics between runs.
1211
1312It performs the A/B-test as follows:
14- For each EMF log message output, look at the dimensions. The script assumes that
15- dimensions are unique across all log messages output from a single test run. In
16- each log message, then look for all properties that have lists assigned to them,
17- and collect them. For both runs of the test, the set of distinct dimensions
18- collected this way must be the same. Then, we match corresponding dimensions
19- between the two runs, performing statistical regression test across all the list-
20- valued properties collected.
13+ For both A and B runs, collect all `metrics.json` files and read all dimentions
14+ from them. Script assumes all dimentions are unique within single run and both
15+ A and B runs result in the same dimentions. After collection is done, perform
16+ statistical regression test across all the list-valued properties collected.
2117"""
2218
19+ import glob
2320import argparse
2421import json
2522import os
3633from framework .ab_test import binary_ab_test , check_regression
3734from framework .properties import global_props
3835from host_tools .metrics import (
39- emit_raw_emf ,
4036 format_with_reduced_unit ,
4137 get_metrics_logger ,
4238)
@@ -69,106 +65,37 @@ def is_ignored(dimensions) -> bool:
6965 return False
7066
7167
72- def extract_dimensions (emf ):
73- """Extracts the cloudwatch dimensions from an EMF log message"""
74- if not emf ["_aws" ]["CloudWatchMetrics" ][0 ]["Dimensions" ]:
75- # Skipped tests emit a duration metric, but have no dimensions set
76- return {}
68+ def load_data_series (data_path : Path ):
69+ """Recursively collects `metrics.json` files in provided path"""
70+ data = {}
71+ for name in glob .glob (f"{ data_path } /**/metrics.json" , recursive = True ):
72+ with open (name , encoding = "utf-8" ) as f :
73+ j = json .load (f )
7774
78- dimension_list = [
79- dim
80- for dimensions in emf ["_aws" ]["CloudWatchMetrics" ][0 ]["Dimensions" ]
81- for dim in dimensions
82- ]
83- return {key : emf [key ] for key in emf if key in dimension_list }
75+ metrics = j ["metrics" ]
76+ dimentions = frozenset (j ["dimensions" ].items ())
8477
78+ data [dimentions ] = {}
79+ for m in metrics :
80+ # Ignore certain metrics as we know them to be volatile
81+ if "cpu_utilization" in m :
82+ continue
83+ mm = metrics [m ]
84+ unit = mm ["unit" ]
85+ values = mm ["values" ]
86+ data [dimentions ][m ] = (values , unit )
8587
86- def process_log_entry (emf : dict ):
87- """Parses the given EMF log entry
88-
89- Returns the entries dimensions and its list-valued properties/metrics, together with their units
90- """
91- result = {
92- key : (value , find_unit (emf , key ))
93- for key , value in emf .items ()
94- if (
95- "fc_metrics" not in key
96- and "cpu_utilization" not in key
97- and isinstance (value , list )
98- )
99- }
100- # Since we don't consider metrics having fc_metrics in key
101- # result could be empty so, return empty dimensions as well
102- if not result :
103- return {}, {}
104-
105- return extract_dimensions (emf ), result
106-
107-
108- def find_unit (emf : dict , metric : str ):
109- """Determines the unit of the given metric"""
110- metrics = {
111- y ["Name" ]: y ["Unit" ] for y in emf ["_aws" ]["CloudWatchMetrics" ][0 ]["Metrics" ]
112- }
113- return metrics .get (metric , "None" )
114-
115-
116- def load_data_series (report_path : Path , tag = None , * , reemit : bool = False ):
117- """Loads the data series relevant for A/B-testing from test_results/test-report.json
118- into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
119- its list-valued properties/metrics.
120-
121- If `reemit` is True, it also reemits all EMF logs to a local EMF agent,
122- overwriting the attached "git_commit_id" field with the given revision."""
123- # Dictionary mapping EMF dimensions to A/B-testable metrics/properties
124- processed_emf = {}
125-
126- report = json .loads (report_path .read_text ("UTF-8" ))
127- for test in report ["tests" ]:
128- for line in test ["teardown" ]["stdout" ].splitlines ():
129- # Only look at EMF log messages. If we ever have other stdout that starts with braces,
130- # we will need to rethink this heuristic.
131- if line .startswith ("{" ):
132- emf = json .loads (line )
133-
134- if reemit :
135- assert tag is not None
136-
137- emf ["git_commit_id" ] = str (tag )
138- emit_raw_emf (emf )
139-
140- dimensions , result = process_log_entry (emf )
141-
142- if not dimensions :
143- continue
144-
145- dimension_set = frozenset (dimensions .items ())
146-
147- if dimension_set not in processed_emf :
148- processed_emf [dimension_set ] = result
149- else :
150- # If there are many data points for a metric, they will be split across
151- # multiple EMF log messages. We need to reassemble :(
152- assert (
153- processed_emf [dimension_set ].keys () == result .keys ()
154- ), f"Found incompatible metrics associated with dimension set { dimension_set } : { processed_emf [dimension_set ].keys ()} in one EMF message, but { result .keys ()} in another."
155-
156- for metric , (values , unit ) in processed_emf [dimension_set ].items ():
157- assert result [metric ][1 ] == unit
158-
159- values .extend (result [metric ][0 ])
160-
161- return processed_emf
88+ return data
16289
16390
164- def uninteresting_dimensions (processed_emf ):
91+ def uninteresting_dimensions (data ):
16592 """
166- Computes the set of cloudwatch dimensions that only ever take on a
93+ Computes the set of dimensions that only ever take on a
16794 single value across the entire dataset.
16895 """
16996 values_per_dimension = defaultdict (set )
17097
171- for dimension_set in processed_emf :
98+ for dimension_set in data :
17299 for dimension , value in dimension_set :
173100 values_per_dimension [dimension ].add (value )
174101
@@ -189,7 +116,8 @@ def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
189116 binary_dir = binary_dir .resolve ()
190117
191118 print (f"Collecting samples with { binary_dir } " )
192- test_report_path = f"test_results/{ tag } /test-report.json"
119+ test_path = f"test_results/{ tag } "
120+ test_report_path = f"{ test_path } /test-report.json"
193121 subprocess .run (
194122 f"./tools/test.sh --binary-dir={ binary_dir } { pytest_opts } -m '' --json-report-file=../{ test_report_path } " ,
195123 env = os .environ
@@ -201,12 +129,12 @@ def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
201129 shell = True ,
202130 )
203131
204- return load_data_series (Path (test_report_path ), binary_dir , reemit = True )
132+ return load_data_series (Path (test_path ) )
205133
206134
207135def analyze_data (
208- processed_emf_a ,
209- processed_emf_b ,
136+ data_a ,
137+ data_b ,
210138 p_thresh ,
211139 strength_abs_thresh ,
212140 noise_threshold ,
@@ -219,8 +147,8 @@ def analyze_data(
219147
220148 Returns a mapping of dimensions and properties/metrics to the result of their regression test.
221149 """
222- assert set (processed_emf_a .keys ()) == set (
223- processed_emf_b .keys ()
150+ assert set (data_a .keys ()) == set (
151+ data_b .keys ()
224152 ), "A and B run produced incomparable data. This is a bug in the test!"
225153
226154 results = {}
@@ -230,9 +158,9 @@ def analyze_data(
230158 for prop_name , prop_val in global_props .__dict__ .items ():
231159 metrics_logger .set_property (prop_name , prop_val )
232160
233- for dimension_set in processed_emf_a :
234- metrics_a = processed_emf_a [dimension_set ]
235- metrics_b = processed_emf_b [dimension_set ]
161+ for dimension_set in data_a :
162+ metrics_a = data_a [dimension_set ]
163+ metrics_b = data_b [dimension_set ]
236164
237165 assert set (metrics_a .keys ()) == set (
238166 metrics_b .keys ()
@@ -296,7 +224,7 @@ def analyze_data(
296224
297225 print (f"Doing A/B-test for dimensions { dimension_set } and property { metric } " )
298226
299- values_a = processed_emf_a [dimension_set ][metric ][0 ]
227+ values_a = data_a [dimension_set ][metric ][0 ]
300228 baseline_mean = statistics .mean (values_a )
301229
302230 relative_changes_by_metric [metric ].append (result .statistic / baseline_mean )
@@ -309,7 +237,7 @@ def analyze_data(
309237 )
310238
311239 messages = []
312- do_not_print_list = uninteresting_dimensions (processed_emf_a )
240+ do_not_print_list = uninteresting_dimensions (data_a )
313241 for dimension_set , metric , result , unit in failures :
314242 # Sanity check as described above
315243 if abs (statistics .mean (relative_changes_by_metric [metric ])) <= noise_threshold :
@@ -321,8 +249,8 @@ def analyze_data(
321249
322250 # The significant data points themselves are above the noise threshold
323251 if abs (statistics .mean (relative_changes_significant [metric ])) > noise_threshold :
324- old_mean = statistics .mean (processed_emf_a [dimension_set ][metric ][0 ])
325- new_mean = statistics .mean (processed_emf_b [dimension_set ][metric ][0 ])
252+ old_mean = statistics .mean (data_a [dimension_set ][metric ][0 ])
253+ new_mean = statistics .mean (data_b [dimension_set ][metric ][0 ])
326254
327255 msg = (
328256 f"\033 [0;32m[Firecracker A/B-Test Runner]\033 [0m A/B-testing shows a change of "
@@ -393,13 +321,13 @@ def ab_performance_test(
393321 help = "Analyze the results of two manually ran tests based on their test-report.json files" ,
394322 )
395323 analyze_parser .add_argument (
396- "report_a " ,
397- help = "The path to the test-report.json file of the baseline run" ,
324+ "path_a " ,
325+ help = "The path to the directlry with A run" ,
398326 type = Path ,
399327 )
400328 analyze_parser .add_argument (
401- "report_b " ,
402- help = "The path to the test-report.json file of the run whose performance we want to compare against report_a " ,
329+ "path_b " ,
330+ help = "The path to the directlry with B run" ,
403331 type = Path ,
404332 )
405333 parser .add_argument (
@@ -432,8 +360,8 @@ def ab_performance_test(
432360 args .noise_threshold ,
433361 )
434362 else :
435- data_a = load_data_series (args .report_a )
436- data_b = load_data_series (args .report_b )
363+ data_a = load_data_series (args .path_a )
364+ data_b = load_data_series (args .path_b )
437365
438366 analyze_data (
439367 data_a ,
0 commit comments