Skip to content

Commit bfb6d15

Browse files
dshi7facebook-github-bot
authored andcommitted
export chrome trace when profiler is enabled (#1760)
Summary: Pull Request resolved: #1760 as title. default off (enabled by `--profile-export-chrome-trace`) Reviewed By: aaronenyeshi, xuzhao9 Differential Revision: D47234285 fbshipit-source-id: d211ac9908e45ea7ea80aba93c88a655ec024341
1 parent 48a8ef6 commit bfb6d15

File tree

1 file changed

+158
-46
lines changed

1 file changed

+158
-46
lines changed

run.py

Lines changed: 158 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,30 @@
1010
"""
1111
import argparse
1212
import logging
13+
import random
14+
import string
1315
import time
1416

17+
import traceback
18+
from datetime import datetime
19+
from functools import partial
20+
1521
import numpy as np
1622
import torch
1723
import torch.profiler as profiler
1824

19-
import traceback
20-
21-
from torchbenchmark import load_canary_model_by_name, load_model_by_name, ModelNotFoundError
25+
from torchbenchmark import (
26+
load_canary_model_by_name,
27+
load_model_by_name,
28+
ModelNotFoundError,
29+
)
2230
from torchbenchmark.util.experiment.metrics import get_peak_memory
2331

32+
33+
if not hasattr(torch.version, "git_version"):
34+
from pytorch.benchmark.fb.run_utils import trace_handler
35+
36+
2437
WARMUP_ROUNDS = 3
2538
SUPPORT_DEVICE_LIST = ["cpu", "cuda"]
2639
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
@@ -171,7 +184,6 @@ def run_one_step(func, nwarmup=WARMUP_ROUNDS, num_iter=10, model=None, export_me
171184
printResultSummaryTime(result_summary, metrics_needed, model, flops_model_analyzer, cpu_peak_mem, mem_device_id, gpu_peak_mem)
172185

173186

174-
175187
def profile_one_step(func, nwarmup=WARMUP_ROUNDS):
176188
activity_groups = []
177189
result_summary = []
@@ -214,7 +226,7 @@ def profile_one_step(func, nwarmup=WARMUP_ROUNDS):
214226
with_stack=args.profile_detailed if args.profile_detailed else profile_opts["with_stack"],
215227
with_flops=args.profile_detailed if args.profile_detailed else profile_opts["with_flops"],
216228
with_modules=args.profile_detailed if args.profile_detailed else profile_opts["with_modules"],
217-
on_trace_ready=profiler.tensorboard_trace_handler(args.profile_folder)
229+
on_trace_ready= partial(trace_handler, f"torchbench_{args.model}") if (not hasattr(torch.version, "git_version") and args.profile_export_chrome_trace) else profiler.tensorboard_trace_handler(args.profile_folder),
218230
) as prof:
219231
if args.device == "cuda":
220232
start_event = torch.cuda.Event(enable_timing=True)
@@ -265,32 +277,101 @@ def _validate_profile_options(profile_options: str):
265277

266278
if __name__ == "__main__":
267279
parser = argparse.ArgumentParser(__doc__)
268-
parser.add_argument("model", help="Full or partial name of a model to run. If partial, picks the first match.")
269-
parser.add_argument("-d", "--device", choices=SUPPORT_DEVICE_LIST, default="cpu", help="Which device to use.")
270-
parser.add_argument("-m", "--mode", choices=["eager", "jit"], default="eager", help="Which mode to run.")
271-
parser.add_argument("-t", "--test", choices=["eval", "train"], default="eval", help="Which test to run.")
272-
parser.add_argument("--profile", action="store_true", help="Run the profiler around the function")
273-
parser.add_argument("--profile-options", type=_validate_profile_options, help=f"Select which profile options to enable. Valid options: {SUPPORT_PROFILE_LIST}.")
280+
parser.add_argument(
281+
"model",
282+
help="Full or partial name of a model to run. If partial, picks the first match.",
283+
)
284+
parser.add_argument(
285+
"-d",
286+
"--device",
287+
choices=SUPPORT_DEVICE_LIST,
288+
default="cpu",
289+
help="Which device to use.",
290+
)
291+
parser.add_argument(
292+
"-m",
293+
"--mode",
294+
choices=["eager", "jit"],
295+
default="eager",
296+
help="Which mode to run.",
297+
)
298+
parser.add_argument(
299+
"-t",
300+
"--test",
301+
choices=["eval", "train"],
302+
default="eval",
303+
help="Which test to run.",
304+
)
305+
parser.add_argument(
306+
"--profile", action="store_true", help="Run the profiler around the function"
307+
)
308+
parser.add_argument(
309+
"--profile-options",
310+
type=_validate_profile_options,
311+
help=f"Select which profile options to enable. Valid options: {SUPPORT_PROFILE_LIST}.",
312+
)
274313
parser.add_argument("--amp", action="store_true", help="enable torch.autocast()")
275-
parser.add_argument("--profile-folder", default="./logs", help="Save profiling model traces to this directory.")
276-
parser.add_argument("--profile-detailed", action="store_true",
277-
help=f"Enable all profile options, including {SUPPORT_PROFILE_LIST}. Overrides --profile-options.")
278-
parser.add_argument("--profile-devices", type=_validate_devices,
279-
help="Profile comma separated list of activities such as cpu,cuda.")
280-
parser.add_argument("--profile-eg", action="store_true", help="Collect execution trace by PARAM")
281-
parser.add_argument("--profile-eg-folder", default="./eg_logs",
282-
help="Save execution traces to this directory.")
283-
parser.add_argument("--cudastreams", action="store_true",
284-
help="Utilization test using increasing number of cuda streams.")
314+
parser.add_argument(
315+
"--profile-folder",
316+
default="./logs",
317+
help="Save profiling model traces to this directory.",
318+
)
319+
parser.add_argument(
320+
"--profile-detailed",
321+
action="store_true",
322+
help=f"Enable all profile options, including {SUPPORT_PROFILE_LIST}. Overrides --profile-options.",
323+
)
324+
parser.add_argument(
325+
"--profile-export-chrome-trace",
326+
action="store_true",
327+
help="Export Chrome tracing files. (internal only)",
328+
)
329+
parser.add_argument(
330+
"--profile-devices",
331+
type=_validate_devices,
332+
help="Profile comma separated list of activities such as cpu,cuda.",
333+
)
334+
parser.add_argument(
335+
"--profile-eg", action="store_true", help="Collect execution trace by PARAM"
336+
)
337+
parser.add_argument(
338+
"--profile-eg-folder",
339+
default="./eg_logs",
340+
help="Save execution traces to this directory.",
341+
)
342+
parser.add_argument(
343+
"--cudastreams",
344+
action="store_true",
345+
help="Utilization test using increasing number of cuda streams.",
346+
)
285347
parser.add_argument("--bs", type=int, help="Specify batch size to the test.")
286-
parser.add_argument("--export-metrics", action="store_true",
287-
help="Export all specified metrics records to a csv file. The default csv file name is [model_name]_all_metrics.csv.")
288-
parser.add_argument("--stress", type=float, default=0, help="Specify execution time (seconds) to stress devices.")
289-
parser.add_argument("--metrics", type=str, default="cpu_peak_mem,gpu_peak_mem",
290-
help="Specify metrics [cpu_peak_mem,gpu_peak_mem,flops]to be collected. You can also set `none` to disable all metrics. The metrics are separated by comma such as cpu_peak_mem,gpu_peak_mem.")
291-
parser.add_argument("--metrics-gpu-backend", choices=["dcgm", "default"], default="default", help="""Specify the backend [dcgm, default] to collect metrics. \nIn default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,
292-
\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). \n - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library.\n - you can specify flops by --metrics flops, and it is collected by fvcore.\nIn dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().\n - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""")
293-
parser.add_argument("--channels-last", action="store_true", help="enable torch.channels_last()")
348+
parser.add_argument(
349+
"--export-metrics",
350+
action="store_true",
351+
help="Export all specified metrics records to a csv file. The default csv file name is [model_name]_all_metrics.csv.",
352+
)
353+
parser.add_argument(
354+
"--stress",
355+
type=float,
356+
default=0,
357+
help="Specify execution time (seconds) to stress devices.",
358+
)
359+
parser.add_argument(
360+
"--metrics",
361+
type=str,
362+
default="cpu_peak_mem,gpu_peak_mem",
363+
help="Specify metrics [cpu_peak_mem,gpu_peak_mem,flops]to be collected. You can also set `none` to disable all metrics. The metrics are separated by comma such as cpu_peak_mem,gpu_peak_mem.",
364+
)
365+
parser.add_argument(
366+
"--metrics-gpu-backend",
367+
choices=["dcgm", "default"],
368+
default="default",
369+
help="""Specify the backend [dcgm, default] to collect metrics. \nIn default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,
370+
\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). \n - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library.\n - you can specify flops by --metrics flops, and it is collected by fvcore.\nIn dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().\n - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""",
371+
)
372+
parser.add_argument(
373+
"--channels-last", action="store_true", help="enable torch.channels_last()"
374+
)
294375
args, extra_args = parser.parse_known_args()
295376
if args.cudastreams and not args.device == "cuda":
296377
print("cuda device required to use --cudastreams option!")
@@ -313,19 +394,29 @@ def _validate_profile_options(profile_options: str):
313394
traceback.print_exc()
314395
exit(-1)
315396
except ModelNotFoundError:
316-
print(f"Error: The model {args.model} cannot be found at either core or canary model set.")
397+
print(
398+
f"Error: The model {args.model} cannot be found at either core or canary model set."
399+
)
317400
exit(-1)
318401

319-
m = Model(device=args.device, test=args.test, jit=(args.mode == "jit"), batch_size=args.bs, extra_args=extra_args)
402+
m = Model(
403+
device=args.device,
404+
test=args.test,
405+
jit=(args.mode == "jit"),
406+
batch_size=args.bs,
407+
extra_args=extra_args,
408+
)
320409
if m.dynamo:
321410
mode = f"dynamo {m.opt_args.torchdynamo}"
322411
elif m.opt_args.backend:
323412
mode = f"{m.opt_args.backend}"
324413
else:
325414
mode = "eager"
326-
print(f"Running {args.test} method from {Model.name} on {args.device} in {mode} mode with input batch size {m.batch_size} and precision {m.dargs.precision}.")
415+
print(
416+
f"Running {args.test} method from {Model.name} on {args.device} in {mode} mode with input batch size {m.batch_size} and precision {m.dargs.precision}."
417+
)
327418
if "--accuracy" in extra_args:
328-
print('{:<20} {:>20}'.format("Accuracy: ", str(m.accuracy)), sep='')
419+
print("{:<20} {:>20}".format("Accuracy: ", str(m.accuracy)), sep="")
329420
exit(0)
330421

331422
if args.channels_last:
@@ -334,25 +425,35 @@ def _validate_profile_options(profile_options: str):
334425
test = m.invoke
335426
if args.amp:
336427
test = torch.autocast(m.device)(test)
337-
metrics_needed = [_ for _ in args.metrics.split(',') if _.strip()] if args.metrics else []
338-
if 'none' in metrics_needed:
428+
metrics_needed = (
429+
[_ for _ in args.metrics.split(",") if _.strip()] if args.metrics else []
430+
)
431+
if "none" in metrics_needed:
339432
metrics_needed = []
340433
# only enabled gpu_peak_mem for cuda device
341-
if args.device != 'cuda' and 'gpu_peak_mem' in metrics_needed:
342-
metrics_needed.remove('gpu_peak_mem')
434+
if args.device != "cuda" and "gpu_peak_mem" in metrics_needed:
435+
metrics_needed.remove("gpu_peak_mem")
343436
metrics_needed = list(set(metrics_needed))
344437
metrics_gpu_backend = args.metrics_gpu_backend
345438
if metrics_needed:
346-
if metrics_gpu_backend == 'dcgm':
439+
if metrics_gpu_backend == "dcgm":
347440
from components.model_analyzer.TorchBenchAnalyzer import check_dcgm
441+
348442
check_dcgm()
349-
elif 'gpu_peak_mem' in metrics_needed:
443+
elif "gpu_peak_mem" in metrics_needed:
350444
from components.model_analyzer.TorchBenchAnalyzer import check_nvml
445+
351446
check_nvml()
352-
if 'gpu_peak_mem' in metrics_needed or ('flops' in metrics_needed and metrics_gpu_backend == 'dcgm'):
353-
assert args.device == 'cuda', "gpu_peak_mem and flops:dcgm are only available for cuda device."
354-
if 'flops' in metrics_needed and metrics_gpu_backend == 'default':
355-
assert hasattr(m, "get_flops"), f"The model {args.model} does not support calculating flops."
447+
if "gpu_peak_mem" in metrics_needed or (
448+
"flops" in metrics_needed and metrics_gpu_backend == "dcgm"
449+
):
450+
assert (
451+
args.device == "cuda"
452+
), "gpu_peak_mem and flops:dcgm are only available for cuda device."
453+
if "flops" in metrics_needed and metrics_gpu_backend == "default":
454+
assert hasattr(
455+
m, "get_flops"
456+
), f"The model {args.model} does not support calculating flops."
356457
m.get_flops()
357458
if args.export_metrics:
358459
if not args.metrics:
@@ -366,12 +467,23 @@ def _validate_profile_options(profile_options: str):
366467
elif args.cudastreams:
367468
run_one_step_with_cudastreams(test, 10)
368469
else:
369-
run_one_step(test, model=m, export_metrics_file=export_metrics_file,
370-
stress=args.stress, metrics_needed=metrics_needed, metrics_gpu_backend=args.metrics_gpu_backend)
470+
run_one_step(
471+
test,
472+
model=m,
473+
export_metrics_file=export_metrics_file,
474+
stress=args.stress,
475+
metrics_needed=metrics_needed,
476+
metrics_gpu_backend=args.metrics_gpu_backend,
477+
)
371478

372479
# Print dynamo compilation metrics, if there are any.
373480
try:
374481
if m.pt2_compilation_time:
375-
print('{:<20} {:>18}'.format("PT2 Compilation time: ", "%.3f seconds" % m.pt2_compilation_time), sep='')
482+
print(
483+
"{:<20} {:>18}".format(
484+
"PT2 Compilation time: ", "%.3f seconds" % m.pt2_compilation_time
485+
),
486+
sep="",
487+
)
376488
except:
377489
pass

0 commit comments

Comments
 (0)