1010"""
1111import argparse
1212import logging
13+ import random
14+ import string
1315import time
1416
17+ import traceback
18+ from datetime import datetime
19+ from functools import partial
20+
1521import numpy as np
1622import torch
1723import torch .profiler as profiler
1824
19- import traceback
20-
21- from torchbenchmark import load_canary_model_by_name , load_model_by_name , ModelNotFoundError
25+ from torchbenchmark import (
26+ load_canary_model_by_name ,
27+ load_model_by_name ,
28+ ModelNotFoundError ,
29+ )
2230from torchbenchmark .util .experiment .metrics import get_peak_memory
2331
32+
33+ if not hasattr (torch .version , "git_version" ):
34+ from pytorch .benchmark .fb .run_utils import trace_handler
35+
36+
2437WARMUP_ROUNDS = 3
2538SUPPORT_DEVICE_LIST = ["cpu" , "cuda" ]
2639if hasattr (torch .backends , "mps" ) and torch .backends .mps .is_available ():
@@ -171,7 +184,6 @@ def run_one_step(func, nwarmup=WARMUP_ROUNDS, num_iter=10, model=None, export_me
171184 printResultSummaryTime (result_summary , metrics_needed , model , flops_model_analyzer , cpu_peak_mem , mem_device_id , gpu_peak_mem )
172185
173186
174-
175187def profile_one_step (func , nwarmup = WARMUP_ROUNDS ):
176188 activity_groups = []
177189 result_summary = []
@@ -214,7 +226,7 @@ def profile_one_step(func, nwarmup=WARMUP_ROUNDS):
214226 with_stack = args .profile_detailed if args .profile_detailed else profile_opts ["with_stack" ],
215227 with_flops = args .profile_detailed if args .profile_detailed else profile_opts ["with_flops" ],
216228 with_modules = args .profile_detailed if args .profile_detailed else profile_opts ["with_modules" ],
217- on_trace_ready = profiler .tensorboard_trace_handler (args .profile_folder )
229+ on_trace_ready = partial ( trace_handler , f"torchbench_ { args . model } " ) if ( not hasattr ( torch . version , "git_version" ) and args . profile_export_chrome_trace ) else profiler .tensorboard_trace_handler (args .profile_folder ),
218230 ) as prof :
219231 if args .device == "cuda" :
220232 start_event = torch .cuda .Event (enable_timing = True )
@@ -265,32 +277,101 @@ def _validate_profile_options(profile_options: str):
265277
266278if __name__ == "__main__" :
267279 parser = argparse .ArgumentParser (__doc__ )
268- parser .add_argument ("model" , help = "Full or partial name of a model to run. If partial, picks the first match." )
269- parser .add_argument ("-d" , "--device" , choices = SUPPORT_DEVICE_LIST , default = "cpu" , help = "Which device to use." )
270- parser .add_argument ("-m" , "--mode" , choices = ["eager" , "jit" ], default = "eager" , help = "Which mode to run." )
271- parser .add_argument ("-t" , "--test" , choices = ["eval" , "train" ], default = "eval" , help = "Which test to run." )
272- parser .add_argument ("--profile" , action = "store_true" , help = "Run the profiler around the function" )
273- parser .add_argument ("--profile-options" , type = _validate_profile_options , help = f"Select which profile options to enable. Valid options: { SUPPORT_PROFILE_LIST } ." )
280+ parser .add_argument (
281+ "model" ,
282+ help = "Full or partial name of a model to run. If partial, picks the first match." ,
283+ )
284+ parser .add_argument (
285+ "-d" ,
286+ "--device" ,
287+ choices = SUPPORT_DEVICE_LIST ,
288+ default = "cpu" ,
289+ help = "Which device to use." ,
290+ )
291+ parser .add_argument (
292+ "-m" ,
293+ "--mode" ,
294+ choices = ["eager" , "jit" ],
295+ default = "eager" ,
296+ help = "Which mode to run." ,
297+ )
298+ parser .add_argument (
299+ "-t" ,
300+ "--test" ,
301+ choices = ["eval" , "train" ],
302+ default = "eval" ,
303+ help = "Which test to run." ,
304+ )
305+ parser .add_argument (
306+ "--profile" , action = "store_true" , help = "Run the profiler around the function"
307+ )
308+ parser .add_argument (
309+ "--profile-options" ,
310+ type = _validate_profile_options ,
311+ help = f"Select which profile options to enable. Valid options: { SUPPORT_PROFILE_LIST } ." ,
312+ )
274313 parser .add_argument ("--amp" , action = "store_true" , help = "enable torch.autocast()" )
275- parser .add_argument ("--profile-folder" , default = "./logs" , help = "Save profiling model traces to this directory." )
276- parser .add_argument ("--profile-detailed" , action = "store_true" ,
277- help = f"Enable all profile options, including { SUPPORT_PROFILE_LIST } . Overrides --profile-options." )
278- parser .add_argument ("--profile-devices" , type = _validate_devices ,
279- help = "Profile comma separated list of activities such as cpu,cuda." )
280- parser .add_argument ("--profile-eg" , action = "store_true" , help = "Collect execution trace by PARAM" )
281- parser .add_argument ("--profile-eg-folder" , default = "./eg_logs" ,
282- help = "Save execution traces to this directory." )
283- parser .add_argument ("--cudastreams" , action = "store_true" ,
284- help = "Utilization test using increasing number of cuda streams." )
314+ parser .add_argument (
315+ "--profile-folder" ,
316+ default = "./logs" ,
317+ help = "Save profiling model traces to this directory." ,
318+ )
319+ parser .add_argument (
320+ "--profile-detailed" ,
321+ action = "store_true" ,
322+ help = f"Enable all profile options, including { SUPPORT_PROFILE_LIST } . Overrides --profile-options." ,
323+ )
324+ parser .add_argument (
325+ "--profile-export-chrome-trace" ,
326+ action = "store_true" ,
327+ help = "Export Chrome tracing files. (internal only)" ,
328+ )
329+ parser .add_argument (
330+ "--profile-devices" ,
331+ type = _validate_devices ,
332+ help = "Profile comma separated list of activities such as cpu,cuda." ,
333+ )
334+ parser .add_argument (
335+ "--profile-eg" , action = "store_true" , help = "Collect execution trace by PARAM"
336+ )
337+ parser .add_argument (
338+ "--profile-eg-folder" ,
339+ default = "./eg_logs" ,
340+ help = "Save execution traces to this directory." ,
341+ )
342+ parser .add_argument (
343+ "--cudastreams" ,
344+ action = "store_true" ,
345+ help = "Utilization test using increasing number of cuda streams." ,
346+ )
285347 parser .add_argument ("--bs" , type = int , help = "Specify batch size to the test." )
286- parser .add_argument ("--export-metrics" , action = "store_true" ,
287- help = "Export all specified metrics records to a csv file. The default csv file name is [model_name]_all_metrics.csv." )
288- parser .add_argument ("--stress" , type = float , default = 0 , help = "Specify execution time (seconds) to stress devices." )
289- parser .add_argument ("--metrics" , type = str , default = "cpu_peak_mem,gpu_peak_mem" ,
290- help = "Specify metrics [cpu_peak_mem,gpu_peak_mem,flops]to be collected. You can also set `none` to disable all metrics. The metrics are separated by comma such as cpu_peak_mem,gpu_peak_mem." )
291- parser .add_argument ("--metrics-gpu-backend" , choices = ["dcgm" , "default" ], default = "default" , help = """Specify the backend [dcgm, default] to collect metrics. \n In default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,
292- \n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). \n - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library.\n - you can specify flops by --metrics flops, and it is collected by fvcore.\n In dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().\n - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""" )
293- parser .add_argument ("--channels-last" , action = "store_true" , help = "enable torch.channels_last()" )
348+ parser .add_argument (
349+ "--export-metrics" ,
350+ action = "store_true" ,
351+ help = "Export all specified metrics records to a csv file. The default csv file name is [model_name]_all_metrics.csv." ,
352+ )
353+ parser .add_argument (
354+ "--stress" ,
355+ type = float ,
356+ default = 0 ,
357+ help = "Specify execution time (seconds) to stress devices." ,
358+ )
359+ parser .add_argument (
360+ "--metrics" ,
361+ type = str ,
362+ default = "cpu_peak_mem,gpu_peak_mem" ,
363+ help = "Specify metrics [cpu_peak_mem,gpu_peak_mem,flops]to be collected. You can also set `none` to disable all metrics. The metrics are separated by comma such as cpu_peak_mem,gpu_peak_mem." ,
364+ )
365+ parser .add_argument (
366+ "--metrics-gpu-backend" ,
367+ choices = ["dcgm" , "default" ],
368+ default = "default" ,
369+ help = """Specify the backend [dcgm, default] to collect metrics. \n In default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,
370+ \n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). \n - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library.\n - you can specify flops by --metrics flops, and it is collected by fvcore.\n In dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. Optionally,\n - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().\n - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""" ,
371+ )
372+ parser .add_argument (
373+ "--channels-last" , action = "store_true" , help = "enable torch.channels_last()"
374+ )
294375 args , extra_args = parser .parse_known_args ()
295376 if args .cudastreams and not args .device == "cuda" :
296377 print ("cuda device required to use --cudastreams option!" )
@@ -313,19 +394,29 @@ def _validate_profile_options(profile_options: str):
313394 traceback .print_exc ()
314395 exit (- 1 )
315396 except ModelNotFoundError :
316- print (f"Error: The model { args .model } cannot be found at either core or canary model set." )
397+ print (
398+ f"Error: The model { args .model } cannot be found at either core or canary model set."
399+ )
317400 exit (- 1 )
318401
319- m = Model (device = args .device , test = args .test , jit = (args .mode == "jit" ), batch_size = args .bs , extra_args = extra_args )
402+ m = Model (
403+ device = args .device ,
404+ test = args .test ,
405+ jit = (args .mode == "jit" ),
406+ batch_size = args .bs ,
407+ extra_args = extra_args ,
408+ )
320409 if m .dynamo :
321410 mode = f"dynamo { m .opt_args .torchdynamo } "
322411 elif m .opt_args .backend :
323412 mode = f"{ m .opt_args .backend } "
324413 else :
325414 mode = "eager"
326- print (f"Running { args .test } method from { Model .name } on { args .device } in { mode } mode with input batch size { m .batch_size } and precision { m .dargs .precision } ." )
415+ print (
416+ f"Running { args .test } method from { Model .name } on { args .device } in { mode } mode with input batch size { m .batch_size } and precision { m .dargs .precision } ."
417+ )
327418 if "--accuracy" in extra_args :
328- print (' {:<20} {:>20}' .format ("Accuracy: " , str (m .accuracy )), sep = '' )
419+ print (" {:<20} {:>20}" .format ("Accuracy: " , str (m .accuracy )), sep = "" )
329420 exit (0 )
330421
331422 if args .channels_last :
@@ -334,25 +425,35 @@ def _validate_profile_options(profile_options: str):
334425 test = m .invoke
335426 if args .amp :
336427 test = torch .autocast (m .device )(test )
337- metrics_needed = [_ for _ in args .metrics .split (',' ) if _ .strip ()] if args .metrics else []
338- if 'none' in metrics_needed :
428+ metrics_needed = (
429+ [_ for _ in args .metrics .split ("," ) if _ .strip ()] if args .metrics else []
430+ )
431+ if "none" in metrics_needed :
339432 metrics_needed = []
340433 # only enabled gpu_peak_mem for cuda device
341- if args .device != ' cuda' and ' gpu_peak_mem' in metrics_needed :
342- metrics_needed .remove (' gpu_peak_mem' )
434+ if args .device != " cuda" and " gpu_peak_mem" in metrics_needed :
435+ metrics_needed .remove (" gpu_peak_mem" )
343436 metrics_needed = list (set (metrics_needed ))
344437 metrics_gpu_backend = args .metrics_gpu_backend
345438 if metrics_needed :
346- if metrics_gpu_backend == ' dcgm' :
439+ if metrics_gpu_backend == " dcgm" :
347440 from components .model_analyzer .TorchBenchAnalyzer import check_dcgm
441+
348442 check_dcgm ()
349- elif ' gpu_peak_mem' in metrics_needed :
443+ elif " gpu_peak_mem" in metrics_needed :
350444 from components .model_analyzer .TorchBenchAnalyzer import check_nvml
445+
351446 check_nvml ()
352- if 'gpu_peak_mem' in metrics_needed or ('flops' in metrics_needed and metrics_gpu_backend == 'dcgm' ):
353- assert args .device == 'cuda' , "gpu_peak_mem and flops:dcgm are only available for cuda device."
354- if 'flops' in metrics_needed and metrics_gpu_backend == 'default' :
355- assert hasattr (m , "get_flops" ), f"The model { args .model } does not support calculating flops."
447+ if "gpu_peak_mem" in metrics_needed or (
448+ "flops" in metrics_needed and metrics_gpu_backend == "dcgm"
449+ ):
450+ assert (
451+ args .device == "cuda"
452+ ), "gpu_peak_mem and flops:dcgm are only available for cuda device."
453+ if "flops" in metrics_needed and metrics_gpu_backend == "default" :
454+ assert hasattr (
455+ m , "get_flops"
456+ ), f"The model { args .model } does not support calculating flops."
356457 m .get_flops ()
357458 if args .export_metrics :
358459 if not args .metrics :
@@ -366,12 +467,23 @@ def _validate_profile_options(profile_options: str):
366467 elif args .cudastreams :
367468 run_one_step_with_cudastreams (test , 10 )
368469 else :
369- run_one_step (test , model = m , export_metrics_file = export_metrics_file ,
370- stress = args .stress , metrics_needed = metrics_needed , metrics_gpu_backend = args .metrics_gpu_backend )
470+ run_one_step (
471+ test ,
472+ model = m ,
473+ export_metrics_file = export_metrics_file ,
474+ stress = args .stress ,
475+ metrics_needed = metrics_needed ,
476+ metrics_gpu_backend = args .metrics_gpu_backend ,
477+ )
371478
372479 # Print dynamo compilation metrics, if there are any.
373480 try :
374481 if m .pt2_compilation_time :
375- print ('{:<20} {:>18}' .format ("PT2 Compilation time: " , "%.3f seconds" % m .pt2_compilation_time ), sep = '' )
482+ print (
483+ "{:<20} {:>18}" .format (
484+ "PT2 Compilation time: " , "%.3f seconds" % m .pt2_compilation_time
485+ ),
486+ sep = "" ,
487+ )
376488 except :
377489 pass
0 commit comments