Merge: [Speech models/PyT] Update perf timers and cuda syncs

nv-kkudrynski · nv-kkudrynski · commit 959c6773feb7 · 2023-02-15T09:36:19.000-08:00
diff --git a/PyTorch/SpeechRecognition/Jasper/inference.py b/PyTorch/SpeechRecognition/Jasper/inference.py
@@ -324,7 +324,7 @@ def main():
                 feats, feat_lens = feat_proc(audio, audio_lens)
 
             sync()
-            t1 = time.perf_counter()
+            t1 = time.time()
 
             if args.amp:
                 feats = feats.half()
@@ -340,7 +340,7 @@ def main():
             preds = greedy_decoder(log_probs)
 
             sync()
-            t2 = time.perf_counter()
+            t2 = time.time()
 
             # burn-in period; wait for a new loader due to num_workers
             if it >= 1 and (args.steps == 0 or it >= args.warmup_steps):
@@ -358,7 +358,7 @@ def main():
                 break
 
             sync()
-            t0 = time.perf_counter()
+            t0 = time.time()
 
         # communicate the results
         if args.transcribe_wav:
diff --git a/PyTorch/SpeechRecognition/Jasper/train.py b/PyTorch/SpeechRecognition/Jasper/train.py
@@ -142,6 +142,7 @@ def evaluate(epoch, step, val_loader, val_feat_proc, labels, model,
             continue
 
         model.eval()
+        torch.cuda.synchronize()
         start_time = time.time()
         agg = {'losses': [], 'preds': [], 'txts': []}
 
@@ -166,6 +167,7 @@ def evaluate(epoch, step, val_loader, val_feat_proc, labels, model,
             agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], labels)
 
         wer, loss = process_evaluation_epoch(agg)
+        torch.cuda.synchronize()
         log(() if epoch is None else (epoch,),
             step, subset, {'loss': loss, 'wer': 100.0 * wer,
                            'took': time.time() - start_time})
@@ -379,11 +381,11 @@ def main():
         if multi_gpu and not use_dali:
             train_loader.sampler.set_epoch(epoch)
 
+        torch.cuda.synchronize()
+        epoch_start_time = time.time()
         epoch_utts = 0
         epoch_loss = 0
         accumulated_batches = 0
-        epoch_start_time = time.time()
-        epoch_eval_time = 0
 
         for batch in train_loader:
 
@@ -461,7 +463,6 @@ def main():
                 step_start_time = time.time()
 
                 if step % args.eval_frequency == 0:
-                    tik = time.time()
                     wer = evaluate(epoch, step, val_loader, val_feat_proc,
                                    symbols, model, ema_model, ctc_loss,
                                    greedy_decoder, args.amp, use_dali)
@@ -470,7 +471,6 @@ def main():
                         checkpointer.save(model, ema_model, optimizer, scaler,
                                           epoch, step, best_wer, is_best=True)
                         best_wer = wer
-                    epoch_eval_time += time.time() - tik
 
                 step += 1
                 accumulated_batches = 0
@@ -481,6 +481,7 @@ def main():
             if not use_dali and step > steps_per_epoch * epoch:
                 break
 
+        torch.cuda.synchronize()
         epoch_time = time.time() - epoch_start_time
         epoch_loss /= steps_per_epoch
         log((epoch,), None, 'train_avg', {'throughput': epoch_utts / epoch_time,
diff --git a/PyTorch/SpeechRecognition/QuartzNet/inference.py b/PyTorch/SpeechRecognition/QuartzNet/inference.py
@@ -334,7 +334,7 @@ def main():
                 feats, feat_lens = feat_proc(audio, audio_lens)
 
             sync()
-            t1 = time.perf_counter()
+            t1 = time.time()
 
             if args.amp:
                 feats = feats.half()
@@ -347,7 +347,7 @@ def main():
             preds = greedy_decoder(log_probs)
 
             sync()
-            t2 = time.perf_counter()
+            t2 = time.time()
 
             # burn-in period; wait for a new loader due to num_workers
             if it >= 1 and (args.steps == 0 or it >= args.warmup_steps):
@@ -365,7 +365,7 @@ def main():
                 break
 
             sync()
-            t0 = time.perf_counter()
+            t0 = time.time()
 
         # communicate the results
         if args.transcribe_wav:
diff --git a/PyTorch/SpeechRecognition/QuartzNet/train.py b/PyTorch/SpeechRecognition/QuartzNet/train.py
@@ -163,6 +163,7 @@ def evaluate(epoch, step, val_loader, val_feat_proc, labels, model,
             continue
 
         model.eval()
+        torch.cuda.synchronize()
         start_time = time.time()
         agg = {'losses': [], 'preds': [], 'txts': []}
 
@@ -187,6 +188,7 @@ def evaluate(epoch, step, val_loader, val_feat_proc, labels, model,
             agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], labels)
 
         wer, loss = process_evaluation_epoch(agg)
+        torch.cuda.synchronize()
         log(() if epoch is None else (epoch,),
             step, subset, {'loss': loss, 'wer': 100.0 * wer,
                            'took': time.time() - start_time})
@@ -410,11 +412,11 @@ def main():
         if multi_gpu and not use_dali:
             train_loader.sampler.set_epoch(epoch)
 
+        torch.cuda.synchronize()
+        epoch_start_time = time.time()
         epoch_utts = 0
         epoch_loss = 0
         accumulated_batches = 0
-        epoch_start_time = time.time()
-        epoch_eval_time = 0
 
         for batch in train_loader:
 
@@ -493,7 +495,6 @@ def main():
                 step_start_time = time.time()
 
                 if step % args.eval_frequency == 0:
-                    tik = time.time()
                     wer = evaluate(epoch, step, val_loader, val_feat_proc,
                                    symbols, model, ema_model, ctc_loss,
                                    greedy_decoder, args.amp, use_dali)
@@ -502,7 +503,6 @@ def main():
                         checkpointer.save(model, ema_model, optimizer, scaler,
                                           epoch, step, best_wer, is_best=True)
                         best_wer = wer
-                    epoch_eval_time += time.time() - tik
 
                 step += 1
                 accumulated_batches = 0
@@ -513,6 +513,7 @@ def main():
             if not use_dali and step > steps_per_epoch * epoch:
                 break
 
+        torch.cuda.synchronize()
         epoch_time = time.time() - epoch_start_time
         epoch_loss /= steps_per_epoch
         log((epoch,), None, 'train_avg', {'throughput': epoch_utts / epoch_time,
diff --git a/PyTorch/SpeechRecognition/wav2vec2/common/metrics.py b/PyTorch/SpeechRecognition/wav2vec2/common/metrics.py
@@ -75,7 +75,8 @@ def __init__(self, scopes=('train', 'train_avg'),
                  benchmark_epochs=0,
                  reduce_mean=(),
                  reduce_last=(),
-                 group_tb_entries=False):
+                 group_tb_entries=False,
+                 cuda=True):
         """
         Args:
             scopes: possible scopes of metrics accumulation
@@ -100,9 +101,10 @@ def __init__(self, scopes=('train', 'train_avg'),
         self.benchmark_keys = benchmark_keys
         self.scopes = scopes
         self.group_tb_entries = group_tb_entries
+        self.cuda = cuda
 
     def log_scalar(self, key, val, accum_reduction=None):
-        """ Main primitive for logging partial metrics from single batch.
+        """Main primitive for logging partial metrics from single batch.
 
         NOTE: Assumption: `log_scalar` cannot be called with different
         `accum_reduction` for the same `key`. This results in undefined behavior
@@ -197,9 +199,13 @@ def start_iter(self, iter):
         self._start_accumulating(iter, True, 'train')
 
     def start_epoch(self, epoch):
+        if self.cuda:
+            torch.cuda.synchronize()
         self._start_accumulating(epoch, True, 'train_avg')
 
     def start_val(self):
+        if self.cuda:
+            torch.cuda.synchronize()
         self._start_accumulating(None, True, 'val')
 
     def finish_iter(self):
@@ -209,6 +215,8 @@ def finish_logging_interval(self):
         self._finish_accumulating('train')
 
     def finish_epoch(self):
+        if self.cuda:
+            torch.cuda.synchronize()
         self._accumulate_time('train_avg')
         self._finish_accumulating('train_avg')
 
@@ -220,6 +228,8 @@ def finish_epoch(self):
                 metr[k].pop(0)
 
     def finish_val(self, scope='val'):
+        if self.cuda:
+            torch.cuda.synchronize()
         self._accumulate_time(scope)
         self._finish_accumulating(scope)
 
diff --git a/PyTorch/SpeechRecognition/wav2vec2/inference.py b/PyTorch/SpeechRecognition/wav2vec2/inference.py
@@ -249,7 +249,7 @@ def main():
         batch = utils.move_to_cuda(batch)
 
         sync()
-        t1 = time.perf_counter()
+        t1 = time.time()
 
         if args.fp16:
             batch = fp_convert_batch(batch, 'fp16')
@@ -266,7 +266,7 @@ def main():
             preds = logp.argmax(dim=-1, keepdim=False).int()
 
         sync()
-        t2 = time.perf_counter()
+        t2 = time.time()
 
         # burn-in period; wait for a new loader due to num_workers
         if it >= 1 and (args.steps == 0 or it >= args.warmup_steps):
@@ -292,7 +292,7 @@ def main():
             break
 
         sync()
-        t0 = time.perf_counter()
+        t0 = time.time()
 
     tdict = target_dictionary
     agg['preds'] = [pred.replace(tdict[tdict.nspecial], ' ')
diff --git a/PyTorch/SpeechRecognition/wav2vec2/train.py b/PyTorch/SpeechRecognition/wav2vec2/train.py
@@ -150,9 +150,10 @@ def main():
         Metrics = W2v2Metrics
         criterion = Wav2vecCriterion(args)
 
-    metrics = Metrics(args.benchmark_epochs_num)
-    val_metrics = Metrics(args.benchmark_epochs_num, scopes=['val'])
-    val_ema_metrics = Metrics(args.benchmark_epochs_num, scopes=['val_ema'])
+    kw = {'benchmark_epochs': args.benchmark_epochs_num, 'cuda': not args.cpu}
+    metrics = Metrics(**kw)
+    val_metrics = Metrics(scopes=['val'], **kw)
+    val_ema_metrics = Metrics(scopes=['val_ema'], **kw)
 
     init_logger(args.output_dir, args.log_file, args.ema)
     logger.log_parameters(vars(args), tb_subset='train')
diff --git a/PyTorch/SpeechRecognition/wav2vec2/wav2vec2/logging.py b/PyTorch/SpeechRecognition/wav2vec2/wav2vec2/logging.py
@@ -111,7 +111,7 @@ def init_infer_metadata():
 
 class W2v2Metrics(MetricsAggregator):
 
-    def __init__(self, benchmark_epochs, scopes=('train', 'train_avg')):
+    def __init__(self, benchmark_epochs, scopes=('train', 'train_avg'), cuda=True):
         super().__init__(
             benchmark_epochs=benchmark_epochs,
             benchmark_keys=('took', 'accuracy', 'loss', 'ntokens/s'),
@@ -120,7 +120,8 @@ def __init__(self, benchmark_epochs, scopes=('train', 'train_avg')):
                            'code_perplexity',
                            'took', 'loss_scale', 'lr', 'ntokens/s'),
             reduce_mean=('temp', 'prob_perplexity', 'code_perplexity'),
-            reduce_last=('lr', 'loss_scale'))
+            reduce_last=('lr', 'loss_scale'),
+            cuda=cuda)
 
     def accumulate(self, scopes=None):
         if 'ignore' not in self.partials or self.partials['ignore'] == 0.0:
@@ -155,11 +156,12 @@ def __init__(
                            'prob_perplexity', 'took', 'ntokens/s', 'uer',
                            'wer', 'raw_wer'),
             reduce_mean=('temp', 'prob_perplexity', 'code_perplexity'),
-            reduce_last=('lr',)):
+            reduce_last=('lr',),
+            cuda=True):
         super().__init__(
             benchmark_epochs=benchmark_epochs, benchmark_keys=benchmark_keys,
             scopes=scopes, dllogger_keys=dllogger_keys,
-            reduce_mean=reduce_mean, reduce_last=reduce_last)
+            reduce_mean=reduce_mean, reduce_last=reduce_last, cuda=cuda)
 
     def accumulate(self, scopes=None):
         if 'ignore' not in self.partials or self.partials['ignore'] == 0.0:
diff --git a/PyTorch/SpeechSynthesis/HiFiGAN/hifigan/logging.py b/PyTorch/SpeechSynthesis/HiFiGAN/hifigan/logging.py
@@ -123,7 +123,7 @@ class Metrics(dict):
     def __init__(self, scopes=['train', 'train_avg'],
                  dll_keys=['loss_gen', 'loss_discrim', 'loss_mel',
                            'frames/s', 'took', 'lrate_gen', 'lrate_discrim'],
-                 benchmark_epochs=0):
+                 benchmark_epochs=0, cuda=True):
         super().__init__()
 
         self.dll_keys = dll_keys
@@ -133,6 +133,7 @@ def __init__(self, scopes=['train', 'train_avg'],
         self.benchmark_epochs = benchmark_epochs
         if benchmark_epochs > 0:
             self.metrics['train_benchmark'] = defaultdict(list)
+        self.cuda = cuda
 
     def __setitem__(self, key, val):
         if type(val) is dict:
@@ -182,15 +183,21 @@ def start_iter(self, iter, start_timer=True):
         self.start_accumulating(iter, start_timer, 'train')
 
     def start_epoch(self, epoch, start_timer=True):
+        if self.cuda:
+            torch.cuda.synchronize()
         self.start_accumulating(epoch, start_timer, 'train_avg')
 
     def start_val(self, start_timer=True):
+        if self.cuda:
+            torch.cuda.synchronize()
         self.start_accumulating(None, start_timer, 'val')
 
     def finish_iter(self, stop_timer=True):
         self.finish_accumulating(stop_timer, 'train')
 
     def finish_epoch(self, stop_timer=True):
+        if self.cuda:
+            torch.cuda.synchronize()
         self.finish_accumulating(stop_timer, 'train_avg')
 
         metr = self.metrics['train_benchmark']
@@ -201,6 +208,8 @@ def finish_epoch(self, stop_timer=True):
                 metr[k].pop(0)
 
     def finish_val(self, stop_timer=True):
+        if self.cuda:
+            torch.cuda.synchronize()
         self.finish_accumulating(stop_timer, 'val')
 
     def get_metrics(self, scope='train', target='dll'):
diff --git a/PyTorch/SpeechSynthesis/HiFiGAN/train.py b/PyTorch/SpeechSynthesis/HiFiGAN/train.py
@@ -237,8 +237,9 @@ def main():
         init_distributed(args, args.world_size, args.local_rank)
 
     metrics = Metrics(scopes=['train', 'train_avg'],
-                      benchmark_epochs=args.benchmark_epochs_num)
-    val_metrics = Metrics(scopes=['val'])
+                      benchmark_epochs=args.benchmark_epochs_num,
+                      cuda=args.cuda)
+    val_metrics = Metrics(scopes=['val'], cuda=args.cuda)
     init_logger(args.output, args.log_file, args.ema_decay)
     logger.parameters(vars(args), tb_subset='train')