Skip to content

Commit c75cc20

Browse files
committed
Merge: [BART/PyT] Add synchronize for benchmarking
2 parents d46a356 + a797214 commit c75cc20

File tree

2 files changed

+4
-0
lines changed

2 files changed

+4
-0
lines changed

PyTorch/LanguageModeling/BART/run_eval.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ def generate_summaries_or_translations(
160160
results = []
161161
with torch.no_grad():
162162
for batch in tqdm(data_loader):
163+
torch.cuda.synchronize()
163164
t0 = time.time()
164165

165166
summaries = model.generate(
@@ -180,6 +181,7 @@ def generate_summaries_or_translations(
180181
if num_return_sequences > 1:
181182
preds = chunks(preds, num_return_sequences) # batch size chunks, each of size num_return_seq
182183

184+
torch.cuda.synchronize()
183185
eval_time = time.time() - t0
184186
for i, pred in enumerate(preds):
185187
store_time = eval_time if i == 0 else None #only store latency for element 0 of every batch

PyTorch/LanguageModeling/BART/training_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,9 +410,11 @@ def generic_train(
410410
for batch in dataloader:
411411
batch = {k: v.to(device) for k, v in batch.items()}
412412
local_step += 1
413+
torch.cuda.synchronize()
413414
iter_start = time.time()
414415

415416
total_loss, logs = train_one_step(args, trainer, optimizer, scheduler, batch, local_step, scaler)
417+
torch.cuda.synchronize()
416418
train_perf = logs["bs"] * get_world_size() / (time.time() - iter_start)
417419

418420

0 commit comments

Comments
 (0)