Skip to content

Commit f82a322

Browse files
authored
Merge pull request #2974 from NVIDIA/merge-rel-8.6-into-main
Merge release/8.6 into main
2 parents c89bc83 + e42fc0d commit f82a322

File tree

6 files changed

+155
-141
lines changed

6 files changed

+155
-141
lines changed

demo/BERT/README.md

Lines changed: 119 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -424,86 +424,83 @@ The following sections provide details on how we achieved our performance and in
424424

425425
Results were obtained by running `scripts/inference_benchmark.sh --gpu Ampere` on NVIDIA A100 (40G).
426426

427-
> NOTE: We observed a few regression cases against the performance with TRT-8.4.3 with small batch sizes. The regression issues are under investigation.
428-
429427
##### BERT Base
430428

431429
| Sequence Length | Batch Size | INT8 Latency (ms) | | | FP16 Latency (ms) | | |
432430
|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
433431
| | | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
434-
| 128 | 1 | 0.55 | 0.7 | 0.55 | 0.8 | 0.8 | 0.64 |
435-
| 128 | 2 | 0.77 | 0.77 | 0.61 | 0.93 | 0.94 | 0.75 |
436-
| 128 | 4 | 0.83 | 0.83 | 0.76 | 0.95 | 1.22 | 0.95 |
437-
| 128 | 8 | 1.21 | 1.22 | 0.96 | 1.35 | 1.36 | 1.36 |
438-
| 128 | 12 | 1.23 | 1.43 | 1.23 | 1.85 | 1.86 | 1.84 |
439-
| 128 | 16 | 1.42 | 1.82 | 1.42 | 2.13 | 2.14 | 2.12 |
440-
| 128 | 24 | 1.86 | 1.88 | 1.86 | 3.18 | 3.24 | 3.17 |
441-
| 128 | 32 | 2.3 | 2.31 | 2.3 | 4.1 | 4.14 | 4.06 |
442-
| 128 | 64 | 4.26 | 4.26 | 4.23 | 8.08 | 8.15 | 8.05 |
443-
| 128 | 128 | 8.24 | 8.3 | 8.22 | 16.07 | 16.1 | 15.91 |
444-
| 384 | 1 | 1.14 | 1.15 | 1.14 | 1.28 | 1.64 | 1.28 |
445-
| 384 | 2 | 1.33 | 1.7 | 1.33 | 1.59 | 1.59 | 1.59 |
446-
| 384 | 4 | 1.69 | 1.7 | 1.69 | 2.25 | 2.27 | 2.25 |
447-
| 384 | 8 | 2.25 | 2.25 | 2.24 | 3.51 | 3.52 | 3.48 |
448-
| 384 | 12 | 3.38 | 3.39 | 3.38 | 4.99 | 5.08 | 4.97 |
449-
| 384 | 16 | 4.16 | 4.17 | 4.15 | 6.73 | 6.73 | 6.64 |
450-
| 384 | 24 | 5.86 | 5.87 | 5.86 | 9.81 | 9.82 | 9.69 |
451-
| 384 | 32 | 7.82 | 7.83 | 7.81 | 13.45 | 13.53 | 13.39 |
452-
| 384 | 64 | 15.25 | 15.35 | 15.21 | 25.98 | 26.05 | 25.8 |
453-
| 384 | 128 | 29.8 | 29.82 | 29.53 | 50.98 | 51.1 | 50.59 |
432+
| 128 | 1 | 0.55 | 0.70 | 0.55 | 0.61 | 0.78 | 0.62 |
433+
| 128 | 2 | 0.78 | 0.78 | 0.62 | 0.72 | 0.92 | 0.73 |
434+
| 128 | 4 | 0.74 | 0.93 | 0.74 | 0.93 | 0.93 | 0.93 |
435+
| 128 | 8 | 0.95 | 0.95 | 0.94 | 1.31 | 1.31 | 1.31 |
436+
| 128 | 12 | 1.21 | 1.53 | 1.22 | 1.73 | 1.77 | 1.72 |
437+
| 128 | 16 | 1.34 | 1.34 | 1.34 | 2.09 | 2.10 | 2.07 |
438+
| 128 | 24 | 1.84 | 1.84 | 1.84 | 3.07 | 3.09 | 3.03 |
439+
| 128 | 32 | 2.27 | 2.27 | 2.26 | 3.93 | 3.94 | 3.90 |
440+
| 128 | 64 | 4.21 | 4.25 | 4.18 | 7.79 | 7.80 | 7.72 |
441+
| 128 | 128 | 8.25 | 8.26 | 8.14 | 15.41 | 15.42 | 15.27 |
442+
| 384 | 1 | 1.14 | 1.46 | 1.14 | 1.26 | 1.26 | 1.25 |
443+
| 384 | 2 | 1.31 | 1.31 | 1.31 | 1.55 | 1.55 | 1.55 |
444+
| 384 | 4 | 1.67 | 1.67 | 1.67 | 2.13 | 2.17 | 2.13 |
445+
| 384 | 8 | 2.22 | 2.22 | 2.22 | 3.36 | 3.39 | 3.35 |
446+
| 384 | 12 | 3.34 | 3.35 | 3.34 | 4.84 | 4.88 | 4.79 |
447+
| 384 | 16 | 4.04 | 4.04 | 4.04 | 6.40 | 6.46 | 6.39 |
448+
| 384 | 24 | 5.76 | 5.76 | 5.74 | 9.54 | 9.66 | 9.44 |
449+
| 384 | 32 | 7.71 | 7.71 | 7.70 | 13.02 | 13.03 | 12.90 |
450+
| 384 | 64 | 15.01 | 15.01 | 14.91 | 25.25 | 25.26 | 24.89 |
451+
| 384 | 128 | 29.26 | 29.26 | 29.13 | 49.12 | 49.25 | 48.81 |
454452

455453
##### BERT Large
456454

457455
| Sequence Length | Batch Size | INT8 Latency (ms) | | | FP16 Latency (ms) | | |
458456
|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
459457
| | | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
460-
| 128 | 1 | 1.25 | 1.57 | 1.25 | 1.67 | 1.7 | 1.67 |
461-
| 128 | 2 | 1.44 | 1.45 | 1.44 | 1.88 | 1.9 | 1.88 |
462-
| 128 | 4 | 2.0 | 2.01 | 2.0 | 2.72 | 2.73 | 2.71 |
463-
| 128 | 8 | 2.73 | 2.74 | 2.73 | 4.4 | 4.41 | 4.38 |
464-
| 128 | 12 | 3.44 | 3.45 | 3.44 | 5.25 | 5.25 | 5.2 |
465-
| 128 | 16 | 4.07 | 4.08 | 4.06 | 7.37 | 7.39 | 7.32 |
466-
| 128 | 24 | 5.31 | 5.32 | 5.3 | 10.02 | 10.1 | 9.97 |
467-
| 128 | 32 | 7.14 | 7.15 | 7.09 | 13.77 | 13.8 | 13.68 |
468-
| 128 | 64 | 13.19 | 13.2 | 13.06 | 26.03 | 26.05 | 25.77 |
469-
| 128 | 128 | 25.62 | 25.65 | 25.39 | 51.59 | 51.72 | 51.2 |
470-
| 384 | 1 | 2.84 | 2.85 | 2.84 | 3.06 | 3.08 | 3.06 |
471-
| 384 | 2 | 3.05 | 3.06 | 3.05 | 4.08 | 4.31 | 4.08 |
472-
| 384 | 4 | 4.37 | 4.38 | 4.36 | 5.85 | 5.87 | 5.85 |
473-
| 384 | 8 | 7.24 | 7.25 | 7.22 | 11.46 | 11.55 | 11.41 |
474-
| 384 | 12 | 9.35 | 9.38 | 9.34 | 16.15 | 16.15 | 16.0 |
475-
| 384 | 16 | 12.38 | 12.4 | 12.37 | 22.06 | 22.12 | 21.86 |
476-
| 384 | 24 | 17.93 | 18.1 | 17.82 | 32.42 | 32.54 | 32.17 |
477-
| 384 | 32 | 23.29 | 23.3 | 23.13 | 42.78 | 42.9 | 42.52 |
478-
| 384 | 64 | 45.6 | 45.62 | 45.29 | 83.5 | 83.68 | 82.86 |
479-
| 384 | 128 | 89.73 | 89.81 | 89.04 | 163.72 | 164.23 | 162.67 |
458+
| 128 | 1 | 1.24 | 1.25 | 1.24 | 1.58 | 1.60 | 1.58 |
459+
| 128 | 2 | 1.44 | 1.44 | 1.44 | 1.83 | 1.84 | 1.82 |
460+
| 128 | 4 | 1.78 | 1.79 | 1.78 | 2.54 | 2.54 | 2.53 |
461+
| 128 | 8 | 2.82 | 2.82 | 2.81 | 3.98 | 4.00 | 3.97 |
462+
| 128 | 12 | 3.11 | 3.11 | 3.11 | 5.08 | 5.12 | 5.04 |
463+
| 128 | 16 | 4.06 | 4.07 | 4.06 | 6.96 | 6.96 | 6.91 |
464+
| 128 | 24 | 5.31 | 5.32 | 5.31 | 9.69 | 9.70 | 9.63 |
465+
| 128 | 32 | 7.07 | 7.07 | 7.02 | 13.11 | 13.12 | 12.93 |
466+
| 128 | 64 | 12.97 | 13.08 | 12.89 | 24.94 | 25.22 | 24.74 |
467+
| 128 | 128 | 25.48 | 25.72 | 25.28 | 49.30 | 49.46 | 49.18 |
468+
| 384 | 1 | 2.59 | 2.59 | 2.59 | 2.98 | 2.99 | 2.98 |
469+
| 384 | 2 | 3.04 | 3.05 | 3.04 | 4.01 | 4.03 | 4.00 |
470+
| 384 | 4 | 4.03 | 4.04 | 4.03 | 5.79 | 5.79 | 5.73 |
471+
| 384 | 8 | 7.20 | 7.22 | 7.20 | 11.11 | 11.14 | 10.99 |
472+
| 384 | 12 | 9.19 | 9.20 | 9.19 | 15.47 | 15.63 | 15.39 |
473+
| 384 | 16 | 12.36 | 12.38 | 12.35 | 21.18 | 21.19 | 21.00 |
474+
| 384 | 24 | 17.77 | 17.95 | 17.68 | 31.41 | 31.42 | 30.90 |
475+
| 384 | 32 | 23.36 | 23.37 | 23.20 | 41.40 | 41.43 | 40.90 |
476+
| 384 | 64 | 45.60 | 45.61 | 45.26 | 80.07 | 80.25 | 79.50 |
477+
| 384 | 128 | 89.25 | 89.30 | 88.57 | 157.38 | 157.76 | 156.31 |
480478

481479
##### Megatron Large with Sparsity
482480

483481
| Sequence Length | Batch Size | INT8 QAT Latency (ms) | | |
484482
|-----------------|------------|-----------------|-----------------|---------|
485483
| | | 95th Percentile | 99th Percentile | Average |
486-
| 128 | 1 | 1.14 | 1.44 | 1.14 |
487-
| 128 | 2 | 1.45 | 1.46 | 1.45 |
488-
| 128 | 4 | 1.8 | 1.8 | 1.8 |
489-
| 128 | 8 | 2.57 | 2.57 | 2.56 |
490-
| 128 | 12 | 3.16 | 3.17 | 3.16 |
491-
| 128 | 16 | 4.08 | 4.09 | 4.08 |
492-
| 128 | 24 | 5.07 | 5.08 | 5.07 |
493-
| 128 | 32 | 6.93 | 6.95 | 6.88 |
494-
| 128 | 64 | 11.73 | 11.74 | 11.71 |
495-
| 128 | 128 | 21.47 | 21.48 | 21.28 |
496-
| 384 | 1 | 1.72 | 1.73 | 1.72 |
497-
| 384 | 2 | 2.26 | 2.27 | 2.26 |
498-
| 384 | 4 | 3.68 | 3.69 | 3.68 |
499-
| 384 | 8 | 5.92 | 5.93 | 5.91 |
500-
| 384 | 12 | 8.27 | 8.28 | 8.26 |
501-
| 384 | 16 | 10.46 | 10.47 | 10.45 |
502-
| 384 | 24 | 14.77 | 14.78 | 14.75 |
503-
| 384 | 32 | 18.82 | 18.83 | 18.8 |
504-
| 384 | 64 | 36.16 | 36.19 | 35.88 |
505-
| 384 | 128 | 69.07 | 69.32 | 68.61 |
506-
484+
| 128 | 1 | 1.29 | 1.54 | 1.29 |
485+
| 128 | 2 | 1.35 | 1.71 | 1.35 |
486+
| 128 | 4 | 1.79 | 2.14 | 1.79 |
487+
| 128 | 8 | 2.54 | 2.54 | 2.53 |
488+
| 128 | 12 | 2.93 | 2.93 | 2.92 |
489+
| 128 | 16 | 3.95 | 3.95 | 3.94 |
490+
| 128 | 24 | 4.93 | 4.94 | 4.92 |
491+
| 128 | 32 | 7.13 | 7.14 | 7.12 |
492+
| 128 | 64 | 11.64 | 11.64 | 11.62 |
493+
| 128 | 128 | 21.29 | 21.46 | 21.16 |
494+
| 384 | 1 | 1.71 | 1.72 | 1.71 |
495+
| 384 | 2 | 2.24 | 2.25 | 2.23 |
496+
| 384 | 4 | 3.43 | 3.44 | 3.43 |
497+
| 384 | 8 | 5.77 | 5.77 | 5.76 |
498+
| 384 | 12 | 8.39 | 8.39 | 8.37 |
499+
| 384 | 16 | 10.38 | 10.39 | 10.36 |
500+
| 384 | 24 | 14.69 | 14.70 | 14.67 |
501+
| 384 | 32 | 18.68 | 18.82 | 18.66 |
502+
| 384 | 64 | 35.88 | 35.89 | 35.70 |
503+
| 384 | 128 | 68.71 | 68.73 | 68.16 |
507504

508505
#### Inference performance: NVIDIA A30
509506

@@ -514,76 +511,76 @@ Results were obtained by running `scripts/inference_benchmark.sh --gpu Ampere` o
514511
| Sequence Length | Batch Size | INT8 Latency (ms) | | | FP16 Latency (ms) | | |
515512
|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
516513
| | | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
517-
| 128 | 1 | 0.59 | 0.89 | 0.6 | 1.19 | 1.19 | 0.82 |
518-
| 128 | 2 | 0.75 | 1.13 | 0.75 | 1.01 | 1.01 | 1.01 |
519-
| 128 | 4 | 1.04 | 1.04 | 1.04 | 1.52 | 1.53 | 1.51 |
520-
| 128 | 8 | 1.47 | 1.48 | 1.45 | 2.48 | 2.5 | 2.48 |
521-
| 128 | 12 | 1.97 | 1.97 | 1.94 | 3.59 | 3.66 | 3.54 |
522-
| 128 | 16 | 2.42 | 2.43 | 2.4 | 4.49 | 4.51 | 4.44 |
523-
| 128 | 24 | 3.58 | 3.61 | 3.52 | 6.89 | 7.01 | 6.82 |
524-
| 128 | 32 | 4.5 | 4.55 | 4.49 | 8.76 | 8.79 | 8.67 |
525-
| 128 | 64 | 8.74 | 8.82 | 8.68 | 17.4 | 17.41 | 17.23 |
526-
| 128 | 128 | 17.01 | 17.2 | 16.88 | 34.0 | 34.32 | 33.86 |
527-
| 384 | 1 | 1.31 | 1.7 | 1.32 | 1.66 | 1.67 | 1.66 |
528-
| 384 | 2 | 1.66 | 1.66 | 1.66 | 2.39 | 2.4 | 2.36 |
529-
| 384 | 4 | 2.3 | 2.31 | 2.29 | 3.9 | 3.96 | 3.87 |
530-
| 384 | 8 | 4.34 | 4.35 | 4.28 | 7.62 | 7.67 | 7.5 |
531-
| 384 | 12 | 6.17 | 6.24 | 6.11 | 10.68 | 10.76 | 10.59 |
532-
| 384 | 16 | 8.25 | 8.27 | 8.18 | 14.58 | 14.67 | 14.53 |
533-
| 384 | 24 | 11.96 | 12.04 | 11.93 | 21.5 | 21.53 | 21.26 |
534-
| 384 | 32 | 15.76 | 15.77 | 15.64 | 28.35 | 28.5 | 28.07 |
535-
| 384 | 64 | 31.09 | 31.34 | 30.93 | 54.91 | 55.46 | 54.69 |
536-
| 384 | 128 | 61.67 | 62.0 | 60.93 | 108.85 | 109.18 | 108.18 |
514+
| 128 | 1 | 0.91 | 0.92 | 0.62 | 1.18 | 1.18 | 0.82 |
515+
| 128 | 2 | 1.13 | 1.13 | 0.77 | 1.07 | 1.07 | 0.97 |
516+
| 128 | 4 | 1.04 | 1.57 | 1.05 | 1.46 | 2.11 | 1.44 |
517+
| 128 | 8 | 1.46 | 1.49 | 1.44 | 2.41 | 2.41 | 2.40 |
518+
| 128 | 12 | 1.94 | 1.94 | 1.94 | 3.42 | 3.45 | 3.40 |
519+
| 128 | 16 | 2.40 | 2.46 | 2.37 | 4.33 | 4.41 | 4.28 |
520+
| 128 | 24 | 3.54 | 3.59 | 3.48 | 6.59 | 6.60 | 6.50 |
521+
| 128 | 32 | 4.46 | 4.50 | 4.43 | 8.49 | 8.55 | 8.37 |
522+
| 128 | 64 | 8.68 | 8.75 | 8.57 | 16.65 | 16.67 | 16.47 |
523+
| 128 | 128 | 16.81 | 16.83 | 16.63 | 32.40 | 32.52 | 32.04 |
524+
| 384 | 1 | 1.31 | 1.32 | 1.31 | 1.62 | 1.64 | 1.63 |
525+
| 384 | 2 | 1.66 | 1.66 | 1.66 | 2.27 | 2.27 | 2.26 |
526+
| 384 | 4 | 2.32 | 2.32 | 2.30 | 3.79 | 3.87 | 3.72 |
527+
| 384 | 8 | 4.26 | 4.26 | 4.24 | 7.26 | 7.31 | 7.17 |
528+
| 384 | 12 | 6.10 | 6.13 | 6.04 | 10.35 | 10.43 | 10.23 |
529+
| 384 | 16 | 8.17 | 8.18 | 8.08 | 13.93 | 14.05 | 13.85 |
530+
| 384 | 24 | 11.91 | 11.98 | 11.82 | 20.46 | 20.57 | 20.25 |
531+
| 384 | 32 | 15.50 | 15.64 | 15.48 | 27.06 | 27.17 | 26.81 |
532+
| 384 | 64 | 31.03 | 31.18 | 30.63 | 52.44 | 52.48 | 52.05 |
533+
| 384 | 128 | 61.10 | 61.13 | 60.50 | 103.38 | 103.64 | 102.87 |
537534

538535
##### BERT Large
539536

540537
| Sequence Length | Batch Size | INT8 Latency (ms) | | | FP16 Latency (ms) | | |
541538
|-----------------|------------|-----------------|-----------------|---------|-----------------|-----------------|---------|
542539
| | | 95th Percentile | 99th Percentile | Average | 95th Percentile | 99th Percentile | Average |
543-
| 128 | 1 | 1.47 | 1.47 | 1.47 | 2.02 | 2.04 | 2.02 |
544-
| 128 | 2 | 1.83 | 1.84 | 1.83 | 2.86 | 2.86 | 2.84 |
545-
| 128 | 4 | 2.71 | 2.71 | 2.69 | 4.77 | 4.8 | 4.69 |
546-
| 128 | 8 | 4.33 | 4.37 | 4.29 | 8.47 | 8.53 | 8.42 |
547-
| 128 | 12 | 5.71 | 5.76 | 5.62 | 10.94 | 11.02 | 10.84 |
548-
| 128 | 16 | 7.67 | 7.76 | 7.64 | 15.08 | 15.17 | 15.06 |
549-
| 128 | 24 | 10.63 | 10.68 | 10.51 | 21.32 | 21.38 | 21.12 |
550-
| 128 | 32 | 14.19 | 14.26 | 14.06 | 29.42 | 29.45 | 29.04 |
551-
| 128 | 64 | 26.95 | 26.97 | 26.69 | 56.09 | 56.38 | 55.71 |
552-
| 128 | 128 | 52.86 | 52.98 | 52.32 | 109.89 | 110.09 | 109.01 |
553-
| 384 | 1 | 3.34 | 3.34 | 3.33 | 4.56 | 4.59 | 4.53 |
554-
| 384 | 2 | 4.24 | 4.25 | 4.21 | 6.82 | 6.86 | 6.75 |
555-
| 384 | 4 | 7.33 | 7.33 | 7.25 | 12.33 | 12.34 | 12.21 |
556-
| 384 | 8 | 12.92 | 13.0 | 12.88 | 23.39 | 23.45 | 23.17 |
557-
| 384 | 12 | 18.75 | 18.88 | 18.6 | 34.75 | 35.07 | 34.59 |
558-
| 384 | 16 | 24.32 | 24.45 | 24.13 | 45.67 | 45.79 | 45.26 |
559-
| 384 | 24 | 35.99 | 36.3 | 35.66 | 67.12 | 67.72 | 66.85 |
560-
| 384 | 32 | 47.53 | 47.56 | 47.04 | 88.88 | 89.31 | 88.39 |
561-
| 384 | 64 | 92.13 | 92.64 | 91.92 | 175.91 | 176.4 | 174.94 |
562-
| 384 | 128 | 181.87 | 182.29 | 180.87 | 346.39 | 346.88 | 345.32 |
540+
| 128 | 1 | 1.49 | 1.49 | 1.48 | 2.03 | 2.03 | 2.02 |
541+
| 128 | 2 | 1.83 | 1.84 | 1.82 | 2.79 | 2.79 | 2.76 |
542+
| 128 | 4 | 2.70 | 2.70 | 2.68 | 4.35 | 4.40 | 4.31 |
543+
| 128 | 8 | 4.50 | 4.52 | 4.47 | 8.07 | 8.17 | 8.01 |
544+
| 128 | 12 | 5.67 | 5.69 | 5.62 | 10.67 | 10.75 | 10.53 |
545+
| 128 | 16 | 8.08 | 8.13 | 7.95 | 14.86 | 14.86 | 14.72 |
546+
| 128 | 24 | 10.59 | 10.60 | 10.47 | 20.71 | 20.73 | 20.47 |
547+
| 128 | 32 | 14.16 | 14.21 | 14.03 | 28.21 | 28.37 | 27.98 |
548+
| 128 | 64 | 26.77 | 26.95 | 26.66 | 54.03 | 54.33 | 53.43 |
549+
| 128 | 128 | 52.65 | 52.78 | 52.12 | 106.15 | 106.75 | 105.37 |
550+
| 384 | 1 | 3.20 | 3.21 | 3.20 | 4.19 | 4.19 | 4.17 |
551+
| 384 | 2 | 4.26 | 4.26 | 4.22 | 6.61 | 6.63 | 6.56 |
552+
| 384 | 4 | 7.56 | 7.64 | 7.55 | 12.04 | 12.05 | 11.93 |
553+
| 384 | 8 | 13.01 | 13.07 | 12.84 | 22.81 | 22.89 | 22.56 |
554+
| 384 | 12 | 18.73 | 18.82 | 18.56 | 33.47 | 33.62 | 33.43 |
555+
| 384 | 16 | 24.41 | 24.51 | 24.16 | 44.45 | 44.47 | 44.03 |
556+
| 384 | 24 | 35.83 | 36.19 | 35.53 | 65.53 | 65.79 | 64.91 |
557+
| 384 | 32 | 47.34 | 47.52 | 46.86 | 85.92 | 86.16 | 85.15 |
558+
| 384 | 64 | 92.68 | 93.00 | 91.86 | 169.51 | 170.03 | 168.46 |
559+
| 384 | 128 | 181.91 | 182.29 | 181.02 | 334.01 | 334.51 | 332.81 |
563560

564561
##### Megatron Large with Sparsity
565562

566563
| Sequence Length | Batch Size | INT8 QAT Latency (ms) | | |
567564
|-----------------|------------|-----------------|-----------------|---------|
568565
| | | 95th Percentile | 99th Percentile | Average |
569-
| 128 | 1 | 1.42 | 1.42 | 1.42 |
566+
| 128 | 1 | 1.46 | 1.47 | 1.45 |
570567
| 128 | 2 | 1.88 | 1.88 | 1.87 |
571-
| 128 | 4 | 2.71 | 2.72 | 2.7 |
572-
| 128 | 8 | 4.16 | 4.17 | 4.16 |
573-
| 128 | 12 | 5.3 | 5.34 | 5.27 |
574-
| 128 | 16 | 7.44 | 7.5 | 7.36 |
575-
| 128 | 24 | 10.01 | 10.05 | 9.91 |
576-
| 128 | 32 | 13.14 | 13.15 | 13.1 |
577-
| 128 | 64 | 24.61 | 24.73 | 24.46 |
578-
| 128 | 128 | 46.66 | 46.83 | 46.58 |
579-
| 384 | 1 | 2.37 | 2.38 | 2.37 |
580-
| 384 | 2 | 3.87 | 3.88 | 3.86 |
581-
| 384 | 4 | 6.14 | 6.17 | 6.08 |
582-
| 384 | 8 | 11.61 | 11.64 | 11.54 |
583-
| 384 | 12 | 16.04 | 16.11 | 15.95 |
584-
| 384 | 16 | 21.24 | 21.33 | 21.1 |
585-
| 384 | 24 | 30.48 | 30.61 | 30.23 |
586-
| 384 | 32 | 40.79 | 40.97 | 40.46 |
587-
| 384 | 64 | 78.04 | 78.41 | 77.51 |
588-
| 384 | 128 | 151.33 | 151.62 | 150.76 |
568+
| 128 | 4 | 2.74 | 2.74 | 2.73 |
569+
| 128 | 8 | 4.11 | 4.12 | 4.10 |
570+
| 128 | 12 | 5.29 | 5.35 | 5.25 |
571+
| 128 | 16 | 7.52 | 7.57 | 7.50 |
572+
| 128 | 24 | 10.11 | 10.19 | 10.06 |
573+
| 128 | 32 | 12.85 | 12.90 | 12.80 |
574+
| 128 | 64 | 24.50 | 24.52 | 24.26 |
575+
| 128 | 128 | 46.24 | 46.57 | 45.92 |
576+
| 384 | 1 | 2.35 | 2.36 | 2.35 |
577+
| 384 | 2 | 3.90 | 3.91 | 3.89 |
578+
| 384 | 4 | 6.14 | 6.15 | 6.08 |
579+
| 384 | 8 | 11.74 | 11.76 | 11.64 |
580+
| 384 | 12 | 15.86 | 15.88 | 15.74 |
581+
| 384 | 16 | 21.21 | 21.27 | 21.05 |
582+
| 384 | 24 | 30.03 | 30.04 | 29.89 |
583+
| 384 | 32 | 40.20 | 40.22 | 40.05 |
584+
| 384 | 64 | 76.82 | 77.11 | 76.52 |
585+
| 384 | 128 | 149.54 | 149.80 | 148.78 |
589586

demo/Diffusion/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ transformers 4.26.1
6565
```bash
6666
python3 demo_txt2img.py --help
6767
python3 demo_img2img.py --help
68-
python3 demo_inpainting.py --help
68+
python3 demo_inpaint.py --help
6969
```
7070

7171
### HuggingFace user access token

python/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ endif()
4545

4646
# -------- PATHS --------
4747
message(STATUS "EXT_PATH: ${EXT_PATH}")
48-
message(STATUS "TENSORRT_BUILD: ${TENSORRT_BUILD}")
48+
message(STATUS "TENSORRT_LIBPATH: ${TENSORRT_LIBPATH}")
4949
message(STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}")
5050
message(STATUS "CUDA_ROOT: ${CUDA_ROOT}")
5151
message(STATUS "CUDA_INCLUDE_DIRS: ${CUDA_INCLUDE_DIRS}")
@@ -111,7 +111,7 @@ message(STATUS "PY_CONFIG_INCLUDE: ${PY_CONFIG_INCLUDE}")
111111
# -------- GLOBAL COMPILE OPTIONS --------
112112

113113
include_directories(${TENSORRT_ROOT}/include ${PROJECT_SOURCE_DIR}/include ${CUDA_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/docstrings ${ONNX_INC_DIR} ${PYBIND11_DIR})
114-
link_directories(${TENSORRT_BUILD})
114+
link_directories(${TENSORRT_LIBPATH})
115115

116116
if (MSVC)
117117
message(STATUS "include_dirs: ${MSVC_COMPILER_DIR}/include ${MSVC_COMPILER_DIR}/../ucrt/include ${NV_WDKSDK_INC}/um ${NV_WDKSDK_INC}/shared")

python/README.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,28 +19,36 @@ git clone https://github.com/pybind/pybind11.git
1919
1. Get the source code from the official [python sources](https://www.python.org/downloads/source/)
2020
2. Copy the contents of the `Include/` directory into `$EXT_PATH/pythonX.Y/include/` directory.
2121

22+
Example: Python 3.9
23+
```bash
24+
wget https://www.python.org/ftp/python/3.9.16/Python-3.9.16.tgz
25+
tar -xvf Python-3.9.16.tgz
26+
mkdir -p $EXT_PATH/python3.9
27+
cp -r Python-3.9.16/Include/ $EXT_PATH/python3.9/include
28+
```
29+
2230
#### Add PyConfig.h
2331

2432
1. Download the deb package for the desired platform from [here](https://packages.debian.org/search?searchon=contents&keywords=pyconfig.h&mode=path&suite=unstable&arch=any).
25-
Typical plaforms include `x86_64` (`amd64`), `aarch64` (`arm64`), and `ppc64le` (`ppc64el`)
33+
Typical plaforms include `x86_64` (`amd64`), `aarch64` (`arm64`), and `ppc64le` (`ppc64el`).
34+
For older versions of Python, you may need to select a different suite.
2635
2. Unpack the debian with `ar x <libpython...>.deb`
2736
3. Unpack the contained `data.tar.xz` with `tar -xvf`
28-
4. Copy the `./usr/include/<platform>/` directory into the `$$EXT_PATH/pythonX.Y/include/` directory here.
29-
It should only contain a single file - `pyconfig.h`
37+
4. Find `pyconfig.h` in the `./usr/include/<platform>/pythonX.Y/` directory and copy it into `$EXT_PATH/pythonX.Y/include/`.
3038

3139

3240
### Build Python bindings
3341

3442
Use `build.sh` to generate the installable wheels for intended python version and target architecture.
3543

36-
Example: for python 3.8 `x86_64` wheel,
44+
Example: for Python 3.9 `x86_64` wheel,
3745
```bash
3846
cd $TRT_OSSPATH/python
39-
PYTHON_MAJOR_VERSION=3 PYTHON_MINOR_VERSION=8 TARGET_ARCHITECTURE=x86_64 ./build.sh
47+
TENSORRT_MODULE=tensorrt PYTHON_MAJOR_VERSION=3 PYTHON_MINOR_VERSION=9 TARGET_ARCHITECTURE=x86_64 ./build.sh
4048
```
4149

4250
### Install the python wheel
4351

4452
```bash
45-
python3 -m pip install build/dist/tensorrt-*.whl
53+
python3 -m pip install ./build/bindings_wheel/dist/tensorrt-*.whl
4654
```

python/build.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ cmake .. -DCMAKE_BUILD_TYPE=Release \
3535
-DEXT_PATH=${EXT_PATH} \
3636
-DCUDA_INCLUDE_DIRS=${CUDA_ROOT}/include \
3737
-DTENSORRT_ROOT=${ROOT_PATH} \
38-
-DTENSORRT_BUILD=${ROOT_PATH}/build/
38+
-DTENSORRT_MODULE=${TENSORRT_MODULE} \
39+
-DTENSORRT_LIBPATH=${TRT_LIBPATH}
3940
make -j12
4041

4142
# Generate wheel
@@ -52,13 +53,19 @@ expand_vars_cp () {
5253
test -f ${1} || (echo "ERROR: File: ${1} does not exist!" && exit 1); \
5354
sed -e "s|\#\#TENSORRT_VERSION\#\#|${TRT_VERSION}|g" \
5455
-e "s|\#\#TENSORRT_MAJMINPATCH\#\#|${TRT_MAJMINPATCH}|g" \
56+
-e "s|\#\#TENSORRT_PYTHON_VERSION\#\#|${TRT_MAJMINPATCH}|g" \
57+
-e "s|\#\#TENSORRT_MODULE\#\#|${TENSORRT_MODULE}|g" \
5558
${1} > ${2}
5659
}
5760

5861
pushd ${ROOT_PATH}/python/packaging
5962
for dir in $(find . -type d); do mkdir -p ${WHEEL_OUTPUT_DIR}/$dir; done
6063
for file in $(find . -type f); do expand_vars_cp $file ${WHEEL_OUTPUT_DIR}/${file}; done
6164
popd
65+
cp tensorrt/tensorrt.so bindings_wheel/tensorrt/tensorrt.so
66+
67+
pushd ${WHEEL_OUTPUT_DIR}/bindings_wheel
68+
6269
python3 setup.py -q bdist_wheel --python-tag=cp${PYTHON_MAJOR_VERSION}${PYTHON_MINOR_VERSION} --plat-name=linux_${TARGET}
6370

6471
popd

0 commit comments

Comments
 (0)