@@ -664,15 +664,15 @@ func.func @mbarrier_txcount_pred() {
664664
665665// CHECK-LABEL: func @async_tma_load
666666!tensorMap1d = !nvgpu.tensormap.descriptor <tensor = memref <128 xf32 ,3 >, swizzle =none , l2promo = none , oob = nan , interleave = none >
667- !tensorMap2d = !nvgpu.tensormap.descriptor <tensor = memref <32 x 32 x f32 ,3 >, swizzle =swizzle_32b , l2promo = none , oob = zero , interleave = none >
668- !tensorMap3d = !nvgpu.tensormap.descriptor <tensor = memref <2 x 32 x 32 x f32 ,3 >, swizzle =swizzle_64b , l2promo = l2promo_64b , oob = zero , interleave = none >
667+ !tensorMap2d = !nvgpu.tensormap.descriptor <tensor = memref <32 x 8 x f32 ,3 >, swizzle =swizzle_32b , l2promo = none , oob = zero , interleave = none >
668+ !tensorMap3d = !nvgpu.tensormap.descriptor <tensor = memref <2 x 32 x 16 x f32 ,3 >, swizzle =swizzle_64b , l2promo = l2promo_64b , oob = zero , interleave = none >
669669!tensorMap4d = !nvgpu.tensormap.descriptor <tensor = memref <2 x2 x32 x32 xf32 ,3 >, swizzle =swizzle_128b ,l2promo = l2promo_128b ,oob = zero , interleave = none >
670670!tensorMap5d = !nvgpu.tensormap.descriptor <tensor = memref <2 x2 x2 x32 x32 xf32 ,3 >, swizzle =none , l2promo = none , oob = zero , interleave = none >
671671!mbarrier = !nvgpu.mbarrier.group <memorySpace = #gpu.address_space <workgroup >>
672672func.func @async_tma_load (%tensorMap1d: !tensorMap1d , %tensorMap2d: !tensorMap2d , %tensorMap3d: !tensorMap3d , %tensorMap4d: !tensorMap4d , %tensorMap5d: !tensorMap5d ,
673673 %buffer1d: memref <128 xf32 ,3 >,
674- %buffer2d: memref <32 x 32 x f32 ,3 >,
675- %buffer3d: memref <2 x 32 x 32 x f32 ,3 >,
674+ %buffer2d: memref <32 x 8 x f32 ,3 >,
675+ %buffer3d: memref <2 x 32 x 16 x f32 ,3 >,
676676 %buffer4d: memref <2 x2 x32 x32 xf32 ,3 >,
677677 %buffer5d: memref <2 x2 x2 x32 x32 xf32 ,3 >,
678678 %mbarrier: !mbarrier ) {
@@ -682,9 +682,9 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
682682 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}]
683683 nvgpu.tma.async.load %tensorMap1d [%crd0 ], %mbarrier [%c0 ] to %buffer1d : !tensorMap1d , !mbarrier -> memref <128 xf32 ,3 >
684684 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}]
685- nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d : !tensorMap2d , !mbarrier -> memref <32 x 32 x f32 ,3 >
685+ nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d : !tensorMap2d , !mbarrier -> memref <32 x 8 x f32 ,3 >
686686 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}]
687- nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d : !tensorMap3d , !mbarrier -> memref <2 x 32 x 32 x f32 ,3 >
687+ nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d : !tensorMap3d , !mbarrier -> memref <2 x 32 x 16 x f32 ,3 >
688688 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
689689 nvgpu.tma.async.load %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer4d : !tensorMap4d , !mbarrier -> memref <2 x2 x32 x32 xf32 ,3 >
690690 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
@@ -708,8 +708,8 @@ func.func @async_tma_load_gpu_address_space(%tensorMap1d: !tensorMap1dgpuspace,
708708// CHECK-LABEL: func @async_tma_load_pred
709709func.func @async_tma_load_pred (%tensorMap1d: !tensorMap1d , %tensorMap2d: !tensorMap2d , %tensorMap3d: !tensorMap3d , %tensorMap4d: !tensorMap4d , %tensorMap5d: !tensorMap5d ,
710710 %buffer1d: memref <128 xf32 ,3 >,
711- %buffer2d: memref <32 x 32 x f32 ,3 >,
712- %buffer3d: memref <2 x 32 x 32 x f32 ,3 >,
711+ %buffer2d: memref <32 x 8 x f32 ,3 >,
712+ %buffer3d: memref <2 x 32 x 16 x f32 ,3 >,
713713 %buffer4d: memref <2 x2 x32 x32 xf32 ,3 >,
714714 %buffer5d: memref <2 x2 x2 x32 x32 xf32 ,3 >,
715715 %mbarrier: !mbarrier ,
@@ -720,9 +720,9 @@ func.func @async_tma_load_pred(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensor
720720 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}] predicate = %{{.*}}
721721 nvgpu.tma.async.load %tensorMap1d [%crd0 ], %mbarrier [%c0 ] to %buffer1d , predicate = %p : !tensorMap1d , !mbarrier -> memref <128 xf32 ,3 >
722722 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}] predicate = %{{.*}}
723- nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d , predicate = %p : !tensorMap2d , !mbarrier -> memref <32 x 32 x f32 ,3 >
723+ nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d , predicate = %p : !tensorMap2d , !mbarrier -> memref <32 x 8 x f32 ,3 >
724724 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
725- nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d , predicate = %p : !tensorMap3d , !mbarrier -> memref <2 x 32 x 32 x f32 ,3 >
725+ nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d , predicate = %p : !tensorMap3d , !mbarrier -> memref <2 x 32 x 16 x f32 ,3 >
726726 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
727727 nvgpu.tma.async.load %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer4d , predicate = %p : !tensorMap4d , !mbarrier -> memref <2 x2 x32 x32 xf32 ,3 >
728728 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}] predicate = %{{.*}}
@@ -734,7 +734,7 @@ func.func @async_tma_load_multicast(
734734 %tensorMap1d: !tensorMap1d , %tensorMap2d: !tensorMap2d ,
735735 %tensorMap3d: !tensorMap3d , %tensorMap4d: !tensorMap4d ,
736736 %tensorMap5d: !tensorMap5d , %buffer1d: memref <128 xf32 ,3 >,
737- %buffer2d: memref <32 x 32 x f32 ,3 >, %buffer3d: memref <2 x 32 x 32 x f32 ,3 >,
737+ %buffer2d: memref <32 x 8 x f32 ,3 >, %buffer3d: memref <2 x 32 x 16 x f32 ,3 >,
738738 %buffer4d: memref <2 x2 x32 x32 xf32 ,3 >, %buffer5d: memref <2 x2 x2 x32 x32 xf32 ,3 >,
739739 %mbarrier: !mbarrier ,
740740 %multicastMask: i16 ) {
@@ -744,9 +744,9 @@ func.func @async_tma_load_multicast(
744744 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}]
745745 nvgpu.tma.async.load %tensorMap1d [%crd0 ], %mbarrier [%c0 ] to %buffer1d multicast_mask = %multicastMask : !tensorMap1d , !mbarrier -> memref <128 xf32 ,3 >
746746 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}]
747- nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d multicast_mask = %multicastMask : !tensorMap2d , !mbarrier -> memref <32 x 32 x f32 ,3 >
747+ nvgpu.tma.async.load %tensorMap2d [%crd0 , %crd1 ], %mbarrier [%c0 ] to %buffer2d multicast_mask = %multicastMask : !tensorMap2d , !mbarrier -> memref <32 x 8 x f32 ,3 >
748748 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}]
749- nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d multicast_mask = %multicastMask : !tensorMap3d , !mbarrier -> memref <2 x 32 x 32 x f32 ,3 >
749+ nvgpu.tma.async.load %tensorMap3d [%crd0 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer3d multicast_mask = %multicastMask : !tensorMap3d , !mbarrier -> memref <2 x 32 x 16 x f32 ,3 >
750750 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
751751 nvgpu.tma.async.load %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ], %mbarrier [%c0 ] to %buffer4d multicast_mask = %multicastMask : !tensorMap4d , !mbarrier -> memref <2 x2 x32 x32 xf32 ,3 >
752752 // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}} box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
@@ -756,8 +756,8 @@ func.func @async_tma_load_multicast(
756756
757757func.func @async_tma_store (%tensorMap1d: !tensorMap1d , %tensorMap2d: !tensorMap2d , %tensorMap3d: !tensorMap3d , %tensorMap4d: !tensorMap4d , %tensorMap5d: !tensorMap5d ,
758758 %buffer1d: memref <128 xf32 ,3 >,
759- %buffer2d: memref <32 x 32 x f32 ,3 >,
760- %buffer3d: memref <2 x 32 x 32 x f32 ,3 >,
759+ %buffer2d: memref <32 x 8 x f32 ,3 >,
760+ %buffer3d: memref <2 x 32 x 16 x f32 ,3 >,
761761 %buffer4d: memref <2 x2 x32 x32 xf32 ,3 >,
762762 %buffer5d: memref <2 x2 x2 x32 x32 xf32 ,3 >) {
763763 %c0 = arith.constant 0 : index
@@ -766,9 +766,9 @@ func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2
766766 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}]
767767 nvgpu.tma.async.store %buffer1d to %tensorMap1d [%crd0 ] : memref <128 xf32 ,3 > -> !tensorMap1d
768768 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}]
769- nvgpu.tma.async.store %buffer2d to %tensorMap2d [%crd0 , %crd1 ] : memref <32 x 32 x f32 ,3 > -> !tensorMap2d
769+ nvgpu.tma.async.store %buffer2d to %tensorMap2d [%crd0 , %crd1 ] : memref <32 x 8 x f32 ,3 > -> !tensorMap2d
770770 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}]
771- nvgpu.tma.async.store %buffer3d to %tensorMap3d [%crd0 , %crd1 , %crd0 ] : memref <2 x 32 x 32 x f32 ,3 > -> !tensorMap3d
771+ nvgpu.tma.async.store %buffer3d to %tensorMap3d [%crd0 , %crd1 , %crd0 ] : memref <2 x 32 x 16 x f32 ,3 > -> !tensorMap3d
772772 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
773773 nvgpu.tma.async.store %buffer4d to %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ] : memref <2 x2 x32 x32 xf32 ,3 > -> !tensorMap4d
774774 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
@@ -779,8 +779,8 @@ func.func @async_tma_store(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2
779779
780780func.func @async_tma_store_predicate (%tensorMap1d: !tensorMap1d , %tensorMap2d: !tensorMap2d , %tensorMap3d: !tensorMap3d , %tensorMap4d: !tensorMap4d , %tensorMap5d: !tensorMap5d ,
781781 %buffer1d: memref <128 xf32 ,3 >,
782- %buffer2d: memref <32 x 32 x f32 ,3 >,
783- %buffer3d: memref <2 x 32 x 32 x f32 ,3 >,
782+ %buffer2d: memref <32 x 8 x f32 ,3 >,
783+ %buffer3d: memref <2 x 32 x 16 x f32 ,3 >,
784784 %buffer4d: memref <2 x2 x32 x32 xf32 ,3 >,
785785 %buffer5d: memref <2 x2 x2 x32 x32 xf32 ,3 >,
786786 %p: i1 ) {
@@ -790,9 +790,9 @@ func.func @async_tma_store_predicate(%tensorMap1d: !tensorMap1d, %tensorMap2d: !
790790 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}], predicate = %{{.*}}
791791 nvgpu.tma.async.store %buffer1d to %tensorMap1d [%crd0 ], predicate = %p : memref <128 xf32 ,3 > -> !tensorMap1d
792792 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}], predicate = %{{.*}}
793- nvgpu.tma.async.store %buffer2d to %tensorMap2d [%crd0 , %crd1 ], predicate = %p : memref <32 x 32 x f32 ,3 > -> !tensorMap2d
793+ nvgpu.tma.async.store %buffer2d to %tensorMap2d [%crd0 , %crd1 ], predicate = %p : memref <32 x 8 x f32 ,3 > -> !tensorMap2d
794794 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
795- nvgpu.tma.async.store %buffer3d to %tensorMap3d [%crd0 , %crd1 , %crd0 ], predicate = %p : memref <2 x 32 x 32 x f32 ,3 > -> !tensorMap3d
795+ nvgpu.tma.async.store %buffer3d to %tensorMap3d [%crd0 , %crd1 , %crd0 ], predicate = %p : memref <2 x 32 x 16 x f32 ,3 > -> !tensorMap3d
796796 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
797797 nvgpu.tma.async.store %buffer4d to %tensorMap4d [%crd0 , %crd1 , %crd1 , %crd0 ], predicate = %p : memref <2 x2 x32 x32 xf32 ,3 > -> !tensorMap4d
798798 // CHECK: nvvm.cp.async.bulk.tensor.global.shared.cta %{{.*}} %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}], predicate = %{{.*}}
0 commit comments