@@ -32,8 +32,17 @@ def NVGPU_Dialect : Dialect {
3232 representing PTX specific operations while using MLIR high level concepts
3333 like memref and 2-D vector.
3434 }];
35+ let useDefaultAttributePrinterParser = 1;
3536}
3637
38+ /// Device-side synchronization token.
39+ def NVGPU_DeviceAsyncToken : DialectType<
40+ NVGPU_Dialect, CPred<"$_self.isa<::mlir::nvgpu::DeviceAsyncTokenType>()">,
41+ "device async token type">,
42+ BuildableType<
43+ "mlir::nvgpu::DeviceAsyncTokenType::get($_builder.getContext())">;
44+
45+
3746//===----------------------------------------------------------------------===//
3847// NVGPU Op definitions
3948//===----------------------------------------------------------------------===//
@@ -73,24 +82,24 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
7382 let description = [{
7483 The `nvgpu.mma.sync` op represents the distributed form of a collective
7584 matrix-multiply-and-accumulate (mma) operation that is compatible with
76- `nvvm.mma.sync`. The operands and results are fragments of the full matrix
85+ `nvvm.mma.sync`. The operands and results are fragments of the full matrix
7786 operands. The full shape of the distributed mma operation is given by the
78- `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
87+ `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
7988
8089 This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
8190 is an intermediate point between lowering from `vector.contract` to
8291 `nvvm.mma.sync`.
83-
92+
8493 This operation is meant to follow the semantic of described here:
8594 https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
86-
95+
8796 Example:
88-
97+
8998 ```mlir
9099 nvgpu.mma.sync (%a, %b, %c) :
91100 (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
92101 ```
93- }];
102+ }];
94103 let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB,
95104 AnyVector:$matrixC, I64ArrayAttr:$mmaShape);
96105
@@ -102,4 +111,110 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
102111 }];
103112}
104113
114+
115+ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
116+ [AttrSizedOperandSegments]> {
117+ let summary = "device-side asynchronous copy";
118+ let description = [{
119+ The `gpu.device_async_copy` op initiates an asynchronous copy operation of
120+ `$size` elements from source to the destination without blocking the thread.
121+ The destination has to be in shared memory.
122+
123+ This is memory access will be pending to be added to a group.
124+
125+ This op is meant to be used with `gpu.device_async_create_group` and
126+ `gpu.device_async_wait` to synchronize copies as explained in those ops
127+ descriptions.
128+ `bypassL1` attribute is hint to the backend and hardware that
129+ the copy should by pass the L1 cache, this may be dropped by the backend or
130+ hardware.
131+
132+ In order to do a copy and wait for the result we need the following
133+ combination:
134+ ```
135+ // copy 1.
136+ %cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
137+ // copy 2.
138+ %cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
139+ // group 1 contains copy 1 and copy 2.
140+ %token1 = gpu.device_async_create_group %cp1, %cp2
141+ // copy 3.
142+ %cp3 = gpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
143+ // group 2 contains copy 3.
144+ %token2 = gpu.device_async_create_group %cp3
145+ // after the wait copy 1 and copy 2 are complete.
146+ gpu.device_async_wait %token1
147+ // after the wait copy 3 is complete.
148+ gpu.device_async_wait %token2
149+ ```
150+
151+ Example:
152+
153+ ```mlir
154+ %0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
155+ memref<4x5xf32> to memref<2x7x5xf32, 3>
156+ ```
157+ }];
158+ let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
159+ let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
160+ Variadic<Index>:$dstIndices,
161+ Arg<AnyMemRef, "", [MemRead]>:$src,
162+ Variadic<Index>:$srcIndices,
163+ IndexAttr:$numElements,
164+ OptionalAttr<UnitAttr>:$bypassL1);
165+ let assemblyFormat = [{
166+ $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements
167+ attr-dict `:` type($src) `to` type($dst)
168+ }];
169+ let hasVerifier = 1;
170+ }
171+
172+ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
173+ let summary = "device side asynchronous create group operation";
174+ let description = [{
175+ The `gpu.device_async_create_group` op creates a group of memory accesses
176+ containing all the pending `device_async_copy` operations associated with
177+ argument tokens. Each token can only be part of one group.
178+
179+ It returns a token that can be use to wait until the group fully completes.
180+
181+ This is meant to be used with `gpu.device_async_wait` to synchronize copies
182+ as explained in those ops descriptions.
183+
184+ Groups are executed in the order they are created.
185+
186+ Example:
187+
188+ ```mlir
189+ %0 = gpu.device_async_create_group
190+ ```
191+ }];
192+ let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
193+ let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
194+ let assemblyFormat = [{
195+ $inputTokens attr-dict
196+ }];
197+ }
198+
199+ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
200+ let summary = "Wait for async gpu ops to complete.";
201+ let description = [{
202+ The `gpu.device_async_wait` op will block the execution thread until the group
203+ associated with the source token is fully completed.
204+
205+ The optional `$numGroup` attribute gives a lower bound of the number of
206+ groups uncompleted when the wait can unblock the thread.
207+ Example:
208+
209+ ```mlir
210+ gpu.device_async_wait %0
211+ ```
212+ }];
213+ let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
214+ OptionalAttr<I32Attr>:$numGroups);
215+ let assemblyFormat = [{
216+ $asyncDependencies attr-dict
217+ }];
218+ }
219+
105220#endif // NVGPU
0 commit comments