draperlaboratory
diff --git a/‎mlir/include/mlir/Dialect/GPU/GPUBase.td‎
Lines changed: 0 additions & 7 deletions b/‎mlir/include/mlir/Dialect/GPU/GPUBase.td‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/GPUDialect.h‎
Lines changed: 0 additions & 8 deletions b/‎mlir/include/mlir/Dialect/GPU/GPUDialect.h‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎mlir/include/mlir/Dialect/GPU/GPUOps.td‎
Lines changed: 0 additions & 101 deletions b/‎mlir/include/mlir/Dialect/GPU/GPUOps.td‎
Lines changed: 0 additions & 101 deletions
diff --git a/‎mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h‎
Lines changed: 9 additions & 0 deletions b/‎mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎mlir/include/mlir/Dialect/NVGPU/NVGPU.td‎
Lines changed: 121 additions & 6 deletions b/‎mlir/include/mlir/Dialect/NVGPU/NVGPU.td‎
Lines changed: 121 additions & 6 deletions
diff --git a/‎mlir/include/mlir/Dialect/NVGPU/NVGPUDialect.h‎
Lines changed: 14 additions & 0 deletions b/‎mlir/include/mlir/Dialect/NVGPU/NVGPUDialect.h‎
Lines changed: 14 additions & 0 deletions
@@ -60,13 +60,6 @@ def GPU_AsyncToken : DialectType<
   GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token type">,
              BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">;
 
-/// Device-side synchronization token.
-def GPU_DeviceAsyncToken : DialectType<
-  GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::DeviceAsyncTokenType>()">,
-   "device async token type">,
-   BuildableType<
-     "mlir::gpu::DeviceAsyncTokenType::get($_builder.getContext())">;
-
 // Predicat to check if type is gpu::MMAMatrixType.
 def IsMMAMatrixTypePred : CPred<"$_self.isa<::mlir::gpu::MMAMatrixType>()">;
 
 
@@ -43,14 +43,6 @@ class AsyncTokenType
   using Base::Base;
 };
 
-/// Device-side token storage type. There is only one type of device-side token.
-class DeviceAsyncTokenType
-    : public Type::TypeBase<DeviceAsyncTokenType, Type, TypeStorage> {
-public:
-  // Used for generic hooks in TypeBase.
-  using Base::Base;
-};
-
 /// MMAMatrixType storage and uniquing. Array is uniqued based on its shape
 /// and type.
 struct MMAMatrixStorageType : public TypeStorage {
 
@@ -1280,105 +1280,4 @@ def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
   }];
 }
 
-def GPU_DeviceAsyncCopyOp : GPU_Op<"device_async_copy",
-  [AttrSizedOperandSegments]> {
-  let summary = "device-side asynchronous copy";
-  let description = [{
-    The `gpu.device_async_copy` op initiates an asynchronous copy operation of
-    `$size` elements from source to the destination without blocking the thread.
-    The destination has to be in shared memory.
-
-    This is memory access will be pending to be added to a group.
-
-    This op is meant to be used with `gpu.device_async_create_group` and
-    `gpu.device_async_wait` to synchronize copies as explained in those ops
-    descriptions.
-
-    In order to do a copy and wait for the result we need the following
-    combination:
-    ```
-    // copy 1.
-    %cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
-    // copy 2.
-    %cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
-    // group 1 contains copy 1 and copy 2.
-    %token1 = gpu.device_async_create_group %cp1, %cp2
-    // copy 3.
-    %cp3 = gpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
-    // group 2 contains copy 3.
-    %token2 = gpu.device_async_create_group %cp3
-    // after the wait copy 1 and copy 2 are complete.
-    gpu.device_async_wait %token1
-    // after the wait copy 3 is complete.
-    gpu.device_async_wait %token2
-    ```
-
-    Example:
-
-    ```mlir
-    %0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
-      memref<4x5xf32> to memref<2x7x5xf32, 3>
-    ```
-  }];
-  let results = (outs GPU_DeviceAsyncToken:$asyncToken);
-  let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
-                       Variadic<Index>:$dstIndices,
-                       Arg<AnyMemRef, "", [MemRead]>:$src,
-                       Variadic<Index>:$srcIndices,
-                       IndexAttr:$numElements);
-  let assemblyFormat = [{
-    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements
-      attr-dict `:` type($src) `to` type($dst)
-  }];
-  let hasVerifier = 1;
-}
-
-def GPU_DeviceAsyncCreateGroupOp : GPU_Op<"device_async_create_group", []> {
-  let summary = "device side asynchronous create group operation";
-  let description = [{
-  The `gpu.device_async_create_group` op creates a group of memory accesses
-  containing all the pending `device_async_copy` operations associated with
-  argument tokens. Each token can only be part of one group.
-
-  It returns a token that can be use to wait until the group fully completes.
-
-  This is meant to be used with `gpu.device_async_wait` to synchronize copies
-  as explained in those ops descriptions.
-
-  Groups are executed in the order they are created.
-
-  Example:
-
-  ```mlir
-  %0 = gpu.device_async_create_group
-  ```
-  }];
-  let results = (outs GPU_DeviceAsyncToken:$asyncToken);
-  let arguments = (ins Variadic<GPU_DeviceAsyncToken>:$inputTokens);
-  let assemblyFormat = [{
-    $inputTokens attr-dict
-  }];
-}
-
-def GPU_DeviceAsyncWaitOp : GPU_Op<"device_async_wait", []> {
-  let summary = "Wait for async gpu ops to complete.";
-  let description = [{
-  The `gpu.device_async_wait` op will block the execution thread until the group
-  associated with the source token is fully completed.
-
-    The optional `$numGroup` attribute gives a lower bound of the number of
-    groups uncompleted when the wait can unblock the thread.
-  Example:
-
-  ```mlir
-  gpu.device_async_wait %0
-  ```
-  }];
-  let arguments = (ins GPU_DeviceAsyncToken:$asyncDependencies,
-                       OptionalAttr<I32Attr>:$numGroups);
-  let assemblyFormat = [{
-    $asyncDependencies attr-dict
-  }];
-}
-
 #endif // GPU_OPS
@@ -25,6 +25,15 @@
 
 namespace mlir {
 namespace NVVM {
+
+/// NVVM memory space identifiers.
+enum NVVMMemorySpace {
+  /// Global memory space identifier.
+  kGlobalMemorySpace = 1,
+  /// Shared memory space identifier.
+  kSharedMemorySpace = 3
+};
+
 /// Return the element type and number of elements associated with a wmma matrix
 /// of given chracteristics. This matches the logic in IntrinsicsNVVM.td
 /// WMMA_REGS structure.
 
@@ -32,8 +32,17 @@ def NVGPU_Dialect : Dialect {
     representing PTX specific operations while using MLIR high level concepts
     like memref and 2-D vector.
   }];
+  let useDefaultAttributePrinterParser = 1;
 }
 
+/// Device-side synchronization token.
+def NVGPU_DeviceAsyncToken : DialectType<
+  NVGPU_Dialect, CPred<"$_self.isa<::mlir::nvgpu::DeviceAsyncTokenType>()">,
+   "device async token type">,
+   BuildableType<
+     "mlir::nvgpu::DeviceAsyncTokenType::get($_builder.getContext())">;
+
+
 //===----------------------------------------------------------------------===//
 // NVGPU Op definitions
 //===----------------------------------------------------------------------===//
@@ -73,24 +82,24 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
   let description = [{
   The `nvgpu.mma.sync` op represents the distributed form of a collective
   matrix-multiply-and-accumulate (mma) operation that is compatible with
-  `nvvm.mma.sync`. The operands and results are fragments of the full matrix 
+  `nvvm.mma.sync`. The operands and results are fragments of the full matrix
   operands. The full shape of the distributed mma operation is given by the
-  `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.  
+  `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
 
   This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
   is an intermediate point between lowering from `vector.contract` to
   `nvvm.mma.sync`.
-  
+
   This operation is meant to follow the semantic of described here:
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
-  
+
   Example:
-  
+
   ```mlir
   nvgpu.mma.sync (%a, %b, %c) :
     (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
   ```
-  }];   
+  }];
   let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB,
                        AnyVector:$matrixC, I64ArrayAttr:$mmaShape);
 
@@ -102,4 +111,110 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
   }];
 }
 
+
+def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
+  [AttrSizedOperandSegments]> {
+  let summary = "device-side asynchronous copy";
+  let description = [{
+    The `gpu.device_async_copy` op initiates an asynchronous copy operation of
+    `$size` elements from source to the destination without blocking the thread.
+    The destination has to be in shared memory.
+
+    This is memory access will be pending to be added to a group.
+
+    This op is meant to be used with `gpu.device_async_create_group` and
+    `gpu.device_async_wait` to synchronize copies as explained in those ops
+    descriptions. 
+    `bypassL1` attribute is hint to the backend and hardware that
+    the copy should by pass the L1 cache, this may be dropped by the backend or
+    hardware. 
+
+    In order to do a copy and wait for the result we need the following
+    combination:
+    ```
+    // copy 1.
+    %cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
+    // copy 2.
+    %cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    // group 1 contains copy 1 and copy 2.
+    %token1 = gpu.device_async_create_group %cp1, %cp2
+    // copy 3.
+    %cp3 = gpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
+    // group 2 contains copy 3.
+    %token2 = gpu.device_async_create_group %cp3
+    // after the wait copy 1 and copy 2 are complete.
+    gpu.device_async_wait %token1
+    // after the wait copy 3 is complete.
+    gpu.device_async_wait %token2
+    ```
+
+    Example:
+
+    ```mlir
+    %0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
+      memref<4x5xf32> to memref<2x7x5xf32, 3>
+    ```
+  }];
+  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+  let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
+                       Variadic<Index>:$dstIndices,
+                       Arg<AnyMemRef, "", [MemRead]>:$src,
+                       Variadic<Index>:$srcIndices,
+                       IndexAttr:$numElements,
+                       OptionalAttr<UnitAttr>:$bypassL1);
+  let assemblyFormat = [{
+    $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements
+      attr-dict `:` type($src) `to` type($dst)
+  }];
+  let hasVerifier = 1;
+}
+
+def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
+  let summary = "device side asynchronous create group operation";
+  let description = [{
+  The `gpu.device_async_create_group` op creates a group of memory accesses
+  containing all the pending `device_async_copy` operations associated with
+  argument tokens. Each token can only be part of one group.
+
+  It returns a token that can be use to wait until the group fully completes.
+
+  This is meant to be used with `gpu.device_async_wait` to synchronize copies
+  as explained in those ops descriptions.
+
+  Groups are executed in the order they are created.
+
+  Example:
+
+  ```mlir
+  %0 = gpu.device_async_create_group
+  ```
+  }];
+  let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
+  let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
+  let assemblyFormat = [{
+    $inputTokens attr-dict
+  }];
+}
+
+def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
+  let summary = "Wait for async gpu ops to complete.";
+  let description = [{
+  The `gpu.device_async_wait` op will block the execution thread until the group
+  associated with the source token is fully completed.
+
+    The optional `$numGroup` attribute gives a lower bound of the number of
+    groups uncompleted when the wait can unblock the thread.
+  Example:
+
+  ```mlir
+  gpu.device_async_wait %0
+  ```
+  }];
+  let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
+                       OptionalAttr<I32Attr>:$numGroups);
+  let assemblyFormat = [{
+    $asyncDependencies attr-dict
+  }];
+}
+
 #endif // NVGPU
@@ -18,6 +18,20 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+namespace mlir {
+namespace nvgpu {
+
+/// Device-side token storage type. There is only one type of device-side token.
+class DeviceAsyncTokenType
+    : public Type::TypeBase<DeviceAsyncTokenType, Type, TypeStorage> {
+public:
+  // Used for generic hooks in TypeBase.
+  using Base::Base;
+};
+
+} // namespace nvgpu
+} // namespace mlir
+
 #include "mlir/Dialect/NVGPU/NVGPUDialect.h.inc"
 
 #define GET_OP_CLASSES