intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 20 additions & 7 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/Utility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 1 addition & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp‎
Lines changed: 0 additions & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 2 additions & 14 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 5 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 3 additions & 5 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 3 additions & 5 deletions
@@ -427,7 +427,7 @@ jobs:
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
-          pytest --capture=tee-sys -rfs -n 16 language runtime \
+          pytest --capture=tee-sys -rfs -n 12 language runtime \
                  --ignore=language/test_line_info.py \
                  --ignore=test_debug.py
           # TODO: uncomment
 
@@ -414,7 +414,7 @@ jobs:
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
           cd python/test/unit
-          pytest --capture=tee-sys -rfs -n 16 language runtime \
+          pytest --capture=tee-sys -rfs -n 12 language runtime \
                  --ignore=language/test_line_info.py \
                  --ignore=test_debug.py
           # TODO: uncomment
 
@@ -273,12 +273,25 @@ struct SharedMemoryObject {
                      ArrayRef<Value> offsets)
       : base(base), baseElemType(baseElemType),
         strides(strides.begin(), strides.end()),
-        offsets(offsets.begin(), offsets.end()) {}
+        offsets(offsets.begin(), offsets.end()) {
+    assert(strides.size() == offsets.size());
+  }
 
   SharedMemoryObject(Value base, Type baseElemType, ArrayRef<int64_t> shape,
-                     ArrayRef<unsigned> order, Location loc,
+                     triton::gpu::SharedEncodingAttr layout, Location loc,
                      RewriterBase &rewriter)
       : base(base), baseElemType(baseElemType) {
+    SmallVector<unsigned> order(shape.size());
+    // Default minor-to-major order
+    std::iota(order.rbegin(), order.rend(), 0);
+    if (layout) {
+      auto layoutOrder = convertType<int>(layout.getOrder());
+      int rankDiff = layoutOrder.size() - shape.size();
+      auto minRank = std::min(shape.size(), layoutOrder.size());
+      for (size_t i = 0; i < minRank; ++i)
+        order[i] = layoutOrder[i] - rankDiff;
+    }
+    assert(isPermutationOfIota(order) && "Invalid order");
     strides = getStridesFromShapeAndOrder(shape, order, loc, rewriter);
     offsets.append(order.size(), i32_val(0));
   }
@@ -304,14 +317,14 @@ struct SharedMemoryObject {
     return types;
   }
 
-  Value getCSwizzleOffset(int order) const {
-    assert(order >= 0 && order < strides.size());
-    return offsets[order];
+  Value getCSwizzleOffset(int dim) const {
+    assert(dim >= 0 && dim < strides.size());
+    return offsets[dim];
   }
 
-  Value getBaseBeforeSlice(int order, Location loc,
+  Value getBaseBeforeSlice(int dim, Location loc,
                            RewriterBase &rewriter) const {
-    Value cSwizzleOffset = getCSwizzleOffset(order);
+    Value cSwizzleOffset = getCSwizzleOffset(dim);
     Value offset = sub(i32_val(0), cSwizzleOffset);
     Type type = base.getType();
     return gep(type, baseElemType, base, offset);
 
@@ -148,7 +148,7 @@ template <typename T> bool isPermutationOfIota(ArrayRef<T> vals) {
   return isIota(sorted);
 }
 
-template <typename VecT> bool IsPermutationOfIota(const VecT &vec) {
+template <typename VecT> bool isPermutationOfIota(const VecT &vec) {
   return isPermutationOfIota(ArrayRef(vec));
 }
 
 
@@ -18,7 +18,6 @@ namespace {
 
 using ::mlir::LLVM::getMultiDimOffset;
 using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
-using ::mlir::LLVM::getStridesFromShapeAndOrder;
 using ::mlir::LLVM::getWrappedMultiDimOffset;
 using ::mlir::LLVM::linearize;
 
@@ -380,8 +379,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
         return !useLegacyMMAConversion;
       }
       if (auto dotOperand = dyn_cast<DotOperandEncodingAttr>(layout)) {
-        if (isa<NvidiaMmaEncodingAttr, AMDMfmaEncodingAttr>(
-                dotOperand.getParent())) {
+        if (isa<MmaEncodingTrait>(dotOperand.getParent())) {
           return !useLegacyMMAConversion;
         }
         return false;
 
@@ -4,7 +4,6 @@
 using ValueTable = std::map<std::pair<int, int>, Value>;
 using ::mlir::LLVM::delinearize;
 using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
-using ::mlir::LLVM::getStridesFromShapeAndOrder;
 using ::mlir::LLVM::linearize;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::getContigPerThread;
 
@@ -78,23 +78,11 @@ struct LocalAllocOpConversion
     auto typeConverter = getTypeConverter();
     auto sharedLayout =
         cast<triton::gpu::SharedEncodingAttr>(resultTy.getEncoding());
-    auto order = sharedLayout.getOrder();
-    // Workaround for 3D tensors
-    // TODO: we need to modify the pipeline pass to give a proper shared
-    // encoding to 3D tensors
-    SmallVector<unsigned> newOrder;
-    if (resultTy.getShape().size() != order.size()) {
-      for (auto i = 0; i < order.size(); ++i)
-        newOrder.push_back(order[i] + 1);
-      newOrder.push_back(0);
-    } else {
-      newOrder = SmallVector<unsigned>(order.begin(), order.end());
-    }
 
     auto llvmElemTy = typeConverter->convertType(resultTy.getElementType());
     auto shapePerCTA = getShapePerCTA(sharedLayout, resultTy.getShape());
     auto smemObj = SharedMemoryObject(smemBase, llvmElemTy, shapePerCTA,
-                                      newOrder, loc, rewriter);
+                                      sharedLayout, loc, rewriter);
     // If there is an initial tensor, store it into the shared memory.
     if (op.getSrc()) {
       lowerDistributedToShared(loc, op.getSrc(), op.getResult(),
@@ -159,7 +147,7 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
             srcTy.getShape()[1] >= 4 * kWidth & dstTy.getRank() <= 2;
         return !canUseLdmatrix;
       }
-      if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
+      if (isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(dot.getParent()))
         return true;
     }
     return false;
 
@@ -189,9 +189,9 @@ Value getSmemVecAddr(RankedTensorType registerTy,
       dyn_cast<triton::gpu::SharedEncodingAttr>(sharedTy.getEncoding());
 
   auto smemBase = smemObj.getBase();
-  auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding());
   auto smemOffsets = smemObj.getOffsets();
   auto smemStrides = smemObj.getStrides();
+  auto smemOrder = sharedEnc.getOrder();
   Value smemOffset;
   // When loading or storing to shared memory, we consider two cases for
   // performance reasons:
@@ -239,9 +239,11 @@ Value getSmemVecAddr(RankedTensorType registerTy,
     // Reorder strides according to `order`.  This way they match the
     // multi-dimensional offsets in regToSharedLayout.
     smemOffset = dot(rewriter, loc, smemOffsets,
-                     applyPermutation(smemStrides, sharedOrder));
+                     applyPermutation(smemStrides, smemOrder));
   } else { // Case 2 -> rank-reduced swizzling
     assert(rank >= 2 && "Swizzling only applies to tensors with rank >= 2");
+    assert(!sharedEnc.getHasLeadingOffset() &&
+           "Leading offsets are not supported for sliced tensors");
     // We define both tensor offsets and shared memory offsets:
     //
     //   - Tensor offsets: Relative offsets within a given tensor.
@@ -572,6 +574,7 @@ SmallVector<Value> getStridesFromShapeAndOrder(ArrayRef<int64_t> shape,
                                                ArrayRef<unsigned> order,
                                                Location loc,
                                                RewriterBase &rewriter) {
+  assert(order.size() == shape.size() && "shape and order must have same size");
   auto rank = shape.size();
   SmallVector<Value> strides(rank);
   int64_t stride = 1;
 
@@ -1152,11 +1152,9 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {
 }
 SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {
   // FIXME(Lezcano): Preexisting. Do we want to have this path at all?
-  if (mlir::isa<AMDMfmaEncodingAttr>(getParent())) {
+  if (mlir::isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(getParent())) {
     return ::getWarpOrder(getParent());
   }
-  // It's quite weird to talk about warp order when that the warps
-  // are broadcasted along the K dimension
   llvm::report_fatal_error("DotOperandEncoding::getWarpOrder not implemented");
   return {};
 }
@@ -1201,9 +1199,9 @@ LogicalResult DotOperandEncodingAttr::verify(
 
   if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {
     if (kWidth != 16 && parentAttr.getVersion() == 1 ||
-        kWidth != 8 && parentAttr.getVersion() == 2)
+        kWidth != 8 && kWidth != 16 && parentAttr.getVersion() == 2)
       return emitError() << "ttg.dot_op kWidth parameter must be 16 for "
-                            "gfx11 and 8 for gfx12";
+                            "gfx11 and 8/16 for gfx12";
     return success();
   }
Original file line number	Diff line number	Diff line change
`@@ -148,7 +148,7 @@ template <typename T> bool isPermutationOfIota(ArrayRef<T> vals) {`
`148`	`148`	`return isIota(sorted);`
`149`	`149`	`}`
`150`	`150`
`151`		`-template <typename VecT> bool IsPermutationOfIota(const VecT &vec) {`
	`151`	`+template <typename VecT> bool isPermutationOfIota(const VecT &vec) {`
`152`	`152`	`return isPermutationOfIota(ArrayRef(vec));`
`153`	`153`	`}`
`154`	`154`
Original file line number	Diff line number	Diff line change
`@@ -1152,11 +1152,9 @@ SmallVector<unsigned> DotOperandEncodingAttr::getWarpsPerCTA() const {`
`1152`	`1152`	`}`
`1153`	`1153`	`SmallVector<unsigned> DotOperandEncodingAttr::getWarpOrder() const {`
`1154`	`1154`	`// FIXME(Lezcano): Preexisting. Do we want to have this path at all?`
`1155`		`- if (mlir::isa<AMDMfmaEncodingAttr>(getParent())) {`
	`1155`	`+ if (mlir::isa<AMDMfmaEncodingAttr, AMDWmmaEncodingAttr>(getParent())) {`
`1156`	`1156`	`return ::getWarpOrder(getParent());`
`1157`	`1157`	`}`
`1158`		`- // It's quite weird to talk about warp order when that the warps`
`1159`		`- // are broadcasted along the K dimension`
`1160`	`1158`	`llvm::report_fatal_error("DotOperandEncoding::getWarpOrder not implemented");`
`1161`	`1159`	`return {};`
`1162`	`1160`	`}`
`@@ -1201,9 +1199,9 @@ LogicalResult DotOperandEncodingAttr::verify(`
`1201`	`1199`
`1202`	`1200`	`if (auto parentAttr = mlir::dyn_cast<AMDWmmaEncodingAttr>(parent)) {`
`1203`	`1201`	`if (kWidth != 16 && parentAttr.getVersion() == 1 \|\|`
`1204`		`- kWidth != 8 && parentAttr.getVersion() == 2)`
	`1202`	`+ kWidth != 8 && kWidth != 16 && parentAttr.getVersion() == 2)`
`1205`	`1203`	`return emitError() << "ttg.dot_op kWidth parameter must be 16 for "`
`1206`		`- "gfx11 and 8 for gfx12";`
	`1204`	`+ "gfx11 and 8/16 for gfx12";`
`1207`	`1205`	`return success();`
`1208`	`1206`	`}`
`1209`	`1207`