From e5d61eef2403c66c58a5e0bd6c8d24a7994020a2 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Wed, 28 May 2025 15:05:22 +0000 Subject: [PATCH 1/4] Add Tensor Layout verifier for DPAS layout --- .../tritonintelgpu-invalid.mlir | 49 +++++++++++++++ .../lib/Dialect/TritonIntelGPU/IR/Dialect.cpp | 62 +++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir index 75d391460b..b4ef568991 100644 --- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir @@ -158,3 +158,52 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr tt.return %res : tensor<8x16xf16> } } + + +// ----- + +#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> +#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> +#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> +#smem = #ttg.shared_memory +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // CHECK-LABEL: matmul_tf32dot + tt.func @matmul_tf32dot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, + %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas> + %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a> + %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b> + + // expected-error @+1 {{Layout has opsPerChannel = 2 but tensor element type is 'f32'. Expected 16 bit type.}} + %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas> + %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked> + + tt.return + } +} + +// ----- + +#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> +#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> +// expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}} +#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> +#smem = #ttg.shared_memory +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // CHECK-LABEL: matmul_tf32dot + tt.func @matmul_tf32dot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, + %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas> + %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a> + %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b> + + %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas> + %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked> + + tt.return + } +} diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp index 138fccf6c0..cde9a6c46c 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp @@ -1203,6 +1203,67 @@ struct TritonIntelGPUInferLayoutInterface } }; +struct TritonIntelGPUVerifyTensorLayoutInterface + : public triton::DialectVerifyTensorLayoutInterface { + using DialectVerifyTensorLayoutInterface::DialectVerifyTensorLayoutInterface; + + LogicalResult verifyTensorLayout( + Attribute layout, RankedTensorType rankedTy, Operation *op, + function_ref makeErr) const override { + + // Verify that the DPAS layout opsPerChannel param matches the A and B + // operand types. Because the DotOperand layout is not part of the Triton + // Intel GPU dialect, we need to first check for a TT.Dot operation. Then, + // we can compare the type of each operand to the Dot operation with the + // DPAS layout attached to the Dot operation. + if (auto dpasEncoding = dyn_cast(layout)) { + + auto validateDotDpasLayout = [&](Type elemTy) -> LogicalResult { + if (auto ptrTy = dyn_cast(elemTy)) { + elemTy = ptrTy.getPointeeType(); + } + const auto elemTyBitWidth = elemTy.getIntOrFloatBitWidth(); + + // We know opsPerChannel is either 1, 4, or 8 because of the DPAS + // verifier when the DPAS attribute is created. Here we verify that + // opsPerChannel matches the tensor type. + if (dpasEncoding.getOpsPerChannel() == 4 && elemTyBitWidth != 8) { + return makeErr() << layout << ".\nLayout has opsPerChannel = " + << dpasEncoding.getOpsPerChannel() + << " but tensor element type is " << elemTy + << ". Expected 8 bit type."; + } else if (dpasEncoding.getOpsPerChannel() == 2 && + elemTyBitWidth != 16) { + return makeErr() << layout << ".\nLayout has opsPerChannel = " + << dpasEncoding.getOpsPerChannel() + << " but tensor element type is " << elemTy + << ". Expected 16 bit type."; + } else if (dpasEncoding.getOpsPerChannel() == 1 && + elemTyBitWidth != 32) { + return makeErr() << layout << ".\nLayout has opsPerChannel = " + << dpasEncoding.getOpsPerChannel() + << " but tensor element type is " << elemTy + << ". Expected 32 bit type."; + } + return success(); + }; + + if (isa(op)) { + auto dotOp = cast(op); + auto aElemTy = dotOp.getA().getType().getElementType(); + auto result = validateDotDpasLayout(aElemTy); + if (result.failed()) + return result; + + auto bElemTy = dotOp.getB().getType().getElementType(); + return validateDotDpasLayout(bElemTy); + } + } + + return success(); + } +}; + //===----------------------------------------------------------------------===// void TritonIntelGPUDialect::initialize() { @@ -1212,6 +1273,7 @@ void TritonIntelGPUDialect::initialize() { >(); addInterfaces(); + addInterfaces(); addOperations< #define GET_OP_LIST From cf664ef8d2fcf2d83bcf672eadc4dd12e54d29da Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Sun, 22 Jun 2025 18:25:26 +0000 Subject: [PATCH 2/4] address review comments --- .../tritonintelgpu-invalid.mlir | 19 +++-------- .../lib/Dialect/TritonIntelGPU/IR/Dialect.cpp | 32 +++++++------------ 2 files changed, 15 insertions(+), 36 deletions(-) diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir index b4ef568991..f32f67f968 100644 --- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir @@ -163,22 +163,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr // ----- #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> -#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> -#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot - tt.func @matmul_tf32dot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, - %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) { + tt.func @matmul_tf32dot(%ptr:!tt.ptr, + %a_mat:tensor<32x16xf32, #dot_operand_a>, %b_mat:tensor<16x32xf32, #dot_operand_b>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas> - %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a> - %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b> // expected-error @+1 {{Layout has opsPerChannel = 2 but tensor element type is 'f32'. Expected 16 bit type.}} %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas> - %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked> tt.return } @@ -187,22 +182,16 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // ----- #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> -#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> -#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> // expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}} #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot - tt.func @matmul_tf32dot(%ptr:!tt.ptr {tt.divisibility = 16 : i32}, - %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) { + tt.func @matmul_tf32dot(%ptr:!tt.ptr, + %a_mat:tensor<32x16xf32, #dot_operand_a>, %b_mat:tensor<16x32xf32, #dot_operand_b>) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas> - %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a> - %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b> - %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas> - %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked> tt.return } diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp index cde9a6c46c..5a030b2454 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp @@ -1222,40 +1222,30 @@ struct TritonIntelGPUVerifyTensorLayoutInterface if (auto ptrTy = dyn_cast(elemTy)) { elemTy = ptrTy.getPointeeType(); } - const auto elemTyBitWidth = elemTy.getIntOrFloatBitWidth(); + const unsigned elemTyBitWidth = elemTy.getIntOrFloatBitWidth(); // We know opsPerChannel is either 1, 4, or 8 because of the DPAS // verifier when the DPAS attribute is created. Here we verify that // opsPerChannel matches the tensor type. - if (dpasEncoding.getOpsPerChannel() == 4 && elemTyBitWidth != 8) { + if (dpasEncoding.getOpsPerChannel() * elemTyBitWidth != 32) { return makeErr() << layout << ".\nLayout has opsPerChannel = " << dpasEncoding.getOpsPerChannel() << " but tensor element type is " << elemTy - << ". Expected 8 bit type."; - } else if (dpasEncoding.getOpsPerChannel() == 2 && - elemTyBitWidth != 16) { - return makeErr() << layout << ".\nLayout has opsPerChannel = " - << dpasEncoding.getOpsPerChannel() - << " but tensor element type is " << elemTy - << ". Expected 16 bit type."; - } else if (dpasEncoding.getOpsPerChannel() == 1 && - elemTyBitWidth != 32) { - return makeErr() << layout << ".\nLayout has opsPerChannel = " - << dpasEncoding.getOpsPerChannel() - << " but tensor element type is " << elemTy - << ". Expected 32 bit type."; + << ". Expected " + << 32 / dpasEncoding.getOpsPerChannel() + << " bit type."; } return success(); }; - if (isa(op)) { - auto dotOp = cast(op); + if (auto dotOp = dyn_cast(op)) { auto aElemTy = dotOp.getA().getType().getElementType(); - auto result = validateDotDpasLayout(aElemTy); - if (result.failed()) - return result; - auto bElemTy = dotOp.getB().getType().getElementType(); + + auto aResult = validateDotDpasLayout(aElemTy); + if (aResult.failed()) + return aResult; + return validateDotDpasLayout(bElemTy); } } From c38da38db6ef3d93cd0309864ddf6b8c1e285944 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Sun, 22 Jun 2025 18:37:52 +0000 Subject: [PATCH 3/4] add B operand mismatch test --- .../tritonintelgpu-invalid.mlir | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir index f32f67f968..c28a4baf18 100644 --- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir @@ -162,6 +162,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr // ----- +// COM: A operand mismatch #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> @@ -181,6 +182,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // ----- +// COM: B operand mismatch +#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> +#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> +#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> +#smem = #ttg.shared_memory +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // CHECK-LABEL: matmul_tf32dot + tt.func @matmul_tf32dot(%ptr:!tt.ptr, + %a_mat:tensor<32x16xf16, #dot_operand_a>, %b_mat:tensor<16x32xf32, #dot_operand_b>) { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas> + + // expected-error @+1 {{Layout has opsPerChannel = 2 but tensor element type is 'f32'. Expected 16 bit type.}} + %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf16, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas> + + tt.return + } +} + +// ----- + #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> // expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}} From d947c54ee25df6a46fcf20905aa7ace22afbf2a4 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Mon, 23 Jun 2025 18:39:32 +0000 Subject: [PATCH 4/4] review comments --- test/TritonIntelGPU/tritonintelgpu-invalid.mlir | 3 --- third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir index c28a4baf18..f95ebd9384 100644 --- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir @@ -166,7 +166,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> -#smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot tt.func @matmul_tf32dot(%ptr:!tt.ptr, @@ -186,7 +185,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}> #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> -#smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot tt.func @matmul_tf32dot(%ptr:!tt.ptr, @@ -206,7 +204,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}> // expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}} #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}> -#smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: matmul_tf32dot tt.func @matmul_tf32dot(%ptr:!tt.ptr, diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp index 5a030b2454..a5d120ccc6 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp @@ -1213,7 +1213,7 @@ struct TritonIntelGPUVerifyTensorLayoutInterface // Verify that the DPAS layout opsPerChannel param matches the A and B // operand types. Because the DotOperand layout is not part of the Triton - // Intel GPU dialect, we need to first check for a TT.Dot operation. Then, + // Intel GPU dialect, we need to first check for a tt.dot operation. Then, // we can compare the type of each operand to the Dot operation with the // DPAS layout attached to the Dot operation. if (auto dpasEncoding = dyn_cast(layout)) {