From e5d61eef2403c66c58a5e0bd6c8d24a7994020a2 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 28 May 2025 15:05:22 +0000
Subject: [PATCH 1/4] Add Tensor Layout verifier for DPAS layout

---
 .../tritonintelgpu-invalid.mlir               | 49 +++++++++++++++
 .../lib/Dialect/TritonIntelGPU/IR/Dialect.cpp | 62 +++++++++++++++++++
 2 files changed, 111 insertions(+)
diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
index 75d391460b..b4ef568991 100644
--- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
+++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
@@ -158,3 +158,52 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
     tt.return %res : tensor<8x16xf16>
   }
 }
+
+
+// -----
+
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
+#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-LABEL: matmul_tf32dot
+  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
+  %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas>
+    %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a>
+    %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b>
+
+    // expected-error @+1 {{Layout has opsPerChannel = 2 but tensor element type is 'f32'. Expected 16 bit type.}}
+    %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas>
+    %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked>
+
+    tt.return
+  }
+}
+
+// -----
+
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
+// expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}}
+#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-LABEL: matmul_tf32dot
+  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
+  %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas>
+    %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a>
+    %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b>
+
+    %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas>
+    %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked>
+
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
index 138fccf6c0..cde9a6c46c 100644
--- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
+++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -1203,6 +1203,67 @@ struct TritonIntelGPUInferLayoutInterface
   }
 };
 
+struct TritonIntelGPUVerifyTensorLayoutInterface
+    : public triton::DialectVerifyTensorLayoutInterface {
+  using DialectVerifyTensorLayoutInterface::DialectVerifyTensorLayoutInterface;
+
+  LogicalResult verifyTensorLayout(
+      Attribute layout, RankedTensorType rankedTy, Operation *op,
+      function_ref<InFlightDiagnostic()> makeErr) const override {
+
+    // Verify that the DPAS layout opsPerChannel param matches the A and B
+    // operand types. Because the DotOperand layout is not part of the Triton
+    // Intel GPU dialect, we need to first check for a TT.Dot operation. Then,
+    // we can compare the type of each operand to the Dot operation with the
+    // DPAS layout attached to the Dot operation.
+    if (auto dpasEncoding = dyn_cast<DpasEncodingAttr>(layout)) {
+
+      auto validateDotDpasLayout = [&](Type elemTy) -> LogicalResult {
+        if (auto ptrTy = dyn_cast<PointerType>(elemTy)) {
+          elemTy = ptrTy.getPointeeType();
+        }
+        const auto elemTyBitWidth = elemTy.getIntOrFloatBitWidth();
+
+        // We know opsPerChannel is either 1, 4, or 8 because of the DPAS
+        // verifier when the DPAS attribute is created. Here we verify that
+        // opsPerChannel matches the tensor type.
+        if (dpasEncoding.getOpsPerChannel() == 4 && elemTyBitWidth != 8) {
+          return makeErr() << layout << ".\nLayout has opsPerChannel = "
+                           << dpasEncoding.getOpsPerChannel()
+                           << " but tensor element type is " << elemTy
+                           << ". Expected 8 bit type.";
+        } else if (dpasEncoding.getOpsPerChannel() == 2 &&
+                   elemTyBitWidth != 16) {
+          return makeErr() << layout << ".\nLayout has opsPerChannel = "
+                           << dpasEncoding.getOpsPerChannel()
+                           << " but tensor element type is " << elemTy
+                           << ". Expected 16 bit type.";
+        } else if (dpasEncoding.getOpsPerChannel() == 1 &&
+                   elemTyBitWidth != 32) {
+          return makeErr() << layout << ".\nLayout has opsPerChannel = "
+                           << dpasEncoding.getOpsPerChannel()
+                           << " but tensor element type is " << elemTy
+                           << ". Expected 32 bit type.";
+        }
+        return success();
+      };
+
+      if (isa<DotOp>(op)) {
+        auto dotOp = cast<DotOp>(op);
+        auto aElemTy = dotOp.getA().getType().getElementType();
+        auto result = validateDotDpasLayout(aElemTy);
+        if (result.failed())
+          return result;
+
+        auto bElemTy = dotOp.getB().getType().getElementType();
+        return validateDotDpasLayout(bElemTy);
+      }
+    }
+
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 
 void TritonIntelGPUDialect::initialize() {
@@ -1212,6 +1273,7 @@ void TritonIntelGPUDialect::initialize() {
       >();
 
   addInterfaces<TritonIntelGPUInferLayoutInterface>();
+  addInterfaces<TritonIntelGPUVerifyTensorLayoutInterface>();
 
   addOperations<
 #define GET_OP_LIST

From cf664ef8d2fcf2d83bcf672eadc4dd12e54d29da Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Sun, 22 Jun 2025 18:25:26 +0000
Subject: [PATCH 2/4] address review comments

---
 .../tritonintelgpu-invalid.mlir               | 19 +++--------
 .../lib/Dialect/TritonIntelGPU/IR/Dialect.cpp | 32 +++++++------------
 2 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
index b4ef568991..f32f67f968 100644
--- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
+++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
@@ -163,22 +163,17 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
 // -----
 
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
-#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: matmul_tf32dot
-  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-  %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) {
+  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32>,
+  %a_mat:tensor<32x16xf32, #dot_operand_a>, %b_mat:tensor<16x32xf32, #dot_operand_b>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas>
-    %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a>
-    %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b>
 
     // expected-error @+1 {{Layout has opsPerChannel = 2 but tensor element type is 'f32'. Expected 16 bit type.}}
     %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas>
-    %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked>
 
     tt.return
   }
@@ -187,22 +182,16 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 // -----
 
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
-#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
-#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [1, 4], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 // expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}}
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: matmul_tf32dot
-  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32> {tt.divisibility = 16 : i32},
-  %a:!ttg.memdesc<32x16xf32, #shared, #smem>, %b:!ttg.memdesc<16x32xf32, #shared, #smem>) {
+  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32>,
+  %a_mat:tensor<32x16xf32, #dot_operand_a>, %b_mat:tensor<16x32xf32, #dot_operand_b>) {
     %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas>
-    %a_mat = ttg.local_load %a : !ttg.memdesc<32x16xf32, #shared, #smem> -> tensor<32x16xf32, #dot_operand_a>
-    %b_mat = ttg.local_load %b : !ttg.memdesc<16x32xf32, #shared, #smem> -> tensor<16x32xf32, #dot_operand_b>
-
     %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf32, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas>
-    %38 = ttg.convert_layout %28 : tensor<32x32xf32, #dpas> -> tensor<32x32xf32, #blocked>
 
     tt.return
   }
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
index cde9a6c46c..5a030b2454 100644
--- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
+++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -1222,40 +1222,30 @@ struct TritonIntelGPUVerifyTensorLayoutInterface
         if (auto ptrTy = dyn_cast<PointerType>(elemTy)) {
           elemTy = ptrTy.getPointeeType();
         }
-        const auto elemTyBitWidth = elemTy.getIntOrFloatBitWidth();
+        const unsigned elemTyBitWidth = elemTy.getIntOrFloatBitWidth();
 
         // We know opsPerChannel is either 1, 4, or 8 because of the DPAS
         // verifier when the DPAS attribute is created. Here we verify that
         // opsPerChannel matches the tensor type.
-        if (dpasEncoding.getOpsPerChannel() == 4 && elemTyBitWidth != 8) {
+        if (dpasEncoding.getOpsPerChannel() * elemTyBitWidth != 32) {
           return makeErr() << layout << ".\nLayout has opsPerChannel = "
                            << dpasEncoding.getOpsPerChannel()
                            << " but tensor element type is " << elemTy
-                           << ". Expected 8 bit type.";
-        } else if (dpasEncoding.getOpsPerChannel() == 2 &&
-                   elemTyBitWidth != 16) {
-          return makeErr() << layout << ".\nLayout has opsPerChannel = "
-                           << dpasEncoding.getOpsPerChannel()
-                           << " but tensor element type is " << elemTy
-                           << ". Expected 16 bit type.";
-        } else if (dpasEncoding.getOpsPerChannel() == 1 &&
-                   elemTyBitWidth != 32) {
-          return makeErr() << layout << ".\nLayout has opsPerChannel = "
-                           << dpasEncoding.getOpsPerChannel()
-                           << " but tensor element type is " << elemTy
-                           << ". Expected 32 bit type.";
+                           << ". Expected "
+                           << 32 / dpasEncoding.getOpsPerChannel()
+                           << " bit type.";
         }
         return success();
       };
 
-      if (isa<DotOp>(op)) {
-        auto dotOp = cast<DotOp>(op);
+      if (auto dotOp = dyn_cast<DotOp>(op)) {
         auto aElemTy = dotOp.getA().getType().getElementType();
-        auto result = validateDotDpasLayout(aElemTy);
-        if (result.failed())
-          return result;
-
         auto bElemTy = dotOp.getB().getType().getElementType();
+
+        auto aResult = validateDotDpasLayout(aElemTy);
+        if (aResult.failed())
+          return aResult;
+
         return validateDotDpasLayout(bElemTy);
       }
     }

From c38da38db6ef3d93cd0309864ddf6b8c1e285944 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Sun, 22 Jun 2025 18:37:52 +0000
Subject: [PATCH 3/4] add B operand mismatch test

---
 .../tritonintelgpu-invalid.mlir               | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
index f32f67f968..c28a4baf18 100644
--- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
+++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
@@ -162,6 +162,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
 
 // -----
 
+// COM: A operand mismatch
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
@@ -181,6 +182,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 
 // -----
 
+// COM: B operand mismatch
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
+#dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
+#dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // CHECK-LABEL: matmul_tf32dot
+  tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32>,
+  %a_mat:tensor<32x16xf16, #dot_operand_a>, %b_mat:tensor<16x32xf32, #dot_operand_b>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #dpas>
+
+    // expected-error @+1 {{Layout has opsPerChannel = 2 but tensor element type is 'f32'. Expected 16 bit type.}}
+    %28 = tt.dot %a_mat, %b_mat, %cst, inputPrecision = tf32 : tensor<32x16xf16, #dot_operand_a> * tensor<16x32xf32, #dot_operand_b> -> tensor<32x32xf32, #dpas>
+
+    tt.return
+  }
+}
+
+// -----
+
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 // expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}}

From d947c54ee25df6a46fcf20905aa7ace22afbf2a4 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 23 Jun 2025 18:39:32 +0000
Subject: [PATCH 4/4] review comments

---
 test/TritonIntelGPU/tritonintelgpu-invalid.mlir             | 3 ---
 third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
index c28a4baf18..f95ebd9384 100644
--- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
+++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
@@ -166,7 +166,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
-#smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: matmul_tf32dot
   tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32>,
@@ -186,7 +185,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 #dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
-#smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: matmul_tf32dot
   tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32>,
@@ -206,7 +204,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 #dot_operand_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
 // expected-error @below {{ttg.dot_op kWidth parameter must match the parent's opsPerChannel}}
 #dot_operand_b = #ttg.dot_op<{opIdx=1, parent=#dpas, kWidth=2}>
-#smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: matmul_tf32dot
   tt.func @matmul_tf32dot(%ptr:!tt.ptr<f32>,
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
index 5a030b2454..a5d120ccc6 100644
--- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
+++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -1213,7 +1213,7 @@ struct TritonIntelGPUVerifyTensorLayoutInterface
 
     // Verify that the DPAS layout opsPerChannel param matches the A and B
     // operand types. Because the DotOperand layout is not part of the Triton
-    // Intel GPU dialect, we need to first check for a TT.Dot operation. Then,
+    // Intel GPU dialect, we need to first check for a tt.dot operation. Then,
     // we can compare the type of each operand to the Dot operation with the
     // DPAS layout attached to the Dot operation.
     if (auto dpasEncoding = dyn_cast<DpasEncodingAttr>(layout)) {