Fix test_matmul.py::test_lhs_in_tmem (#3866)

anmyachev · whitneywhtsang · web-flow · commit 6f743d1d7751 · 2025-04-08T16:52:24.000Z
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
Co-authored-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -50,7 +50,10 @@ def matmul_kernel(  #
     b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=output_ptr.dtype.element_ty)
     for k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
-        mask_a = (offs_am[:, None] < M) & (offs_k[None, :] + k * BLOCK_K < K)
+        if not A_TRANS:
+            mask_a = (offs_am[:, None] < M) & (offs_k[None, :] + k * BLOCK_K < K)
+        else:
+            mask_a = (offs_k[:, None] + k * BLOCK_K < K) & (offs_am[None, :] < M)
         a = tl.load(a_ptrs, mask=mask_a, other=0.0)
         if SCALE_A is not None:
             a = a * SCALE_A
@@ -551,11 +554,6 @@ def test_lhs_in_tmem(BLOCK_M, BLOCK_N, BLOCK_K, a_trans, dtype_src_str, device,
     N = 512
     K = 256
     _knob_promote_lhs_to_tmem(monkeypatch)
-    if is_xpu() and (M != BLOCK_M or N != BLOCK_N or K != BLOCK_K):
-        # TODO: Make LHS TMEM promotion work for all problem sizes regardless of block dims
-        pytest.skip(
-            "LHS TMEM promotion produces incorrect results when the workload dimensions are not equal to the block dims"
-        )
     torch.manual_seed(42)
     if dtype_src_str == "float8e5":
         a = torch.randint(20, 40, (M, K), dtype=torch.int8, device=device).view(torch.float8_e5m2)
@@ -581,6 +579,8 @@ def test_lhs_in_tmem(BLOCK_M, BLOCK_N, BLOCK_K, a_trans, dtype_src_str, device,
     atol = 0.03
     rtol = 0.03
     torch.testing.assert_close(ref_out, output, atol=atol, rtol=rtol)
+    if not is_cuda():
+        return
     pattern = r"%\w+\s*=\s*ttng\.tmem_alloc[\s\S]*?tng\.tc_gen5_mma\s+%\w+,"
     ttgir = k.asm["ttgir"]
     assert re.search(pattern, ttgir)
diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt
@@ -316,3 +316,4 @@ test/unit/language/test_pipeliner.py::test_indirect_matmul[5-128-128-64]
 test/unit/language/test_pipeliner.py::test_indirect_matmul[5-128-64-128]
 test/unit/language/test_core.py::test_convert_mma2mma[mma_pair0-float16-256-256]
 test/unit/language/test_matmul.py::test_mxfp
+test/unit/language/test_matmul.py::test_lhs_in_tmem