add xmx colmajor

DDEle · DDEle · commit 8407143953f5 · 2024-08-28T07:48:55.000Z
fix
diff --git a/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp b/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp
@@ -156,7 +156,12 @@ class gemm_t<
       : is_vnni_tiled_a ? reg_layout::vnni_tiled
                         : reg_layout::tiled;
 
+  // reg_layout of the load result
   static constexpr reg_layout reg_layout_b =
+      is_col_major_b ? reg_layout::transpose_tiled : reg_layout::tiled;
+
+  // reg_layout required by mma
+  static constexpr reg_layout reg_layout_b_acc =
       // fpu
       compute_policy::mma_engine == mma_engine::fpu
       ? (is_gemv ? reg_layout::transpose_tiled : reg_layout::tiled)
@@ -214,7 +219,7 @@ class gemm_t<
       tile_size_y_b,
       block_size_x_b,
       block_size_y_b,
-      reg_layout_b>;
+      reg_layout_b_acc>;
   using matB_acc_t = subgroup::tile_t<dtype_mma_b, matB_acc_tile_desc_t>;
 
  public:
@@ -629,9 +634,10 @@ class gemm_t<
             matA_acc,
             i == args.inner_loop_count - 1);
       } else {
-        if constexpr (is_col_major_b) {
-          tile_transpose(matB_acc);
-        }
+        // The result of dequantize should always be (plain) tiled
+        if constexpr (
+            matB_acc_tile_desc_t::register_layout == reg_layout::vnni_tiled)
+          subgroup::vnni_convert(matB_acc);
         tile_mma::mma(matC, matC, matB_acc, matA_acc);
       }
       if constexpr (enable_periodic_sync) {
@@ -696,9 +702,10 @@ class gemm_t<
         tile_mma::mma(
             matAcc, matAcc, matC, matB_acc, matA_acc, i == compute_stages - 1);
       } else {
-        if constexpr (is_col_major_b) {
-          tile_transpose(matB_acc);
-        }
+        // The result of dequantize should always be (plain) tiled
+        if constexpr (
+            matB_acc_tile_desc_t::register_layout == reg_layout::vnni_tiled)
+          subgroup::vnni_convert(matB_acc);
         tile_mma::mma(matC, matC, matB_acc, matA_acc);
       }
       if constexpr (enable_periodic_sync) {
diff --git a/include/subgroup/tile/impl/op_function.hpp b/include/subgroup/tile/impl/op_function.hpp
@@ -704,36 +704,37 @@ layout_convert(T_dst& dst, T_src& src) {
   }
 }
 
-template <typename T>
-void dump_mat(
-    T mat,
-    size_t tile_x = T::reg_transpose ? T::tile_size_y : T::tile_size_x,
-    size_t tile_y = T::reg_transpose ? T::tile_size_x : T::tile_size_y) {
-#pragma unroll
-  for (size_t row = 0; row < tile_y; row++) {
-#pragma unroll
-    for (size_t col = 0; col < tile_x; col++) {
-      sycl::ext::oneapi::experimental::printf(
-          "%x(%d) ",
-          int(native_type_t<typename T::dtype>(mat.reg[row * tile_x + col])),
-          int(native_type_t<typename T::dtype>(mat.reg[row * tile_x + col])));
-    }
-    sycl::ext::oneapi::experimental::printf("\n");
-  }
-  sycl::ext::oneapi::experimental::printf("\n ");
-}
 template <typename T>
 void dump_mat_reg(T mat, size_t tile_x, size_t tile_y) {
 #pragma unroll
   for (size_t row = 0; row < tile_y; row++) {
 #pragma unroll
     for (size_t col = 0; col < tile_x; col++) {
-      sycl::ext::oneapi::experimental::printf(
-          "%d ", (int)(sycl::half)mat[row * tile_x + col]);
+      const auto&& v = int64_t(
+          native_type_t<typename T::element_type>(mat[row * tile_x + col]));
+      constexpr bool is_int32 =
+          (std::is_same<typename T::element_type, int4x2>::value ||
+           std::is_same<typename T::element_type, int4x8>::value ||
+           std::is_same<typename T::element_type, uint32_t>::value ||
+           std::is_same<typename T::element_type, int32_t>::value);
+      constexpr bool is_int64 =
+          (std::is_same<typename T::element_type, uint64_t>::value ||
+           std::is_same<typename T::element_type, int64_t>::value);
+      is_int32 ? sycl::ext::oneapi::experimental::printf(
+                    "%08x(%10u)  ", int(v), int(v))
+          : is_int64
+          ? sycl::ext::oneapi::experimental::printf("%016llx(%20llu)  ", v, v)
+          : sycl::ext::oneapi::experimental::printf("%3lld ", v);
     }
     sycl::ext::oneapi::experimental::printf("\n");
   }
   sycl::ext::oneapi::experimental::printf("\n");
 }
-
+template <typename T>
+void dump_mat(
+    T mat,
+    size_t tile_x = T::reg_transpose ? T::tile_size_y : T::tile_size_x,
+    size_t tile_y = T::reg_transpose ? T::tile_size_x : T::tile_size_y) {
+  dump_mat_reg(mat.reg, tile_x, tile_y);
+}
 } // namespace gpu::xetla::subgroup
diff --git a/include/subgroup/tile/impl/tile_op_functor.hpp b/include/subgroup/tile/impl/tile_op_functor.hpp
@@ -83,6 +83,12 @@ struct dequant_int4_weight_t {
     constexpr uint32_t block_size_y_b = matB_acc_t::block_size_y;
     static constexpr uint32_t pack_ratio = sizeof(typename matB_t::dtype) * 2;
 
+    // If the result of dequant should be tranposed before storing to matB_acc
+    constexpr bool trans_acc =
+        matB_t::register_layout == reg_layout::transpose_tiled &&
+        (matB_acc_t::register_layout == reg_layout::tiled ||
+         matB_acc_t::register_layout == reg_layout::vnni_tiled);
+
     constexpr uint32_t num_block_x = tile_size_x_b / block_size_x_b;
     constexpr uint32_t num_block_y = tile_size_y_b / block_size_y_b;
 #pragma unroll
@@ -149,9 +155,18 @@ struct dequant_int4_weight_t {
                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) -
                   int8_t(8);
             }
-            dst_blk.xetla_select<step, 1>(jj * block_size_y_b + ii) =
-                cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) *
-                scale.reg[scale_idx];
+            // Scale and write back to matB_acc
+            if constexpr (trans_acc) {
+              dst_blk.xetla_select<step, block_size_x_b>(
+                  ii * block_size_x_b + jj) =
+                  cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) *
+                  scale.reg[scale_idx];
+
+            } else {
+              dst_blk.xetla_select<step, 1>(jj * block_size_y_b + ii) =
+                  cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) *
+                  scale.reg[scale_idx];
+            }
 
             // sycl::ext::oneapi::experimental::printf(
             //     "scale[%d] %f \n",
diff --git a/tests/integration/gemv/int4/main.cpp b/tests/integration/gemv/int4/main.cpp
@@ -31,14 +31,14 @@ template <typename scalar_t>
 class test_col_major_1 {
  public:
   // Extract the parameters required by different test cases
-  static constexpr size_t mat_m = 1;
+  static constexpr size_t mat_m = 4096;
   static constexpr size_t mat_n = 4096;
   static constexpr size_t mat_k = 4096;
-  static constexpr size_t wg_m = 1;
-  static constexpr size_t wg_n = 1;
-  static constexpr size_t sg_m = 1;
-  static constexpr size_t sg_n = 1;
-  static constexpr size_t sg_k = 512 / sg_m;
+  static constexpr size_t wg_m = 64;
+  static constexpr size_t wg_n = 32;
+  static constexpr size_t sg_m = 16;
+  static constexpr size_t sg_n = 8;
+  static constexpr size_t sg_k = 32;
   static constexpr size_t dequant_s = 128;
   static constexpr quant_mode quant_mode = quant_mode::I4_SYM;
 
@@ -109,14 +109,17 @@ int gemm_result_validate(
   bool result = buff_cmp::xetla_buff_cmp(data, other, "gemv validation");
 
 #ifdef UT_DEBUG
-  // for (uint32_t i = 0; i < m; i++) {
-  //   for (uint32_t j = 0; j < n; j++) {
-  //     std::cout << float(sycl::half(C[i * n + j])) << " ";
-  //   }
-  //   std::cout << std::endl;
-  // }
+  if (m * n <= 4096) {
+    std::cout << "result:\n";
+    for (uint32_t i = 0; i < m; i++) {
+      for (uint32_t j = 0; j < n; j++) {
+        std::cout << float(sycl::half(C[i * n + j])) << " ";
+      }
+      std::cout << "\n";
+    }
+  }
 #endif
-  std::cout << (!result ? "FAILED\n" : "PASSED\n");
+  std::cout << (!result ? "FAILED" : "PASSED") << std::endl;
   return result ? 0 : 1;
 }
 
@@ -186,12 +189,15 @@ std::vector<data_type_acc_in> dequantize_weight(
     }
   }
 #ifdef UT_DEBUG
-  // for (uint32_t i = 0; i < matrix_n; i++) {
-  //   for (uint32_t j = 0; j < matrix_k; j++) {
-  //     std::cout << float(sycl::half(b_out[i * matrix_k + j])) << " ";
-  //   }
-  //   std::cout << std::endl;
-  // }
+  if (matrix_n * matrix_k <= 4096) {
+    std::cout << "dequantize_weight:\n";
+    for (uint32_t i = 0; i < matrix_n; i++) {
+      for (uint32_t j = 0; j < matrix_k; j++) {
+        std::cout << float(sycl::half(b_out[i * matrix_k + j])) << " ";
+      }
+      std::cout << std::endl;
+    }
+  }
 #endif
   return b_out;
 }
@@ -386,12 +392,14 @@ void dequantize_gemv_run(int iter) {
     if constexpr (std::is_same_v<int4x2, data_type_b>) {
       B_h[i] = random_uint8();
 #ifdef UT_DEBUG
-      B_h[i] = 0x77;
+      B_h[i] = ((7 + i) % 15 + 1) * 0x11;
+      if (i >= size_b)
+        B_h[i] = -1;
 #endif
     } else if constexpr (std::is_same_v<int4x8, data_type_b>) {
       B_h[i] = random_uint32();
 #ifdef UT_DEBUG
-      B_h[i] = 0x77777777;
+      B_h[i] = ((7 + i) % 15 + 1) * 0x11111111;
 #endif
     }
   }