remove comments of unused code

DDEle · DDEle · commit 07325e28d938 · 2024-06-24T08:55:27.000Z
diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp
@@ -752,18 +752,6 @@ template <typename Ty, int NElts = 1, data_size DS = data_size::default_size>
 __XETLA_API void xetla_store_local(
     uint32_t offset,
     xetla_vector<Ty, NElts> vals) {
-  // using T = native_type_t<Ty>;
-  // DEBUG_INVOKE(
-  //     dbg_level::core,
-  //     core::general_1d<gpu_arch::XeHpc, Ty>::template
-  //     check_restriction<NElts>(
-  //         offset));
-
-  // __ESIMD_ENS::
-  //     lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size(DS)>(
-  //         offset, vals);
-  // __ESIMD_NS::properties props{};
-
   __ESIMD_NS::slm_block_store<Ty, NElts>(offset, vals);
 }
 
diff --git a/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp b/include/experimental/group/gemm/impl/int4_dequantize_xe.hpp
@@ -658,212 +658,6 @@ class gemm_t<
   }
 
  private:
-  //   inline void dequantize(
-  //       matB_acc_t& matB_acc,
-  //       matB_t& matB,
-  //       scale_t& scale,
-  //       zero_pt_t& zero_pt) {
-  //     // no tail, because this is matB
-  //     constexpr uint32_t num_block_x = tile_size_x_b / block_size_x_b;
-  //     constexpr uint32_t num_block_y = tile_size_y_b / block_size_y_b;
-  // #pragma unroll
-  //     for (uint32_t i = 0; i < num_block_y; ++i) {
-  // #pragma unroll
-  //       for (uint32_t j = 0; j < num_block_x; ++j) {
-  //         int block_id = (i * num_block_x + j);
-  //         // Must be little-endian
-  //         auto matB_blk = matB.reg.xetla_format<uint8_t>()
-  //                             .xetla_select<matB_acc_t::block_elems / 2, 1>(
-  //                                 block_id * matB_acc_t::block_elems / 2);
-
-  //         auto dst_blk = matB_acc.reg.xetla_select<matB_acc_t::block_elems,
-  //         1>(
-  //             block_id * matB_acc_t::block_elems);
-
-  //         // int8 includes 2 4bits data.
-  //         xetla_vector<int8_t, matB_acc_t::block_elems> cvt_blk_i8;
-
-  //         // lowest 4 bit
-  //         {
-  //           cvt_blk_i8.xetla_select<matB_acc_t::block_elems / 2, 2>(0) =
-  //               matB_blk & 0xf;
-  //         }
-  //         // highest 4 bit
-  //         {
-  //           cvt_blk_i8.xetla_select<matB_acc_t::block_elems / 2, 2>(1) =
-  //               matB_blk >> 4;
-  //         }
-
-  //         // (b_i8 -  zero_pt_i8) x scale = fp16
-  //         constexpr uint32_t step = std::min(block_size_y_b, dequant_s);
-  // #pragma unroll
-  //         for (uint32_t jj = 0; jj < block_size_x_b; jj++) {
-  // #pragma unroll
-  //           for (uint32_t ii = 0; ii < block_size_y_b; ii += step) {
-  //             uint32_t offset_y_in_tile = i * block_size_y_b + ii;
-  //             uint32_t offset_x_in_tile = j * block_size_x_b + jj;
-
-  //             uint32_t scale_idx =
-  //                 (offset_y_in_tile) / dequant_s * scale_t::block_size_x +
-  //                 offset_x_in_tile;
-
-  //             if constexpr (compute_policy::quant_mode ==
-  //             quant_mode::S4_ASYM) {
-  //               uint32_t zero_pt_idx =
-  //                   offset_y_in_tile / dequant_s * zero_pt_t::block_size_x +
-  //                   offset_x_in_tile / pack_ratio;
-  //               native_type_t<dtype_b> zero_pt_pack =
-  //               zero_pt.reg[zero_pt_idx];
-
-  //               int8_t zero_pt_i8 =
-  //                   (zero_pt_pack >>
-  //                    (4 * ((wg_start_n + offset_x_in_tile) % pack_ratio))) &
-  //                   0xf;
-  //               // sycl::ext::oneapi::experimental::printf(
-  //               //     "zero_pt.reg[%d}  %x    zero_pt_i8 %x
-  //               offset_x_in_tile:%d
-  //               //     \n", zero_pt_idx, zero_pt_pack, (int32_t)zero_pt_i8 ,
-  //               //     offset_x_in_tile);
-
-  //               cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) =
-  //                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b +
-  //                   ii) - zero_pt_i8;
-  //             } else if constexpr (
-  //                 compute_policy::quant_mode ==
-  //                 quant_mode::S4_FULLRANGE_NO_ZP) {
-  //               cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii) =
-  //                   cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b +
-  //                   ii) - int8_t(8);
-  //             }
-  //             dst_blk.xetla_select<step, 1>(jj * block_size_y_b + ii) =
-  //                 cvt_blk_i8.xetla_select<step, 1>(jj * block_size_y_b + ii)
-  //                 * scale.reg[scale_idx];
-
-  //             // sycl::ext::oneapi::experimental::printf(
-  //             //     "scale[%d] %f \n",
-  //             //     scale_idx,
-  //             //     float(sycl::half(scale.reg.xetla_select<1,
-  //             1>(scale_idx))));
-  //           }
-  //         }
-  //       }
-  //     }
-  //   }
-
-  /*
-  inline void dequantize(
-      matB_acc_t & matB_acc,
-      matB_t & matB,
-      scale_t & scale,
-      zero_pt_t & zero_pt) {
-    // no tail, because this is matB
-    constexpr uint32_t num_block_x = tile_size_x_b / block_size_x_b;
-    constexpr uint32_t num_block_y = tile_size_y_b / block_size_y_b;
-
-    constexpr uint32_t block_b_y_per_scale = dequant_s / block_size_y_b;
-    constexpr uint32_t block_b_x_per_scale = dequant_s / block_size_x_b;
-  #pragma unroll
-    for (uint32_t i = 0; i < num_block_y; ++i) {
-  #pragma unroll
-      for (uint32_t j = 0; j < num_block_x; ++j) {
-        int block_id = (i * num_block_x + j);
-        auto matB_blk = matB.reg
-                            .xetla_select<matB_t::block_elems, 1>(
-                                block_id * matB_t::block_elems)
-                            .xetla_format<int8_t>();
-        int scale_block_id = (i / block_b_y_per_scale * num_block_x + j);
-        auto scale_vec = scale.reg.xetla_select<scale_t::block_size_x, 1>(
-            scale_block_id * scale_t::block_size_x);
-        auto dst_blk = matB_acc.reg.xetla_select<matB_acc_t::block_elems, 1>(
-            block_id * matB_acc_t::block_elems);
-
-        // 2: int8 includes 2 4bits data.
-        xetla_vector<uint8_t, block_size_x_b * block_size_y_b> cvt_blk;
-
-        xetla_vector<int32_t, block_size_x_b * block_size_y_b> cvt_blk_i32;
-        if constexpr (compute_policy::quant_mode == quant_mode::S4_ASYM) {
-          auto zero_pt_vec = zero_pt.reg
-                                 .xetla_select<zero_pt_t::block_size_x, 1>(
-                                     scale_block_id * zero_pt_t::block_size_x)
-                                 .xetla_format<uint8_t>();
-          cvt_blk.xetla_select<matB_t::block_elems, 2>(0) = matB_blk & 0x0f;
-          cvt_blk.xetla_select<matB_t::block_elems, 2>(1) = matB_blk >> 4;
-          xetla_vector<uint8_t, block_size_x_b> zero_pt_sub;
-          zero_pt_sub.xetla_select<block_size_x_b / 2, 2>(0) =
-              zero_pt_vec & 0x0f;
-          zero_pt_sub.xetla_select<block_size_x_b / 2, 2>(1) =
-              zero_pt_vec >> 4;
-          xetla_vector<uint8_t, block_size_x_b * block_size_y_b> zero_pt_blk;
-  #pragma unroll
-          for (uint32_t row = 0; row < block_size_y_b; row++) {
-            zero_pt_blk.xetla_select<block_size_x_b, 1>(row * block_size_x_b)
-                .xetla_format<int8_t>() =
-                zero_pt_sub.xetla_format<int8_t>() + int8_t(1);
-          }
-          cvt_blk_i32 =
-              (cvt_blk.xetla_format<int8_t>() -
-               zero_pt_blk.xetla_format<int8_t>());
-        }
-        if constexpr (
-            compute_policy::quant_mode == quant_mode::S4_FULLRANGE_NO_ZP) {
-          xetla_vector<int8_t, block_size_x_b * block_size_y_b> cvt_blk_i8;
-          cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(0) =
-              matB_blk & 0x0f;
-          cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(0) =
-              cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(0) << 4;
-          cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(0) =
-              cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(0) >> 4;
-          cvt_blk_i8.xetla_select<matB_t::block_elems, 2>(1) =
-              matB_blk.xetla_format<int8_t>() >> 4;
-          cvt_blk_i32 = (cvt_blk_i8.xetla_format<int8_t>());
-        }
-        if constexpr (compute_policy::mma_engine == mma_engine::xmx) {
-          constexpr uint32_t vnni_rows =
-              sizeof(uint32_t) / sizeof(dtype_mma_b);
-          xetla_vector<dtype_mma_b, matB_acc_t::block_elems * vnni_rows>
-              temp_blk;
-          temp_blk.xetla_select<matB_acc_t::block_elems, vnni_rows>(0) =
-              cvt_blk_i32;
-
-  #pragma unroll
-          for (uint32_t k = 0; k < block_size_y_b; k += vnni_rows) {
-  #pragma unroll
-            for (uint32_t row = 0; row < vnni_rows; row++) {
-              temp_blk.xetla_select<block_size_x_b, vnni_rows>(
-                  row + block_size_x_b * k * vnni_rows) =
-                  temp_blk.xetla_select<block_size_x_b, vnni_rows>(
-                      (k + row) * block_size_x_b * vnni_rows);
-            }
-          }
-
-          xetla_vector<dtype_scale, block_size_x_b * vnni_rows> scale_blk;
-  #pragma unroll
-          for (uint32_t row = 0; row < vnni_rows; row++) {
-            scale_blk.xetla_select<block_size_x_b, vnni_rows>(row) =
-                scale_vec;
-          }
-
-  #pragma unroll
-          for (uint32_t k = 0; k < block_size_y_b; k += vnni_rows) {
-            dst_blk.xetla_select<block_size_x_b * vnni_rows, 1>(
-                k * block_size_x_b) =
-                temp_blk.xetla_select<block_size_x_b * vnni_rows, 1>(
-                    k * block_size_x_b * vnni_rows) *
-                scale_blk;
-          }
-        } else {
-  #pragma unroll
-          for (uint32_t k = 0; k < block_size_y_b; k++) {
-            dst_blk.xetla_select<block_size_x_b, 1>(k * block_size_x_b) =
-                cvt_blk_i32.xetla_select<block_size_x_b, 1>(
-                    k * block_size_x_b) *
-                scale_vec;
-          }
-        }
-      }
-    }
-  } */
-
   /// @brief Updates tile base descriptor based on the tid.
   __XETLA_API static void update_sg_tile_tdesc(
       arguments_t& args,
diff --git a/include/kernel/gemm/impl/default_xe.hpp b/include/kernel/gemm/impl/default_xe.hpp
@@ -275,19 +275,6 @@ class gemm_universal_t<
             args.matB_base.base, args.matB_ld);
       }
     }
-    // if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
-    //   if (epilogue_t::msg_type_c == msg_type::block_2d) {
-    //     implementable &= kernel::block_2d<arch_tag, dtype_c>::check_tensor(
-    //         (uint64_t)(args.matC_base.base),
-    //         args.matrix_n,
-    //         args.matrix_m,
-    //         args.matC_ld);
-    //   } else {
-    //     implementable &= kernel::general_1d<arch_tag,
-    //     dtype_c>::check_alignment(
-    //         args.matC_base.base, args.matC_ld);
-    //   }
-    // }
 
     return implementable;
   }
diff --git a/include/kernel/gemm/impl/kslicing_xe.hpp b/include/kernel/gemm/impl/kslicing_xe.hpp
@@ -387,19 +387,6 @@ class gemm_universal_t<
             args.matB_base.base, args.matB_ld);
       }
     }
-    // if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {
-    //   if (epilogue_t::msg_type_c == msg_type::block_2d) {
-    //     implementable &= kernel::block_2d<arch_tag, dtype_c>::check_tensor(
-    //         (uint64_t)(args.matC_base.base),
-    //         args.matrix_n,
-    //         args.matrix_m,
-    //         args.matC_ld);
-    //   } else {
-    //     implementable &= kernel::general_1d<arch_tag,
-    //     dtype_c>::check_alignment(
-    //         args.matC_base.base, args.matC_ld);
-    //   }
-    // }
 
     return implementable;
   }
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
@@ -1165,7 +1165,6 @@ struct mem_payload_t<
   static constexpr uint32_t max_channel =
       max_bytes / (simd_exec_size * sizeof(mem_dtype));
 
-
   static constexpr uint32_t select_channel(const uint32_t channel) {
     return (channel >= 32 && arch_tag == gpu_arch::XeHpc) ? 32
         : channel >= 16                                   ? 16

Original file line number	Diff line number	Diff line change
`@@ -275,19 +275,6 @@ class gemm_universal_t<`
`275`	`275`	`args.matB_base.base, args.matB_ld);`
`276`	`276`	`}`
`277`	`277`	`}`
`278`		`- // if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {`
`279`		`- // if (epilogue_t::msg_type_c == msg_type::block_2d) {`
`280`		`- // implementable &= kernel::block_2d<arch_tag, dtype_c>::check_tensor(`
`281`		`- // (uint64_t)(args.matC_base.base),`
`282`		`- // args.matrix_n,`
`283`		`- // args.matrix_m,`
`284`		`- // args.matC_ld);`
`285`		`- // } else {`
`286`		`- // implementable &= kernel::general_1d<arch_tag,`
`287`		`- // dtype_c>::check_alignment(`
`288`		`- // args.matC_base.base, args.matC_ld);`
`289`		`- // }`
`290`		`- // }`
`291`	`278`
`292`	`279`	`return implementable;`
`293`	`280`	`}`
Original file line number	Diff line number	Diff line change
`@@ -387,19 +387,6 @@ class gemm_universal_t<`
`387`	`387`	`args.matB_base.base, args.matB_ld);`
`388`	`388`	`}`
`389`	`389`	`}`
`390`		`- // if (epilogue_t::msg_type_c != msg_type::unaligned_2d) {`
`391`		`- // if (epilogue_t::msg_type_c == msg_type::block_2d) {`
`392`		`- // implementable &= kernel::block_2d<arch_tag, dtype_c>::check_tensor(`
`393`		`- // (uint64_t)(args.matC_base.base),`
`394`		`- // args.matrix_n,`
`395`		`- // args.matrix_m,`
`396`		`- // args.matC_ld);`
`397`		`- // } else {`
`398`		`- // implementable &= kernel::general_1d<arch_tag,`
`399`		`- // dtype_c>::check_alignment(`
`400`		`- // args.matC_base.base, args.matC_ld);`
`401`		`- // }`
`402`		`- // }`
`403`	`390`
`404`	`391`	`return implementable;`
`405`	`392`	`}`