File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed
csrc/fused_moe/cutlass_backend Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -1457,7 +1457,7 @@ __host__ __device__ constexpr static U arrayConvert(T const& input) {
14571457// (k-1)*rows_in_input all map to row 0 in the original matrix. Thus, to know where to read in the
14581458// source matrix, we simply take the modulus of the expanded index.
14591459
1460- constexpr static int EXPAND_THREADS_PER_BLOCK = 256 ;
1460+ constexpr static int EXPAND_THREADS_PER_BLOCK = 128 ;
14611461
14621462template <class InputActivationsType , class ExpandedActivationsType ,
14631463 TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType,
@@ -1697,7 +1697,7 @@ void expandInputRowsKernelLauncher(
16971697
16981698 static int64_t const smCount = tensorrt_llm::common::getMultiProcessorCount ();
16991699 // Note: Launching 8 blocks per SM can fully leverage the memory bandwidth (tested on B200).
1700- int64_t const blocks = std::min (smCount * 8 , std::max (num_rows * k, num_padding_tokens));
1700+ int64_t const blocks = std::min (smCount * 16 , std::max (num_rows * k, num_padding_tokens));
17011701 int64_t const threads = EXPAND_THREADS_PER_BLOCK;
17021702
17031703 auto func = [&]() {
You can’t perform that action at this time.
0 commit comments