Issue/259 softmax_cuda 算子dispatch抽象以及格式规范化

Graylatzhou · Graylatzhou · commit 302ea32985bc · 2025-06-23T22:22:47.000+08:00
diff --git a/include/infiniop/ops/softmax.h b/include/infiniop/ops/softmax.h
@@ -16,4 +16,5 @@ __C infiniStatus_t infiniopGetSoftmaxWorkspaceSize(infiniopSoftmaxDescriptor_t d
 __C infiniStatus_t infiniopSoftmax(infiniopSoftmaxDescriptor_t desc, void *workspace, size_t workspace_size, void *y, const void *x, void *stream);
 
 __C infiniStatus_t infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t desc);
+
 #endif
diff --git a/src/infiniop/ops/softmax/cpu/softmax_cpu.cc b/src/infiniop/ops/softmax/cpu/softmax_cpu.cc
@@ -38,48 +38,35 @@ infiniStatus_t Descriptor::create(
 template <typename T>
 void softmax_cpu(const SoftmaxInfo &info,
                  const void *x, void *y, int axis) {
-    int dimsize = info.dimsize;
+    int dim_size = info.dim_size;
     int stride = info.stride;
-    int othersize = info.otherdim_size;
-    auto to_float = [](const T &val) -> float {
-        if constexpr (std::is_same_v<T, fp16_t>) {
-            return utils::cast<float>(val);
-        } else {
-            return val;
-        }
-    };
-
-    auto from_float = [](float val) -> T {
-        if constexpr (std::is_same_v<T, fp16_t>) {
-            return utils::cast<fp16_t>(val);
-        } else {
-            return val;
-        }
-    };
-
+    int other_size = info.other_size;
     auto input = reinterpret_cast<const T *>(x);
     auto output = reinterpret_cast<T *>(y);
 
     auto compute_softmax = [&](int i) {
-        int tid = i % stride + (i - i % stride) * dimsize;
+        int tid = i % stride + (i - i % stride) * dim_size;
+
         float max_data = -INFINITY;
-        for (int j = 0; j < dimsize; j++) {
+        for (int j = 0; j < dim_size; j++) {
             int index = tid + j * stride;
-            max_data = fmax(max_data, to_float(input[index]));
+            max_data = fmax(max_data, utils::cast<float>(input[index]));
         }
+
         float sum_data = 0.0f;
-        for (int j = 0; j < dimsize; j++) {
+        for (int j = 0; j < dim_size; j++) {
             int index = tid + j * stride;
-            sum_data += std::exp(to_float(input[index]) - max_data);
+            sum_data += std::exp(utils::cast<float>(input[index]) - max_data);
         }
-        for (int j = 0; j < dimsize; j++) {
+
+        for (int j = 0; j < dim_size; j++) {
             int index = tid + j * stride;
-            float result = std::exp(to_float(input[index]) - max_data) / sum_data;
-            output[index] = from_float(result);
+            float result = std::exp(utils::cast<float>(input[index]) - max_data) / sum_data;
+            output[index] = utils::cast<T>(result);
         }
     };
 #pragma omp parallel for
-    for (int i = 0; i < othersize; i++) {
+    for (int i = 0; i < other_size; i++) {
         compute_softmax(i);
     }
 }
diff --git a/src/infiniop/ops/softmax/cuda/softmax_cuda.cu b/src/infiniop/ops/softmax/cuda/softmax_cuda.cu
@@ -49,4 +49,4 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 }
-} // namespace op::softmax::cuda
+} // namespace op::softmax::cuda
diff --git a/src/infiniop/ops/softmax/cuda/softmax_kernel.cuh b/src/infiniop/ops/softmax/cuda/softmax_kernel.cuh
@@ -47,9 +47,9 @@ struct MaxOp<half> {
     }
 };
 
-template <typename T, template <typename> class ReduceOp, int thread_group_width = 32>
+template <typename T, template <typename> class ReduceOp, int THREAD_GROUP_WIDTH = 32>
 __device__ __forceinline__ T warpReduce(T value) {
-    for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    for (int mask = THREAD_GROUP_WIDTH / 2; mask > 0; mask /= 2) {
         value = ReduceOp<T>()(value, __shfl_xor_sync(0xffffffff, value, mask));
     }
     return value;
@@ -73,23 +73,23 @@ __device__ __forceinline__ T warpReduce(T value) {
 也就是我们还需要i 和 j
 i 也就是 (blockIdx.x * blockDim.y + threadIdx.y) / stride
 j 也就是 (blockIdx.x * blockDim.y + threadIdx.y) % stride
-然后i转化为线性也就是 i * stride * dimsize
+然后i转化为线性也就是 i * stride * dim_size
 j直接加上就好
 */
-template <int elemPerThread, int BLOCK_DIM_Y, int BLOCK_DIM_X, typename T>
-__global__ void Softmax_warp_impl(const T *x, T *y, int stride, int dimsize, int otherdim_size) {
-    float dataPerThread[elemPerThread];
+template <int ELEM_PER_THREAD, int BLOCK_DIM_Y, int BLOCK_DIM_X, typename T>
+__global__ void Softmax_warp_impl(const T *x, T *y, int stride, int dim_size, int other_size) {
+    float dataPerThread[ELEM_PER_THREAD];
     int global_warp_id = blockIdx.x * blockDim.y + threadIdx.y;
-    int group_offset = global_warp_id % stride + (global_warp_id - global_warp_id % stride) * dimsize;
+    int group_offset = global_warp_id % stride + (global_warp_id - global_warp_id % stride) * dim_size;
     int tid = threadIdx.x;
-    if (global_warp_id >= otherdim_size) {
+    if (global_warp_id >= other_size) {
         return;
     }
     __shared__ float group_max[BLOCK_DIM_X];
     __shared__ float group_sum[BLOCK_DIM_X];
     float thread_max = -INFINITY;
     float thread_sum = 0.0f;
-    for (int i = 0; tid + i * BLOCK_DIM_X < dimsize; i++) {
+    for (int i = 0; tid + i * BLOCK_DIM_X < dim_size; i++) {
         dataPerThread[i] = static_cast<float>(x[(tid + i * BLOCK_DIM_X) * stride + group_offset]);
         thread_max = max(thread_max, dataPerThread[i]);
     }
@@ -99,7 +99,7 @@ __global__ void Softmax_warp_impl(const T *x, T *y, int stride, int dimsize, int
         group_max[threadIdx.y] = thread_max;
     }
 
-    for (int i = 0; tid + i * BLOCK_DIM_X < dimsize; i++) {
+    for (int i = 0; tid + i * BLOCK_DIM_X < dim_size; i++) {
         dataPerThread[i] = __expf(dataPerThread[i] - group_max[threadIdx.y]);
         thread_sum += dataPerThread[i];
     }
@@ -109,18 +109,18 @@ __global__ void Softmax_warp_impl(const T *x, T *y, int stride, int dimsize, int
         group_sum[threadIdx.y] = thread_sum;
     }
 
-    for (int i = 0; tid + i * BLOCK_DIM_X < dimsize; i++) {
+    for (int i = 0; tid + i * BLOCK_DIM_X < dim_size; i++) {
         y[(tid + i * BLOCK_DIM_X) * stride + group_offset] = static_cast<T>(dataPerThread[i] * __fdividef(1.0f, group_sum[threadIdx.y]));
     }
 }
 
-template <int elemPerThread, int BLOCK_DIM, typename T>
+template <int ELEM_PER_THREAD, int BLOCK_DIM, typename T>
 __launch_bounds__(BLOCK_DIM)
-    __global__ void Softmax_block_impl(const T *x, T *y, int stride, int dimsize, int otherdim_size) {
-    //  remain = dimsize - BLOCK_DIM * elemPerThread
+    __global__ void Softmax_block_impl(const T *x, T *y, int stride, int dim_size, int other_size) {
+    //  remain = dim_size - BLOCK_DIM * ELEM_PER_THREAD
     int tid = threadIdx.x;
-    int block_offset = (blockIdx.x - blockIdx.x % stride) * dimsize + blockIdx.x % stride;
-    int remain = dimsize - (BLOCK_DIM - 1) * elemPerThread; // 🔧 修正：最后线程处理的元素数
+    int block_offset = (blockIdx.x - blockIdx.x % stride) * dim_size + blockIdx.x % stride;
+    int remain = dim_size - (BLOCK_DIM - 1) * ELEM_PER_THREAD;
 
     MD md_partial;
     md_partial.max = -INFINITY;
@@ -129,16 +129,16 @@ __launch_bounds__(BLOCK_DIM)
     // tid = [0, BLOCK_DIM - 1], 所以最后一个线程处理余数部分
     if (tid < BLOCK_DIM - 1) {
 #pragma unroll
-        for (int i = 0; i < elemPerThread; i++) {
-            int index = (tid * elemPerThread + i) * stride + block_offset;
+        for (int i = 0; i < ELEM_PER_THREAD; i++) {
+            int index = (tid * ELEM_PER_THREAD + i) * stride + block_offset;
             input.max = static_cast<float>(x[index]);
             input.sum = 1.0f;
             md_partial = reduce_for_md(md_partial, input);
         }
     } else {
 #pragma unroll
         for (int i = 0; i < remain; i++) {
-            int index = ((BLOCK_DIM - 1) * elemPerThread + i) * stride + block_offset;
+            int index = ((BLOCK_DIM - 1) * ELEM_PER_THREAD + i) * stride + block_offset;
             input.max = static_cast<float>(x[index]);
             input.sum = 1.0f;
             md_partial = reduce_for_md(md_partial, input);
@@ -153,85 +153,94 @@ __launch_bounds__(BLOCK_DIM)
     }
     __syncthreads();
     if (tid < BLOCK_DIM - 1) {
-        for (int i = 0; i < elemPerThread; i++) {
-            int index = (tid * elemPerThread + i) * stride + block_offset;
+        for (int i = 0; i < ELEM_PER_THREAD; i++) {
+            int index = (tid * ELEM_PER_THREAD + i) * stride + block_offset;
             y[index] = static_cast<T>(__expf(static_cast<float>(x[index]) - md_total.max) * __fdividef(1.0f, md_total.sum));
         }
     } else {
         for (int i = 0; i < remain; i++) {
-            int index = ((BLOCK_DIM - 1) * elemPerThread + i) * stride + block_offset;
+            int index = ((BLOCK_DIM - 1) * ELEM_PER_THREAD + i) * stride + block_offset;
             y[index] = static_cast<T>(__expf(static_cast<float>(x[index]) - md_total.max) * __fdividef(1.0f, md_total.sum));
         }
     }
 }
 
 template <typename T>
-infiniStatus_t softmax_dispatch(const op::softmax::SoftmaxInfo &info, void *y, const void *x, void *stream) {
-    int dimsize = info.dimsize;
-    int stride = info.stride;
-    int otherdim_size = info.otherdim_size;
-    if (dimsize <= 1024) {
-        dim3 block(32, 32); // BLOCK_DIM_X=32, BLOCK_DIM_Y=4
-        int num_blocks = (otherdim_size + block.y - 1) / block.y;
-        dim3 grid(num_blocks, 1, 1);
-        int elemPerThread = (dimsize + 31) / 32; // 计算每个线程需要处理的元素数
-        elemPerThread = min(elemPerThread, 32);  // 限制最大值
+void dispatchSoftmaxKernel(
+    const void *x, void *y,
+    int stride, int dim_size, int other_size,
+    void *stream, bool use_warp_impl) {
+
+    int elemPerThread;
+    dim3 grid, block;
+
+    if (use_warp_impl) {
+        block = dim3(32, 32);
+        grid = dim3((other_size + block.y - 1) / block.y, 1, 1);
+        elemPerThread = min((dim_size + 31) / 32, 32);
+
+#define LAUNCH_WARP_KERNEL(ELEM_PER_THREAD)                           \
+    Softmax_warp_impl<ELEM_PER_THREAD, 32, 32, T>                     \
+        <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>( \
+            reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), \
+            stride, dim_size, other_size)
+
         if (elemPerThread <= 1) {
-            Softmax_warp_impl<1, 32, 32, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_WARP_KERNEL(1);
         } else if (elemPerThread <= 2) {
-            Softmax_warp_impl<2, 32, 32, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_WARP_KERNEL(2);
         } else if (elemPerThread <= 4) {
-            Softmax_warp_impl<4, 32, 32, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_WARP_KERNEL(4);
         } else if (elemPerThread <= 8) {
-            Softmax_warp_impl<8, 32, 32, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_WARP_KERNEL(8);
         } else if (elemPerThread <= 16) {
-            Softmax_warp_impl<16, 32, 32, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_WARP_KERNEL(16);
         } else {
-            Softmax_warp_impl<32, 32, 32, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_WARP_KERNEL(32);
         }
-    } else if (dimsize > 1024) {
-        int block_size = 1024;
-        int elemPerThread = (dimsize + block_size - 1) / block_size; // 每个线程需要处理的元素数
-        elemPerThread = min(elemPerThread, 32);                      // 限制最大值为32
-        dim3 block(block_size);
-        dim3 grid(otherdim_size);
+
+#undef LAUNCH_WARP_KERNEL
+
+    } else {
+        // Block implementation for dim_size > 1024
+        constexpr int BLOCK_SIZE = 1024;
+        block = dim3(BLOCK_SIZE);
+        grid = dim3(other_size);
+        elemPerThread = min((dim_size + BLOCK_SIZE - 1) / BLOCK_SIZE, 32);
+
+#define LAUNCH_BLOCK_KERNEL(ELEM_PER_THREAD)                          \
+    Softmax_block_impl<ELEM_PER_THREAD, BLOCK_SIZE, T>                \
+        <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>( \
+            reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), \
+            stride, dim_size, other_size)
+
         if (elemPerThread <= 1) {
-            Softmax_block_impl<1, 1024, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_BLOCK_KERNEL(1);
         } else if (elemPerThread <= 2) {
-            Softmax_block_impl<2, 1024, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_BLOCK_KERNEL(2);
         } else if (elemPerThread <= 4) {
-            Softmax_block_impl<4, 1024, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_BLOCK_KERNEL(4);
         } else if (elemPerThread <= 8) {
-            Softmax_block_impl<8, 1024, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_BLOCK_KERNEL(8);
         } else if (elemPerThread <= 16) {
-            Softmax_block_impl<16, 1024, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_BLOCK_KERNEL(16);
         } else {
-            Softmax_block_impl<32, 1024, T>
-                <<<grid, block, 0, reinterpret_cast<cudaStream_t>(stream)>>>(
-                    reinterpret_cast<const T *>(x), reinterpret_cast<T *>(y), stride, dimsize, otherdim_size);
+            LAUNCH_BLOCK_KERNEL(32);
         }
+
+#undef LAUNCH_BLOCK_KERNEL
+    }
+}
+
+template <typename T>
+infiniStatus_t softmax_dispatch(const op::softmax::SoftmaxInfo &info, void *y, const void *x, void *stream) {
+    int dim_size = info.dim_size;
+    int stride = info.stride;
+    int other_size = info.other_size;
+    if (dim_size <= 1024) {
+        dispatchSoftmaxKernel<T>(x, y, stride, dim_size, other_size, stream, true);
+    } else if (dim_size > 1024) {
+        dispatchSoftmaxKernel<T>(x, y, stride, dim_size, other_size, stream, false);
     }
     return INFINI_STATUS_SUCCESS;
 }
diff --git a/src/infiniop/ops/softmax/info.h b/src/infiniop/ops/softmax/info.h
@@ -10,10 +10,10 @@ namespace op::softmax {
 class SoftmaxInfo {
 public:
     int axis;
-    int otherdim_size;
+    int other_size;
     int stride;
     int size;
-    int dimsize;
+    int dim_size;
 
     static utils::Result<SoftmaxInfo> create(
         infiniopTensorDescriptor_t y_desc,
@@ -27,9 +27,9 @@ class SoftmaxInfo {
         SoftmaxInfo info;
         info.axis = axis;
         info.size = 1;
-        info.otherdim_size = 1;
+        info.other_size = 1;
         info.stride = 1;
-        info.dimsize = static_cast<int>(x_desc->dim(axis));
+        info.dim_size = static_cast<int>(x_desc->dim(axis));
         int ndim = static_cast<int>(y_desc->ndim());
         for (int i = ndim - 1; i >= 0; i--) {
             info.size *= static_cast<int>(y_desc->dim(i));
@@ -38,7 +38,7 @@ class SoftmaxInfo {
         for (int i = axis + 1; i < ndim; i++) {
             info.stride *= static_cast<int>(x_desc->dim(i));
         }
-        info.otherdim_size = info.size / info.dimsize;
+        info.other_size = info.size / info.dim_size;
         return utils::Result<SoftmaxInfo>(info);
     }
 };
diff --git a/src/infiniop/ops/softmax/softmax.h b/src/infiniop/ops/softmax/softmax.h
@@ -46,4 +46,5 @@
             void *stream) const;                                 \
     };                                                           \
     }
+
 #endif // __SOFTMAX_H__
diff --git a/test/infiniop/rope.py b/test/infiniop/rope.py
diff --git a/test/infiniop/softmax.py b/test/infiniop/softmax.py

Original file line number	Diff line number	Diff line change
`@@ -49,4 +49,4 @@ infiniStatus_t Descriptor::calculate(`
`49`	`49`	`return INFINI_STATUS_BAD_TENSOR_DTYPE;`
`50`	`50`	`}`
`51`	`51`	`}`
`52`		`-} // namespace op::softmax::cuda`
	`52`	`+} // namespace op::softmax::cuda`
Original file line number	Diff line number	Diff line change
`@@ -46,4 +46,5 @@`
`46`	`46`	`void *stream) const; \`
`47`	`47`	`}; \`
`48`	`48`	`}`
	`49`	`+`
`49`	`50`	`#endif // __SOFTMAX_H__`