@@ -696,7 +696,41 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
696696 const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
697697 size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
698698
699- {
699+ if (reduction_nelems < wg) {
700+ sycl::event comp_ev = exec_q.submit ([&](sycl::handler &cgh) {
701+ cgh.depends_on (depends);
702+
703+ using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
704+ using InputOutputIterIndexerT =
705+ dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
706+ NoOpIndexerT, NoOpIndexerT>;
707+ using ReductionIndexerT =
708+ dpctl::tensor::offset_utils::Strided1DIndexer;
709+
710+ InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
711+ NoOpIndexerT{}};
712+ ReductionIndexerT reduction_indexer{
713+ 0 , static_cast <py::ssize_t >(reduction_nelems),
714+ static_cast <py::ssize_t >(iter_nelems)};
715+
716+ using KernelName =
717+ class reduction_seq_contig_krn <argTy, resTy, ReductionOpT,
718+ InputOutputIterIndexerT,
719+ ReductionIndexerT>;
720+
721+ sycl::range<1 > iter_range{iter_nelems};
722+
723+ cgh.parallel_for <KernelName>(
724+ iter_range,
725+ SequentialReduction<argTy, resTy, ReductionOpT,
726+ InputOutputIterIndexerT, ReductionIndexerT>(
727+ arg_tp, res_tp, ReductionOpT (), identity_val,
728+ in_out_iter_indexer, reduction_indexer, reduction_nelems));
729+ });
730+
731+ return comp_ev;
732+ }
733+ else {
700734 sycl::event res_init_ev = exec_q.fill <resTy>(
701735 res_tp, resTy (identity_val), iter_nelems, depends);
702736
@@ -1849,6 +1883,41 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
18491883 const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
18501884 size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
18511885
1886+ if (reduction_nelems < wg) {
1887+ sycl::event comp_ev = exec_q.submit ([&](sycl::handler &cgh) {
1888+ cgh.depends_on (depends);
1889+
1890+ using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
1891+ using InputOutputIterIndexerT =
1892+ dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
1893+ NoOpIndexerT, NoOpIndexerT>;
1894+ using ReductionIndexerT =
1895+ dpctl::tensor::offset_utils::Strided1DIndexer;
1896+
1897+ InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
1898+ NoOpIndexerT{}};
1899+ ReductionIndexerT reduction_indexer{
1900+ 0 , static_cast <py::ssize_t >(reduction_nelems),
1901+ static_cast <py::ssize_t >(iter_nelems)};
1902+
1903+ using KernelName =
1904+ class reduction_seq_contig_krn <argTy, resTy, ReductionOpT,
1905+ InputOutputIterIndexerT,
1906+ ReductionIndexerT>;
1907+
1908+ sycl::range<1 > iter_range{iter_nelems};
1909+
1910+ cgh.parallel_for <KernelName>(
1911+ iter_range,
1912+ SequentialReduction<argTy, resTy, ReductionOpT,
1913+ InputOutputIterIndexerT, ReductionIndexerT>(
1914+ arg_tp, res_tp, ReductionOpT (), identity_val,
1915+ in_out_iter_indexer, reduction_indexer, reduction_nelems));
1916+ });
1917+
1918+ return comp_ev;
1919+ }
1920+
18521921 constexpr size_t preferred_reductions_per_wi = 8 ;
18531922 // max_max_wg prevents running out of resources on CPU
18541923 constexpr size_t max_max_wg = 2048 ;
0 commit comments