@@ -777,7 +777,10 @@ accumulate_1d_contig_impl(sycl::queue &q,
777777 }
778778 else {
779779 constexpr nwiT n_wi_for_gpu = 4 ;
780- const std::size_t wg_size = 256 ;
780+ // base_scan_striped algorithm does not execute correctly
781+ // on HIP device with wg_size > 64
782+ const std::size_t wg_size =
783+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
781784 comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
782785 transformerT, AccumulateOpT,
783786 include_initial>(
@@ -1181,7 +1184,10 @@ accumulate_strided_impl(sycl::queue &q,
11811184 }
11821185 else {
11831186 constexpr nwiT n_wi_for_gpu = 4 ;
1184- const std::size_t wg_size = 256 ;
1187+ // base_scan_striped algorithm does not execute correctly
1188+ // on HIP device with wg_size > 64
1189+ const std::size_t wg_size =
1190+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
11851191 comp_ev =
11861192 inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
11871193 OutIndexerT, InpIndexerT, OutIndexerT,
@@ -1235,7 +1241,10 @@ std::size_t cumsum_val_contig_impl(sycl::queue &q,
12351241 }
12361242 else {
12371243 constexpr nwiT n_wi_for_gpu = 4 ;
1238- const std::size_t wg_size = 256 ;
1244+ // base_scan_striped algorithm does not execute correctly
1245+ // on HIP device with wg_size > 64
1246+ const std::size_t wg_size =
1247+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
12391248 comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
12401249 NoOpIndexerT, transformerT,
12411250 AccumulateOpT, include_initial>(
@@ -1346,7 +1355,10 @@ cumsum_val_strided_impl(sycl::queue &q,
13461355 }
13471356 else {
13481357 constexpr nwiT n_wi_for_gpu = 4 ;
1349- const std::size_t wg_size = 256 ;
1358+ // base_scan_striped algorithm does not execute correctly
1359+ // on HIP device with wg_size > 64
1360+ const std::size_t wg_size =
1361+ (q.get_backend () == sycl::backend::ext_oneapi_hip) ? 64 : 256 ;
13501362 comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
13511363 StridedIndexerT, transformerT,
13521364 AccumulateOpT, include_initial>(
0 commit comments