@@ -376,26 +376,24 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
376376 CallInst *const SetInactive =
377377 B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
378378
379- CallInst * const FirstDPP =
379+ ExclScan =
380380 B.CreateIntrinsic (Intrinsic::amdgcn_update_dpp, Ty,
381381 {Identity, SetInactive, B.getInt32 (DPP_WF_SR1),
382382 B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
383- ExclScan = FirstDPP;
384383
385- const unsigned Iters = 7 ;
386- const unsigned DPPCtrl[Iters] = {
387- DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
388- DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
389- const unsigned RowMask[Iters] = {0xf , 0xf , 0xf , 0xf , 0xf , 0xa , 0xc };
390- const unsigned BankMask[Iters] = {0xf , 0xf , 0xf , 0xe , 0xc , 0xf , 0xf };
384+ const unsigned Iters = 6 ;
385+ const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
386+ DPP_ROW_SR4, DPP_ROW_SR8 ,
387+ DPP_ROW_BCAST15, DPP_ROW_BCAST31};
388+ const unsigned RowMask[Iters] = {0xf , 0xf , 0xf , 0xf , 0xa , 0xc };
389+ const unsigned BankMask[Iters] = {0xf , 0xf , 0xe , 0xc , 0xf , 0xf };
391390
392391 // This loop performs an exclusive scan across the wavefront, with all lanes
393392 // active (by using the WWM intrinsic).
394393 for (unsigned Idx = 0 ; Idx < Iters; Idx++) {
395- Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
396394 CallInst *const DPP = B.CreateIntrinsic (
397395 Intrinsic::amdgcn_update_dpp, Ty,
398- {Identity, UpdateValue , B.getInt32 (DPPCtrl[Idx]),
396+ {Identity, ExclScan , B.getInt32 (DPPCtrl[Idx]),
399397 B.getInt32 (RowMask[Idx]), B.getInt32 (BankMask[Idx]), B.getFalse ()});
400398
401399 ExclScan = buildNonAtomicBinOp (B, Op, ExclScan, DPP);
0 commit comments