@@ -245,8 +245,8 @@ class WaitcntBrackets {
245245 const SIRegisterInfo *TRI, unsigned OpNo) const ;
246246
247247 bool counterOutOfOrder (InstCounterType T) const ;
248- bool simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
249- bool simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
248+ void simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const ;
249+ void simplifyWaitcnt (InstCounterType T, unsigned &Count) const ;
250250 void determineWait (InstCounterType T, unsigned ScoreToWait,
251251 AMDGPU::Waitcnt &Wait) const ;
252252 void applyWaitcnt (const AMDGPU::Waitcnt &Wait);
@@ -418,7 +418,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
418418 }
419419
420420 if (DebugCounter::isCounterSet (ForceLgkmCounter) &&
421- DebugCounter::shouldExecute (ForceLgkmCounter)) {
421+ DebugCounter::shouldExecute (ForceLgkmCounter)) {
422422 ForceEmitWaitcnt[LGKM_CNT] = true ;
423423 } else {
424424 ForceEmitWaitcnt[LGKM_CNT] = false ;
@@ -442,6 +442,9 @@ class SIInsertWaitcnts : public MachineFunctionPass {
442442 WaitcntBrackets *ScoreBrackets);
443443 bool insertWaitcntInBlock (MachineFunction &MF, MachineBasicBlock &Block,
444444 WaitcntBrackets &ScoreBrackets);
445+ bool applyPreexistingWaitcnt (WaitcntBrackets &ScoreBrackets,
446+ MachineInstr &OldWaitcntInstr,
447+ AMDGPU::Waitcnt &Wait, const MachineInstr *MI);
445448};
446449
447450} // end anonymous namespace
@@ -708,22 +711,23 @@ void WaitcntBrackets::print(raw_ostream &OS) {
708711
709712// / Simplify the waitcnt, in the sense of removing redundant counts, and return
710713// / whether a waitcnt instruction is needed at all.
711- bool WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
712- return simplifyWaitcnt (VM_CNT, Wait.VmCnt ) |
713- simplifyWaitcnt (EXP_CNT, Wait.ExpCnt ) |
714- simplifyWaitcnt (LGKM_CNT, Wait.LgkmCnt ) |
715- simplifyWaitcnt (VS_CNT, Wait.VsCnt );
714+ void WaitcntBrackets::simplifyWaitcnt (AMDGPU::Waitcnt &Wait) const {
715+ simplifyWaitcnt (VM_CNT, Wait.VmCnt );
716+ simplifyWaitcnt (EXP_CNT, Wait.ExpCnt );
717+ simplifyWaitcnt (LGKM_CNT, Wait.LgkmCnt );
718+ simplifyWaitcnt (VS_CNT, Wait.VsCnt );
716719}
717720
718- bool WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
721+ void WaitcntBrackets::simplifyWaitcnt (InstCounterType T,
719722 unsigned &Count) const {
720723 const unsigned LB = getScoreLB (T);
721724 const unsigned UB = getScoreUB (T);
722- if (Count < UB && UB - Count > LB)
723- return true ;
724725
725- Count = ~0u ;
726- return false ;
726+ // The number of outstanding events for this type, T, can be calculated
727+ // as (UB - LB). If the current Count is greater than or equal to the number
728+ // of outstanding events, then the wait for this counter is redundant.
729+ if (Count >= UB - LB)
730+ Count = ~0u ;
727731}
728732
729733void WaitcntBrackets::determineWait (InstCounterType T, unsigned ScoreToWait,
@@ -798,6 +802,107 @@ FunctionPass *llvm::createSIInsertWaitcntsPass() {
798802 return new SIInsertWaitcnts ();
799803}
800804
805+ // / Combine consecutive waitcnt instructions that precede \p MI and follow
806+ // / \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
807+ // / by previous passes. Currently this pass conservatively assumes that these
808+ // / preexisting waitcnt are required for correctness.
809+ bool SIInsertWaitcnts::applyPreexistingWaitcnt (WaitcntBrackets &ScoreBrackets,
810+ MachineInstr &OldWaitcntInstr,
811+ AMDGPU::Waitcnt &Wait,
812+ const MachineInstr *MI) {
813+ bool Modified = false ;
814+ MachineInstr *WaitcntInstr = nullptr ;
815+ MachineInstr *WaitcntVsCntInstr = nullptr ;
816+ for (auto II = OldWaitcntInstr.getIterator (), NextI = std::next (II);
817+ &*II != MI; II = NextI, ++NextI) {
818+ if (II->isMetaInstruction ())
819+ continue ;
820+
821+ if (II->getOpcode () == AMDGPU::S_WAITCNT) {
822+ // Conservatively update required wait if this waitcnt was added in an
823+ // earlier pass. In this case it will not exist in the tracked waitcnt
824+ // set.
825+ if (!TrackedWaitcntSet.count (&*II)) {
826+ unsigned IEnc = II->getOperand (0 ).getImm ();
827+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt (IV, IEnc);
828+ Wait = Wait.combined (OldWait);
829+ }
830+
831+ // Merge consecutive waitcnt of the same type by erasing multiples.
832+ if (!WaitcntInstr) {
833+ WaitcntInstr = &*II;
834+ } else {
835+ II->eraseFromParent ();
836+ Modified = true ;
837+ }
838+
839+ } else {
840+ assert (II->getOpcode () == AMDGPU::S_WAITCNT_VSCNT);
841+ assert (II->getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
842+ if (!TrackedWaitcntSet.count (&*II)) {
843+ unsigned OldVSCnt =
844+ TII->getNamedOperand (*II, AMDGPU::OpName::simm16)->getImm ();
845+ Wait.VsCnt = std::min (Wait.VsCnt , OldVSCnt);
846+ }
847+
848+ if (!WaitcntVsCntInstr) {
849+ WaitcntVsCntInstr = &*II;
850+ } else {
851+ II->eraseFromParent ();
852+ Modified = true ;
853+ }
854+ }
855+ }
856+
857+ // Updated encoding of merged waitcnt with the required wait.
858+ if (WaitcntInstr) {
859+ if (Wait.hasWaitExceptVsCnt ()) {
860+ unsigned NewEnc = AMDGPU::encodeWaitcnt (IV, Wait);
861+ unsigned OldEnc = WaitcntInstr->getOperand (0 ).getImm ();
862+ if (OldEnc != NewEnc) {
863+ WaitcntInstr->getOperand (0 ).setImm (NewEnc);
864+ Modified = true ;
865+ }
866+ ScoreBrackets.applyWaitcnt (Wait);
867+ Wait.VmCnt = ~0u ;
868+ Wait.LgkmCnt = ~0u ;
869+ Wait.ExpCnt = ~0u ;
870+
871+ LLVM_DEBUG (dbgs () << " generateWaitcntInstBefore\n "
872+ << " Old Instr: " << MI << " New Instr: " << *WaitcntInstr
873+ << ' \n ' );
874+ } else {
875+ WaitcntInstr->eraseFromParent ();
876+ Modified = true ;
877+ }
878+ }
879+
880+ if (WaitcntVsCntInstr) {
881+ if (Wait.hasWaitVsCnt ()) {
882+ assert (ST->hasVscnt ());
883+ unsigned OldVSCnt =
884+ TII->getNamedOperand (*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
885+ ->getImm ();
886+ if (Wait.VsCnt != OldVSCnt) {
887+ TII->getNamedOperand (*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
888+ ->setImm (Wait.VsCnt );
889+ Modified = true ;
890+ }
891+ ScoreBrackets.applyWaitcnt (Wait);
892+ Wait.VsCnt = ~0u ;
893+
894+ LLVM_DEBUG (dbgs () << " generateWaitcntInstBefore\n "
895+ << " Old Instr: " << MI
896+ << " New Instr: " << *WaitcntVsCntInstr << ' \n ' );
897+ } else {
898+ WaitcntVsCntInstr->eraseFromParent ();
899+ Modified = true ;
900+ }
901+ }
902+
903+ return Modified;
904+ }
905+
801906static bool readsVCCZ (const MachineInstr &MI) {
802907 unsigned Opc = MI.getOpcode ();
803908 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -833,12 +938,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
833938 MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
834939 MachineInstr *OldWaitcntInstr) {
835940 setForceEmitWaitcnt ();
836- bool IsForceEmitWaitcnt = isForceEmitWaitcnt ();
837941
838942 if (MI.isMetaInstruction ())
839943 return false ;
840944
841945 AMDGPU::Waitcnt Wait;
946+ bool Modified = false ;
842947
843948 // See if this instruction has a forced S_WAITCNT VM.
844949 // TODO: Handle other cases of NeedsWaitcntVmBefore()
@@ -1053,32 +1158,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
10531158 }
10541159 }
10551160
1056- // Early-out if no wait is indicated.
1057- if (!ScoreBrackets.simplifyWaitcnt (Wait) && !IsForceEmitWaitcnt) {
1058- bool Modified = false ;
1059- if (OldWaitcntInstr) {
1060- for (auto II = OldWaitcntInstr->getIterator (), NextI = std::next (II);
1061- &*II != &MI; II = NextI, ++NextI) {
1062- if (II->isDebugInstr ())
1063- continue ;
1064-
1065- if (TrackedWaitcntSet.count (&*II)) {
1066- TrackedWaitcntSet.erase (&*II);
1067- II->eraseFromParent ();
1068- Modified = true ;
1069- } else if (II->getOpcode () == AMDGPU::S_WAITCNT) {
1070- int64_t Imm = II->getOperand (0 ).getImm ();
1071- ScoreBrackets.applyWaitcnt (AMDGPU::decodeWaitcnt (IV, Imm));
1072- } else {
1073- assert (II->getOpcode () == AMDGPU::S_WAITCNT_VSCNT);
1074- assert (II->getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
1075- auto W = TII->getNamedOperand (*II, AMDGPU::OpName::simm16)->getImm ();
1076- ScoreBrackets.applyWaitcnt (AMDGPU::Waitcnt (~0u , ~0u , ~0u , W));
1077- }
1078- }
1079- }
1080- return Modified;
1081- }
1161+ // Verify that the wait is actually needed.
1162+ ScoreBrackets.simplifyWaitcnt (Wait);
10821163
10831164 if (ForceEmitZeroWaitcnts)
10841165 Wait = AMDGPU::Waitcnt::allZero (ST->hasVscnt ());
@@ -1092,57 +1173,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
10921173 if (ForceEmitWaitcnt[VS_CNT])
10931174 Wait.VsCnt = 0 ;
10941175
1095- ScoreBrackets.applyWaitcnt (Wait);
1096-
1097- AMDGPU::Waitcnt OldWait;
1098- bool Modified = false ;
1099-
11001176 if (OldWaitcntInstr) {
1101- for (auto II = OldWaitcntInstr->getIterator (), NextI = std::next (II);
1102- &*II != &MI; II = NextI, NextI++) {
1103- if (II->isDebugInstr ())
1104- continue ;
1105-
1106- if (II->getOpcode () == AMDGPU::S_WAITCNT) {
1107- unsigned IEnc = II->getOperand (0 ).getImm ();
1108- AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt (IV, IEnc);
1109- OldWait = OldWait.combined (IWait);
1110- if (!TrackedWaitcntSet.count (&*II))
1111- Wait = Wait.combined (IWait);
1112- unsigned NewEnc = AMDGPU::encodeWaitcnt (IV, Wait);
1113- if (IEnc != NewEnc) {
1114- II->getOperand (0 ).setImm (NewEnc);
1115- Modified = true ;
1116- }
1117- Wait.VmCnt = ~0u ;
1118- Wait.LgkmCnt = ~0u ;
1119- Wait.ExpCnt = ~0u ;
1120- } else {
1121- assert (II->getOpcode () == AMDGPU::S_WAITCNT_VSCNT);
1122- assert (II->getOperand (0 ).getReg () == AMDGPU::SGPR_NULL);
1123-
1124- unsigned ICnt = TII->getNamedOperand (*II, AMDGPU::OpName::simm16)
1125- ->getImm ();
1126- OldWait.VsCnt = std::min (OldWait.VsCnt , ICnt);
1127- if (!TrackedWaitcntSet.count (&*II))
1128- Wait.VsCnt = std::min (Wait.VsCnt , ICnt);
1129- if (Wait.VsCnt != ICnt) {
1130- TII->getNamedOperand (*II, AMDGPU::OpName::simm16)->setImm (Wait.VsCnt );
1131- Modified = true ;
1132- }
1133- Wait.VsCnt = ~0u ;
1134- }
1135-
1136- LLVM_DEBUG (dbgs () << " generateWaitcntInstBefore\n "
1137- << " Old Instr: " << MI
1138- << " New Instr: " << *II << ' \n ' );
1139-
1140- if (!Wait.hasWait ())
1141- return Modified;
1142- }
1177+ // Try to merge the required wait with preexisting waitcnt instructions.
1178+ // Also erase redundant waitcnt.
1179+ Modified =
1180+ applyPreexistingWaitcnt (ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1181+ } else {
1182+ // Update waitcnt brackets after determining the required wait.
1183+ ScoreBrackets.applyWaitcnt (Wait);
11431184 }
11441185
1145- if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u ) {
1186+ // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1187+ // instruction was modified to handle the required wait.
1188+ if (Wait.hasWaitExceptVsCnt ()) {
11461189 unsigned Enc = AMDGPU::encodeWaitcnt (IV, Wait);
11471190 auto SWaitInst = BuildMI (*MI.getParent (), MI.getIterator (),
11481191 MI.getDebugLoc (), TII->get (AMDGPU::S_WAITCNT))
@@ -1155,7 +1198,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
11551198 << " New Instr: " << *SWaitInst << ' \n ' );
11561199 }
11571200
1158- if (Wait.VsCnt != ~ 0u ) {
1201+ if (Wait.hasWaitVsCnt () ) {
11591202 assert (ST->hasVscnt ());
11601203
11611204 auto SWaitInst =
@@ -1430,7 +1473,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
14301473 Iter != E;) {
14311474 MachineInstr &Inst = *Iter;
14321475
1433- // Track pre-existing waitcnts from earlier iterations.
1476+ // Track pre-existing waitcnts that were added in earlier iterations or by
1477+ // the memory legalizer.
14341478 if (Inst.getOpcode () == AMDGPU::S_WAITCNT ||
14351479 (Inst.getOpcode () == AMDGPU::S_WAITCNT_VSCNT &&
14361480 Inst.getOperand (0 ).isReg () &&
0 commit comments