77// ===----------------------------------------------------------------------===//
88//
99// / \file
10- // / This pass tries to remove unnecessary VGPR live range in divergent if-else
11- // / structure .
10+ // / This pass tries to remove unnecessary VGPR live ranges in divergent if-else
11+ // / structures and waterfall loops .
1212// /
13- // / When we do structurization, we usually transform a if-else into two
13+ // / When we do structurization, we usually transform an if-else into two
1414// / sucessive if-then (with a flow block to do predicate inversion). Consider a
1515// / simple case after structurization: A divergent value %a was defined before
1616// / if-else and used in both THEN (use in THEN is optional) and ELSE part:
2929// /
3030// / As register allocator has no idea of the thread-control-flow, it will just
3131// / assume %a would be alive in the whole range of bb.then because of a later
32- // / use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect
32+ // / use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect
3333// / to exec mask. For this if-else case, the lanes active in bb.then will be
34- // / inactive in bb.else, and vice-verse . So we are safe to say that %a was dead
35- // / after the last use in bb.then untill the end of the block. The reason is
34+ // / inactive in bb.else, and vice-versa . So we are safe to say that %a was dead
35+ // / after the last use in bb.then until the end of the block. The reason is
3636// / the instructions in bb.then will only overwrite lanes that will never be
3737// / accessed in bb.else.
3838// /
4646// / sure the second loop iteration still get correct data.
4747// / 2.) There should be no further uses after the IF-ELSE region.
4848// /
49+ // /
50+ // / Waterfall loops get inserted around instructions that use divergent values
51+ // / but can only be executed with a uniform value. For example an indirect call
52+ // / to a divergent address:
53+ // / bb.start:
54+ // / %a = ...
55+ // / %fun = ...
56+ // / ...
57+ // / bb.loop:
58+ // / call %fun (%a)
59+ // / ... // %a can be dead here
60+ // / loop %bb.loop
61+ // /
62+ // / The loop block is executed multiple times, but it is run exactly once for
63+ // / each active lane. Similar to the if-else case, the register allocator
64+ // / assumes that %a is live throughout the loop as it is used again in the next
65+ // / iteration. If %a is a VGPR that is unused after the loop, it does not need
66+ // / to be live after its last use in the loop block. By inserting a phi-node at
67+ // / the start of bb.loop that is undef when coming from bb.loop, the register
68+ // / allocation knows that the value of %a does not need to be preserved through
69+ // / iterations of the loop.
70+ // /
4971//
5072// ===----------------------------------------------------------------------===//
5173
@@ -89,6 +111,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
89111 SmallSetVector<MachineBasicBlock *, 16 > &ElseBlocks,
90112 SmallVectorImpl<Register> &CandidateRegs) const ;
91113
114+ void collectWaterfallCandidateRegisters (
115+ MachineBasicBlock *Loop,
116+ SmallSetVector<Register, 16 > &CandidateRegs) const ;
117+
92118 void findNonPHIUsesInBlock (Register Reg, MachineBasicBlock *MBB,
93119 SmallVectorImpl<MachineInstr *> &Uses) const ;
94120
@@ -105,6 +131,8 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
105131 MachineBasicBlock *Flow, MachineBasicBlock *Endif,
106132 SmallSetVector<MachineBasicBlock *, 16 > &ElseBlocks) const ;
107133
134+ void optimizeWaterfallLiveRange (Register Reg, MachineBasicBlock *If) const ;
135+
108136 SIOptimizeVGPRLiveRange () : MachineFunctionPass(ID) {}
109137
110138 bool runOnMachineFunction (MachineFunction &MF) override ;
@@ -278,6 +306,54 @@ void SIOptimizeVGPRLiveRange::collectCandidateRegisters(
278306 }
279307}
280308
309+ // / Collect the registers used in the waterfall loop block that are defined
310+ // / before.
311+ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters (
312+ MachineBasicBlock *Loop,
313+ SmallSetVector<Register, 16 > &CandidateRegs) const {
314+
315+ for (auto &MI : Loop->instrs ()) {
316+ if (MI.isDebugInstr ())
317+ continue ;
318+
319+ for (auto &MO : MI.operands ()) {
320+ if (!MO.isReg () || !MO.getReg () || MO.isDef ())
321+ continue ;
322+
323+ Register MOReg = MO.getReg ();
324+ // We can only optimize AGPR/VGPR virtual register
325+ if (MOReg.isPhysical () || !TRI->isVectorRegister (*MRI, MOReg))
326+ continue ;
327+
328+ if (MO.readsReg ()) {
329+ const MachineBasicBlock *DefMBB = MRI->getVRegDef (MOReg)->getParent ();
330+ // Make sure the value is defined before the LOOP block
331+ if (DefMBB != Loop && !CandidateRegs.contains (MOReg)) {
332+ // If the variable is used after the loop, the register coalescer will
333+ // merge the newly created register and remove the phi node again.
334+ // Just do nothing in that case.
335+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo (MOReg);
336+ bool IsUsed = false ;
337+ for (auto *Succ : Loop->successors ()) {
338+ if (Succ != Loop && OldVarInfo.isLiveIn (*Succ, MOReg, *MRI)) {
339+ IsUsed = true ;
340+ break ;
341+ }
342+ }
343+ if (!IsUsed) {
344+ LLVM_DEBUG (dbgs () << " Found candidate reg: "
345+ << printReg (MOReg, TRI, 0 , MRI) << ' \n ' );
346+ CandidateRegs.insert (MOReg);
347+ } else {
348+ LLVM_DEBUG (dbgs () << " Reg is used after loop, ignoring: "
349+ << printReg (MOReg, TRI, 0 , MRI) << ' \n ' );
350+ }
351+ }
352+ }
353+ }
354+ }
355+ }
356+
281357// Re-calculate the liveness of \p Reg in the THEN-region
282358void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion (
283359 Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
@@ -403,12 +479,8 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
403479 }
404480
405481 // Replace all uses in the ELSE region or the PHIs in ENDIF block
406- for (auto I = MRI->use_begin (Reg), E = MRI->use_end (); I != E;) {
407- MachineOperand &O = *I;
408- // This is a little bit tricky, the setReg() will update the linked list,
409- // so we have to increment the iterator before setReg() to avoid skipping
410- // some uses.
411- ++I;
482+ // Use early increment range because setReg() will update the linked list.
483+ for (auto &O : make_early_inc_range (MRI->use_operands (Reg))) {
412484 auto *UseMI = O.getParent ();
413485 auto *UseBlock = UseMI->getParent ();
414486 // Replace uses in Endif block
@@ -431,6 +503,53 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
431503 updateLiveRangeInThenRegion (Reg, If, Flow);
432504}
433505
506+ void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange (
507+ Register Reg, MachineBasicBlock *Loop) const {
508+ // Insert a new PHI, marking the value from the last loop iteration undef.
509+ LLVM_DEBUG (dbgs () << " Optimizing " << printReg (Reg, TRI) << ' \n ' );
510+ const auto *RC = MRI->getRegClass (Reg);
511+ Register NewReg = MRI->createVirtualRegister (RC);
512+ Register UndefReg = MRI->createVirtualRegister (RC);
513+
514+ // Replace all uses in the LOOP region
515+ // Use early increment range because setReg() will update the linked list.
516+ for (auto &O : make_early_inc_range (MRI->use_operands (Reg))) {
517+ auto *UseMI = O.getParent ();
518+ auto *UseBlock = UseMI->getParent ();
519+ // Replace uses in Loop block
520+ if (UseBlock == Loop)
521+ O.setReg (NewReg);
522+ }
523+
524+ MachineInstrBuilder PHI = BuildMI (*Loop, Loop->getFirstNonPHI (), DebugLoc (),
525+ TII->get (TargetOpcode::PHI), NewReg);
526+ for (auto *Pred : Loop->predecessors ()) {
527+ if (Pred == Loop)
528+ PHI.addReg (UndefReg, RegState::Undef).addMBB (Pred);
529+ else
530+ PHI.addReg (Reg).addMBB (Pred);
531+ }
532+
533+ LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo (NewReg);
534+ LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo (Reg);
535+
536+ // collectWaterfallCandidateRegisters only collects registers that are dead
537+ // after the loop. So we know that the old reg is not live throughout the
538+ // whole block anymore.
539+ OldVarInfo.AliveBlocks .reset (Loop->getNumber ());
540+
541+ // Mark the last use as kill
542+ for (auto &MI : reverse (Loop->instrs ())) {
543+ if (MI.readsRegister (NewReg, TRI)) {
544+ MI.addRegisterKilled (NewReg, TRI);
545+ NewVarInfo.Kills .push_back (&MI);
546+ break ;
547+ }
548+ }
549+ assert (!NewVarInfo.Kills .empty () &&
550+ " Failed to find last usage of register in loop" );
551+ }
552+
434553char SIOptimizeVGPRLiveRange::ID = 0 ;
435554
436555INITIALIZE_PASS_BEGIN (SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -491,6 +610,16 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
491610 // Now we are safe to optimize.
492611 for (auto Reg : CandidateRegs)
493612 optimizeLiveRange (Reg, &MBB, IfTarget, Endif, ElseBlocks);
613+ } else if (MI.getOpcode () == AMDGPU::SI_WATERFALL_LOOP) {
614+ LLVM_DEBUG (dbgs () << " Checking Waterfall loop: "
615+ << printMBBReference (MBB) << ' \n ' );
616+
617+ SmallSetVector<Register, 16 > CandidateRegs;
618+ collectWaterfallCandidateRegisters (&MBB, CandidateRegs);
619+ MadeChange |= !CandidateRegs.empty ();
620+ // Now we are safe to optimize.
621+ for (auto Reg : CandidateRegs)
622+ optimizeWaterfallLiveRange (Reg, &MBB);
494623 }
495624 }
496625 }
0 commit comments