|
21 | 21 | #include "SIMachineFunctionInfo.h" |
22 | 22 | #include "llvm/ADT/BitmaskEnum.h" |
23 | 23 | #include "llvm/ADT/DenseMap.h" |
| 24 | +#include "llvm/ADT/STLExtras.h" |
24 | 25 | #include "llvm/CodeGen/MachineScheduler.h" |
25 | 26 | #include "llvm/CodeGen/TargetOpcodes.h" |
26 | 27 |
|
@@ -60,6 +61,10 @@ static cl::opt<bool> UseCostHeur( |
60 | 61 | "Experimentally, results are mixed, so this should be set on a " |
61 | 62 | "case-by-case basis.")); |
62 | 63 |
|
| 64 | +static cl::opt<bool> DisableMfmaChainOrderingDeps( |
| 65 | + "amdgpu-disable-mfma-chain-order-deps", cl::init(false), cl::Hidden, |
| 66 | + cl::desc("Enable artificial false dependencies between MFMA chains")); |
| 67 | + |
63 | 68 | // Components of the mask that determines which instruction types may be may be |
64 | 69 | // classified into a SchedGroup. |
65 | 70 | enum class SchedGroupMask { |
@@ -2342,6 +2347,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { |
2342 | 2347 | // Add DAG edges that enforce SCHED_BARRIER ordering. |
2343 | 2348 | void addSchedBarrierEdges(SUnit &SU); |
2344 | 2349 |
|
| 2350 | + // Add artificial false-dependencies between MFMA consumers of adjacent |
| 2351 | + // DS_READ_B128 streams to enforce MFMA(newer) -> MFMA(older-last) ordering. |
| 2352 | + void addMfmaFalseDeps(); |
| 2353 | + |
2345 | 2354 | // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should |
2346 | 2355 | // not be reordered accross the SCHED_BARRIER. This is used for the base |
2347 | 2356 | // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that |
@@ -2585,6 +2594,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { |
2585 | 2594 | } |
2586 | 2595 | } |
2587 | 2596 |
|
| 2597 | + if (!DisableMfmaChainOrderingDeps && ST.hasMAIInsts()) |
| 2598 | + addMfmaFalseDeps(); |
| 2599 | + |
2588 | 2600 | if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { |
2589 | 2601 | PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); |
2590 | 2602 | // PipelineSolver performs the mutation by adding the edges it |
@@ -2681,6 +2693,86 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { |
2681 | 2693 |
|
2682 | 2694 | } // namespace |
2683 | 2695 |
|
| 2696 | +void IGroupLPDAGMutation::addMfmaFalseDeps() { |
| 2697 | + DenseMap<SUnit *, SUnit *> MFMAAncestor; |
| 2698 | + SmallVector<SUnit *, 10> MFMAChainLeaders; |
| 2699 | + DenseMap<SUnit *, SUnit *> MFMAChainNext; |
| 2700 | + for (auto &SU : DAG->SUnits) { |
| 2701 | + if (!TII->isMFMAorWMMA(*SU.getInstr())) |
| 2702 | + continue; |
| 2703 | + |
| 2704 | + if (MFMAAncestor.contains(&SU)) |
| 2705 | + continue; |
| 2706 | + |
| 2707 | + SUnit *CurrMFMA = &SU; |
| 2708 | + MFMAAncestor[CurrMFMA] = CurrMFMA; |
| 2709 | + MFMAChainLeaders.push_back(&SU); |
| 2710 | + while (!CurrMFMA->Succs.empty()) { |
| 2711 | + // Count the number of successor MFMA/WMMA instructions of |
| 2712 | + // the current MFMA instruction. |
| 2713 | + SUnit *NextMFMA = nullptr; |
| 2714 | + unsigned MFMADataDepSuccCount = 0; |
| 2715 | + for (const auto &Succ : CurrMFMA->Succs) { |
| 2716 | + SUnit *SuccSU = Succ.getSUnit(); |
| 2717 | + if (!SuccSU->isInstr() || !TII->isMFMAorWMMA(*SuccSU->getInstr())) |
| 2718 | + continue; |
| 2719 | + |
| 2720 | + // Check if the successor is MFMA/WMMA and the edge is a data dependency |
| 2721 | + if (Succ.getKind() == SDep::Data) { |
| 2722 | + NextMFMA = SuccSU; |
| 2723 | + MFMADataDepSuccCount++; |
| 2724 | + } |
| 2725 | + } |
| 2726 | + |
| 2727 | + // If the current MFMA instruction has more than one successor MFMA/WMMA instruction, |
| 2728 | + // we need to break the chain. |
| 2729 | + if (MFMADataDepSuccCount != 1) { |
| 2730 | + MFMAChainNext[CurrMFMA] = nullptr; |
| 2731 | + break; |
| 2732 | + } |
| 2733 | + |
| 2734 | + // Add the current MFMA instruction to the MFMAAncestor map. |
| 2735 | + MFMAAncestor[CurrMFMA] = &SU; |
| 2736 | + MFMAChainNext[CurrMFMA] = NextMFMA; |
| 2737 | + CurrMFMA = NextMFMA; |
| 2738 | + } |
| 2739 | + } |
| 2740 | + |
| 2741 | + // Compute the tail and length of each chain in a single loop. |
| 2742 | + auto GetTailAndLength = [&](SUnit *Leader) -> std::pair<SUnit *, unsigned> { |
| 2743 | + unsigned Length = 1; |
| 2744 | + SUnit *Curr = Leader; |
| 2745 | + while (MFMAChainNext.count(Curr)) { |
| 2746 | + if (!MFMAChainNext[Curr]) |
| 2747 | + break; |
| 2748 | + Curr = MFMAChainNext[Curr]; |
| 2749 | + ++Length; |
| 2750 | + } |
| 2751 | + return {Curr, Length}; |
| 2752 | + }; |
| 2753 | + |
| 2754 | + // Assert that all MFMA chains are ordered by NodeNum |
| 2755 | + // Add artificial false dependencies between MFMA chains if two given |
| 2756 | + // chains are at least 2 SUs long. |
| 2757 | + // Iterate over all pairs of contiguous MFMA chains and add artificial edges if chains are at least 2 SUs long. |
| 2758 | + for (size_t I = 0; I + 1 < MFMAChainLeaders.size(); ++I) { |
| 2759 | + SUnit *ChainLeaderA = MFMAChainLeaders[I]; |
| 2760 | + SUnit *ChainLeaderB = MFMAChainLeaders[I + 1]; |
| 2761 | + |
| 2762 | + auto [TailA, LengthA] = GetTailAndLength(ChainLeaderA); |
| 2763 | + auto [TailB, LengthB] = GetTailAndLength(ChainLeaderB); |
| 2764 | + |
| 2765 | + // Only add if both chains are at least two SUs long. |
| 2766 | + if (LengthA >= 2 && LengthB >= 2) { |
| 2767 | + // Add an artificial dependency edge from the tail of chain A to the |
| 2768 | + // leader of chain B. |
| 2769 | + LLVM_DEBUG(dbgs() << "Adding artificial dependency edge from " << TailA->NodeNum |
| 2770 | + << " to " << ChainLeaderB->NodeNum << "\n"); |
| 2771 | + DAG->addEdge(ChainLeaderB, SDep(TailA, SDep::Artificial)); |
| 2772 | + } |
| 2773 | + } |
| 2774 | +} |
| 2775 | + |
2684 | 2776 | /// \p Phase specifes whether or not this is a reentry into the |
2685 | 2777 | /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the |
2686 | 2778 | /// same scheduling region (e.g. pre and post-RA scheduling / multiple |
|
0 commit comments