Skip to content

Commit fd2f7ca

Browse files
committed
[AMDGPU] Serialize disjoint MFMA chains to hide DS_READ latency
This patch identifies disjoint chains of dependent MFMA instructions (with length >= 2) and stitches them together into a single execution sequence by adding artificial dependencies from the tail of one chain to the head of the next. Currently, the scheduler may schedule disjoint MFMA chains too early or interleave them, which can expose high latencies from their associated DS_READ operands. By strictly serializing these MFMA chains, we force subsequent chains to execute later. This artificial delay increases the distance between the DS_READ issuance and the consuming MFMA instruction, effectively hiding the load latency.
1 parent 2ff6322 commit fd2f7ca

File tree

5 files changed

+1376
-1159
lines changed

5 files changed

+1376
-1159
lines changed

llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "SIMachineFunctionInfo.h"
2222
#include "llvm/ADT/BitmaskEnum.h"
2323
#include "llvm/ADT/DenseMap.h"
24+
#include "llvm/ADT/STLExtras.h"
2425
#include "llvm/CodeGen/MachineScheduler.h"
2526
#include "llvm/CodeGen/TargetOpcodes.h"
2627

@@ -60,6 +61,10 @@ static cl::opt<bool> UseCostHeur(
6061
"Experimentally, results are mixed, so this should be set on a "
6162
"case-by-case basis."));
6263

64+
static cl::opt<bool> DisableMfmaChainOrderingDeps(
65+
"amdgpu-disable-mfma-chain-order-deps", cl::init(false), cl::Hidden,
66+
cl::desc("Enable artificial false dependencies between MFMA chains"));
67+
6368
// Components of the mask that determines which instruction types may be may be
6469
// classified into a SchedGroup.
6570
enum class SchedGroupMask {
@@ -2342,6 +2347,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
23422347
// Add DAG edges that enforce SCHED_BARRIER ordering.
23432348
void addSchedBarrierEdges(SUnit &SU);
23442349

2350+
// Add artificial false-dependencies between MFMA consumers of adjacent
2351+
// DS_READ_B128 streams to enforce MFMA(newer) -> MFMA(older-last) ordering.
2352+
void addMfmaFalseDeps();
2353+
23452354
// Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
23462355
// not be reordered accross the SCHED_BARRIER. This is used for the base
23472356
// SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
@@ -2585,6 +2594,9 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
25852594
}
25862595
}
25872596

2597+
if (!DisableMfmaChainOrderingDeps && ST.hasMAIInsts())
2598+
addMfmaFalseDeps();
2599+
25882600
if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
25892601
PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
25902602
// PipelineSolver performs the mutation by adding the edges it
@@ -2681,6 +2693,86 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
26812693

26822694
} // namespace
26832695

2696+
void IGroupLPDAGMutation::addMfmaFalseDeps() {
2697+
DenseMap<SUnit *, SUnit *> MFMAAncestor;
2698+
SmallVector<SUnit *, 10> MFMAChainLeaders;
2699+
DenseMap<SUnit *, SUnit *> MFMAChainNext;
2700+
for (auto &SU : DAG->SUnits) {
2701+
if (!TII->isMFMAorWMMA(*SU.getInstr()))
2702+
continue;
2703+
2704+
if (MFMAAncestor.contains(&SU))
2705+
continue;
2706+
2707+
SUnit *CurrMFMA = &SU;
2708+
MFMAAncestor[CurrMFMA] = CurrMFMA;
2709+
MFMAChainLeaders.push_back(&SU);
2710+
while (!CurrMFMA->Succs.empty()) {
2711+
// Count the number of successor MFMA/WMMA instructions of
2712+
// the current MFMA instruction.
2713+
SUnit *NextMFMA = nullptr;
2714+
unsigned MFMADataDepSuccCount = 0;
2715+
for (const auto &Succ : CurrMFMA->Succs) {
2716+
SUnit *SuccSU = Succ.getSUnit();
2717+
if (!SuccSU->isInstr() || !TII->isMFMAorWMMA(*SuccSU->getInstr()))
2718+
continue;
2719+
2720+
// Check if the successor is MFMA/WMMA and the edge is a data dependency
2721+
if (Succ.getKind() == SDep::Data) {
2722+
NextMFMA = SuccSU;
2723+
MFMADataDepSuccCount++;
2724+
}
2725+
}
2726+
2727+
// If the current MFMA instruction has more than one successor MFMA/WMMA instruction,
2728+
// we need to break the chain.
2729+
if (MFMADataDepSuccCount != 1) {
2730+
MFMAChainNext[CurrMFMA] = nullptr;
2731+
break;
2732+
}
2733+
2734+
// Add the current MFMA instruction to the MFMAAncestor map.
2735+
MFMAAncestor[CurrMFMA] = &SU;
2736+
MFMAChainNext[CurrMFMA] = NextMFMA;
2737+
CurrMFMA = NextMFMA;
2738+
}
2739+
}
2740+
2741+
// Compute the tail and length of each chain in a single loop.
2742+
auto GetTailAndLength = [&](SUnit *Leader) -> std::pair<SUnit *, unsigned> {
2743+
unsigned Length = 1;
2744+
SUnit *Curr = Leader;
2745+
while (MFMAChainNext.count(Curr)) {
2746+
if (!MFMAChainNext[Curr])
2747+
break;
2748+
Curr = MFMAChainNext[Curr];
2749+
++Length;
2750+
}
2751+
return {Curr, Length};
2752+
};
2753+
2754+
// Assert that all MFMA chains are ordered by NodeNum
2755+
// Add artificial false dependencies between MFMA chains if two given
2756+
// chains are at least 2 SUs long.
2757+
// Iterate over all pairs of contiguous MFMA chains and add artificial edges if chains are at least 2 SUs long.
2758+
for (size_t I = 0; I + 1 < MFMAChainLeaders.size(); ++I) {
2759+
SUnit *ChainLeaderA = MFMAChainLeaders[I];
2760+
SUnit *ChainLeaderB = MFMAChainLeaders[I + 1];
2761+
2762+
auto [TailA, LengthA] = GetTailAndLength(ChainLeaderA);
2763+
auto [TailB, LengthB] = GetTailAndLength(ChainLeaderB);
2764+
2765+
// Only add if both chains are at least two SUs long.
2766+
if (LengthA >= 2 && LengthB >= 2) {
2767+
// Add an artificial dependency edge from the tail of chain A to the
2768+
// leader of chain B.
2769+
LLVM_DEBUG(dbgs() << "Adding artificial dependency edge from " << TailA->NodeNum
2770+
<< " to " << ChainLeaderB->NodeNum << "\n");
2771+
DAG->addEdge(ChainLeaderB, SDep(TailA, SDep::Artificial));
2772+
}
2773+
}
2774+
}
2775+
26842776
/// \p Phase specifes whether or not this is a reentry into the
26852777
/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
26862778
/// same scheduling region (e.g. pre and post-RA scheduling / multiple

0 commit comments

Comments
 (0)