1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file defines a set of schedule DAG mutations that can be used to
10 // override default scheduler behavior to enforce specific scheduling patterns.
11 // They should be used in cases where runtime performance considerations such as
12 // inter-wavefront interactions, mean that compile-time heuristics cannot
13 // predict the optimal instruction ordering, or in kernels where optimum
14 // instruction scheduling is important enough to warrant manual intervention.
15 //
16 //===----------------------------------------------------------------------===//
17
18 #include "AMDGPUIGroupLP.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/TargetOpcodes.h"
26
27 using namespace llvm;
28
29 #define DEBUG_TYPE "machine-scheduler"
30
31 namespace {
32
33 static cl::opt<bool>
34 EnableIGroupLP("amdgpu-igrouplp",
35 cl::desc("Enable construction of Instruction Groups and "
36 "their ordering for scheduling"),
37 cl::init(false));
38
39 static cl::opt<Optional<unsigned>>
40 VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None),
41 cl::Hidden,
42 cl::desc("The maximum number of instructions to include "
43 "in VMEM group."));
44
45 static cl::opt<Optional<unsigned>>
46 MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None),
47 cl::Hidden,
48 cl::desc("The maximum number of instructions to include "
49 "in MFMA group."));
50
51 static cl::opt<Optional<unsigned>>
52 LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None),
53 cl::Hidden,
54 cl::desc("The maximum number of instructions to include "
55 "in lds/gds read group."));
56
57 static cl::opt<Optional<unsigned>>
58 LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None),
59 cl::Hidden,
60 cl::desc("The maximum number of instructions to include "
61 "in lds/gds write group."));
62
63 typedef function_ref<bool(const MachineInstr &, const SIInstrInfo *)>
64 CanAddMIFn;
65
66 // Classify instructions into groups to enable fine tuned control over the
67 // scheduler. These groups may be more specific than current SchedModel
68 // instruction classes.
69 class SchedGroup {
70 private:
71 // Function that returns true if a non-bundle MI may be inserted into this
72 // group.
73 const CanAddMIFn canAddMI;
74
75 // Maximum number of SUnits that can be added to this group.
76 Optional<unsigned> MaxSize;
77
78 // Collection of SUnits that are classified as members of this group.
79 SmallVector<SUnit *, 32> Collection;
80
81 ScheduleDAGInstrs *DAG;
82
tryAddEdge(SUnit * A,SUnit * B)83 void tryAddEdge(SUnit *A, SUnit *B) {
84 if (A != B && DAG->canAddEdge(B, A)) {
85 DAG->addEdge(B, SDep(A, SDep::Artificial));
86 LLVM_DEBUG(dbgs() << "Adding edge...\n"
87 << "from: SU(" << A->NodeNum << ") " << *A->getInstr()
88 << "to: SU(" << B->NodeNum << ") " << *B->getInstr());
89 }
90 }
91
92 public:
93 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
94 // MakePred is true, SU will be a predecessor of the SUnits in this
95 // SchedGroup, otherwise SU will be a successor.
link(SUnit & SU,bool MakePred=false)96 void link(SUnit &SU, bool MakePred = false) {
97 for (auto A : Collection) {
98 SUnit *B = &SU;
99 if (MakePred)
100 std::swap(A, B);
101
102 tryAddEdge(A, B);
103 }
104 }
105
106 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use
107 // the predicate to determine whether SU should be a predecessor (P = true)
108 // or a successor (P = false) of this SchedGroup.
link(SUnit & SU,function_ref<bool (const SUnit * A,const SUnit * B)> P)109 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P) {
110 for (auto A : Collection) {
111 SUnit *B = &SU;
112 if (P(A, B))
113 std::swap(A, B);
114
115 tryAddEdge(A, B);
116 }
117 }
118
119 // Add DAG dependencies such that SUnits in this group shall be ordered
120 // before SUnits in OtherGroup.
link(SchedGroup & OtherGroup)121 void link(SchedGroup &OtherGroup) {
122 for (auto B : OtherGroup.Collection)
123 link(*B);
124 }
125
126 // Returns true if no more instructions may be added to this group.
isFull()127 bool isFull() { return MaxSize && Collection.size() >= *MaxSize; }
128
129 // Returns true if SU can be added to this SchedGroup.
canAddSU(SUnit & SU,const SIInstrInfo * TII)130 bool canAddSU(SUnit &SU, const SIInstrInfo *TII) {
131 if (isFull())
132 return false;
133
134 MachineInstr &MI = *SU.getInstr();
135 if (MI.getOpcode() != TargetOpcode::BUNDLE)
136 return canAddMI(MI, TII);
137
138 // Special case for bundled MIs.
139 const MachineBasicBlock *MBB = MI.getParent();
140 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
141 while (E != MBB->end() && E->isBundledWithPred())
142 ++E;
143
144 // Return true if all of the bundled MIs can be added to this group.
145 return std::all_of(
146 B, E, [this, TII](MachineInstr &MI) { return canAddMI(MI, TII); });
147 }
148
add(SUnit & SU)149 void add(SUnit &SU) { Collection.push_back(&SU); }
150
SchedGroup(CanAddMIFn canAddMI,Optional<unsigned> MaxSize,ScheduleDAGInstrs * DAG)151 SchedGroup(CanAddMIFn canAddMI, Optional<unsigned> MaxSize,
152 ScheduleDAGInstrs *DAG)
153 : canAddMI(canAddMI), MaxSize(MaxSize), DAG(DAG) {}
154 };
155
isMFMASGMember(const MachineInstr & MI,const SIInstrInfo * TII)156 bool isMFMASGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
157 return TII->isMFMA(MI);
158 }
159
isVALUSGMember(const MachineInstr & MI,const SIInstrInfo * TII)160 bool isVALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
161 return TII->isVALU(MI) && !TII->isMFMA(MI);
162 }
163
isSALUSGMember(const MachineInstr & MI,const SIInstrInfo * TII)164 bool isSALUSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
165 return TII->isSALU(MI);
166 }
167
isVMEMSGMember(const MachineInstr & MI,const SIInstrInfo * TII)168 bool isVMEMSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
169 return TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI));
170 }
171
isVMEMReadSGMember(const MachineInstr & MI,const SIInstrInfo * TII)172 bool isVMEMReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
173 return MI.mayLoad() &&
174 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
175 }
176
isVMEMWriteSGMember(const MachineInstr & MI,const SIInstrInfo * TII)177 bool isVMEMWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
178 return MI.mayStore() &&
179 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)));
180 }
181
isDSWriteSGMember(const MachineInstr & MI,const SIInstrInfo * TII)182 bool isDSWriteSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
183 return MI.mayStore() && TII->isDS(MI);
184 }
185
isDSReadSGMember(const MachineInstr & MI,const SIInstrInfo * TII)186 bool isDSReadSGMember(const MachineInstr &MI, const SIInstrInfo *TII) {
187 return MI.mayLoad() && TII->isDS(MI);
188 }
189
190 class IGroupLPDAGMutation : public ScheduleDAGMutation {
191 public:
192 const SIInstrInfo *TII;
193 ScheduleDAGMI *DAG;
194
195 IGroupLPDAGMutation() = default;
196 void apply(ScheduleDAGInstrs *DAGInstrs) override;
197 };
198
199 // DAG mutation that coordinates with the SCHED_BARRIER instruction and
200 // corresponding builtin. The mutation adds edges from specific instruction
201 // classes determined by the SCHED_BARRIER mask so that they cannot be
202 // scheduled around the SCHED_BARRIER.
203 class SchedBarrierDAGMutation : public ScheduleDAGMutation {
204 private:
205 const SIInstrInfo *TII;
206
207 ScheduleDAGMI *DAG;
208
209 // Components of the mask that determines which instructions may not be
210 // scheduled across the SCHED_BARRIER.
211 enum class SchedBarrierMasks {
212 NONE = 0u,
213 ALU = 1u << 0,
214 VALU = 1u << 1,
215 SALU = 1u << 2,
216 MFMA = 1u << 3,
217 VMEM = 1u << 4,
218 VMEM_READ = 1u << 5,
219 VMEM_WRITE = 1u << 6,
220 DS = 1u << 7,
221 DS_READ = 1u << 8,
222 DS_WRITE = 1u << 9,
223 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ DS_WRITE)
224 };
225
226 // Cache SchedGroups of each type if we have multiple SCHED_BARRIERs in a
227 // region.
228 //
229 std::unique_ptr<SchedGroup> MFMASchedGroup = nullptr;
230 std::unique_ptr<SchedGroup> VALUSchedGroup = nullptr;
231 std::unique_ptr<SchedGroup> SALUSchedGroup = nullptr;
232 std::unique_ptr<SchedGroup> VMEMReadSchedGroup = nullptr;
233 std::unique_ptr<SchedGroup> VMEMWriteSchedGroup = nullptr;
234 std::unique_ptr<SchedGroup> DSWriteSchedGroup = nullptr;
235 std::unique_ptr<SchedGroup> DSReadSchedGroup = nullptr;
236
237 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
238 // not be reordered accross the SCHED_BARRIER.
239 void getSchedGroupsFromMask(int32_t Mask,
240 SmallVectorImpl<SchedGroup *> &SchedGroups);
241
242 // Add DAG edges that enforce SCHED_BARRIER ordering.
243 void addSchedBarrierEdges(SUnit &SU);
244
245 // Classify instructions and add them to the SchedGroup.
246 void initSchedGroup(SchedGroup *SG);
247
248 // Remove all existing edges from a SCHED_BARRIER.
249 void resetSchedBarrierEdges(SUnit &SU);
250
251 public:
252 void apply(ScheduleDAGInstrs *DAGInstrs) override;
253
254 SchedBarrierDAGMutation() = default;
255 };
256
apply(ScheduleDAGInstrs * DAGInstrs)257 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
258 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
259 TII = ST.getInstrInfo();
260 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
261 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
262 if (!TSchedModel || DAG->SUnits.empty())
263 return;
264
265 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
266
267 // The order of InstructionGroups in this vector defines the
268 // order in which edges will be added. In other words, given the
269 // present ordering, we will try to make each VMEMRead instruction
270 // a predecessor of each DSRead instruction, and so on.
271 SmallVector<SchedGroup, 4> PipelineOrderGroups = {
272 SchedGroup(isVMEMSGMember, VMEMGroupMaxSize, DAG),
273 SchedGroup(isDSReadSGMember, LDRGroupMaxSize, DAG),
274 SchedGroup(isMFMASGMember, MFMAGroupMaxSize, DAG),
275 SchedGroup(isDSWriteSGMember, LDWGroupMaxSize, DAG)};
276
277 for (SUnit &SU : DAG->SUnits) {
278 LLVM_DEBUG(dbgs() << "Checking Node"; DAG->dumpNode(SU));
279 for (auto &SG : PipelineOrderGroups)
280 if (SG.canAddSU(SU, TII))
281 SG.add(SU);
282 }
283
284 for (unsigned i = 0; i < PipelineOrderGroups.size() - 1; i++) {
285 auto &GroupA = PipelineOrderGroups[i];
286 for (unsigned j = i + 1; j < PipelineOrderGroups.size(); j++) {
287 auto &GroupB = PipelineOrderGroups[j];
288 GroupA.link(GroupB);
289 }
290 }
291 }
292
apply(ScheduleDAGInstrs * DAGInstrs)293 void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
294 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
295 if (!TSchedModel || DAGInstrs->SUnits.empty())
296 return;
297
298 LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n");
299
300 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
301 TII = ST.getInstrInfo();
302 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
303 for (auto &SU : DAG->SUnits)
304 if (SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER)
305 addSchedBarrierEdges(SU);
306 }
307
addSchedBarrierEdges(SUnit & SchedBarrier)308 void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
309 MachineInstr &MI = *SchedBarrier.getInstr();
310 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
311 // Remove all existing edges from the SCHED_BARRIER that were added due to the
312 // instruction having side effects.
313 resetSchedBarrierEdges(SchedBarrier);
314 SmallVector<SchedGroup *, 4> SchedGroups;
315 int32_t Mask = MI.getOperand(0).getImm();
316 getSchedGroupsFromMask(Mask, SchedGroups);
317 for (auto SG : SchedGroups)
318 SG->link(
319 SchedBarrier, (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
320 const SUnit *A, const SUnit *B) {
321 return A->NodeNum > B->NodeNum;
322 });
323 }
324
getSchedGroupsFromMask(int32_t Mask,SmallVectorImpl<SchedGroup * > & SchedGroups)325 void SchedBarrierDAGMutation::getSchedGroupsFromMask(
326 int32_t Mask, SmallVectorImpl<SchedGroup *> &SchedGroups) {
327 SchedBarrierMasks SBMask = (SchedBarrierMasks)Mask;
328 // See IntrinsicsAMDGPU.td for an explanation of these masks and their
329 // mappings.
330 //
331 if ((SBMask & SchedBarrierMasks::VALU) == SchedBarrierMasks::NONE &&
332 (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
333 if (!VALUSchedGroup) {
334 VALUSchedGroup = std::make_unique<SchedGroup>(isVALUSGMember, None, DAG);
335 initSchedGroup(VALUSchedGroup.get());
336 }
337
338 SchedGroups.push_back(VALUSchedGroup.get());
339 }
340
341 if ((SBMask & SchedBarrierMasks::SALU) == SchedBarrierMasks::NONE &&
342 (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
343 if (!SALUSchedGroup) {
344 SALUSchedGroup = std::make_unique<SchedGroup>(isSALUSGMember, None, DAG);
345 initSchedGroup(SALUSchedGroup.get());
346 }
347
348 SchedGroups.push_back(SALUSchedGroup.get());
349 }
350
351 if ((SBMask & SchedBarrierMasks::MFMA) == SchedBarrierMasks::NONE &&
352 (SBMask & SchedBarrierMasks::ALU) == SchedBarrierMasks::NONE) {
353 if (!MFMASchedGroup) {
354 MFMASchedGroup = std::make_unique<SchedGroup>(isMFMASGMember, None, DAG);
355 initSchedGroup(MFMASchedGroup.get());
356 }
357
358 SchedGroups.push_back(MFMASchedGroup.get());
359 }
360
361 if ((SBMask & SchedBarrierMasks::VMEM_READ) == SchedBarrierMasks::NONE &&
362 (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
363 if (!VMEMReadSchedGroup) {
364 VMEMReadSchedGroup =
365 std::make_unique<SchedGroup>(isVMEMReadSGMember, None, DAG);
366 initSchedGroup(VMEMReadSchedGroup.get());
367 }
368
369 SchedGroups.push_back(VMEMReadSchedGroup.get());
370 }
371
372 if ((SBMask & SchedBarrierMasks::VMEM_WRITE) == SchedBarrierMasks::NONE &&
373 (SBMask & SchedBarrierMasks::VMEM) == SchedBarrierMasks::NONE) {
374 if (!VMEMWriteSchedGroup) {
375 VMEMWriteSchedGroup =
376 std::make_unique<SchedGroup>(isVMEMWriteSGMember, None, DAG);
377 initSchedGroup(VMEMWriteSchedGroup.get());
378 }
379
380 SchedGroups.push_back(VMEMWriteSchedGroup.get());
381 }
382
383 if ((SBMask & SchedBarrierMasks::DS_READ) == SchedBarrierMasks::NONE &&
384 (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
385 if (!DSReadSchedGroup) {
386 DSReadSchedGroup =
387 std::make_unique<SchedGroup>(isDSReadSGMember, None, DAG);
388 initSchedGroup(DSReadSchedGroup.get());
389 }
390
391 SchedGroups.push_back(DSReadSchedGroup.get());
392 }
393
394 if ((SBMask & SchedBarrierMasks::DS_WRITE) == SchedBarrierMasks::NONE &&
395 (SBMask & SchedBarrierMasks::DS) == SchedBarrierMasks::NONE) {
396 if (!DSWriteSchedGroup) {
397 DSWriteSchedGroup =
398 std::make_unique<SchedGroup>(isDSWriteSGMember, None, DAG);
399 initSchedGroup(DSWriteSchedGroup.get());
400 }
401
402 SchedGroups.push_back(DSWriteSchedGroup.get());
403 }
404 }
405
initSchedGroup(SchedGroup * SG)406 void SchedBarrierDAGMutation::initSchedGroup(SchedGroup *SG) {
407 assert(SG);
408 for (auto &SU : DAG->SUnits)
409 if (SG->canAddSU(SU, TII))
410 SG->add(SU);
411 }
412
resetSchedBarrierEdges(SUnit & SU)413 void SchedBarrierDAGMutation::resetSchedBarrierEdges(SUnit &SU) {
414 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER);
415 for (auto &P : SU.Preds)
416 SU.removePred(P);
417
418 for (auto &S : SU.Succs) {
419 for (auto &SP : S.getSUnit()->Preds) {
420 if (SP.getSUnit() == &SU) {
421 S.getSUnit()->removePred(SP);
422 }
423 }
424 }
425 }
426
427 } // namespace
428
429 namespace llvm {
430
createIGroupLPDAGMutation()431 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
432 return EnableIGroupLP ? std::make_unique<IGroupLPDAGMutation>() : nullptr;
433 }
434
createSchedBarrierDAGMutation()435 std::unique_ptr<ScheduleDAGMutation> createSchedBarrierDAGMutation() {
436 return std::make_unique<SchedBarrierDAGMutation>();
437 }
438
439 } // end namespace llvm
440