1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #include "AMDGPUGenSubtargetInfo.inc"
37 
38 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
39 
40 AMDGPUSubtarget &
41 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
42                                                  StringRef GPU, StringRef FS) {
43   // Determine default and user-specified characteristics
44   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
45   // enabled, but some instructions do not respect them and they run at the
46   // double precision rate, so don't enable by default.
47   //
48   // We want to be able to turn these off, but making this a subtarget feature
49   // for SI has the unhelpful behavior that it unsets everything else if you
50   // disable it.
51 
52   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
53 
54   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
55     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
61     FullFS += "+fp64-fp16-denormals,";
62   } else {
63     FullFS += "-fp32-denormals,";
64   }
65 
66   FullFS += FS;
67 
68   ParseSubtargetFeatures(GPU, FullFS);
69 
70   // We don't support FP64 for EG/NI atm.
71   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
72 
73   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
74   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
75   // variants of MUBUF instructions.
76   if (!hasAddr64() && !FS.contains("flat-for-global")) {
77     FlatForGlobal = true;
78   }
79 
80   // Set defaults if needed.
81   if (MaxPrivateElementSize == 0)
82     MaxPrivateElementSize = 4;
83 
84   if (LDSBankCount == 0)
85     LDSBankCount = 32;
86 
87   if (TT.getArch() == Triple::amdgcn) {
88     if (LocalMemorySize == 0)
89       LocalMemorySize = 32768;
90 
91     // Do something sensible for unspecified target.
92     if (!HasMovrel && !HasVGPRIndexMode)
93       HasMovrel = true;
94   }
95 
96   return *this;
97 }
98 
99 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
100                                  const TargetMachine &TM)
101   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
102     TargetTriple(TT),
103     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
104     IsaVersion(ISAVersion0_0_0),
105     WavefrontSize(0),
106     LocalMemorySize(0),
107     LDSBankCount(0),
108     MaxPrivateElementSize(0),
109 
110     FastFMAF32(false),
111     HalfRate64Ops(false),
112 
113     FP32Denormals(false),
114     FP64FP16Denormals(false),
115     FPExceptions(false),
116     DX10Clamp(false),
117     FlatForGlobal(false),
118     AutoWaitcntBeforeBarrier(false),
119     CodeObjectV3(false),
120     UnalignedScratchAccess(false),
121     UnalignedBufferAccess(false),
122 
123     HasApertureRegs(false),
124     EnableXNACK(false),
125     TrapHandler(false),
126     DebuggerInsertNops(false),
127     DebuggerEmitPrologue(false),
128 
129     EnableHugePrivateBuffer(false),
130     EnableVGPRSpilling(false),
131     EnablePromoteAlloca(false),
132     EnableLoadStoreOpt(false),
133     EnableUnsafeDSOffsetFolding(false),
134     EnableSIScheduler(false),
135     EnableDS128(false),
136     DumpCode(false),
137 
138     FP64(false),
139     FMA(false),
140     MIMG_R128(false),
141     IsGCN(false),
142     GCN3Encoding(false),
143     CIInsts(false),
144     GFX9Insts(false),
145     SGPRInitBug(false),
146     HasSMemRealTime(false),
147     Has16BitInsts(false),
148     HasIntClamp(false),
149     HasVOP3PInsts(false),
150     HasMadMixInsts(false),
151     HasFmaMixInsts(false),
152     HasMovrel(false),
153     HasVGPRIndexMode(false),
154     HasScalarStores(false),
155     HasScalarAtomics(false),
156     HasInv2PiInlineImm(false),
157     HasSDWA(false),
158     HasSDWAOmod(false),
159     HasSDWAScalar(false),
160     HasSDWASdst(false),
161     HasSDWAMac(false),
162     HasSDWAOutModsVOPC(false),
163     HasDPP(false),
164     HasDLInsts(false),
165     D16PreservesUnusedBits(false),
166     FlatAddressSpace(false),
167     FlatInstOffsets(false),
168     FlatGlobalInsts(false),
169     FlatScratchInsts(false),
170     AddNoCarryInsts(false),
171     HasUnpackedD16VMem(false),
172 
173     R600ALUInst(false),
174     CaymanISA(false),
175     CFALUBug(false),
176     HasVertexCache(false),
177     TexVTXClauseSize(0),
178     ScalarizeGlobal(false),
179 
180     FeatureDisable(false),
181     InstrItins(getInstrItineraryForCPU(GPU)) {
182   AS = AMDGPU::getAMDGPUAS(TT);
183   initializeSubtargetDependencies(TT, GPU, FS);
184 }
185 
186 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
187   const Function &F) const {
188   if (NWaves == 1)
189     return getLocalMemorySize();
190   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
191   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
192   unsigned MaxWaves = getMaxWavesPerEU();
193   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
194 }
195 
196 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
197   const Function &F) const {
198   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
199   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
200   unsigned MaxWaves = getMaxWavesPerEU();
201   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
202   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
203   NumWaves = std::min(NumWaves, MaxWaves);
204   NumWaves = std::max(NumWaves, 1u);
205   return NumWaves;
206 }
207 
208 unsigned
209 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
210   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
211   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
212 }
213 
214 std::pair<unsigned, unsigned>
215 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
216   switch (CC) {
217   case CallingConv::AMDGPU_CS:
218   case CallingConv::AMDGPU_KERNEL:
219   case CallingConv::SPIR_KERNEL:
220     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
221   case CallingConv::AMDGPU_VS:
222   case CallingConv::AMDGPU_LS:
223   case CallingConv::AMDGPU_HS:
224   case CallingConv::AMDGPU_ES:
225   case CallingConv::AMDGPU_GS:
226   case CallingConv::AMDGPU_PS:
227     return std::make_pair(1, getWavefrontSize());
228   default:
229     return std::make_pair(1, 16 * getWavefrontSize());
230   }
231 }
232 
233 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
234   const Function &F) const {
235   // FIXME: 1024 if function.
236   // Default minimum/maximum flat work group sizes.
237   std::pair<unsigned, unsigned> Default =
238     getDefaultFlatWorkGroupSize(F.getCallingConv());
239 
240   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
241   // starts using "amdgpu-flat-work-group-size" attribute.
242   Default.second = AMDGPU::getIntegerAttribute(
243     F, "amdgpu-max-work-group-size", Default.second);
244   Default.first = std::min(Default.first, Default.second);
245 
246   // Requested minimum/maximum flat work group sizes.
247   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
248     F, "amdgpu-flat-work-group-size", Default);
249 
250   // Make sure requested minimum is less than requested maximum.
251   if (Requested.first > Requested.second)
252     return Default;
253 
254   // Make sure requested values do not violate subtarget's specifications.
255   if (Requested.first < getMinFlatWorkGroupSize())
256     return Default;
257   if (Requested.second > getMaxFlatWorkGroupSize())
258     return Default;
259 
260   return Requested;
261 }
262 
263 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
264   const Function &F) const {
265   // Default minimum/maximum number of waves per execution unit.
266   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
267 
268   // Default/requested minimum/maximum flat work group sizes.
269   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
270 
271   // If minimum/maximum flat work group sizes were explicitly requested using
272   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
273   // number of waves per execution unit to values implied by requested
274   // minimum/maximum flat work group sizes.
275   unsigned MinImpliedByFlatWorkGroupSize =
276     getMaxWavesPerEU(FlatWorkGroupSizes.second);
277   bool RequestedFlatWorkGroupSize = false;
278 
279   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
280   // starts using "amdgpu-flat-work-group-size" attribute.
281   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
282       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
283     Default.first = MinImpliedByFlatWorkGroupSize;
284     RequestedFlatWorkGroupSize = true;
285   }
286 
287   // Requested minimum/maximum number of waves per execution unit.
288   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
289     F, "amdgpu-waves-per-eu", Default, true);
290 
291   // Make sure requested minimum is less than requested maximum.
292   if (Requested.second && Requested.first > Requested.second)
293     return Default;
294 
295   // Make sure requested values do not violate subtarget's specifications.
296   if (Requested.first < getMinWavesPerEU() ||
297       Requested.first > getMaxWavesPerEU())
298     return Default;
299   if (Requested.second > getMaxWavesPerEU())
300     return Default;
301 
302   // Make sure requested values are compatible with values implied by requested
303   // minimum/maximum flat work group sizes.
304   if (RequestedFlatWorkGroupSize &&
305       Requested.first < MinImpliedByFlatWorkGroupSize)
306     return Default;
307 
308   return Requested;
309 }
310 
311 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
312   Function *Kernel = I->getParent()->getParent();
313   unsigned MinSize = 0;
314   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
315   bool IdQuery = false;
316 
317   // If reqd_work_group_size is present it narrows value down.
318   if (auto *CI = dyn_cast<CallInst>(I)) {
319     const Function *F = CI->getCalledFunction();
320     if (F) {
321       unsigned Dim = UINT_MAX;
322       switch (F->getIntrinsicID()) {
323       case Intrinsic::amdgcn_workitem_id_x:
324       case Intrinsic::r600_read_tidig_x:
325         IdQuery = true;
326         LLVM_FALLTHROUGH;
327       case Intrinsic::r600_read_local_size_x:
328         Dim = 0;
329         break;
330       case Intrinsic::amdgcn_workitem_id_y:
331       case Intrinsic::r600_read_tidig_y:
332         IdQuery = true;
333         LLVM_FALLTHROUGH;
334       case Intrinsic::r600_read_local_size_y:
335         Dim = 1;
336         break;
337       case Intrinsic::amdgcn_workitem_id_z:
338       case Intrinsic::r600_read_tidig_z:
339         IdQuery = true;
340         LLVM_FALLTHROUGH;
341       case Intrinsic::r600_read_local_size_z:
342         Dim = 2;
343         break;
344       default:
345         break;
346       }
347       if (Dim <= 3) {
348         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
349           if (Node->getNumOperands() == 3)
350             MinSize = MaxSize = mdconst::extract<ConstantInt>(
351                                   Node->getOperand(Dim))->getZExtValue();
352       }
353     }
354   }
355 
356   if (!MaxSize)
357     return false;
358 
359   // Range metadata is [Lo, Hi). For ID query we need to pass max size
360   // as Hi. For size query we need to pass Hi + 1.
361   if (IdQuery)
362     MinSize = 0;
363   else
364     ++MaxSize;
365 
366   MDBuilder MDB(I->getContext());
367   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
368                                                   APInt(32, MaxSize));
369   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
370   return true;
371 }
372 
373 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
374                              const TargetMachine &TM) :
375   AMDGPUSubtarget(TT, GPU, FS, TM),
376   InstrInfo(*this),
377   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
378   TLInfo(TM, *this) {}
379 
380 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
381                          const GCNTargetMachine &TM)
382     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
383       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
384       TLInfo(TM, *this) {
385   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
386   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
387 
388   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
389   InstSelector.reset(new AMDGPUInstructionSelector(
390       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
391 }
392 
393 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
394                                       unsigned NumRegionInstrs) const {
395   // Track register pressure so the scheduler can try to decrease
396   // pressure once register usage is above the threshold defined by
397   // SIRegisterInfo::getRegPressureSetLimit()
398   Policy.ShouldTrackPressure = true;
399 
400   // Enabling both top down and bottom up scheduling seems to give us less
401   // register spills than just using one of these approaches on its own.
402   Policy.OnlyTopDown = false;
403   Policy.OnlyBottomUp = false;
404 
405   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
406   if (!enableSIScheduler())
407     Policy.ShouldTrackLaneMasks = true;
408 }
409 
410 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
411   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
412 }
413 
414 unsigned SISubtarget::getKernArgSegmentSize(const Function &F,
415                                             unsigned ExplicitArgBytes) const {
416   uint64_t TotalSize = ExplicitArgBytes;
417   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
418 
419   if (ImplicitBytes != 0) {
420     unsigned Alignment = getAlignmentForImplicitArgPtr();
421     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
422   }
423 
424   // Being able to dereference past the end is useful for emitting scalar loads.
425   return alignTo(TotalSize, 4);
426 }
427 
428 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
429   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
430     if (SGPRs <= 80)
431       return 10;
432     if (SGPRs <= 88)
433       return 9;
434     if (SGPRs <= 100)
435       return 8;
436     return 7;
437   }
438   if (SGPRs <= 48)
439     return 10;
440   if (SGPRs <= 56)
441     return 9;
442   if (SGPRs <= 64)
443     return 8;
444   if (SGPRs <= 72)
445     return 7;
446   if (SGPRs <= 80)
447     return 6;
448   return 5;
449 }
450 
451 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
452   if (VGPRs <= 24)
453     return 10;
454   if (VGPRs <= 28)
455     return 9;
456   if (VGPRs <= 32)
457     return 8;
458   if (VGPRs <= 36)
459     return 7;
460   if (VGPRs <= 40)
461     return 6;
462   if (VGPRs <= 48)
463     return 5;
464   if (VGPRs <= 64)
465     return 4;
466   if (VGPRs <= 84)
467     return 3;
468   if (VGPRs <= 128)
469     return 2;
470   return 1;
471 }
472 
473 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
474   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
475   if (MFI.hasFlatScratchInit()) {
476     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
477       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
478     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
479       return 4; // FLAT_SCRATCH, VCC (in that order).
480   }
481 
482   if (isXNACKEnabled())
483     return 4; // XNACK, VCC (in that order).
484   return 2; // VCC.
485 }
486 
487 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
488   const Function &F = MF.getFunction();
489   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
490 
491   // Compute maximum number of SGPRs function can use using default/requested
492   // minimum number of waves per execution unit.
493   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
494   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
495   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
496 
497   // Check if maximum number of SGPRs was explicitly requested using
498   // "amdgpu-num-sgpr" attribute.
499   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
500     unsigned Requested = AMDGPU::getIntegerAttribute(
501       F, "amdgpu-num-sgpr", MaxNumSGPRs);
502 
503     // Make sure requested value does not violate subtarget's specifications.
504     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
505       Requested = 0;
506 
507     // If more SGPRs are required to support the input user/system SGPRs,
508     // increase to accommodate them.
509     //
510     // FIXME: This really ends up using the requested number of SGPRs + number
511     // of reserved special registers in total. Theoretically you could re-use
512     // the last input registers for these special registers, but this would
513     // require a lot of complexity to deal with the weird aliasing.
514     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
515     if (Requested && Requested < InputNumSGPRs)
516       Requested = InputNumSGPRs;
517 
518     // Make sure requested value is compatible with values implied by
519     // default/requested minimum/maximum number of waves per execution unit.
520     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
521       Requested = 0;
522     if (WavesPerEU.second &&
523         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
524       Requested = 0;
525 
526     if (Requested)
527       MaxNumSGPRs = Requested;
528   }
529 
530   if (hasSGPRInitBug())
531     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
532 
533   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
534                   MaxAddressableNumSGPRs);
535 }
536 
537 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
538   const Function &F = MF.getFunction();
539   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
540 
541   // Compute maximum number of VGPRs function can use using default/requested
542   // minimum number of waves per execution unit.
543   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
544   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
545 
546   // Check if maximum number of VGPRs was explicitly requested using
547   // "amdgpu-num-vgpr" attribute.
548   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
549     unsigned Requested = AMDGPU::getIntegerAttribute(
550       F, "amdgpu-num-vgpr", MaxNumVGPRs);
551 
552     // Make sure requested value is compatible with values implied by
553     // default/requested minimum/maximum number of waves per execution unit.
554     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
555       Requested = 0;
556     if (WavesPerEU.second &&
557         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
558       Requested = 0;
559 
560     if (Requested)
561       MaxNumVGPRs = Requested;
562   }
563 
564   return MaxNumVGPRs;
565 }
566 
567 namespace {
568 struct MemOpClusterMutation : ScheduleDAGMutation {
569   const SIInstrInfo *TII;
570 
571   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
572 
573   void apply(ScheduleDAGInstrs *DAGInstrs) override {
574     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
575 
576     SUnit *SUa = nullptr;
577     // Search for two consequent memory operations and link them
578     // to prevent scheduler from moving them apart.
579     // In DAG pre-process SUnits are in the original order of
580     // the instructions before scheduling.
581     for (SUnit &SU : DAG->SUnits) {
582       MachineInstr &MI2 = *SU.getInstr();
583       if (!MI2.mayLoad() && !MI2.mayStore()) {
584         SUa = nullptr;
585         continue;
586       }
587       if (!SUa) {
588         SUa = &SU;
589         continue;
590       }
591 
592       MachineInstr &MI1 = *SUa->getInstr();
593       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
594           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
595           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
596           (TII->isDS(MI1)   && TII->isDS(MI2))) {
597         SU.addPredBarrier(SUa);
598 
599         for (const SDep &SI : SU.Preds) {
600           if (SI.getSUnit() != SUa)
601             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
602         }
603 
604         if (&SU != &DAG->ExitSU) {
605           for (const SDep &SI : SUa->Succs) {
606             if (SI.getSUnit() != &SU)
607               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
608           }
609         }
610       }
611 
612       SUa = &SU;
613     }
614   }
615 };
616 } // namespace
617 
618 void SISubtarget::getPostRAMutations(
619     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
620   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
621 }
622