1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #include "AMDGPUGenSubtargetInfo.inc"
37 
38 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
39 
40 AMDGPUSubtarget &
41 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
42                                                  StringRef GPU, StringRef FS) {
43   // Determine default and user-specified characteristics
44   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
45   // enabled, but some instructions do not respect them and they run at the
46   // double precision rate, so don't enable by default.
47   //
48   // We want to be able to turn these off, but making this a subtarget feature
49   // for SI has the unhelpful behavior that it unsets everything else if you
50   // disable it.
51 
52   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
53 
54   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
55     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
61     FullFS += "+fp64-fp16-denormals,";
62   } else {
63     FullFS += "-fp32-denormals,";
64   }
65 
66   FullFS += FS;
67 
68   ParseSubtargetFeatures(GPU, FullFS);
69 
70   // We don't support FP64 for EG/NI atm.
71   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
72 
73   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
74   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
75   // variants of MUBUF instructions.
76   if (!hasAddr64() && !FS.contains("flat-for-global")) {
77     FlatForGlobal = true;
78   }
79 
80   // Set defaults if needed.
81   if (MaxPrivateElementSize == 0)
82     MaxPrivateElementSize = 4;
83 
84   if (LDSBankCount == 0)
85     LDSBankCount = 32;
86 
87   if (TT.getArch() == Triple::amdgcn) {
88     if (LocalMemorySize == 0)
89       LocalMemorySize = 32768;
90 
91     // Do something sensible for unspecified target.
92     if (!HasMovrel && !HasVGPRIndexMode)
93       HasMovrel = true;
94   }
95 
96   return *this;
97 }
98 
99 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
100                                  const TargetMachine &TM)
101   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
102     TargetTriple(TT),
103     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
104     IsaVersion(ISAVersion0_0_0),
105     WavefrontSize(0),
106     LocalMemorySize(0),
107     LDSBankCount(0),
108     MaxPrivateElementSize(0),
109 
110     FastFMAF32(false),
111     HalfRate64Ops(false),
112 
113     FP32Denormals(false),
114     FP64FP16Denormals(false),
115     FPExceptions(false),
116     DX10Clamp(false),
117     FlatForGlobal(false),
118     AutoWaitcntBeforeBarrier(false),
119     CodeObjectV3(false),
120     UnalignedScratchAccess(false),
121     UnalignedBufferAccess(false),
122 
123     HasApertureRegs(false),
124     EnableXNACK(false),
125     TrapHandler(false),
126     DebuggerInsertNops(false),
127     DebuggerReserveRegs(false),
128     DebuggerEmitPrologue(false),
129 
130     EnableHugePrivateBuffer(false),
131     EnableVGPRSpilling(false),
132     EnablePromoteAlloca(false),
133     EnableLoadStoreOpt(false),
134     EnableUnsafeDSOffsetFolding(false),
135     EnableSIScheduler(false),
136     EnableDS128(false),
137     DumpCode(false),
138 
139     FP64(false),
140     FMA(false),
141     MIMG_R128(false),
142     IsGCN(false),
143     GCN3Encoding(false),
144     CIInsts(false),
145     GFX9Insts(false),
146     SGPRInitBug(false),
147     HasSMemRealTime(false),
148     Has16BitInsts(false),
149     HasIntClamp(false),
150     HasVOP3PInsts(false),
151     HasMadMixInsts(false),
152     HasFmaMixInsts(false),
153     HasMovrel(false),
154     HasVGPRIndexMode(false),
155     HasScalarStores(false),
156     HasScalarAtomics(false),
157     HasInv2PiInlineImm(false),
158     HasSDWA(false),
159     HasSDWAOmod(false),
160     HasSDWAScalar(false),
161     HasSDWASdst(false),
162     HasSDWAMac(false),
163     HasSDWAOutModsVOPC(false),
164     HasDPP(false),
165     HasDLInsts(false),
166     D16PreservesUnusedBits(false),
167     FlatAddressSpace(false),
168     FlatInstOffsets(false),
169     FlatGlobalInsts(false),
170     FlatScratchInsts(false),
171     AddNoCarryInsts(false),
172     HasUnpackedD16VMem(false),
173 
174     R600ALUInst(false),
175     CaymanISA(false),
176     CFALUBug(false),
177     HasVertexCache(false),
178     TexVTXClauseSize(0),
179     ScalarizeGlobal(false),
180 
181     FeatureDisable(false),
182     InstrItins(getInstrItineraryForCPU(GPU)) {
183   AS = AMDGPU::getAMDGPUAS(TT);
184   initializeSubtargetDependencies(TT, GPU, FS);
185 }
186 
187 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
188   const Function &F) const {
189   if (NWaves == 1)
190     return getLocalMemorySize();
191   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
192   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
193   unsigned MaxWaves = getMaxWavesPerEU();
194   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
195 }
196 
197 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
198   const Function &F) const {
199   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
200   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
201   unsigned MaxWaves = getMaxWavesPerEU();
202   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
203   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
204   NumWaves = std::min(NumWaves, MaxWaves);
205   NumWaves = std::max(NumWaves, 1u);
206   return NumWaves;
207 }
208 
209 unsigned
210 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
211   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
212   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
213 }
214 
215 std::pair<unsigned, unsigned>
216 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
217   switch (CC) {
218   case CallingConv::AMDGPU_CS:
219   case CallingConv::AMDGPU_KERNEL:
220   case CallingConv::SPIR_KERNEL:
221     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
222   case CallingConv::AMDGPU_VS:
223   case CallingConv::AMDGPU_LS:
224   case CallingConv::AMDGPU_HS:
225   case CallingConv::AMDGPU_ES:
226   case CallingConv::AMDGPU_GS:
227   case CallingConv::AMDGPU_PS:
228     return std::make_pair(1, getWavefrontSize());
229   default:
230     return std::make_pair(1, 16 * getWavefrontSize());
231   }
232 }
233 
234 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
235   const Function &F) const {
236   // FIXME: 1024 if function.
237   // Default minimum/maximum flat work group sizes.
238   std::pair<unsigned, unsigned> Default =
239     getDefaultFlatWorkGroupSize(F.getCallingConv());
240 
241   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
242   // starts using "amdgpu-flat-work-group-size" attribute.
243   Default.second = AMDGPU::getIntegerAttribute(
244     F, "amdgpu-max-work-group-size", Default.second);
245   Default.first = std::min(Default.first, Default.second);
246 
247   // Requested minimum/maximum flat work group sizes.
248   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
249     F, "amdgpu-flat-work-group-size", Default);
250 
251   // Make sure requested minimum is less than requested maximum.
252   if (Requested.first > Requested.second)
253     return Default;
254 
255   // Make sure requested values do not violate subtarget's specifications.
256   if (Requested.first < getMinFlatWorkGroupSize())
257     return Default;
258   if (Requested.second > getMaxFlatWorkGroupSize())
259     return Default;
260 
261   return Requested;
262 }
263 
264 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
265   const Function &F) const {
266   // Default minimum/maximum number of waves per execution unit.
267   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
268 
269   // Default/requested minimum/maximum flat work group sizes.
270   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
271 
272   // If minimum/maximum flat work group sizes were explicitly requested using
273   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
274   // number of waves per execution unit to values implied by requested
275   // minimum/maximum flat work group sizes.
276   unsigned MinImpliedByFlatWorkGroupSize =
277     getMaxWavesPerEU(FlatWorkGroupSizes.second);
278   bool RequestedFlatWorkGroupSize = false;
279 
280   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
281   // starts using "amdgpu-flat-work-group-size" attribute.
282   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
283       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
284     Default.first = MinImpliedByFlatWorkGroupSize;
285     RequestedFlatWorkGroupSize = true;
286   }
287 
288   // Requested minimum/maximum number of waves per execution unit.
289   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
290     F, "amdgpu-waves-per-eu", Default, true);
291 
292   // Make sure requested minimum is less than requested maximum.
293   if (Requested.second && Requested.first > Requested.second)
294     return Default;
295 
296   // Make sure requested values do not violate subtarget's specifications.
297   if (Requested.first < getMinWavesPerEU() ||
298       Requested.first > getMaxWavesPerEU())
299     return Default;
300   if (Requested.second > getMaxWavesPerEU())
301     return Default;
302 
303   // Make sure requested values are compatible with values implied by requested
304   // minimum/maximum flat work group sizes.
305   if (RequestedFlatWorkGroupSize &&
306       Requested.first < MinImpliedByFlatWorkGroupSize)
307     return Default;
308 
309   return Requested;
310 }
311 
312 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
313   Function *Kernel = I->getParent()->getParent();
314   unsigned MinSize = 0;
315   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
316   bool IdQuery = false;
317 
318   // If reqd_work_group_size is present it narrows value down.
319   if (auto *CI = dyn_cast<CallInst>(I)) {
320     const Function *F = CI->getCalledFunction();
321     if (F) {
322       unsigned Dim = UINT_MAX;
323       switch (F->getIntrinsicID()) {
324       case Intrinsic::amdgcn_workitem_id_x:
325       case Intrinsic::r600_read_tidig_x:
326         IdQuery = true;
327         LLVM_FALLTHROUGH;
328       case Intrinsic::r600_read_local_size_x:
329         Dim = 0;
330         break;
331       case Intrinsic::amdgcn_workitem_id_y:
332       case Intrinsic::r600_read_tidig_y:
333         IdQuery = true;
334         LLVM_FALLTHROUGH;
335       case Intrinsic::r600_read_local_size_y:
336         Dim = 1;
337         break;
338       case Intrinsic::amdgcn_workitem_id_z:
339       case Intrinsic::r600_read_tidig_z:
340         IdQuery = true;
341         LLVM_FALLTHROUGH;
342       case Intrinsic::r600_read_local_size_z:
343         Dim = 2;
344         break;
345       default:
346         break;
347       }
348       if (Dim <= 3) {
349         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
350           if (Node->getNumOperands() == 3)
351             MinSize = MaxSize = mdconst::extract<ConstantInt>(
352                                   Node->getOperand(Dim))->getZExtValue();
353       }
354     }
355   }
356 
357   if (!MaxSize)
358     return false;
359 
360   // Range metadata is [Lo, Hi). For ID query we need to pass max size
361   // as Hi. For size query we need to pass Hi + 1.
362   if (IdQuery)
363     MinSize = 0;
364   else
365     ++MaxSize;
366 
367   MDBuilder MDB(I->getContext());
368   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
369                                                   APInt(32, MaxSize));
370   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
371   return true;
372 }
373 
374 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
375                              const TargetMachine &TM) :
376   AMDGPUSubtarget(TT, GPU, FS, TM),
377   InstrInfo(*this),
378   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
379   TLInfo(TM, *this) {}
380 
381 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
382                          const GCNTargetMachine &TM)
383     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
384       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
385       TLInfo(TM, *this) {
386   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
387   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
388 
389   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
390   InstSelector.reset(new AMDGPUInstructionSelector(
391       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
392 }
393 
394 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
395                                       unsigned NumRegionInstrs) const {
396   // Track register pressure so the scheduler can try to decrease
397   // pressure once register usage is above the threshold defined by
398   // SIRegisterInfo::getRegPressureSetLimit()
399   Policy.ShouldTrackPressure = true;
400 
401   // Enabling both top down and bottom up scheduling seems to give us less
402   // register spills than just using one of these approaches on its own.
403   Policy.OnlyTopDown = false;
404   Policy.OnlyBottomUp = false;
405 
406   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
407   if (!enableSIScheduler())
408     Policy.ShouldTrackLaneMasks = true;
409 }
410 
411 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
412   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
413 }
414 
415 unsigned SISubtarget::getKernArgSegmentSize(const Function &F,
416                                             unsigned ExplicitArgBytes) const {
417   uint64_t TotalSize = ExplicitArgBytes;
418   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
419 
420   if (ImplicitBytes != 0) {
421     unsigned Alignment = getAlignmentForImplicitArgPtr();
422     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
423   }
424 
425   // Being able to dereference past the end is useful for emitting scalar loads.
426   return alignTo(TotalSize, 4);
427 }
428 
429 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
430   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
431     if (SGPRs <= 80)
432       return 10;
433     if (SGPRs <= 88)
434       return 9;
435     if (SGPRs <= 100)
436       return 8;
437     return 7;
438   }
439   if (SGPRs <= 48)
440     return 10;
441   if (SGPRs <= 56)
442     return 9;
443   if (SGPRs <= 64)
444     return 8;
445   if (SGPRs <= 72)
446     return 7;
447   if (SGPRs <= 80)
448     return 6;
449   return 5;
450 }
451 
452 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
453   if (VGPRs <= 24)
454     return 10;
455   if (VGPRs <= 28)
456     return 9;
457   if (VGPRs <= 32)
458     return 8;
459   if (VGPRs <= 36)
460     return 7;
461   if (VGPRs <= 40)
462     return 6;
463   if (VGPRs <= 48)
464     return 5;
465   if (VGPRs <= 64)
466     return 4;
467   if (VGPRs <= 84)
468     return 3;
469   if (VGPRs <= 128)
470     return 2;
471   return 1;
472 }
473 
474 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
475   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
476   if (MFI.hasFlatScratchInit()) {
477     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
478       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
479     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
480       return 4; // FLAT_SCRATCH, VCC (in that order).
481   }
482 
483   if (isXNACKEnabled())
484     return 4; // XNACK, VCC (in that order).
485   return 2; // VCC.
486 }
487 
488 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
489   const Function &F = MF.getFunction();
490   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
491 
492   // Compute maximum number of SGPRs function can use using default/requested
493   // minimum number of waves per execution unit.
494   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
495   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
496   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
497 
498   // Check if maximum number of SGPRs was explicitly requested using
499   // "amdgpu-num-sgpr" attribute.
500   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
501     unsigned Requested = AMDGPU::getIntegerAttribute(
502       F, "amdgpu-num-sgpr", MaxNumSGPRs);
503 
504     // Make sure requested value does not violate subtarget's specifications.
505     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
506       Requested = 0;
507 
508     // If more SGPRs are required to support the input user/system SGPRs,
509     // increase to accommodate them.
510     //
511     // FIXME: This really ends up using the requested number of SGPRs + number
512     // of reserved special registers in total. Theoretically you could re-use
513     // the last input registers for these special registers, but this would
514     // require a lot of complexity to deal with the weird aliasing.
515     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
516     if (Requested && Requested < InputNumSGPRs)
517       Requested = InputNumSGPRs;
518 
519     // Make sure requested value is compatible with values implied by
520     // default/requested minimum/maximum number of waves per execution unit.
521     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
522       Requested = 0;
523     if (WavesPerEU.second &&
524         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
525       Requested = 0;
526 
527     if (Requested)
528       MaxNumSGPRs = Requested;
529   }
530 
531   if (hasSGPRInitBug())
532     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
533 
534   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
535                   MaxAddressableNumSGPRs);
536 }
537 
538 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
539   const Function &F = MF.getFunction();
540   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
541 
542   // Compute maximum number of VGPRs function can use using default/requested
543   // minimum number of waves per execution unit.
544   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
545   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
546 
547   // Check if maximum number of VGPRs was explicitly requested using
548   // "amdgpu-num-vgpr" attribute.
549   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
550     unsigned Requested = AMDGPU::getIntegerAttribute(
551       F, "amdgpu-num-vgpr", MaxNumVGPRs);
552 
553     // Make sure requested value does not violate subtarget's specifications.
554     if (Requested && Requested <= getReservedNumVGPRs(MF))
555       Requested = 0;
556 
557     // Make sure requested value is compatible with values implied by
558     // default/requested minimum/maximum number of waves per execution unit.
559     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
560       Requested = 0;
561     if (WavesPerEU.second &&
562         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
563       Requested = 0;
564 
565     if (Requested)
566       MaxNumVGPRs = Requested;
567   }
568 
569   return MaxNumVGPRs - getReservedNumVGPRs(MF);
570 }
571 
572 namespace {
573 struct MemOpClusterMutation : ScheduleDAGMutation {
574   const SIInstrInfo *TII;
575 
576   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
577 
578   void apply(ScheduleDAGInstrs *DAGInstrs) override {
579     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
580 
581     SUnit *SUa = nullptr;
582     // Search for two consequent memory operations and link them
583     // to prevent scheduler from moving them apart.
584     // In DAG pre-process SUnits are in the original order of
585     // the instructions before scheduling.
586     for (SUnit &SU : DAG->SUnits) {
587       MachineInstr &MI2 = *SU.getInstr();
588       if (!MI2.mayLoad() && !MI2.mayStore()) {
589         SUa = nullptr;
590         continue;
591       }
592       if (!SUa) {
593         SUa = &SU;
594         continue;
595       }
596 
597       MachineInstr &MI1 = *SUa->getInstr();
598       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
599           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
600           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
601           (TII->isDS(MI1)   && TII->isDS(MI2))) {
602         SU.addPredBarrier(SUa);
603 
604         for (const SDep &SI : SU.Preds) {
605           if (SI.getSUnit() != SUa)
606             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
607         }
608 
609         if (&SU != &DAG->ExitSU) {
610           for (const SDep &SI : SUa->Succs) {
611             if (SI.getSUnit() != &SU)
612               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
613           }
614         }
615       }
616 
617       SUa = &SU;
618     }
619   }
620 };
621 } // namespace
622 
623 void SISubtarget::getPostRAMutations(
624     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
625   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
626 }
627