1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #include "AMDGPUGenSubtargetInfo.inc"
37 
38 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
39 
40 AMDGPUSubtarget &
41 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
42                                                  StringRef GPU, StringRef FS) {
43   // Determine default and user-specified characteristics
44   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
45   // enabled, but some instructions do not respect them and they run at the
46   // double precision rate, so don't enable by default.
47   //
48   // We want to be able to turn these off, but making this a subtarget feature
49   // for SI has the unhelpful behavior that it unsets everything else if you
50   // disable it.
51 
52   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
53 
54   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
55     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
61     FullFS += "+fp64-fp16-denormals,";
62   } else {
63     FullFS += "-fp32-denormals,";
64   }
65 
66   FullFS += FS;
67 
68   ParseSubtargetFeatures(GPU, FullFS);
69 
70   // We don't support FP64 for EG/NI atm.
71   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
72 
73   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
74   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
75   // variants of MUBUF instructions.
76   if (!hasAddr64() && !FS.contains("flat-for-global")) {
77     FlatForGlobal = true;
78   }
79 
80   // Set defaults if needed.
81   if (MaxPrivateElementSize == 0)
82     MaxPrivateElementSize = 4;
83 
84   if (LDSBankCount == 0)
85     LDSBankCount = 32;
86 
87   if (TT.getArch() == Triple::amdgcn) {
88     if (LocalMemorySize == 0)
89       LocalMemorySize = 32768;
90 
91     // Do something sensible for unspecified target.
92     if (!HasMovrel && !HasVGPRIndexMode)
93       HasMovrel = true;
94   }
95 
96   return *this;
97 }
98 
99 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
100                                  const TargetMachine &TM)
101   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
102     TargetTriple(TT),
103     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
104     IsaVersion(ISAVersion0_0_0),
105     WavefrontSize(0),
106     LocalMemorySize(0),
107     LDSBankCount(0),
108     MaxPrivateElementSize(0),
109 
110     FastFMAF32(false),
111     HalfRate64Ops(false),
112 
113     FP32Denormals(false),
114     FP64FP16Denormals(false),
115     FPExceptions(false),
116     DX10Clamp(false),
117     FlatForGlobal(false),
118     AutoWaitcntBeforeBarrier(false),
119     CodeObjectV3(false),
120     UnalignedScratchAccess(false),
121     UnalignedBufferAccess(false),
122 
123     HasApertureRegs(false),
124     EnableXNACK(false),
125     TrapHandler(false),
126     DebuggerInsertNops(false),
127     DebuggerReserveRegs(false),
128     DebuggerEmitPrologue(false),
129 
130     EnableHugePrivateBuffer(false),
131     EnableVGPRSpilling(false),
132     EnablePromoteAlloca(false),
133     EnableLoadStoreOpt(false),
134     EnableUnsafeDSOffsetFolding(false),
135     EnableSIScheduler(false),
136     EnableDS128(false),
137     DumpCode(false),
138 
139     FP64(false),
140     FMA(false),
141     MIMG_R128(false),
142     IsGCN(false),
143     GCN3Encoding(false),
144     CIInsts(false),
145     GFX9Insts(false),
146     SGPRInitBug(false),
147     HasSMemRealTime(false),
148     Has16BitInsts(false),
149     HasIntClamp(false),
150     HasVOP3PInsts(false),
151     HasMadMixInsts(false),
152     HasFmaMixInsts(false),
153     HasMovrel(false),
154     HasVGPRIndexMode(false),
155     HasScalarStores(false),
156     HasScalarAtomics(false),
157     HasInv2PiInlineImm(false),
158     HasSDWA(false),
159     HasSDWAOmod(false),
160     HasSDWAScalar(false),
161     HasSDWASdst(false),
162     HasSDWAMac(false),
163     HasSDWAOutModsVOPC(false),
164     HasDPP(false),
165     HasDLInsts(false),
166     D16PreservesUnusedBits(false),
167     FlatAddressSpace(false),
168     FlatInstOffsets(false),
169     FlatGlobalInsts(false),
170     FlatScratchInsts(false),
171     AddNoCarryInsts(false),
172     HasUnpackedD16VMem(false),
173 
174     R600ALUInst(false),
175     CaymanISA(false),
176     CFALUBug(false),
177     HasVertexCache(false),
178     TexVTXClauseSize(0),
179     ScalarizeGlobal(false),
180 
181     FeatureDisable(false),
182     InstrItins(getInstrItineraryForCPU(GPU)) {
183   AS = AMDGPU::getAMDGPUAS(TT);
184   initializeSubtargetDependencies(TT, GPU, FS);
185 }
186 
187 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
188   const Function &F) const {
189   if (NWaves == 1)
190     return getLocalMemorySize();
191   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
192   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
193   unsigned MaxWaves = getMaxWavesPerEU();
194   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
195 }
196 
197 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
198   const Function &F) const {
199   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
200   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
201   unsigned MaxWaves = getMaxWavesPerEU();
202   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
203   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
204   NumWaves = std::min(NumWaves, MaxWaves);
205   NumWaves = std::max(NumWaves, 1u);
206   return NumWaves;
207 }
208 
209 unsigned
210 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
211   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
212   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
213 }
214 
215 std::pair<unsigned, unsigned>
216 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
217   switch (CC) {
218   case CallingConv::AMDGPU_CS:
219   case CallingConv::AMDGPU_KERNEL:
220   case CallingConv::SPIR_KERNEL:
221     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
222   case CallingConv::AMDGPU_VS:
223   case CallingConv::AMDGPU_LS:
224   case CallingConv::AMDGPU_HS:
225   case CallingConv::AMDGPU_ES:
226   case CallingConv::AMDGPU_GS:
227   case CallingConv::AMDGPU_PS:
228     return std::make_pair(1, getWavefrontSize());
229   default:
230     return std::make_pair(1, 16 * getWavefrontSize());
231   }
232 }
233 
234 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
235   const Function &F) const {
236   // FIXME: 1024 if function.
237   // Default minimum/maximum flat work group sizes.
238   std::pair<unsigned, unsigned> Default =
239     getDefaultFlatWorkGroupSize(F.getCallingConv());
240 
241   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
242   // starts using "amdgpu-flat-work-group-size" attribute.
243   Default.second = AMDGPU::getIntegerAttribute(
244     F, "amdgpu-max-work-group-size", Default.second);
245   Default.first = std::min(Default.first, Default.second);
246 
247   // Requested minimum/maximum flat work group sizes.
248   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
249     F, "amdgpu-flat-work-group-size", Default);
250 
251   // Make sure requested minimum is less than requested maximum.
252   if (Requested.first > Requested.second)
253     return Default;
254 
255   // Make sure requested values do not violate subtarget's specifications.
256   if (Requested.first < getMinFlatWorkGroupSize())
257     return Default;
258   if (Requested.second > getMaxFlatWorkGroupSize())
259     return Default;
260 
261   return Requested;
262 }
263 
264 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
265   const Function &F) const {
266   // Default minimum/maximum number of waves per execution unit.
267   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
268 
269   // Default/requested minimum/maximum flat work group sizes.
270   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
271 
272   // If minimum/maximum flat work group sizes were explicitly requested using
273   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
274   // number of waves per execution unit to values implied by requested
275   // minimum/maximum flat work group sizes.
276   unsigned MinImpliedByFlatWorkGroupSize =
277     getMaxWavesPerEU(FlatWorkGroupSizes.second);
278   bool RequestedFlatWorkGroupSize = false;
279 
280   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
281   // starts using "amdgpu-flat-work-group-size" attribute.
282   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
283       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
284     Default.first = MinImpliedByFlatWorkGroupSize;
285     RequestedFlatWorkGroupSize = true;
286   }
287 
288   // Requested minimum/maximum number of waves per execution unit.
289   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
290     F, "amdgpu-waves-per-eu", Default, true);
291 
292   // Make sure requested minimum is less than requested maximum.
293   if (Requested.second && Requested.first > Requested.second)
294     return Default;
295 
296   // Make sure requested values do not violate subtarget's specifications.
297   if (Requested.first < getMinWavesPerEU() ||
298       Requested.first > getMaxWavesPerEU())
299     return Default;
300   if (Requested.second > getMaxWavesPerEU())
301     return Default;
302 
303   // Make sure requested values are compatible with values implied by requested
304   // minimum/maximum flat work group sizes.
305   if (RequestedFlatWorkGroupSize &&
306       Requested.first < MinImpliedByFlatWorkGroupSize)
307     return Default;
308 
309   return Requested;
310 }
311 
312 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
313   Function *Kernel = I->getParent()->getParent();
314   unsigned MinSize = 0;
315   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
316   bool IdQuery = false;
317 
318   // If reqd_work_group_size is present it narrows value down.
319   if (auto *CI = dyn_cast<CallInst>(I)) {
320     const Function *F = CI->getCalledFunction();
321     if (F) {
322       unsigned Dim = UINT_MAX;
323       switch (F->getIntrinsicID()) {
324       case Intrinsic::amdgcn_workitem_id_x:
325       case Intrinsic::r600_read_tidig_x:
326         IdQuery = true;
327         LLVM_FALLTHROUGH;
328       case Intrinsic::r600_read_local_size_x:
329         Dim = 0;
330         break;
331       case Intrinsic::amdgcn_workitem_id_y:
332       case Intrinsic::r600_read_tidig_y:
333         IdQuery = true;
334         LLVM_FALLTHROUGH;
335       case Intrinsic::r600_read_local_size_y:
336         Dim = 1;
337         break;
338       case Intrinsic::amdgcn_workitem_id_z:
339       case Intrinsic::r600_read_tidig_z:
340         IdQuery = true;
341         LLVM_FALLTHROUGH;
342       case Intrinsic::r600_read_local_size_z:
343         Dim = 2;
344         break;
345       default:
346         break;
347       }
348       if (Dim <= 3) {
349         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
350           if (Node->getNumOperands() == 3)
351             MinSize = MaxSize = mdconst::extract<ConstantInt>(
352                                   Node->getOperand(Dim))->getZExtValue();
353       }
354     }
355   }
356 
357   if (!MaxSize)
358     return false;
359 
360   // Range metadata is [Lo, Hi). For ID query we need to pass max size
361   // as Hi. For size query we need to pass Hi + 1.
362   if (IdQuery)
363     MinSize = 0;
364   else
365     ++MaxSize;
366 
367   MDBuilder MDB(I->getContext());
368   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
369                                                   APInt(32, MaxSize));
370   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
371   return true;
372 }
373 
374 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
375                              const TargetMachine &TM) :
376   AMDGPUSubtarget(TT, GPU, FS, TM),
377   InstrInfo(*this),
378   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
379   TLInfo(TM, *this) {}
380 
381 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
382                          const GCNTargetMachine &TM)
383     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
384       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
385       TLInfo(TM, *this) {
386   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
387   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
388 
389   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
390   InstSelector.reset(new AMDGPUInstructionSelector(
391       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
392 }
393 
394 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
395                                       unsigned NumRegionInstrs) const {
396   // Track register pressure so the scheduler can try to decrease
397   // pressure once register usage is above the threshold defined by
398   // SIRegisterInfo::getRegPressureSetLimit()
399   Policy.ShouldTrackPressure = true;
400 
401   // Enabling both top down and bottom up scheduling seems to give us less
402   // register spills than just using one of these approaches on its own.
403   Policy.OnlyTopDown = false;
404   Policy.OnlyBottomUp = false;
405 
406   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
407   if (!enableSIScheduler())
408     Policy.ShouldTrackLaneMasks = true;
409 }
410 
411 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
412   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
413 }
414 
415 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
416                                             unsigned ExplicitArgBytes) const {
417   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
418   if (ImplicitBytes == 0)
419     return ExplicitArgBytes;
420 
421   unsigned Alignment = getAlignmentForImplicitArgPtr();
422   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
423 }
424 
425 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
426   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
427     if (SGPRs <= 80)
428       return 10;
429     if (SGPRs <= 88)
430       return 9;
431     if (SGPRs <= 100)
432       return 8;
433     return 7;
434   }
435   if (SGPRs <= 48)
436     return 10;
437   if (SGPRs <= 56)
438     return 9;
439   if (SGPRs <= 64)
440     return 8;
441   if (SGPRs <= 72)
442     return 7;
443   if (SGPRs <= 80)
444     return 6;
445   return 5;
446 }
447 
448 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
449   if (VGPRs <= 24)
450     return 10;
451   if (VGPRs <= 28)
452     return 9;
453   if (VGPRs <= 32)
454     return 8;
455   if (VGPRs <= 36)
456     return 7;
457   if (VGPRs <= 40)
458     return 6;
459   if (VGPRs <= 48)
460     return 5;
461   if (VGPRs <= 64)
462     return 4;
463   if (VGPRs <= 84)
464     return 3;
465   if (VGPRs <= 128)
466     return 2;
467   return 1;
468 }
469 
470 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
471   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
472   if (MFI.hasFlatScratchInit()) {
473     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
474       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
475     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
476       return 4; // FLAT_SCRATCH, VCC (in that order).
477   }
478 
479   if (isXNACKEnabled())
480     return 4; // XNACK, VCC (in that order).
481   return 2; // VCC.
482 }
483 
484 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
485   const Function &F = MF.getFunction();
486   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
487 
488   // Compute maximum number of SGPRs function can use using default/requested
489   // minimum number of waves per execution unit.
490   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
491   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
492   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
493 
494   // Check if maximum number of SGPRs was explicitly requested using
495   // "amdgpu-num-sgpr" attribute.
496   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
497     unsigned Requested = AMDGPU::getIntegerAttribute(
498       F, "amdgpu-num-sgpr", MaxNumSGPRs);
499 
500     // Make sure requested value does not violate subtarget's specifications.
501     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
502       Requested = 0;
503 
504     // If more SGPRs are required to support the input user/system SGPRs,
505     // increase to accommodate them.
506     //
507     // FIXME: This really ends up using the requested number of SGPRs + number
508     // of reserved special registers in total. Theoretically you could re-use
509     // the last input registers for these special registers, but this would
510     // require a lot of complexity to deal with the weird aliasing.
511     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
512     if (Requested && Requested < InputNumSGPRs)
513       Requested = InputNumSGPRs;
514 
515     // Make sure requested value is compatible with values implied by
516     // default/requested minimum/maximum number of waves per execution unit.
517     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
518       Requested = 0;
519     if (WavesPerEU.second &&
520         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
521       Requested = 0;
522 
523     if (Requested)
524       MaxNumSGPRs = Requested;
525   }
526 
527   if (hasSGPRInitBug())
528     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
529 
530   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
531                   MaxAddressableNumSGPRs);
532 }
533 
534 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
535   const Function &F = MF.getFunction();
536   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
537 
538   // Compute maximum number of VGPRs function can use using default/requested
539   // minimum number of waves per execution unit.
540   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
541   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
542 
543   // Check if maximum number of VGPRs was explicitly requested using
544   // "amdgpu-num-vgpr" attribute.
545   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
546     unsigned Requested = AMDGPU::getIntegerAttribute(
547       F, "amdgpu-num-vgpr", MaxNumVGPRs);
548 
549     // Make sure requested value does not violate subtarget's specifications.
550     if (Requested && Requested <= getReservedNumVGPRs(MF))
551       Requested = 0;
552 
553     // Make sure requested value is compatible with values implied by
554     // default/requested minimum/maximum number of waves per execution unit.
555     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
556       Requested = 0;
557     if (WavesPerEU.second &&
558         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
559       Requested = 0;
560 
561     if (Requested)
562       MaxNumVGPRs = Requested;
563   }
564 
565   return MaxNumVGPRs - getReservedNumVGPRs(MF);
566 }
567 
568 namespace {
569 struct MemOpClusterMutation : ScheduleDAGMutation {
570   const SIInstrInfo *TII;
571 
572   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
573 
574   void apply(ScheduleDAGInstrs *DAGInstrs) override {
575     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
576 
577     SUnit *SUa = nullptr;
578     // Search for two consequent memory operations and link them
579     // to prevent scheduler from moving them apart.
580     // In DAG pre-process SUnits are in the original order of
581     // the instructions before scheduling.
582     for (SUnit &SU : DAG->SUnits) {
583       MachineInstr &MI2 = *SU.getInstr();
584       if (!MI2.mayLoad() && !MI2.mayStore()) {
585         SUa = nullptr;
586         continue;
587       }
588       if (!SUa) {
589         SUa = &SU;
590         continue;
591       }
592 
593       MachineInstr &MI1 = *SUa->getInstr();
594       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
595           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
596           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
597           (TII->isDS(MI1)   && TII->isDS(MI2))) {
598         SU.addPredBarrier(SUa);
599 
600         for (const SDep &SI : SU.Preds) {
601           if (SI.getSUnit() != SUa)
602             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
603         }
604 
605         if (&SU != &DAG->ExitSU) {
606           for (const SDep &SI : SUa->Succs) {
607             if (SI.getSUnit() != &SU)
608               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
609           }
610         }
611       }
612 
613       SUa = &SU;
614     }
615   }
616 };
617 } // namespace
618 
619 void SISubtarget::getPostRAMutations(
620     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
621   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
622 }
623