1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
52 
53   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
54     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
55 
56   // FIXME: I don't think think Evergreen has any useful support for
57   // denormals, but should be checked. Should we issue a warning somewhere
58   // if someone tries to enable these?
59   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
60     FullFS += "+fp64-fp16-denormals,";
61   } else {
62     FullFS += "-fp32-denormals,";
63   }
64 
65   FullFS += FS;
66 
67   ParseSubtargetFeatures(GPU, FullFS);
68 
69   // We don't support FP64 for EG/NI atm.
70   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
71 
72   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
73   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
74   // variants of MUBUF instructions.
75   if (!hasAddr64() && !FS.contains("flat-for-global")) {
76     FlatForGlobal = true;
77   }
78 
79   // Set defaults if needed.
80   if (MaxPrivateElementSize == 0)
81     MaxPrivateElementSize = 4;
82 
83   if (LDSBankCount == 0)
84     LDSBankCount = 32;
85 
86   if (TT.getArch() == Triple::amdgcn) {
87     if (LocalMemorySize == 0)
88       LocalMemorySize = 32768;
89 
90     // Do something sensible for unspecified target.
91     if (!HasMovrel && !HasVGPRIndexMode)
92       HasMovrel = true;
93   }
94 
95   return *this;
96 }
97 
98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
99                                  const TargetMachine &TM)
100   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
101     TargetTriple(TT),
102     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
103     IsaVersion(ISAVersion0_0_0),
104     WavefrontSize(0),
105     LocalMemorySize(0),
106     LDSBankCount(0),
107     MaxPrivateElementSize(0),
108 
109     FastFMAF32(false),
110     HalfRate64Ops(false),
111 
112     FP32Denormals(false),
113     FP64FP16Denormals(false),
114     FPExceptions(false),
115     DX10Clamp(false),
116     FlatForGlobal(false),
117     AutoWaitcntBeforeBarrier(false),
118     CodeObjectV3(false),
119     UnalignedScratchAccess(false),
120     UnalignedBufferAccess(false),
121 
122     HasApertureRegs(false),
123     EnableXNACK(false),
124     TrapHandler(false),
125     DebuggerInsertNops(false),
126     DebuggerReserveRegs(false),
127     DebuggerEmitPrologue(false),
128 
129     EnableHugePrivateBuffer(false),
130     EnableVGPRSpilling(false),
131     EnablePromoteAlloca(false),
132     EnableLoadStoreOpt(false),
133     EnableUnsafeDSOffsetFolding(false),
134     EnableSIScheduler(false),
135     EnableDS128(false),
136     DumpCode(false),
137 
138     FP64(false),
139     FMA(false),
140     MIMG_R128(false),
141     IsGCN(false),
142     GCN3Encoding(false),
143     CIInsts(false),
144     GFX9Insts(false),
145     SGPRInitBug(false),
146     HasSMemRealTime(false),
147     Has16BitInsts(false),
148     HasIntClamp(false),
149     HasVOP3PInsts(false),
150     HasMadMixInsts(false),
151     HasMovrel(false),
152     HasVGPRIndexMode(false),
153     HasScalarStores(false),
154     HasScalarAtomics(false),
155     HasInv2PiInlineImm(false),
156     HasSDWA(false),
157     HasSDWAOmod(false),
158     HasSDWAScalar(false),
159     HasSDWASdst(false),
160     HasSDWAMac(false),
161     HasSDWAOutModsVOPC(false),
162     HasDPP(false),
163     FlatAddressSpace(false),
164     FlatInstOffsets(false),
165     FlatGlobalInsts(false),
166     FlatScratchInsts(false),
167     AddNoCarryInsts(false),
168     HasUnpackedD16VMem(false),
169 
170     R600ALUInst(false),
171     CaymanISA(false),
172     CFALUBug(false),
173     HasVertexCache(false),
174     TexVTXClauseSize(0),
175     ScalarizeGlobal(false),
176 
177     FeatureDisable(false),
178     InstrItins(getInstrItineraryForCPU(GPU)) {
179   AS = AMDGPU::getAMDGPUAS(TT);
180   initializeSubtargetDependencies(TT, GPU, FS);
181 }
182 
183 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
184   const Function &F) const {
185   if (NWaves == 1)
186     return getLocalMemorySize();
187   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
188   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
189   unsigned MaxWaves = getMaxWavesPerEU();
190   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
191 }
192 
193 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
194   const Function &F) const {
195   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
196   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
197   unsigned MaxWaves = getMaxWavesPerEU();
198   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
199   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
200   NumWaves = std::min(NumWaves, MaxWaves);
201   NumWaves = std::max(NumWaves, 1u);
202   return NumWaves;
203 }
204 
205 std::pair<unsigned, unsigned>
206 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
207   switch (CC) {
208   case CallingConv::AMDGPU_CS:
209   case CallingConv::AMDGPU_KERNEL:
210   case CallingConv::SPIR_KERNEL:
211     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
212   case CallingConv::AMDGPU_VS:
213   case CallingConv::AMDGPU_LS:
214   case CallingConv::AMDGPU_HS:
215   case CallingConv::AMDGPU_ES:
216   case CallingConv::AMDGPU_GS:
217   case CallingConv::AMDGPU_PS:
218     return std::make_pair(1, getWavefrontSize());
219   default:
220     return std::make_pair(1, 16 * getWavefrontSize());
221   }
222 }
223 
224 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
225   const Function &F) const {
226   // FIXME: 1024 if function.
227   // Default minimum/maximum flat work group sizes.
228   std::pair<unsigned, unsigned> Default =
229     getDefaultFlatWorkGroupSize(F.getCallingConv());
230 
231   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
232   // starts using "amdgpu-flat-work-group-size" attribute.
233   Default.second = AMDGPU::getIntegerAttribute(
234     F, "amdgpu-max-work-group-size", Default.second);
235   Default.first = std::min(Default.first, Default.second);
236 
237   // Requested minimum/maximum flat work group sizes.
238   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
239     F, "amdgpu-flat-work-group-size", Default);
240 
241   // Make sure requested minimum is less than requested maximum.
242   if (Requested.first > Requested.second)
243     return Default;
244 
245   // Make sure requested values do not violate subtarget's specifications.
246   if (Requested.first < getMinFlatWorkGroupSize())
247     return Default;
248   if (Requested.second > getMaxFlatWorkGroupSize())
249     return Default;
250 
251   return Requested;
252 }
253 
254 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
255   const Function &F) const {
256   // Default minimum/maximum number of waves per execution unit.
257   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
258 
259   // Default/requested minimum/maximum flat work group sizes.
260   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
261 
262   // If minimum/maximum flat work group sizes were explicitly requested using
263   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
264   // number of waves per execution unit to values implied by requested
265   // minimum/maximum flat work group sizes.
266   unsigned MinImpliedByFlatWorkGroupSize =
267     getMaxWavesPerEU(FlatWorkGroupSizes.second);
268   bool RequestedFlatWorkGroupSize = false;
269 
270   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
271   // starts using "amdgpu-flat-work-group-size" attribute.
272   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
273       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
274     Default.first = MinImpliedByFlatWorkGroupSize;
275     RequestedFlatWorkGroupSize = true;
276   }
277 
278   // Requested minimum/maximum number of waves per execution unit.
279   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
280     F, "amdgpu-waves-per-eu", Default, true);
281 
282   // Make sure requested minimum is less than requested maximum.
283   if (Requested.second && Requested.first > Requested.second)
284     return Default;
285 
286   // Make sure requested values do not violate subtarget's specifications.
287   if (Requested.first < getMinWavesPerEU() ||
288       Requested.first > getMaxWavesPerEU())
289     return Default;
290   if (Requested.second > getMaxWavesPerEU())
291     return Default;
292 
293   // Make sure requested values are compatible with values implied by requested
294   // minimum/maximum flat work group sizes.
295   if (RequestedFlatWorkGroupSize &&
296       Requested.first < MinImpliedByFlatWorkGroupSize)
297     return Default;
298 
299   return Requested;
300 }
301 
302 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
303   Function *Kernel = I->getParent()->getParent();
304   unsigned MinSize = 0;
305   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
306   bool IdQuery = false;
307 
308   // If reqd_work_group_size is present it narrows value down.
309   if (auto *CI = dyn_cast<CallInst>(I)) {
310     const Function *F = CI->getCalledFunction();
311     if (F) {
312       unsigned Dim = UINT_MAX;
313       switch (F->getIntrinsicID()) {
314       case Intrinsic::amdgcn_workitem_id_x:
315       case Intrinsic::r600_read_tidig_x:
316         IdQuery = true;
317         LLVM_FALLTHROUGH;
318       case Intrinsic::r600_read_local_size_x:
319         Dim = 0;
320         break;
321       case Intrinsic::amdgcn_workitem_id_y:
322       case Intrinsic::r600_read_tidig_y:
323         IdQuery = true;
324         LLVM_FALLTHROUGH;
325       case Intrinsic::r600_read_local_size_y:
326         Dim = 1;
327         break;
328       case Intrinsic::amdgcn_workitem_id_z:
329       case Intrinsic::r600_read_tidig_z:
330         IdQuery = true;
331         LLVM_FALLTHROUGH;
332       case Intrinsic::r600_read_local_size_z:
333         Dim = 2;
334         break;
335       default:
336         break;
337       }
338       if (Dim <= 3) {
339         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
340           if (Node->getNumOperands() == 3)
341             MinSize = MaxSize = mdconst::extract<ConstantInt>(
342                                   Node->getOperand(Dim))->getZExtValue();
343       }
344     }
345   }
346 
347   if (!MaxSize)
348     return false;
349 
350   // Range metadata is [Lo, Hi). For ID query we need to pass max size
351   // as Hi. For size query we need to pass Hi + 1.
352   if (IdQuery)
353     MinSize = 0;
354   else
355     ++MaxSize;
356 
357   MDBuilder MDB(I->getContext());
358   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
359                                                   APInt(32, MaxSize));
360   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
361   return true;
362 }
363 
364 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
365                              const TargetMachine &TM) :
366   AMDGPUSubtarget(TT, GPU, FS, TM),
367   InstrInfo(*this),
368   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
369   TLInfo(TM, *this) {}
370 
371 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
372                          const GCNTargetMachine &TM)
373     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
374       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
375       TLInfo(TM, *this) {
376   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
377   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
378 
379   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
380   InstSelector.reset(new AMDGPUInstructionSelector(
381       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
382 }
383 
384 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
385                                       unsigned NumRegionInstrs) const {
386   // Track register pressure so the scheduler can try to decrease
387   // pressure once register usage is above the threshold defined by
388   // SIRegisterInfo::getRegPressureSetLimit()
389   Policy.ShouldTrackPressure = true;
390 
391   // Enabling both top down and bottom up scheduling seems to give us less
392   // register spills than just using one of these approaches on its own.
393   Policy.OnlyTopDown = false;
394   Policy.OnlyBottomUp = false;
395 
396   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
397   if (!enableSIScheduler())
398     Policy.ShouldTrackLaneMasks = true;
399 }
400 
401 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
402   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
403 }
404 
405 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
406                                             unsigned ExplicitArgBytes) const {
407   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
408   if (ImplicitBytes == 0)
409     return ExplicitArgBytes;
410 
411   unsigned Alignment = getAlignmentForImplicitArgPtr();
412   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
413 }
414 
415 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
416   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
417     if (SGPRs <= 80)
418       return 10;
419     if (SGPRs <= 88)
420       return 9;
421     if (SGPRs <= 100)
422       return 8;
423     return 7;
424   }
425   if (SGPRs <= 48)
426     return 10;
427   if (SGPRs <= 56)
428     return 9;
429   if (SGPRs <= 64)
430     return 8;
431   if (SGPRs <= 72)
432     return 7;
433   if (SGPRs <= 80)
434     return 6;
435   return 5;
436 }
437 
438 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
439   if (VGPRs <= 24)
440     return 10;
441   if (VGPRs <= 28)
442     return 9;
443   if (VGPRs <= 32)
444     return 8;
445   if (VGPRs <= 36)
446     return 7;
447   if (VGPRs <= 40)
448     return 6;
449   if (VGPRs <= 48)
450     return 5;
451   if (VGPRs <= 64)
452     return 4;
453   if (VGPRs <= 84)
454     return 3;
455   if (VGPRs <= 128)
456     return 2;
457   return 1;
458 }
459 
460 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
461   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
462   if (MFI.hasFlatScratchInit()) {
463     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
464       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
465     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
466       return 4; // FLAT_SCRATCH, VCC (in that order).
467   }
468 
469   if (isXNACKEnabled())
470     return 4; // XNACK, VCC (in that order).
471   return 2; // VCC.
472 }
473 
474 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
475   const Function &F = MF.getFunction();
476   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
477 
478   // Compute maximum number of SGPRs function can use using default/requested
479   // minimum number of waves per execution unit.
480   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
481   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
482   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
483 
484   // Check if maximum number of SGPRs was explicitly requested using
485   // "amdgpu-num-sgpr" attribute.
486   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
487     unsigned Requested = AMDGPU::getIntegerAttribute(
488       F, "amdgpu-num-sgpr", MaxNumSGPRs);
489 
490     // Make sure requested value does not violate subtarget's specifications.
491     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
492       Requested = 0;
493 
494     // If more SGPRs are required to support the input user/system SGPRs,
495     // increase to accommodate them.
496     //
497     // FIXME: This really ends up using the requested number of SGPRs + number
498     // of reserved special registers in total. Theoretically you could re-use
499     // the last input registers for these special registers, but this would
500     // require a lot of complexity to deal with the weird aliasing.
501     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
502     if (Requested && Requested < InputNumSGPRs)
503       Requested = InputNumSGPRs;
504 
505     // Make sure requested value is compatible with values implied by
506     // default/requested minimum/maximum number of waves per execution unit.
507     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
508       Requested = 0;
509     if (WavesPerEU.second &&
510         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
511       Requested = 0;
512 
513     if (Requested)
514       MaxNumSGPRs = Requested;
515   }
516 
517   if (hasSGPRInitBug())
518     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
519 
520   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
521                   MaxAddressableNumSGPRs);
522 }
523 
524 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
525   const Function &F = MF.getFunction();
526   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
527 
528   // Compute maximum number of VGPRs function can use using default/requested
529   // minimum number of waves per execution unit.
530   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
531   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
532 
533   // Check if maximum number of VGPRs was explicitly requested using
534   // "amdgpu-num-vgpr" attribute.
535   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
536     unsigned Requested = AMDGPU::getIntegerAttribute(
537       F, "amdgpu-num-vgpr", MaxNumVGPRs);
538 
539     // Make sure requested value does not violate subtarget's specifications.
540     if (Requested && Requested <= getReservedNumVGPRs(MF))
541       Requested = 0;
542 
543     // Make sure requested value is compatible with values implied by
544     // default/requested minimum/maximum number of waves per execution unit.
545     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
546       Requested = 0;
547     if (WavesPerEU.second &&
548         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
549       Requested = 0;
550 
551     if (Requested)
552       MaxNumVGPRs = Requested;
553   }
554 
555   return MaxNumVGPRs - getReservedNumVGPRs(MF);
556 }
557 
558 namespace {
559 struct MemOpClusterMutation : ScheduleDAGMutation {
560   const SIInstrInfo *TII;
561 
562   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
563 
564   void apply(ScheduleDAGInstrs *DAGInstrs) override {
565     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
566 
567     SUnit *SUa = nullptr;
568     // Search for two consequent memory operations and link them
569     // to prevent scheduler from moving them apart.
570     // In DAG pre-process SUnits are in the original order of
571     // the instructions before scheduling.
572     for (SUnit &SU : DAG->SUnits) {
573       MachineInstr &MI2 = *SU.getInstr();
574       if (!MI2.mayLoad() && !MI2.mayStore()) {
575         SUa = nullptr;
576         continue;
577       }
578       if (!SUa) {
579         SUa = &SU;
580         continue;
581       }
582 
583       MachineInstr &MI1 = *SUa->getInstr();
584       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
585           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
586           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
587           (TII->isDS(MI1)   && TII->isDS(MI2))) {
588         SU.addPredBarrier(SUa);
589 
590         for (const SDep &SI : SU.Preds) {
591           if (SI.getSUnit() != SUa)
592             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
593         }
594 
595         if (&SU != &DAG->ExitSU) {
596           for (const SDep &SI : SUa->Succs) {
597             if (SI.getSUnit() != &SU)
598               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
599           }
600         }
601       }
602 
603       SUa = &SU;
604     }
605   }
606 };
607 } // namespace
608 
609 void SISubtarget::getPostRAMutations(
610     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
611   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
612 }
613