1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
52 
53   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
54     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
55 
56   // FIXME: I don't think think Evergreen has any useful support for
57   // denormals, but should be checked. Should we issue a warning somewhere
58   // if someone tries to enable these?
59   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
60     FullFS += "+fp64-fp16-denormals,";
61   } else {
62     FullFS += "-fp32-denormals,";
63   }
64 
65   FullFS += FS;
66 
67   ParseSubtargetFeatures(GPU, FullFS);
68 
69   // We don't support FP64 for EG/NI atm.
70   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
71 
72   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
73   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
74   // variants of MUBUF instructions.
75   if (!hasAddr64() && !FS.contains("flat-for-global")) {
76     FlatForGlobal = true;
77   }
78 
79   // Set defaults if needed.
80   if (MaxPrivateElementSize == 0)
81     MaxPrivateElementSize = 4;
82 
83   if (LDSBankCount == 0)
84     LDSBankCount = 32;
85 
86   if (TT.getArch() == Triple::amdgcn) {
87     if (LocalMemorySize == 0)
88       LocalMemorySize = 32768;
89 
90     // Do something sensible for unspecified target.
91     if (!HasMovrel && !HasVGPRIndexMode)
92       HasMovrel = true;
93   }
94 
95   return *this;
96 }
97 
98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
99                                  const TargetMachine &TM)
100   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
101     TargetTriple(TT),
102     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
103     IsaVersion(ISAVersion0_0_0),
104     WavefrontSize(0),
105     LocalMemorySize(0),
106     LDSBankCount(0),
107     MaxPrivateElementSize(0),
108 
109     FastFMAF32(false),
110     HalfRate64Ops(false),
111 
112     FP32Denormals(false),
113     FP64FP16Denormals(false),
114     FPExceptions(false),
115     DX10Clamp(false),
116     FlatForGlobal(false),
117     AutoWaitcntBeforeBarrier(false),
118     CodeObjectV3(false),
119     UnalignedScratchAccess(false),
120     UnalignedBufferAccess(false),
121 
122     HasApertureRegs(false),
123     EnableXNACK(false),
124     TrapHandler(false),
125     DebuggerInsertNops(false),
126     DebuggerReserveRegs(false),
127     DebuggerEmitPrologue(false),
128 
129     EnableHugePrivateBuffer(false),
130     EnableVGPRSpilling(false),
131     EnablePromoteAlloca(false),
132     EnableLoadStoreOpt(false),
133     EnableUnsafeDSOffsetFolding(false),
134     EnableSIScheduler(false),
135     DumpCode(false),
136 
137     FP64(false),
138     FMA(false),
139     MIMG_R128(false),
140     IsGCN(false),
141     GCN3Encoding(false),
142     CIInsts(false),
143     GFX9Insts(false),
144     SGPRInitBug(false),
145     HasSMemRealTime(false),
146     Has16BitInsts(false),
147     HasIntClamp(false),
148     HasVOP3PInsts(false),
149     HasMadMixInsts(false),
150     HasMovrel(false),
151     HasVGPRIndexMode(false),
152     HasScalarStores(false),
153     HasInv2PiInlineImm(false),
154     HasSDWA(false),
155     HasSDWAOmod(false),
156     HasSDWAScalar(false),
157     HasSDWASdst(false),
158     HasSDWAMac(false),
159     HasSDWAOutModsVOPC(false),
160     HasDPP(false),
161     FlatAddressSpace(false),
162     FlatInstOffsets(false),
163     FlatGlobalInsts(false),
164     FlatScratchInsts(false),
165     AddNoCarryInsts(false),
166     HasUnpackedD16VMem(false),
167 
168     R600ALUInst(false),
169     CaymanISA(false),
170     CFALUBug(false),
171     HasVertexCache(false),
172     TexVTXClauseSize(0),
173     ScalarizeGlobal(false),
174 
175     FeatureDisable(false),
176     InstrItins(getInstrItineraryForCPU(GPU)) {
177   AS = AMDGPU::getAMDGPUAS(TT);
178   initializeSubtargetDependencies(TT, GPU, FS);
179 }
180 
181 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
182   const Function &F) const {
183   if (NWaves == 1)
184     return getLocalMemorySize();
185   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
186   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
187   unsigned MaxWaves = getMaxWavesPerEU();
188   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
189 }
190 
191 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
192   const Function &F) const {
193   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
194   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
195   unsigned MaxWaves = getMaxWavesPerEU();
196   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
197   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
198   NumWaves = std::min(NumWaves, MaxWaves);
199   NumWaves = std::max(NumWaves, 1u);
200   return NumWaves;
201 }
202 
203 std::pair<unsigned, unsigned>
204 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
205   switch (CC) {
206   case CallingConv::AMDGPU_CS:
207   case CallingConv::AMDGPU_KERNEL:
208   case CallingConv::SPIR_KERNEL:
209     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
210   case CallingConv::AMDGPU_VS:
211   case CallingConv::AMDGPU_LS:
212   case CallingConv::AMDGPU_HS:
213   case CallingConv::AMDGPU_ES:
214   case CallingConv::AMDGPU_GS:
215   case CallingConv::AMDGPU_PS:
216     return std::make_pair(1, getWavefrontSize());
217   default:
218     return std::make_pair(1, 16 * getWavefrontSize());
219   }
220 }
221 
222 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
223   const Function &F) const {
224   // FIXME: 1024 if function.
225   // Default minimum/maximum flat work group sizes.
226   std::pair<unsigned, unsigned> Default =
227     getDefaultFlatWorkGroupSize(F.getCallingConv());
228 
229   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
230   // starts using "amdgpu-flat-work-group-size" attribute.
231   Default.second = AMDGPU::getIntegerAttribute(
232     F, "amdgpu-max-work-group-size", Default.second);
233   Default.first = std::min(Default.first, Default.second);
234 
235   // Requested minimum/maximum flat work group sizes.
236   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
237     F, "amdgpu-flat-work-group-size", Default);
238 
239   // Make sure requested minimum is less than requested maximum.
240   if (Requested.first > Requested.second)
241     return Default;
242 
243   // Make sure requested values do not violate subtarget's specifications.
244   if (Requested.first < getMinFlatWorkGroupSize())
245     return Default;
246   if (Requested.second > getMaxFlatWorkGroupSize())
247     return Default;
248 
249   return Requested;
250 }
251 
252 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
253   const Function &F) const {
254   // Default minimum/maximum number of waves per execution unit.
255   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
256 
257   // Default/requested minimum/maximum flat work group sizes.
258   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
259 
260   // If minimum/maximum flat work group sizes were explicitly requested using
261   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
262   // number of waves per execution unit to values implied by requested
263   // minimum/maximum flat work group sizes.
264   unsigned MinImpliedByFlatWorkGroupSize =
265     getMaxWavesPerEU(FlatWorkGroupSizes.second);
266   bool RequestedFlatWorkGroupSize = false;
267 
268   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
269   // starts using "amdgpu-flat-work-group-size" attribute.
270   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
271       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
272     Default.first = MinImpliedByFlatWorkGroupSize;
273     RequestedFlatWorkGroupSize = true;
274   }
275 
276   // Requested minimum/maximum number of waves per execution unit.
277   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
278     F, "amdgpu-waves-per-eu", Default, true);
279 
280   // Make sure requested minimum is less than requested maximum.
281   if (Requested.second && Requested.first > Requested.second)
282     return Default;
283 
284   // Make sure requested values do not violate subtarget's specifications.
285   if (Requested.first < getMinWavesPerEU() ||
286       Requested.first > getMaxWavesPerEU())
287     return Default;
288   if (Requested.second > getMaxWavesPerEU())
289     return Default;
290 
291   // Make sure requested values are compatible with values implied by requested
292   // minimum/maximum flat work group sizes.
293   if (RequestedFlatWorkGroupSize &&
294       Requested.first < MinImpliedByFlatWorkGroupSize)
295     return Default;
296 
297   return Requested;
298 }
299 
300 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
301   Function *Kernel = I->getParent()->getParent();
302   unsigned MinSize = 0;
303   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
304   bool IdQuery = false;
305 
306   // If reqd_work_group_size is present it narrows value down.
307   if (auto *CI = dyn_cast<CallInst>(I)) {
308     const Function *F = CI->getCalledFunction();
309     if (F) {
310       unsigned Dim = UINT_MAX;
311       switch (F->getIntrinsicID()) {
312       case Intrinsic::amdgcn_workitem_id_x:
313       case Intrinsic::r600_read_tidig_x:
314         IdQuery = true;
315         LLVM_FALLTHROUGH;
316       case Intrinsic::r600_read_local_size_x:
317         Dim = 0;
318         break;
319       case Intrinsic::amdgcn_workitem_id_y:
320       case Intrinsic::r600_read_tidig_y:
321         IdQuery = true;
322         LLVM_FALLTHROUGH;
323       case Intrinsic::r600_read_local_size_y:
324         Dim = 1;
325         break;
326       case Intrinsic::amdgcn_workitem_id_z:
327       case Intrinsic::r600_read_tidig_z:
328         IdQuery = true;
329         LLVM_FALLTHROUGH;
330       case Intrinsic::r600_read_local_size_z:
331         Dim = 2;
332         break;
333       default:
334         break;
335       }
336       if (Dim <= 3) {
337         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
338           if (Node->getNumOperands() == 3)
339             MinSize = MaxSize = mdconst::extract<ConstantInt>(
340                                   Node->getOperand(Dim))->getZExtValue();
341       }
342     }
343   }
344 
345   if (!MaxSize)
346     return false;
347 
348   // Range metadata is [Lo, Hi). For ID query we need to pass max size
349   // as Hi. For size query we need to pass Hi + 1.
350   if (IdQuery)
351     MinSize = 0;
352   else
353     ++MaxSize;
354 
355   MDBuilder MDB(I->getContext());
356   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
357                                                   APInt(32, MaxSize));
358   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
359   return true;
360 }
361 
362 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
363                              const TargetMachine &TM) :
364   AMDGPUSubtarget(TT, GPU, FS, TM),
365   InstrInfo(*this),
366   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
367   TLInfo(TM, *this) {}
368 
369 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
370                          const GCNTargetMachine &TM)
371     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
372       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
373       TLInfo(TM, *this) {
374   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
375   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
376 
377   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
378   InstSelector.reset(new AMDGPUInstructionSelector(
379       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
380 }
381 
382 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
383                                       unsigned NumRegionInstrs) const {
384   // Track register pressure so the scheduler can try to decrease
385   // pressure once register usage is above the threshold defined by
386   // SIRegisterInfo::getRegPressureSetLimit()
387   Policy.ShouldTrackPressure = true;
388 
389   // Enabling both top down and bottom up scheduling seems to give us less
390   // register spills than just using one of these approaches on its own.
391   Policy.OnlyTopDown = false;
392   Policy.OnlyBottomUp = false;
393 
394   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
395   if (!enableSIScheduler())
396     Policy.ShouldTrackLaneMasks = true;
397 }
398 
399 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
400   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
401 }
402 
403 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
404                                             unsigned ExplicitArgBytes) const {
405   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
406   if (ImplicitBytes == 0)
407     return ExplicitArgBytes;
408 
409   unsigned Alignment = getAlignmentForImplicitArgPtr();
410   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
411 }
412 
413 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
414   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
415     if (SGPRs <= 80)
416       return 10;
417     if (SGPRs <= 88)
418       return 9;
419     if (SGPRs <= 100)
420       return 8;
421     return 7;
422   }
423   if (SGPRs <= 48)
424     return 10;
425   if (SGPRs <= 56)
426     return 9;
427   if (SGPRs <= 64)
428     return 8;
429   if (SGPRs <= 72)
430     return 7;
431   if (SGPRs <= 80)
432     return 6;
433   return 5;
434 }
435 
436 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
437   if (VGPRs <= 24)
438     return 10;
439   if (VGPRs <= 28)
440     return 9;
441   if (VGPRs <= 32)
442     return 8;
443   if (VGPRs <= 36)
444     return 7;
445   if (VGPRs <= 40)
446     return 6;
447   if (VGPRs <= 48)
448     return 5;
449   if (VGPRs <= 64)
450     return 4;
451   if (VGPRs <= 84)
452     return 3;
453   if (VGPRs <= 128)
454     return 2;
455   return 1;
456 }
457 
458 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
459   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
460   if (MFI.hasFlatScratchInit()) {
461     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
462       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
463     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
464       return 4; // FLAT_SCRATCH, VCC (in that order).
465   }
466 
467   if (isXNACKEnabled())
468     return 4; // XNACK, VCC (in that order).
469   return 2; // VCC.
470 }
471 
472 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
473   const Function &F = MF.getFunction();
474   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
475 
476   // Compute maximum number of SGPRs function can use using default/requested
477   // minimum number of waves per execution unit.
478   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
479   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
480   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
481 
482   // Check if maximum number of SGPRs was explicitly requested using
483   // "amdgpu-num-sgpr" attribute.
484   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
485     unsigned Requested = AMDGPU::getIntegerAttribute(
486       F, "amdgpu-num-sgpr", MaxNumSGPRs);
487 
488     // Make sure requested value does not violate subtarget's specifications.
489     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
490       Requested = 0;
491 
492     // If more SGPRs are required to support the input user/system SGPRs,
493     // increase to accommodate them.
494     //
495     // FIXME: This really ends up using the requested number of SGPRs + number
496     // of reserved special registers in total. Theoretically you could re-use
497     // the last input registers for these special registers, but this would
498     // require a lot of complexity to deal with the weird aliasing.
499     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
500     if (Requested && Requested < InputNumSGPRs)
501       Requested = InputNumSGPRs;
502 
503     // Make sure requested value is compatible with values implied by
504     // default/requested minimum/maximum number of waves per execution unit.
505     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
506       Requested = 0;
507     if (WavesPerEU.second &&
508         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
509       Requested = 0;
510 
511     if (Requested)
512       MaxNumSGPRs = Requested;
513   }
514 
515   if (hasSGPRInitBug())
516     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
517 
518   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
519                   MaxAddressableNumSGPRs);
520 }
521 
522 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
523   const Function &F = MF.getFunction();
524   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
525 
526   // Compute maximum number of VGPRs function can use using default/requested
527   // minimum number of waves per execution unit.
528   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
529   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
530 
531   // Check if maximum number of VGPRs was explicitly requested using
532   // "amdgpu-num-vgpr" attribute.
533   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
534     unsigned Requested = AMDGPU::getIntegerAttribute(
535       F, "amdgpu-num-vgpr", MaxNumVGPRs);
536 
537     // Make sure requested value does not violate subtarget's specifications.
538     if (Requested && Requested <= getReservedNumVGPRs(MF))
539       Requested = 0;
540 
541     // Make sure requested value is compatible with values implied by
542     // default/requested minimum/maximum number of waves per execution unit.
543     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
544       Requested = 0;
545     if (WavesPerEU.second &&
546         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
547       Requested = 0;
548 
549     if (Requested)
550       MaxNumVGPRs = Requested;
551   }
552 
553   return MaxNumVGPRs - getReservedNumVGPRs(MF);
554 }
555 
556 namespace {
557 struct MemOpClusterMutation : ScheduleDAGMutation {
558   const SIInstrInfo *TII;
559 
560   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
561 
562   void apply(ScheduleDAGInstrs *DAGInstrs) override {
563     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
564 
565     SUnit *SUa = nullptr;
566     // Search for two consequent memory operations and link them
567     // to prevent scheduler from moving them apart.
568     // In DAG pre-process SUnits are in the original order of
569     // the instructions before scheduling.
570     for (SUnit &SU : DAG->SUnits) {
571       MachineInstr &MI2 = *SU.getInstr();
572       if (!MI2.mayLoad() && !MI2.mayStore()) {
573         SUa = nullptr;
574         continue;
575       }
576       if (!SUa) {
577         SUa = &SU;
578         continue;
579       }
580 
581       MachineInstr &MI1 = *SUa->getInstr();
582       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
583           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
584           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
585           (TII->isDS(MI1)   && TII->isDS(MI2))) {
586         SU.addPredBarrier(SUa);
587 
588         for (const SDep &SI : SU.Preds) {
589           if (SI.getSUnit() != SUa)
590             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
591         }
592 
593         if (&SU != &DAG->ExitSU) {
594           for (const SDep &SI : SUa->Succs) {
595             if (SI.getSUnit() != &SU)
596               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
597           }
598         }
599       }
600 
601       SUa = &SU;
602     }
603   }
604 };
605 } // namespace
606 
607 void SISubtarget::getPostRAMutations(
608     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
609   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
610 }
611