1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
52 
53   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
54     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
55 
56   // FIXME: I don't think think Evergreen has any useful support for
57   // denormals, but should be checked. Should we issue a warning somewhere
58   // if someone tries to enable these?
59   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
60     FullFS += "+fp64-fp16-denormals,";
61   } else {
62     FullFS += "-fp32-denormals,";
63   }
64 
65   FullFS += FS;
66 
67   ParseSubtargetFeatures(GPU, FullFS);
68 
69   // We don't support FP64 for EG/NI atm.
70   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
71 
72   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
73   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
74   // variants of MUBUF instructions.
75   if (!hasAddr64() && !FS.contains("flat-for-global")) {
76     FlatForGlobal = true;
77   }
78 
79   // Set defaults if needed.
80   if (MaxPrivateElementSize == 0)
81     MaxPrivateElementSize = 4;
82 
83   if (LDSBankCount == 0)
84     LDSBankCount = 32;
85 
86   if (TT.getArch() == Triple::amdgcn) {
87     if (LocalMemorySize == 0)
88       LocalMemorySize = 32768;
89 
90     // Do something sensible for unspecified target.
91     if (!HasMovrel && !HasVGPRIndexMode)
92       HasMovrel = true;
93   }
94 
95   return *this;
96 }
97 
98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
99                                  const TargetMachine &TM)
100   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
101     TargetTriple(TT),
102     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
103     IsaVersion(ISAVersion0_0_0),
104     WavefrontSize(0),
105     LocalMemorySize(0),
106     LDSBankCount(0),
107     MaxPrivateElementSize(0),
108 
109     FastFMAF32(false),
110     HalfRate64Ops(false),
111 
112     FP32Denormals(false),
113     FP64FP16Denormals(false),
114     FPExceptions(false),
115     DX10Clamp(false),
116     FlatForGlobal(false),
117     AutoWaitcntBeforeBarrier(false),
118     CodeObjectV3(false),
119     UnalignedScratchAccess(false),
120     UnalignedBufferAccess(false),
121 
122     HasApertureRegs(false),
123     EnableXNACK(false),
124     TrapHandler(false),
125     DebuggerInsertNops(false),
126     DebuggerReserveRegs(false),
127     DebuggerEmitPrologue(false),
128 
129     EnableHugePrivateBuffer(false),
130     EnableVGPRSpilling(false),
131     EnablePromoteAlloca(false),
132     EnableLoadStoreOpt(false),
133     EnableUnsafeDSOffsetFolding(false),
134     EnableSIScheduler(false),
135     DumpCode(false),
136 
137     FP64(false),
138     FMA(false),
139     IsGCN(false),
140     GCN3Encoding(false),
141     CIInsts(false),
142     GFX9Insts(false),
143     SGPRInitBug(false),
144     HasSMemRealTime(false),
145     Has16BitInsts(false),
146     HasIntClamp(false),
147     HasVOP3PInsts(false),
148     HasMadMixInsts(false),
149     HasMovrel(false),
150     HasVGPRIndexMode(false),
151     HasScalarStores(false),
152     HasInv2PiInlineImm(false),
153     HasSDWA(false),
154     HasSDWAOmod(false),
155     HasSDWAScalar(false),
156     HasSDWASdst(false),
157     HasSDWAMac(false),
158     HasSDWAOutModsVOPC(false),
159     HasDPP(false),
160     FlatAddressSpace(false),
161     FlatInstOffsets(false),
162     FlatGlobalInsts(false),
163     FlatScratchInsts(false),
164     AddNoCarryInsts(false),
165     HasUnpackedD16VMem(false),
166 
167     R600ALUInst(false),
168     CaymanISA(false),
169     CFALUBug(false),
170     HasVertexCache(false),
171     TexVTXClauseSize(0),
172     ScalarizeGlobal(false),
173 
174     FeatureDisable(false),
175     InstrItins(getInstrItineraryForCPU(GPU)) {
176   AS = AMDGPU::getAMDGPUAS(TT);
177   initializeSubtargetDependencies(TT, GPU, FS);
178 }
179 
180 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
181   const Function &F) const {
182   if (NWaves == 1)
183     return getLocalMemorySize();
184   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
185   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
186   unsigned MaxWaves = getMaxWavesPerEU();
187   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
188 }
189 
190 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
191   const Function &F) const {
192   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
193   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
194   unsigned MaxWaves = getMaxWavesPerEU();
195   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
196   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
197   NumWaves = std::min(NumWaves, MaxWaves);
198   NumWaves = std::max(NumWaves, 1u);
199   return NumWaves;
200 }
201 
202 std::pair<unsigned, unsigned>
203 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
204   switch (CC) {
205   case CallingConv::AMDGPU_CS:
206   case CallingConv::AMDGPU_KERNEL:
207   case CallingConv::SPIR_KERNEL:
208     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
209   case CallingConv::AMDGPU_VS:
210   case CallingConv::AMDGPU_LS:
211   case CallingConv::AMDGPU_HS:
212   case CallingConv::AMDGPU_ES:
213   case CallingConv::AMDGPU_GS:
214   case CallingConv::AMDGPU_PS:
215     return std::make_pair(1, getWavefrontSize());
216   default:
217     return std::make_pair(1, 16 * getWavefrontSize());
218   }
219 }
220 
221 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
222   const Function &F) const {
223   // FIXME: 1024 if function.
224   // Default minimum/maximum flat work group sizes.
225   std::pair<unsigned, unsigned> Default =
226     getDefaultFlatWorkGroupSize(F.getCallingConv());
227 
228   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
229   // starts using "amdgpu-flat-work-group-size" attribute.
230   Default.second = AMDGPU::getIntegerAttribute(
231     F, "amdgpu-max-work-group-size", Default.second);
232   Default.first = std::min(Default.first, Default.second);
233 
234   // Requested minimum/maximum flat work group sizes.
235   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
236     F, "amdgpu-flat-work-group-size", Default);
237 
238   // Make sure requested minimum is less than requested maximum.
239   if (Requested.first > Requested.second)
240     return Default;
241 
242   // Make sure requested values do not violate subtarget's specifications.
243   if (Requested.first < getMinFlatWorkGroupSize())
244     return Default;
245   if (Requested.second > getMaxFlatWorkGroupSize())
246     return Default;
247 
248   return Requested;
249 }
250 
251 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
252   const Function &F) const {
253   // Default minimum/maximum number of waves per execution unit.
254   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
255 
256   // Default/requested minimum/maximum flat work group sizes.
257   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
258 
259   // If minimum/maximum flat work group sizes were explicitly requested using
260   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
261   // number of waves per execution unit to values implied by requested
262   // minimum/maximum flat work group sizes.
263   unsigned MinImpliedByFlatWorkGroupSize =
264     getMaxWavesPerEU(FlatWorkGroupSizes.second);
265   bool RequestedFlatWorkGroupSize = false;
266 
267   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
268   // starts using "amdgpu-flat-work-group-size" attribute.
269   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
270       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
271     Default.first = MinImpliedByFlatWorkGroupSize;
272     RequestedFlatWorkGroupSize = true;
273   }
274 
275   // Requested minimum/maximum number of waves per execution unit.
276   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
277     F, "amdgpu-waves-per-eu", Default, true);
278 
279   // Make sure requested minimum is less than requested maximum.
280   if (Requested.second && Requested.first > Requested.second)
281     return Default;
282 
283   // Make sure requested values do not violate subtarget's specifications.
284   if (Requested.first < getMinWavesPerEU() ||
285       Requested.first > getMaxWavesPerEU())
286     return Default;
287   if (Requested.second > getMaxWavesPerEU())
288     return Default;
289 
290   // Make sure requested values are compatible with values implied by requested
291   // minimum/maximum flat work group sizes.
292   if (RequestedFlatWorkGroupSize &&
293       Requested.first < MinImpliedByFlatWorkGroupSize)
294     return Default;
295 
296   return Requested;
297 }
298 
299 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
300   Function *Kernel = I->getParent()->getParent();
301   unsigned MinSize = 0;
302   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
303   bool IdQuery = false;
304 
305   // If reqd_work_group_size is present it narrows value down.
306   if (auto *CI = dyn_cast<CallInst>(I)) {
307     const Function *F = CI->getCalledFunction();
308     if (F) {
309       unsigned Dim = UINT_MAX;
310       switch (F->getIntrinsicID()) {
311       case Intrinsic::amdgcn_workitem_id_x:
312       case Intrinsic::r600_read_tidig_x:
313         IdQuery = true;
314         LLVM_FALLTHROUGH;
315       case Intrinsic::r600_read_local_size_x:
316         Dim = 0;
317         break;
318       case Intrinsic::amdgcn_workitem_id_y:
319       case Intrinsic::r600_read_tidig_y:
320         IdQuery = true;
321         LLVM_FALLTHROUGH;
322       case Intrinsic::r600_read_local_size_y:
323         Dim = 1;
324         break;
325       case Intrinsic::amdgcn_workitem_id_z:
326       case Intrinsic::r600_read_tidig_z:
327         IdQuery = true;
328         LLVM_FALLTHROUGH;
329       case Intrinsic::r600_read_local_size_z:
330         Dim = 2;
331         break;
332       default:
333         break;
334       }
335       if (Dim <= 3) {
336         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
337           if (Node->getNumOperands() == 3)
338             MinSize = MaxSize = mdconst::extract<ConstantInt>(
339                                   Node->getOperand(Dim))->getZExtValue();
340       }
341     }
342   }
343 
344   if (!MaxSize)
345     return false;
346 
347   // Range metadata is [Lo, Hi). For ID query we need to pass max size
348   // as Hi. For size query we need to pass Hi + 1.
349   if (IdQuery)
350     MinSize = 0;
351   else
352     ++MaxSize;
353 
354   MDBuilder MDB(I->getContext());
355   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
356                                                   APInt(32, MaxSize));
357   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
358   return true;
359 }
360 
361 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
362                              const TargetMachine &TM) :
363   AMDGPUSubtarget(TT, GPU, FS, TM),
364   InstrInfo(*this),
365   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
366   TLInfo(TM, *this) {}
367 
368 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
369                          const TargetMachine &TM)
370     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
371       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
372       TLInfo(TM, *this) {
373   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
374   Legalizer.reset(new AMDGPULegalizerInfo());
375 
376   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
377   InstSelector.reset(new AMDGPUInstructionSelector(
378       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
379 }
380 
381 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
382                                       unsigned NumRegionInstrs) const {
383   // Track register pressure so the scheduler can try to decrease
384   // pressure once register usage is above the threshold defined by
385   // SIRegisterInfo::getRegPressureSetLimit()
386   Policy.ShouldTrackPressure = true;
387 
388   // Enabling both top down and bottom up scheduling seems to give us less
389   // register spills than just using one of these approaches on its own.
390   Policy.OnlyTopDown = false;
391   Policy.OnlyBottomUp = false;
392 
393   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
394   if (!enableSIScheduler())
395     Policy.ShouldTrackLaneMasks = true;
396 }
397 
398 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
399   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
400 }
401 
402 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
403                                             unsigned ExplicitArgBytes) const {
404   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
405   if (ImplicitBytes == 0)
406     return ExplicitArgBytes;
407 
408   unsigned Alignment = getAlignmentForImplicitArgPtr();
409   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
410 }
411 
412 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
413   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
414     if (SGPRs <= 80)
415       return 10;
416     if (SGPRs <= 88)
417       return 9;
418     if (SGPRs <= 100)
419       return 8;
420     return 7;
421   }
422   if (SGPRs <= 48)
423     return 10;
424   if (SGPRs <= 56)
425     return 9;
426   if (SGPRs <= 64)
427     return 8;
428   if (SGPRs <= 72)
429     return 7;
430   if (SGPRs <= 80)
431     return 6;
432   return 5;
433 }
434 
435 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
436   if (VGPRs <= 24)
437     return 10;
438   if (VGPRs <= 28)
439     return 9;
440   if (VGPRs <= 32)
441     return 8;
442   if (VGPRs <= 36)
443     return 7;
444   if (VGPRs <= 40)
445     return 6;
446   if (VGPRs <= 48)
447     return 5;
448   if (VGPRs <= 64)
449     return 4;
450   if (VGPRs <= 84)
451     return 3;
452   if (VGPRs <= 128)
453     return 2;
454   return 1;
455 }
456 
457 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
458   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
459   if (MFI.hasFlatScratchInit()) {
460     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
461       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
462     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
463       return 4; // FLAT_SCRATCH, VCC (in that order).
464   }
465 
466   if (isXNACKEnabled())
467     return 4; // XNACK, VCC (in that order).
468   return 2; // VCC.
469 }
470 
471 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
472   const Function &F = MF.getFunction();
473   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
474 
475   // Compute maximum number of SGPRs function can use using default/requested
476   // minimum number of waves per execution unit.
477   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
478   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
479   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
480 
481   // Check if maximum number of SGPRs was explicitly requested using
482   // "amdgpu-num-sgpr" attribute.
483   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
484     unsigned Requested = AMDGPU::getIntegerAttribute(
485       F, "amdgpu-num-sgpr", MaxNumSGPRs);
486 
487     // Make sure requested value does not violate subtarget's specifications.
488     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
489       Requested = 0;
490 
491     // If more SGPRs are required to support the input user/system SGPRs,
492     // increase to accommodate them.
493     //
494     // FIXME: This really ends up using the requested number of SGPRs + number
495     // of reserved special registers in total. Theoretically you could re-use
496     // the last input registers for these special registers, but this would
497     // require a lot of complexity to deal with the weird aliasing.
498     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
499     if (Requested && Requested < InputNumSGPRs)
500       Requested = InputNumSGPRs;
501 
502     // Make sure requested value is compatible with values implied by
503     // default/requested minimum/maximum number of waves per execution unit.
504     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
505       Requested = 0;
506     if (WavesPerEU.second &&
507         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
508       Requested = 0;
509 
510     if (Requested)
511       MaxNumSGPRs = Requested;
512   }
513 
514   if (hasSGPRInitBug())
515     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
516 
517   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
518                   MaxAddressableNumSGPRs);
519 }
520 
521 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
522   const Function &F = MF.getFunction();
523   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
524 
525   // Compute maximum number of VGPRs function can use using default/requested
526   // minimum number of waves per execution unit.
527   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
528   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
529 
530   // Check if maximum number of VGPRs was explicitly requested using
531   // "amdgpu-num-vgpr" attribute.
532   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
533     unsigned Requested = AMDGPU::getIntegerAttribute(
534       F, "amdgpu-num-vgpr", MaxNumVGPRs);
535 
536     // Make sure requested value does not violate subtarget's specifications.
537     if (Requested && Requested <= getReservedNumVGPRs(MF))
538       Requested = 0;
539 
540     // Make sure requested value is compatible with values implied by
541     // default/requested minimum/maximum number of waves per execution unit.
542     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
543       Requested = 0;
544     if (WavesPerEU.second &&
545         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
546       Requested = 0;
547 
548     if (Requested)
549       MaxNumVGPRs = Requested;
550   }
551 
552   return MaxNumVGPRs - getReservedNumVGPRs(MF);
553 }
554 
555 namespace {
556 struct MemOpClusterMutation : ScheduleDAGMutation {
557   const SIInstrInfo *TII;
558 
559   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
560 
561   void apply(ScheduleDAGInstrs *DAGInstrs) override {
562     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
563 
564     SUnit *SUa = nullptr;
565     // Search for two consequent memory operations and link them
566     // to prevent scheduler from moving them apart.
567     // In DAG pre-process SUnits are in the original order of
568     // the instructions before scheduling.
569     for (SUnit &SU : DAG->SUnits) {
570       MachineInstr &MI2 = *SU.getInstr();
571       if (!MI2.mayLoad() && !MI2.mayStore()) {
572         SUa = nullptr;
573         continue;
574       }
575       if (!SUa) {
576         SUa = &SU;
577         continue;
578       }
579 
580       MachineInstr &MI1 = *SUa->getInstr();
581       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
582           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
583           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
584           (TII->isDS(MI1)   && TII->isDS(MI2))) {
585         SU.addPredBarrier(SUa);
586 
587         for (const SDep &SI : SU.Preds) {
588           if (SI.getSUnit() != SUa)
589             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
590         }
591 
592         if (&SU != &DAG->ExitSU) {
593           for (const SDep &SI : SUa->Succs) {
594             if (SI.getSUnit() != &SU)
595               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
596           }
597         }
598       }
599 
600       SUa = &SU;
601     }
602   }
603 };
604 } // namespace
605 
606 void SISubtarget::getPostRAMutations(
607     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
608   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
609 }
610