1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
52 
53   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
54     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
55 
56   // FIXME: I don't think think Evergreen has any useful support for
57   // denormals, but should be checked. Should we issue a warning somewhere
58   // if someone tries to enable these?
59   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
60     FullFS += "+fp64-fp16-denormals,";
61   } else {
62     FullFS += "-fp32-denormals,";
63   }
64 
65   FullFS += FS;
66 
67   ParseSubtargetFeatures(GPU, FullFS);
68 
69   // We don't support FP64 for EG/NI atm.
70   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
71 
72   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
73   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
74   // variants of MUBUF instructions.
75   if (!hasAddr64() && !FS.contains("flat-for-global")) {
76     FlatForGlobal = true;
77   }
78 
79   // Set defaults if needed.
80   if (MaxPrivateElementSize == 0)
81     MaxPrivateElementSize = 4;
82 
83   if (LDSBankCount == 0)
84     LDSBankCount = 32;
85 
86   if (TT.getArch() == Triple::amdgcn) {
87     if (LocalMemorySize == 0)
88       LocalMemorySize = 32768;
89 
90     // Do something sensible for unspecified target.
91     if (!HasMovrel && !HasVGPRIndexMode)
92       HasMovrel = true;
93   }
94 
95   return *this;
96 }
97 
98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
99                                  const TargetMachine &TM)
100   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
101     TargetTriple(TT),
102     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
103     IsaVersion(ISAVersion0_0_0),
104     WavefrontSize(0),
105     LocalMemorySize(0),
106     LDSBankCount(0),
107     MaxPrivateElementSize(0),
108 
109     FastFMAF32(false),
110     HalfRate64Ops(false),
111 
112     FP32Denormals(false),
113     FP64FP16Denormals(false),
114     FPExceptions(false),
115     DX10Clamp(false),
116     FlatForGlobal(false),
117     AutoWaitcntBeforeBarrier(false),
118     CodeObjectV3(false),
119     UnalignedScratchAccess(false),
120     UnalignedBufferAccess(false),
121 
122     HasApertureRegs(false),
123     EnableXNACK(false),
124     TrapHandler(false),
125     DebuggerInsertNops(false),
126     DebuggerReserveRegs(false),
127     DebuggerEmitPrologue(false),
128 
129     EnableHugePrivateBuffer(false),
130     EnableVGPRSpilling(false),
131     EnablePromoteAlloca(false),
132     EnableLoadStoreOpt(false),
133     EnableUnsafeDSOffsetFolding(false),
134     EnableSIScheduler(false),
135     DumpCode(false),
136 
137     FP64(false),
138     FMA(false),
139     MIMG_R128(false),
140     IsGCN(false),
141     GCN3Encoding(false),
142     CIInsts(false),
143     GFX9Insts(false),
144     SGPRInitBug(false),
145     HasSMemRealTime(false),
146     Has16BitInsts(false),
147     HasIntClamp(false),
148     HasVOP3PInsts(false),
149     HasMadMixInsts(false),
150     HasMovrel(false),
151     HasVGPRIndexMode(false),
152     HasScalarStores(false),
153     HasScalarAtomics(false),
154     HasInv2PiInlineImm(false),
155     HasSDWA(false),
156     HasSDWAOmod(false),
157     HasSDWAScalar(false),
158     HasSDWASdst(false),
159     HasSDWAMac(false),
160     HasSDWAOutModsVOPC(false),
161     HasDPP(false),
162     FlatAddressSpace(false),
163     FlatInstOffsets(false),
164     FlatGlobalInsts(false),
165     FlatScratchInsts(false),
166     AddNoCarryInsts(false),
167     HasUnpackedD16VMem(false),
168 
169     R600ALUInst(false),
170     CaymanISA(false),
171     CFALUBug(false),
172     HasVertexCache(false),
173     TexVTXClauseSize(0),
174     ScalarizeGlobal(false),
175 
176     FeatureDisable(false),
177     InstrItins(getInstrItineraryForCPU(GPU)) {
178   AS = AMDGPU::getAMDGPUAS(TT);
179   initializeSubtargetDependencies(TT, GPU, FS);
180 }
181 
182 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
183   const Function &F) const {
184   if (NWaves == 1)
185     return getLocalMemorySize();
186   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
187   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
188   unsigned MaxWaves = getMaxWavesPerEU();
189   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
190 }
191 
192 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
193   const Function &F) const {
194   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
195   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
196   unsigned MaxWaves = getMaxWavesPerEU();
197   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
198   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
199   NumWaves = std::min(NumWaves, MaxWaves);
200   NumWaves = std::max(NumWaves, 1u);
201   return NumWaves;
202 }
203 
204 std::pair<unsigned, unsigned>
205 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
206   switch (CC) {
207   case CallingConv::AMDGPU_CS:
208   case CallingConv::AMDGPU_KERNEL:
209   case CallingConv::SPIR_KERNEL:
210     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
211   case CallingConv::AMDGPU_VS:
212   case CallingConv::AMDGPU_LS:
213   case CallingConv::AMDGPU_HS:
214   case CallingConv::AMDGPU_ES:
215   case CallingConv::AMDGPU_GS:
216   case CallingConv::AMDGPU_PS:
217     return std::make_pair(1, getWavefrontSize());
218   default:
219     return std::make_pair(1, 16 * getWavefrontSize());
220   }
221 }
222 
223 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
224   const Function &F) const {
225   // FIXME: 1024 if function.
226   // Default minimum/maximum flat work group sizes.
227   std::pair<unsigned, unsigned> Default =
228     getDefaultFlatWorkGroupSize(F.getCallingConv());
229 
230   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
231   // starts using "amdgpu-flat-work-group-size" attribute.
232   Default.second = AMDGPU::getIntegerAttribute(
233     F, "amdgpu-max-work-group-size", Default.second);
234   Default.first = std::min(Default.first, Default.second);
235 
236   // Requested minimum/maximum flat work group sizes.
237   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
238     F, "amdgpu-flat-work-group-size", Default);
239 
240   // Make sure requested minimum is less than requested maximum.
241   if (Requested.first > Requested.second)
242     return Default;
243 
244   // Make sure requested values do not violate subtarget's specifications.
245   if (Requested.first < getMinFlatWorkGroupSize())
246     return Default;
247   if (Requested.second > getMaxFlatWorkGroupSize())
248     return Default;
249 
250   return Requested;
251 }
252 
253 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
254   const Function &F) const {
255   // Default minimum/maximum number of waves per execution unit.
256   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
257 
258   // Default/requested minimum/maximum flat work group sizes.
259   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
260 
261   // If minimum/maximum flat work group sizes were explicitly requested using
262   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
263   // number of waves per execution unit to values implied by requested
264   // minimum/maximum flat work group sizes.
265   unsigned MinImpliedByFlatWorkGroupSize =
266     getMaxWavesPerEU(FlatWorkGroupSizes.second);
267   bool RequestedFlatWorkGroupSize = false;
268 
269   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
270   // starts using "amdgpu-flat-work-group-size" attribute.
271   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
272       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
273     Default.first = MinImpliedByFlatWorkGroupSize;
274     RequestedFlatWorkGroupSize = true;
275   }
276 
277   // Requested minimum/maximum number of waves per execution unit.
278   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
279     F, "amdgpu-waves-per-eu", Default, true);
280 
281   // Make sure requested minimum is less than requested maximum.
282   if (Requested.second && Requested.first > Requested.second)
283     return Default;
284 
285   // Make sure requested values do not violate subtarget's specifications.
286   if (Requested.first < getMinWavesPerEU() ||
287       Requested.first > getMaxWavesPerEU())
288     return Default;
289   if (Requested.second > getMaxWavesPerEU())
290     return Default;
291 
292   // Make sure requested values are compatible with values implied by requested
293   // minimum/maximum flat work group sizes.
294   if (RequestedFlatWorkGroupSize &&
295       Requested.first < MinImpliedByFlatWorkGroupSize)
296     return Default;
297 
298   return Requested;
299 }
300 
301 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
302   Function *Kernel = I->getParent()->getParent();
303   unsigned MinSize = 0;
304   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
305   bool IdQuery = false;
306 
307   // If reqd_work_group_size is present it narrows value down.
308   if (auto *CI = dyn_cast<CallInst>(I)) {
309     const Function *F = CI->getCalledFunction();
310     if (F) {
311       unsigned Dim = UINT_MAX;
312       switch (F->getIntrinsicID()) {
313       case Intrinsic::amdgcn_workitem_id_x:
314       case Intrinsic::r600_read_tidig_x:
315         IdQuery = true;
316         LLVM_FALLTHROUGH;
317       case Intrinsic::r600_read_local_size_x:
318         Dim = 0;
319         break;
320       case Intrinsic::amdgcn_workitem_id_y:
321       case Intrinsic::r600_read_tidig_y:
322         IdQuery = true;
323         LLVM_FALLTHROUGH;
324       case Intrinsic::r600_read_local_size_y:
325         Dim = 1;
326         break;
327       case Intrinsic::amdgcn_workitem_id_z:
328       case Intrinsic::r600_read_tidig_z:
329         IdQuery = true;
330         LLVM_FALLTHROUGH;
331       case Intrinsic::r600_read_local_size_z:
332         Dim = 2;
333         break;
334       default:
335         break;
336       }
337       if (Dim <= 3) {
338         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
339           if (Node->getNumOperands() == 3)
340             MinSize = MaxSize = mdconst::extract<ConstantInt>(
341                                   Node->getOperand(Dim))->getZExtValue();
342       }
343     }
344   }
345 
346   if (!MaxSize)
347     return false;
348 
349   // Range metadata is [Lo, Hi). For ID query we need to pass max size
350   // as Hi. For size query we need to pass Hi + 1.
351   if (IdQuery)
352     MinSize = 0;
353   else
354     ++MaxSize;
355 
356   MDBuilder MDB(I->getContext());
357   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
358                                                   APInt(32, MaxSize));
359   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
360   return true;
361 }
362 
363 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
364                              const TargetMachine &TM) :
365   AMDGPUSubtarget(TT, GPU, FS, TM),
366   InstrInfo(*this),
367   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
368   TLInfo(TM, *this) {}
369 
370 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
371                          const GCNTargetMachine &TM)
372     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
373       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
374       TLInfo(TM, *this) {
375   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
376   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
377 
378   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
379   InstSelector.reset(new AMDGPUInstructionSelector(
380       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
381 }
382 
383 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
384                                       unsigned NumRegionInstrs) const {
385   // Track register pressure so the scheduler can try to decrease
386   // pressure once register usage is above the threshold defined by
387   // SIRegisterInfo::getRegPressureSetLimit()
388   Policy.ShouldTrackPressure = true;
389 
390   // Enabling both top down and bottom up scheduling seems to give us less
391   // register spills than just using one of these approaches on its own.
392   Policy.OnlyTopDown = false;
393   Policy.OnlyBottomUp = false;
394 
395   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
396   if (!enableSIScheduler())
397     Policy.ShouldTrackLaneMasks = true;
398 }
399 
400 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
401   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
402 }
403 
404 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
405                                             unsigned ExplicitArgBytes) const {
406   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
407   if (ImplicitBytes == 0)
408     return ExplicitArgBytes;
409 
410   unsigned Alignment = getAlignmentForImplicitArgPtr();
411   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
412 }
413 
414 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
415   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
416     if (SGPRs <= 80)
417       return 10;
418     if (SGPRs <= 88)
419       return 9;
420     if (SGPRs <= 100)
421       return 8;
422     return 7;
423   }
424   if (SGPRs <= 48)
425     return 10;
426   if (SGPRs <= 56)
427     return 9;
428   if (SGPRs <= 64)
429     return 8;
430   if (SGPRs <= 72)
431     return 7;
432   if (SGPRs <= 80)
433     return 6;
434   return 5;
435 }
436 
437 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
438   if (VGPRs <= 24)
439     return 10;
440   if (VGPRs <= 28)
441     return 9;
442   if (VGPRs <= 32)
443     return 8;
444   if (VGPRs <= 36)
445     return 7;
446   if (VGPRs <= 40)
447     return 6;
448   if (VGPRs <= 48)
449     return 5;
450   if (VGPRs <= 64)
451     return 4;
452   if (VGPRs <= 84)
453     return 3;
454   if (VGPRs <= 128)
455     return 2;
456   return 1;
457 }
458 
459 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
460   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
461   if (MFI.hasFlatScratchInit()) {
462     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
463       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
464     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
465       return 4; // FLAT_SCRATCH, VCC (in that order).
466   }
467 
468   if (isXNACKEnabled())
469     return 4; // XNACK, VCC (in that order).
470   return 2; // VCC.
471 }
472 
473 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
474   const Function &F = MF.getFunction();
475   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
476 
477   // Compute maximum number of SGPRs function can use using default/requested
478   // minimum number of waves per execution unit.
479   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
480   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
481   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
482 
483   // Check if maximum number of SGPRs was explicitly requested using
484   // "amdgpu-num-sgpr" attribute.
485   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
486     unsigned Requested = AMDGPU::getIntegerAttribute(
487       F, "amdgpu-num-sgpr", MaxNumSGPRs);
488 
489     // Make sure requested value does not violate subtarget's specifications.
490     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
491       Requested = 0;
492 
493     // If more SGPRs are required to support the input user/system SGPRs,
494     // increase to accommodate them.
495     //
496     // FIXME: This really ends up using the requested number of SGPRs + number
497     // of reserved special registers in total. Theoretically you could re-use
498     // the last input registers for these special registers, but this would
499     // require a lot of complexity to deal with the weird aliasing.
500     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
501     if (Requested && Requested < InputNumSGPRs)
502       Requested = InputNumSGPRs;
503 
504     // Make sure requested value is compatible with values implied by
505     // default/requested minimum/maximum number of waves per execution unit.
506     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
507       Requested = 0;
508     if (WavesPerEU.second &&
509         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
510       Requested = 0;
511 
512     if (Requested)
513       MaxNumSGPRs = Requested;
514   }
515 
516   if (hasSGPRInitBug())
517     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
518 
519   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
520                   MaxAddressableNumSGPRs);
521 }
522 
523 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
524   const Function &F = MF.getFunction();
525   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
526 
527   // Compute maximum number of VGPRs function can use using default/requested
528   // minimum number of waves per execution unit.
529   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
530   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
531 
532   // Check if maximum number of VGPRs was explicitly requested using
533   // "amdgpu-num-vgpr" attribute.
534   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
535     unsigned Requested = AMDGPU::getIntegerAttribute(
536       F, "amdgpu-num-vgpr", MaxNumVGPRs);
537 
538     // Make sure requested value does not violate subtarget's specifications.
539     if (Requested && Requested <= getReservedNumVGPRs(MF))
540       Requested = 0;
541 
542     // Make sure requested value is compatible with values implied by
543     // default/requested minimum/maximum number of waves per execution unit.
544     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
545       Requested = 0;
546     if (WavesPerEU.second &&
547         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
548       Requested = 0;
549 
550     if (Requested)
551       MaxNumVGPRs = Requested;
552   }
553 
554   return MaxNumVGPRs - getReservedNumVGPRs(MF);
555 }
556 
557 namespace {
558 struct MemOpClusterMutation : ScheduleDAGMutation {
559   const SIInstrInfo *TII;
560 
561   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
562 
563   void apply(ScheduleDAGInstrs *DAGInstrs) override {
564     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
565 
566     SUnit *SUa = nullptr;
567     // Search for two consequent memory operations and link them
568     // to prevent scheduler from moving them apart.
569     // In DAG pre-process SUnits are in the original order of
570     // the instructions before scheduling.
571     for (SUnit &SU : DAG->SUnits) {
572       MachineInstr &MI2 = *SU.getInstr();
573       if (!MI2.mayLoad() && !MI2.mayStore()) {
574         SUa = nullptr;
575         continue;
576       }
577       if (!SUa) {
578         SUa = &SU;
579         continue;
580       }
581 
582       MachineInstr &MI1 = *SUa->getInstr();
583       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
584           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
585           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
586           (TII->isDS(MI1)   && TII->isDS(MI2))) {
587         SU.addPredBarrier(SUa);
588 
589         for (const SDep &SI : SU.Preds) {
590           if (SI.getSUnit() != SUa)
591             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
592         }
593 
594         if (&SU != &DAG->ExitSU) {
595           for (const SDep &SI : SUa->Succs) {
596             if (SI.getSUnit() != &SU)
597               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
598           }
599         }
600       }
601 
602       SUa = &SU;
603     }
604   }
605 };
606 } // namespace
607 
608 void SISubtarget::getPostRAMutations(
609     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
610   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
611 }
612