1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
52 
53   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
54     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
55 
56   // FIXME: I don't think think Evergreen has any useful support for
57   // denormals, but should be checked. Should we issue a warning somewhere
58   // if someone tries to enable these?
59   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
60     FullFS += "+fp64-fp16-denormals,";
61   } else {
62     FullFS += "-fp32-denormals,";
63   }
64 
65   FullFS += FS;
66 
67   ParseSubtargetFeatures(GPU, FullFS);
68 
69   // We don't support FP64 for EG/NI atm.
70   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
71 
72   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
73   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
74   // variants of MUBUF instructions.
75   if (!hasAddr64() && !FS.contains("flat-for-global")) {
76     FlatForGlobal = true;
77   }
78 
79   // Set defaults if needed.
80   if (MaxPrivateElementSize == 0)
81     MaxPrivateElementSize = 4;
82 
83   if (LDSBankCount == 0)
84     LDSBankCount = 32;
85 
86   if (TT.getArch() == Triple::amdgcn) {
87     if (LocalMemorySize == 0)
88       LocalMemorySize = 32768;
89 
90     // Do something sensible for unspecified target.
91     if (!HasMovrel && !HasVGPRIndexMode)
92       HasMovrel = true;
93   }
94 
95   return *this;
96 }
97 
98 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
99                                  const TargetMachine &TM)
100   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
101     TargetTriple(TT),
102     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
103     IsaVersion(ISAVersion0_0_0),
104     WavefrontSize(0),
105     LocalMemorySize(0),
106     LDSBankCount(0),
107     MaxPrivateElementSize(0),
108 
109     FastFMAF32(false),
110     HalfRate64Ops(false),
111 
112     FP32Denormals(false),
113     FP64FP16Denormals(false),
114     FPExceptions(false),
115     DX10Clamp(false),
116     FlatForGlobal(false),
117     AutoWaitcntBeforeBarrier(false),
118     CodeObjectV3(false),
119     UnalignedScratchAccess(false),
120     UnalignedBufferAccess(false),
121 
122     HasApertureRegs(false),
123     EnableXNACK(false),
124     TrapHandler(false),
125     DebuggerInsertNops(false),
126     DebuggerReserveRegs(false),
127     DebuggerEmitPrologue(false),
128 
129     EnableHugePrivateBuffer(false),
130     EnableVGPRSpilling(false),
131     EnablePromoteAlloca(false),
132     EnableLoadStoreOpt(false),
133     EnableUnsafeDSOffsetFolding(false),
134     EnableSIScheduler(false),
135     EnableDS128(false),
136     DumpCode(false),
137 
138     FP64(false),
139     FMA(false),
140     MIMG_R128(false),
141     IsGCN(false),
142     GCN3Encoding(false),
143     CIInsts(false),
144     GFX9Insts(false),
145     SGPRInitBug(false),
146     HasSMemRealTime(false),
147     Has16BitInsts(false),
148     HasIntClamp(false),
149     HasVOP3PInsts(false),
150     HasMadMixInsts(false),
151     HasFmaMixInsts(false),
152     HasMovrel(false),
153     HasVGPRIndexMode(false),
154     HasScalarStores(false),
155     HasScalarAtomics(false),
156     HasInv2PiInlineImm(false),
157     HasSDWA(false),
158     HasSDWAOmod(false),
159     HasSDWAScalar(false),
160     HasSDWASdst(false),
161     HasSDWAMac(false),
162     HasSDWAOutModsVOPC(false),
163     HasDPP(false),
164     HasDLInsts(false),
165     D16PreservesUnusedBits(false),
166     FlatAddressSpace(false),
167     FlatInstOffsets(false),
168     FlatGlobalInsts(false),
169     FlatScratchInsts(false),
170     AddNoCarryInsts(false),
171     HasUnpackedD16VMem(false),
172 
173     R600ALUInst(false),
174     CaymanISA(false),
175     CFALUBug(false),
176     HasVertexCache(false),
177     TexVTXClauseSize(0),
178     ScalarizeGlobal(false),
179 
180     FeatureDisable(false),
181     InstrItins(getInstrItineraryForCPU(GPU)) {
182   AS = AMDGPU::getAMDGPUAS(TT);
183   initializeSubtargetDependencies(TT, GPU, FS);
184 }
185 
186 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
187   const Function &F) const {
188   if (NWaves == 1)
189     return getLocalMemorySize();
190   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
191   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
192   unsigned MaxWaves = getMaxWavesPerEU();
193   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
194 }
195 
196 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
197   const Function &F) const {
198   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
199   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
200   unsigned MaxWaves = getMaxWavesPerEU();
201   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
202   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
203   NumWaves = std::min(NumWaves, MaxWaves);
204   NumWaves = std::max(NumWaves, 1u);
205   return NumWaves;
206 }
207 
208 std::pair<unsigned, unsigned>
209 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
210   switch (CC) {
211   case CallingConv::AMDGPU_CS:
212   case CallingConv::AMDGPU_KERNEL:
213   case CallingConv::SPIR_KERNEL:
214     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
215   case CallingConv::AMDGPU_VS:
216   case CallingConv::AMDGPU_LS:
217   case CallingConv::AMDGPU_HS:
218   case CallingConv::AMDGPU_ES:
219   case CallingConv::AMDGPU_GS:
220   case CallingConv::AMDGPU_PS:
221     return std::make_pair(1, getWavefrontSize());
222   default:
223     return std::make_pair(1, 16 * getWavefrontSize());
224   }
225 }
226 
227 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
228   const Function &F) const {
229   // FIXME: 1024 if function.
230   // Default minimum/maximum flat work group sizes.
231   std::pair<unsigned, unsigned> Default =
232     getDefaultFlatWorkGroupSize(F.getCallingConv());
233 
234   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
235   // starts using "amdgpu-flat-work-group-size" attribute.
236   Default.second = AMDGPU::getIntegerAttribute(
237     F, "amdgpu-max-work-group-size", Default.second);
238   Default.first = std::min(Default.first, Default.second);
239 
240   // Requested minimum/maximum flat work group sizes.
241   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
242     F, "amdgpu-flat-work-group-size", Default);
243 
244   // Make sure requested minimum is less than requested maximum.
245   if (Requested.first > Requested.second)
246     return Default;
247 
248   // Make sure requested values do not violate subtarget's specifications.
249   if (Requested.first < getMinFlatWorkGroupSize())
250     return Default;
251   if (Requested.second > getMaxFlatWorkGroupSize())
252     return Default;
253 
254   return Requested;
255 }
256 
257 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
258   const Function &F) const {
259   // Default minimum/maximum number of waves per execution unit.
260   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
261 
262   // Default/requested minimum/maximum flat work group sizes.
263   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
264 
265   // If minimum/maximum flat work group sizes were explicitly requested using
266   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
267   // number of waves per execution unit to values implied by requested
268   // minimum/maximum flat work group sizes.
269   unsigned MinImpliedByFlatWorkGroupSize =
270     getMaxWavesPerEU(FlatWorkGroupSizes.second);
271   bool RequestedFlatWorkGroupSize = false;
272 
273   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
274   // starts using "amdgpu-flat-work-group-size" attribute.
275   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
276       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
277     Default.first = MinImpliedByFlatWorkGroupSize;
278     RequestedFlatWorkGroupSize = true;
279   }
280 
281   // Requested minimum/maximum number of waves per execution unit.
282   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
283     F, "amdgpu-waves-per-eu", Default, true);
284 
285   // Make sure requested minimum is less than requested maximum.
286   if (Requested.second && Requested.first > Requested.second)
287     return Default;
288 
289   // Make sure requested values do not violate subtarget's specifications.
290   if (Requested.first < getMinWavesPerEU() ||
291       Requested.first > getMaxWavesPerEU())
292     return Default;
293   if (Requested.second > getMaxWavesPerEU())
294     return Default;
295 
296   // Make sure requested values are compatible with values implied by requested
297   // minimum/maximum flat work group sizes.
298   if (RequestedFlatWorkGroupSize &&
299       Requested.first < MinImpliedByFlatWorkGroupSize)
300     return Default;
301 
302   return Requested;
303 }
304 
305 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
306   Function *Kernel = I->getParent()->getParent();
307   unsigned MinSize = 0;
308   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
309   bool IdQuery = false;
310 
311   // If reqd_work_group_size is present it narrows value down.
312   if (auto *CI = dyn_cast<CallInst>(I)) {
313     const Function *F = CI->getCalledFunction();
314     if (F) {
315       unsigned Dim = UINT_MAX;
316       switch (F->getIntrinsicID()) {
317       case Intrinsic::amdgcn_workitem_id_x:
318       case Intrinsic::r600_read_tidig_x:
319         IdQuery = true;
320         LLVM_FALLTHROUGH;
321       case Intrinsic::r600_read_local_size_x:
322         Dim = 0;
323         break;
324       case Intrinsic::amdgcn_workitem_id_y:
325       case Intrinsic::r600_read_tidig_y:
326         IdQuery = true;
327         LLVM_FALLTHROUGH;
328       case Intrinsic::r600_read_local_size_y:
329         Dim = 1;
330         break;
331       case Intrinsic::amdgcn_workitem_id_z:
332       case Intrinsic::r600_read_tidig_z:
333         IdQuery = true;
334         LLVM_FALLTHROUGH;
335       case Intrinsic::r600_read_local_size_z:
336         Dim = 2;
337         break;
338       default:
339         break;
340       }
341       if (Dim <= 3) {
342         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
343           if (Node->getNumOperands() == 3)
344             MinSize = MaxSize = mdconst::extract<ConstantInt>(
345                                   Node->getOperand(Dim))->getZExtValue();
346       }
347     }
348   }
349 
350   if (!MaxSize)
351     return false;
352 
353   // Range metadata is [Lo, Hi). For ID query we need to pass max size
354   // as Hi. For size query we need to pass Hi + 1.
355   if (IdQuery)
356     MinSize = 0;
357   else
358     ++MaxSize;
359 
360   MDBuilder MDB(I->getContext());
361   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
362                                                   APInt(32, MaxSize));
363   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
364   return true;
365 }
366 
367 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
368                              const TargetMachine &TM) :
369   AMDGPUSubtarget(TT, GPU, FS, TM),
370   InstrInfo(*this),
371   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
372   TLInfo(TM, *this) {}
373 
374 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
375                          const GCNTargetMachine &TM)
376     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
377       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
378       TLInfo(TM, *this) {
379   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
380   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
381 
382   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
383   InstSelector.reset(new AMDGPUInstructionSelector(
384       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
385 }
386 
387 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
388                                       unsigned NumRegionInstrs) const {
389   // Track register pressure so the scheduler can try to decrease
390   // pressure once register usage is above the threshold defined by
391   // SIRegisterInfo::getRegPressureSetLimit()
392   Policy.ShouldTrackPressure = true;
393 
394   // Enabling both top down and bottom up scheduling seems to give us less
395   // register spills than just using one of these approaches on its own.
396   Policy.OnlyTopDown = false;
397   Policy.OnlyBottomUp = false;
398 
399   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
400   if (!enableSIScheduler())
401     Policy.ShouldTrackLaneMasks = true;
402 }
403 
404 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
405   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
406 }
407 
408 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
409                                             unsigned ExplicitArgBytes) const {
410   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
411   if (ImplicitBytes == 0)
412     return ExplicitArgBytes;
413 
414   unsigned Alignment = getAlignmentForImplicitArgPtr();
415   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
416 }
417 
418 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
419   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
420     if (SGPRs <= 80)
421       return 10;
422     if (SGPRs <= 88)
423       return 9;
424     if (SGPRs <= 100)
425       return 8;
426     return 7;
427   }
428   if (SGPRs <= 48)
429     return 10;
430   if (SGPRs <= 56)
431     return 9;
432   if (SGPRs <= 64)
433     return 8;
434   if (SGPRs <= 72)
435     return 7;
436   if (SGPRs <= 80)
437     return 6;
438   return 5;
439 }
440 
441 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
442   if (VGPRs <= 24)
443     return 10;
444   if (VGPRs <= 28)
445     return 9;
446   if (VGPRs <= 32)
447     return 8;
448   if (VGPRs <= 36)
449     return 7;
450   if (VGPRs <= 40)
451     return 6;
452   if (VGPRs <= 48)
453     return 5;
454   if (VGPRs <= 64)
455     return 4;
456   if (VGPRs <= 84)
457     return 3;
458   if (VGPRs <= 128)
459     return 2;
460   return 1;
461 }
462 
463 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
464   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
465   if (MFI.hasFlatScratchInit()) {
466     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
467       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
468     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
469       return 4; // FLAT_SCRATCH, VCC (in that order).
470   }
471 
472   if (isXNACKEnabled())
473     return 4; // XNACK, VCC (in that order).
474   return 2; // VCC.
475 }
476 
477 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
478   const Function &F = MF.getFunction();
479   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
480 
481   // Compute maximum number of SGPRs function can use using default/requested
482   // minimum number of waves per execution unit.
483   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
484   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
485   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
486 
487   // Check if maximum number of SGPRs was explicitly requested using
488   // "amdgpu-num-sgpr" attribute.
489   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
490     unsigned Requested = AMDGPU::getIntegerAttribute(
491       F, "amdgpu-num-sgpr", MaxNumSGPRs);
492 
493     // Make sure requested value does not violate subtarget's specifications.
494     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
495       Requested = 0;
496 
497     // If more SGPRs are required to support the input user/system SGPRs,
498     // increase to accommodate them.
499     //
500     // FIXME: This really ends up using the requested number of SGPRs + number
501     // of reserved special registers in total. Theoretically you could re-use
502     // the last input registers for these special registers, but this would
503     // require a lot of complexity to deal with the weird aliasing.
504     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
505     if (Requested && Requested < InputNumSGPRs)
506       Requested = InputNumSGPRs;
507 
508     // Make sure requested value is compatible with values implied by
509     // default/requested minimum/maximum number of waves per execution unit.
510     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
511       Requested = 0;
512     if (WavesPerEU.second &&
513         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
514       Requested = 0;
515 
516     if (Requested)
517       MaxNumSGPRs = Requested;
518   }
519 
520   if (hasSGPRInitBug())
521     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
522 
523   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
524                   MaxAddressableNumSGPRs);
525 }
526 
527 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
528   const Function &F = MF.getFunction();
529   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
530 
531   // Compute maximum number of VGPRs function can use using default/requested
532   // minimum number of waves per execution unit.
533   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
534   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
535 
536   // Check if maximum number of VGPRs was explicitly requested using
537   // "amdgpu-num-vgpr" attribute.
538   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
539     unsigned Requested = AMDGPU::getIntegerAttribute(
540       F, "amdgpu-num-vgpr", MaxNumVGPRs);
541 
542     // Make sure requested value does not violate subtarget's specifications.
543     if (Requested && Requested <= getReservedNumVGPRs(MF))
544       Requested = 0;
545 
546     // Make sure requested value is compatible with values implied by
547     // default/requested minimum/maximum number of waves per execution unit.
548     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
549       Requested = 0;
550     if (WavesPerEU.second &&
551         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
552       Requested = 0;
553 
554     if (Requested)
555       MaxNumVGPRs = Requested;
556   }
557 
558   return MaxNumVGPRs - getReservedNumVGPRs(MF);
559 }
560 
561 namespace {
562 struct MemOpClusterMutation : ScheduleDAGMutation {
563   const SIInstrInfo *TII;
564 
565   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
566 
567   void apply(ScheduleDAGInstrs *DAGInstrs) override {
568     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
569 
570     SUnit *SUa = nullptr;
571     // Search for two consequent memory operations and link them
572     // to prevent scheduler from moving them apart.
573     // In DAG pre-process SUnits are in the original order of
574     // the instructions before scheduling.
575     for (SUnit &SU : DAG->SUnits) {
576       MachineInstr &MI2 = *SU.getInstr();
577       if (!MI2.mayLoad() && !MI2.mayStore()) {
578         SUa = nullptr;
579         continue;
580       }
581       if (!SUa) {
582         SUa = &SU;
583         continue;
584       }
585 
586       MachineInstr &MI1 = *SUa->getInstr();
587       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
588           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
589           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
590           (TII->isDS(MI1)   && TII->isDS(MI2))) {
591         SU.addPredBarrier(SUa);
592 
593         for (const SDep &SI : SU.Preds) {
594           if (SI.getSUnit() != SUa)
595             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
596         }
597 
598         if (&SU != &DAG->ExitSU) {
599           for (const SDep &SI : SUa->Succs) {
600             if (SI.getSUnit() != &SU)
601               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
602           }
603         }
604       }
605 
606       SUa = &SU;
607     }
608   }
609 };
610 } // namespace
611 
612 void SISubtarget::getPostRAMutations(
613     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
614   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
615 }
616