1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/Target/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
52   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
53     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
54 
55   FullFS += FS;
56 
57   ParseSubtargetFeatures(GPU, FullFS);
58 
59   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
60   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
61   // variants of MUBUF instructions.
62   if (!hasAddr64() && !FS.contains("flat-for-global")) {
63     FlatForGlobal = true;
64   }
65 
66   // FIXME: I don't think think Evergreen has any useful support for
67   // denormals, but should be checked. Should we issue a warning somewhere
68   // if someone tries to enable these?
69   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
70     FP64FP16Denormals = false;
71     FP32Denormals = false;
72   }
73 
74   // Set defaults if needed.
75   if (MaxPrivateElementSize == 0)
76     MaxPrivateElementSize = 4;
77 
78   if (LDSBankCount == 0)
79     LDSBankCount = 32;
80 
81   if (TT.getArch() == Triple::amdgcn) {
82     if (LocalMemorySize == 0)
83       LocalMemorySize = 32768;
84 
85     // Do something sensible for unspecified target.
86     if (!HasMovrel && !HasVGPRIndexMode)
87       HasMovrel = true;
88   }
89 
90   return *this;
91 }
92 
93 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
94                                  const TargetMachine &TM)
95   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
96     TargetTriple(TT),
97     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
98     IsaVersion(ISAVersion0_0_0),
99     WavefrontSize(0),
100     LocalMemorySize(0),
101     LDSBankCount(0),
102     MaxPrivateElementSize(0),
103 
104     FastFMAF32(false),
105     HalfRate64Ops(false),
106 
107     FP32Denormals(false),
108     FP64FP16Denormals(false),
109     FPExceptions(false),
110     DX10Clamp(false),
111     FlatForGlobal(false),
112     AutoWaitcntBeforeBarrier(false),
113     CodeObjectV3(false),
114     UnalignedScratchAccess(false),
115     UnalignedBufferAccess(false),
116 
117     HasApertureRegs(false),
118     EnableXNACK(false),
119     TrapHandler(false),
120     DebuggerInsertNops(false),
121     DebuggerReserveRegs(false),
122     DebuggerEmitPrologue(false),
123 
124     EnableVGPRSpilling(false),
125     EnablePromoteAlloca(false),
126     EnableLoadStoreOpt(false),
127     EnableUnsafeDSOffsetFolding(false),
128     EnableSIScheduler(false),
129     DumpCode(false),
130 
131     FP64(false),
132     IsGCN(false),
133     GCN3Encoding(false),
134     CIInsts(false),
135     GFX9Insts(false),
136     SGPRInitBug(false),
137     HasSMemRealTime(false),
138     Has16BitInsts(false),
139     HasIntClamp(false),
140     HasVOP3PInsts(false),
141     HasMadMixInsts(false),
142     HasMovrel(false),
143     HasVGPRIndexMode(false),
144     HasScalarStores(false),
145     HasInv2PiInlineImm(false),
146     HasSDWA(false),
147     HasSDWAOmod(false),
148     HasSDWAScalar(false),
149     HasSDWASdst(false),
150     HasSDWAMac(false),
151     HasSDWAOutModsVOPC(false),
152     HasDPP(false),
153     FlatAddressSpace(false),
154     FlatInstOffsets(false),
155     FlatGlobalInsts(false),
156     FlatScratchInsts(false),
157     AddNoCarryInsts(false),
158 
159     R600ALUInst(false),
160     CaymanISA(false),
161     CFALUBug(false),
162     HasVertexCache(false),
163     TexVTXClauseSize(0),
164     ScalarizeGlobal(false),
165 
166     FeatureDisable(false),
167     InstrItins(getInstrItineraryForCPU(GPU)) {
168   AS = AMDGPU::getAMDGPUAS(TT);
169   initializeSubtargetDependencies(TT, GPU, FS);
170 }
171 
172 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
173   const Function &F) const {
174   if (NWaves == 1)
175     return getLocalMemorySize();
176   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
177   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
178   unsigned MaxWaves = getMaxWavesPerEU();
179   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
180 }
181 
182 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
183   const Function &F) const {
184   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
185   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
186   unsigned MaxWaves = getMaxWavesPerEU();
187   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
188   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
189   NumWaves = std::min(NumWaves, MaxWaves);
190   NumWaves = std::max(NumWaves, 1u);
191   return NumWaves;
192 }
193 
194 std::pair<unsigned, unsigned>
195 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
196   switch (CC) {
197   case CallingConv::AMDGPU_CS:
198   case CallingConv::AMDGPU_KERNEL:
199   case CallingConv::SPIR_KERNEL:
200     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
201   case CallingConv::AMDGPU_VS:
202   case CallingConv::AMDGPU_LS:
203   case CallingConv::AMDGPU_HS:
204   case CallingConv::AMDGPU_ES:
205   case CallingConv::AMDGPU_GS:
206   case CallingConv::AMDGPU_PS:
207     return std::make_pair(1, getWavefrontSize());
208   default:
209     return std::make_pair(1, 16 * getWavefrontSize());
210   }
211 }
212 
213 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
214   const Function &F) const {
215   // FIXME: 1024 if function.
216   // Default minimum/maximum flat work group sizes.
217   std::pair<unsigned, unsigned> Default =
218     getDefaultFlatWorkGroupSize(F.getCallingConv());
219 
220   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
221   // starts using "amdgpu-flat-work-group-size" attribute.
222   Default.second = AMDGPU::getIntegerAttribute(
223     F, "amdgpu-max-work-group-size", Default.second);
224   Default.first = std::min(Default.first, Default.second);
225 
226   // Requested minimum/maximum flat work group sizes.
227   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
228     F, "amdgpu-flat-work-group-size", Default);
229 
230   // Make sure requested minimum is less than requested maximum.
231   if (Requested.first > Requested.second)
232     return Default;
233 
234   // Make sure requested values do not violate subtarget's specifications.
235   if (Requested.first < getMinFlatWorkGroupSize())
236     return Default;
237   if (Requested.second > getMaxFlatWorkGroupSize())
238     return Default;
239 
240   return Requested;
241 }
242 
243 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
244   const Function &F) const {
245   // Default minimum/maximum number of waves per execution unit.
246   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
247 
248   // Default/requested minimum/maximum flat work group sizes.
249   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
250 
251   // If minimum/maximum flat work group sizes were explicitly requested using
252   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
253   // number of waves per execution unit to values implied by requested
254   // minimum/maximum flat work group sizes.
255   unsigned MinImpliedByFlatWorkGroupSize =
256     getMaxWavesPerEU(FlatWorkGroupSizes.second);
257   bool RequestedFlatWorkGroupSize = false;
258 
259   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
260   // starts using "amdgpu-flat-work-group-size" attribute.
261   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
262       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
263     Default.first = MinImpliedByFlatWorkGroupSize;
264     RequestedFlatWorkGroupSize = true;
265   }
266 
267   // Requested minimum/maximum number of waves per execution unit.
268   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
269     F, "amdgpu-waves-per-eu", Default, true);
270 
271   // Make sure requested minimum is less than requested maximum.
272   if (Requested.second && Requested.first > Requested.second)
273     return Default;
274 
275   // Make sure requested values do not violate subtarget's specifications.
276   if (Requested.first < getMinWavesPerEU() ||
277       Requested.first > getMaxWavesPerEU())
278     return Default;
279   if (Requested.second > getMaxWavesPerEU())
280     return Default;
281 
282   // Make sure requested values are compatible with values implied by requested
283   // minimum/maximum flat work group sizes.
284   if (RequestedFlatWorkGroupSize &&
285       Requested.first < MinImpliedByFlatWorkGroupSize)
286     return Default;
287 
288   return Requested;
289 }
290 
291 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
292   Function *Kernel = I->getParent()->getParent();
293   unsigned MinSize = 0;
294   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
295   bool IdQuery = false;
296 
297   // If reqd_work_group_size is present it narrows value down.
298   if (auto *CI = dyn_cast<CallInst>(I)) {
299     const Function *F = CI->getCalledFunction();
300     if (F) {
301       unsigned Dim = UINT_MAX;
302       switch (F->getIntrinsicID()) {
303       case Intrinsic::amdgcn_workitem_id_x:
304       case Intrinsic::r600_read_tidig_x:
305         IdQuery = true;
306         LLVM_FALLTHROUGH;
307       case Intrinsic::r600_read_local_size_x:
308         Dim = 0;
309         break;
310       case Intrinsic::amdgcn_workitem_id_y:
311       case Intrinsic::r600_read_tidig_y:
312         IdQuery = true;
313         LLVM_FALLTHROUGH;
314       case Intrinsic::r600_read_local_size_y:
315         Dim = 1;
316         break;
317       case Intrinsic::amdgcn_workitem_id_z:
318       case Intrinsic::r600_read_tidig_z:
319         IdQuery = true;
320         LLVM_FALLTHROUGH;
321       case Intrinsic::r600_read_local_size_z:
322         Dim = 2;
323         break;
324       default:
325         break;
326       }
327       if (Dim <= 3) {
328         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
329           if (Node->getNumOperands() == 3)
330             MinSize = MaxSize = mdconst::extract<ConstantInt>(
331                                   Node->getOperand(Dim))->getZExtValue();
332       }
333     }
334   }
335 
336   if (!MaxSize)
337     return false;
338 
339   // Range metadata is [Lo, Hi). For ID query we need to pass max size
340   // as Hi. For size query we need to pass Hi + 1.
341   if (IdQuery)
342     MinSize = 0;
343   else
344     ++MaxSize;
345 
346   MDBuilder MDB(I->getContext());
347   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
348                                                   APInt(32, MaxSize));
349   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
350   return true;
351 }
352 
353 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
354                              const TargetMachine &TM) :
355   AMDGPUSubtarget(TT, GPU, FS, TM),
356   InstrInfo(*this),
357   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
358   TLInfo(TM, *this) {}
359 
360 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
361                          const TargetMachine &TM)
362     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
363       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
364       TLInfo(TM, *this) {
365   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
366   Legalizer.reset(new AMDGPULegalizerInfo());
367 
368   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
369   InstSelector.reset(new AMDGPUInstructionSelector(
370       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
371 }
372 
373 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
374                                       unsigned NumRegionInstrs) const {
375   // Track register pressure so the scheduler can try to decrease
376   // pressure once register usage is above the threshold defined by
377   // SIRegisterInfo::getRegPressureSetLimit()
378   Policy.ShouldTrackPressure = true;
379 
380   // Enabling both top down and bottom up scheduling seems to give us less
381   // register spills than just using one of these approaches on its own.
382   Policy.OnlyTopDown = false;
383   Policy.OnlyBottomUp = false;
384 
385   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
386   if (!enableSIScheduler())
387     Policy.ShouldTrackLaneMasks = true;
388 }
389 
390 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
391   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
392 }
393 
394 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
395                                             unsigned ExplicitArgBytes) const {
396   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
397   if (ImplicitBytes == 0)
398     return ExplicitArgBytes;
399 
400   unsigned Alignment = getAlignmentForImplicitArgPtr();
401   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
402 }
403 
404 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
405   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
406     if (SGPRs <= 80)
407       return 10;
408     if (SGPRs <= 88)
409       return 9;
410     if (SGPRs <= 100)
411       return 8;
412     return 7;
413   }
414   if (SGPRs <= 48)
415     return 10;
416   if (SGPRs <= 56)
417     return 9;
418   if (SGPRs <= 64)
419     return 8;
420   if (SGPRs <= 72)
421     return 7;
422   if (SGPRs <= 80)
423     return 6;
424   return 5;
425 }
426 
427 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
428   if (VGPRs <= 24)
429     return 10;
430   if (VGPRs <= 28)
431     return 9;
432   if (VGPRs <= 32)
433     return 8;
434   if (VGPRs <= 36)
435     return 7;
436   if (VGPRs <= 40)
437     return 6;
438   if (VGPRs <= 48)
439     return 5;
440   if (VGPRs <= 64)
441     return 4;
442   if (VGPRs <= 84)
443     return 3;
444   if (VGPRs <= 128)
445     return 2;
446   return 1;
447 }
448 
449 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
450   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
451   if (MFI.hasFlatScratchInit()) {
452     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
453       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
454     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
455       return 4; // FLAT_SCRATCH, VCC (in that order).
456   }
457 
458   if (isXNACKEnabled())
459     return 4; // XNACK, VCC (in that order).
460   return 2; // VCC.
461 }
462 
463 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
464   const Function &F = *MF.getFunction();
465   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
466 
467   // Compute maximum number of SGPRs function can use using default/requested
468   // minimum number of waves per execution unit.
469   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
470   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
471   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
472 
473   // Check if maximum number of SGPRs was explicitly requested using
474   // "amdgpu-num-sgpr" attribute.
475   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
476     unsigned Requested = AMDGPU::getIntegerAttribute(
477       F, "amdgpu-num-sgpr", MaxNumSGPRs);
478 
479     // Make sure requested value does not violate subtarget's specifications.
480     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
481       Requested = 0;
482 
483     // If more SGPRs are required to support the input user/system SGPRs,
484     // increase to accommodate them.
485     //
486     // FIXME: This really ends up using the requested number of SGPRs + number
487     // of reserved special registers in total. Theoretically you could re-use
488     // the last input registers for these special registers, but this would
489     // require a lot of complexity to deal with the weird aliasing.
490     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
491     if (Requested && Requested < InputNumSGPRs)
492       Requested = InputNumSGPRs;
493 
494     // Make sure requested value is compatible with values implied by
495     // default/requested minimum/maximum number of waves per execution unit.
496     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
497       Requested = 0;
498     if (WavesPerEU.second &&
499         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
500       Requested = 0;
501 
502     if (Requested)
503       MaxNumSGPRs = Requested;
504   }
505 
506   if (hasSGPRInitBug())
507     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
508 
509   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
510                   MaxAddressableNumSGPRs);
511 }
512 
513 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
514   const Function &F = *MF.getFunction();
515   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
516 
517   // Compute maximum number of VGPRs function can use using default/requested
518   // minimum number of waves per execution unit.
519   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
520   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
521 
522   // Check if maximum number of VGPRs was explicitly requested using
523   // "amdgpu-num-vgpr" attribute.
524   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
525     unsigned Requested = AMDGPU::getIntegerAttribute(
526       F, "amdgpu-num-vgpr", MaxNumVGPRs);
527 
528     // Make sure requested value does not violate subtarget's specifications.
529     if (Requested && Requested <= getReservedNumVGPRs(MF))
530       Requested = 0;
531 
532     // Make sure requested value is compatible with values implied by
533     // default/requested minimum/maximum number of waves per execution unit.
534     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
535       Requested = 0;
536     if (WavesPerEU.second &&
537         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
538       Requested = 0;
539 
540     if (Requested)
541       MaxNumVGPRs = Requested;
542   }
543 
544   return MaxNumVGPRs - getReservedNumVGPRs(MF);
545 }
546 
547 namespace {
548 struct MemOpClusterMutation : ScheduleDAGMutation {
549   const SIInstrInfo *TII;
550 
551   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
552 
553   void apply(ScheduleDAGInstrs *DAGInstrs) override {
554     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
555 
556     SUnit *SUa = nullptr;
557     // Search for two consequent memory operations and link them
558     // to prevent scheduler from moving them apart.
559     // In DAG pre-process SUnits are in the original order of
560     // the instructions before scheduling.
561     for (SUnit &SU : DAG->SUnits) {
562       MachineInstr &MI2 = *SU.getInstr();
563       if (!MI2.mayLoad() && !MI2.mayStore()) {
564         SUa = nullptr;
565         continue;
566       }
567       if (!SUa) {
568         SUa = &SU;
569         continue;
570       }
571 
572       MachineInstr &MI1 = *SUa->getInstr();
573       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
574           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
575           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
576           (TII->isDS(MI1)   && TII->isDS(MI2))) {
577         SU.addPredBarrier(SUa);
578 
579         for (const SDep &SI : SU.Preds) {
580           if (SI.getSUnit() != SUa)
581             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
582         }
583 
584         if (&SU != &DAG->ExitSU) {
585           for (const SDep &SI : SUa->Succs) {
586             if (SI.getSUnit() != &SU)
587               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
588           }
589         }
590       }
591 
592       SUa = &SU;
593     }
594   }
595 };
596 } // namespace
597 
598 void SISubtarget::getPostRAMutations(
599     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
600   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
601 }
602