1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/MDBuilder.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-subtarget"
32 
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #define GET_SUBTARGETINFO_CTOR
35 #include "AMDGPUGenSubtargetInfo.inc"
36 
37 AMDGPUSubtarget::~AMDGPUSubtarget() = default;
38 
39 AMDGPUSubtarget &
40 AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
41                                                  StringRef GPU, StringRef FS) {
42   // Determine default and user-specified characteristics
43   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
44   // enabled, but some instructions do not respect them and they run at the
45   // double precision rate, so don't enable by default.
46   //
47   // We want to be able to turn these off, but making this a subtarget feature
48   // for SI has the unhelpful behavior that it unsets everything else if you
49   // disable it.
50 
51   SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
52   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
53     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
54 
55   FullFS += FS;
56 
57   ParseSubtargetFeatures(GPU, FullFS);
58 
59   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
60   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
61   // variants of MUBUF instructions.
62   if (!hasAddr64() && !FS.contains("flat-for-global")) {
63     FlatForGlobal = true;
64   }
65 
66   // FIXME: I don't think think Evergreen has any useful support for
67   // denormals, but should be checked. Should we issue a warning somewhere
68   // if someone tries to enable these?
69   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
70     FP64FP16Denormals = false;
71     FP32Denormals = false;
72   }
73 
74   // Set defaults if needed.
75   if (MaxPrivateElementSize == 0)
76     MaxPrivateElementSize = 4;
77 
78   if (LDSBankCount == 0)
79     LDSBankCount = 32;
80 
81   if (TT.getArch() == Triple::amdgcn) {
82     if (LocalMemorySize == 0)
83       LocalMemorySize = 32768;
84 
85     // Do something sensible for unspecified target.
86     if (!HasMovrel && !HasVGPRIndexMode)
87       HasMovrel = true;
88   }
89 
90   return *this;
91 }
92 
93 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
94                                  const TargetMachine &TM)
95   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
96     TargetTriple(TT),
97     Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600),
98     IsaVersion(ISAVersion0_0_0),
99     WavefrontSize(0),
100     LocalMemorySize(0),
101     LDSBankCount(0),
102     MaxPrivateElementSize(0),
103 
104     FastFMAF32(false),
105     HalfRate64Ops(false),
106 
107     FP32Denormals(false),
108     FP64FP16Denormals(false),
109     FPExceptions(false),
110     DX10Clamp(false),
111     FlatForGlobal(false),
112     AutoWaitcntBeforeBarrier(false),
113     CodeObjectV3(false),
114     UnalignedScratchAccess(false),
115     UnalignedBufferAccess(false),
116 
117     HasApertureRegs(false),
118     EnableXNACK(false),
119     TrapHandler(false),
120     DebuggerInsertNops(false),
121     DebuggerReserveRegs(false),
122     DebuggerEmitPrologue(false),
123 
124     EnableHugePrivateBuffer(false),
125     EnableVGPRSpilling(false),
126     EnablePromoteAlloca(false),
127     EnableLoadStoreOpt(false),
128     EnableUnsafeDSOffsetFolding(false),
129     EnableSIScheduler(false),
130     DumpCode(false),
131 
132     FP64(false),
133     IsGCN(false),
134     GCN3Encoding(false),
135     CIInsts(false),
136     GFX9Insts(false),
137     SGPRInitBug(false),
138     HasSMemRealTime(false),
139     Has16BitInsts(false),
140     HasIntClamp(false),
141     HasVOP3PInsts(false),
142     HasMadMixInsts(false),
143     HasMovrel(false),
144     HasVGPRIndexMode(false),
145     HasScalarStores(false),
146     HasInv2PiInlineImm(false),
147     HasSDWA(false),
148     HasSDWAOmod(false),
149     HasSDWAScalar(false),
150     HasSDWASdst(false),
151     HasSDWAMac(false),
152     HasSDWAOutModsVOPC(false),
153     HasDPP(false),
154     FlatAddressSpace(false),
155     FlatInstOffsets(false),
156     FlatGlobalInsts(false),
157     FlatScratchInsts(false),
158     AddNoCarryInsts(false),
159 
160     R600ALUInst(false),
161     CaymanISA(false),
162     CFALUBug(false),
163     HasVertexCache(false),
164     TexVTXClauseSize(0),
165     ScalarizeGlobal(false),
166 
167     FeatureDisable(false),
168     InstrItins(getInstrItineraryForCPU(GPU)) {
169   AS = AMDGPU::getAMDGPUAS(TT);
170   initializeSubtargetDependencies(TT, GPU, FS);
171 }
172 
173 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
174   const Function &F) const {
175   if (NWaves == 1)
176     return getLocalMemorySize();
177   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
178   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
179   unsigned MaxWaves = getMaxWavesPerEU();
180   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
181 }
182 
183 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
184   const Function &F) const {
185   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
186   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
187   unsigned MaxWaves = getMaxWavesPerEU();
188   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
189   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
190   NumWaves = std::min(NumWaves, MaxWaves);
191   NumWaves = std::max(NumWaves, 1u);
192   return NumWaves;
193 }
194 
195 std::pair<unsigned, unsigned>
196 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
197   switch (CC) {
198   case CallingConv::AMDGPU_CS:
199   case CallingConv::AMDGPU_KERNEL:
200   case CallingConv::SPIR_KERNEL:
201     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
202   case CallingConv::AMDGPU_VS:
203   case CallingConv::AMDGPU_LS:
204   case CallingConv::AMDGPU_HS:
205   case CallingConv::AMDGPU_ES:
206   case CallingConv::AMDGPU_GS:
207   case CallingConv::AMDGPU_PS:
208     return std::make_pair(1, getWavefrontSize());
209   default:
210     return std::make_pair(1, 16 * getWavefrontSize());
211   }
212 }
213 
214 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
215   const Function &F) const {
216   // FIXME: 1024 if function.
217   // Default minimum/maximum flat work group sizes.
218   std::pair<unsigned, unsigned> Default =
219     getDefaultFlatWorkGroupSize(F.getCallingConv());
220 
221   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
222   // starts using "amdgpu-flat-work-group-size" attribute.
223   Default.second = AMDGPU::getIntegerAttribute(
224     F, "amdgpu-max-work-group-size", Default.second);
225   Default.first = std::min(Default.first, Default.second);
226 
227   // Requested minimum/maximum flat work group sizes.
228   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
229     F, "amdgpu-flat-work-group-size", Default);
230 
231   // Make sure requested minimum is less than requested maximum.
232   if (Requested.first > Requested.second)
233     return Default;
234 
235   // Make sure requested values do not violate subtarget's specifications.
236   if (Requested.first < getMinFlatWorkGroupSize())
237     return Default;
238   if (Requested.second > getMaxFlatWorkGroupSize())
239     return Default;
240 
241   return Requested;
242 }
243 
244 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
245   const Function &F) const {
246   // Default minimum/maximum number of waves per execution unit.
247   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
248 
249   // Default/requested minimum/maximum flat work group sizes.
250   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
251 
252   // If minimum/maximum flat work group sizes were explicitly requested using
253   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
254   // number of waves per execution unit to values implied by requested
255   // minimum/maximum flat work group sizes.
256   unsigned MinImpliedByFlatWorkGroupSize =
257     getMaxWavesPerEU(FlatWorkGroupSizes.second);
258   bool RequestedFlatWorkGroupSize = false;
259 
260   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
261   // starts using "amdgpu-flat-work-group-size" attribute.
262   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
263       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
264     Default.first = MinImpliedByFlatWorkGroupSize;
265     RequestedFlatWorkGroupSize = true;
266   }
267 
268   // Requested minimum/maximum number of waves per execution unit.
269   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
270     F, "amdgpu-waves-per-eu", Default, true);
271 
272   // Make sure requested minimum is less than requested maximum.
273   if (Requested.second && Requested.first > Requested.second)
274     return Default;
275 
276   // Make sure requested values do not violate subtarget's specifications.
277   if (Requested.first < getMinWavesPerEU() ||
278       Requested.first > getMaxWavesPerEU())
279     return Default;
280   if (Requested.second > getMaxWavesPerEU())
281     return Default;
282 
283   // Make sure requested values are compatible with values implied by requested
284   // minimum/maximum flat work group sizes.
285   if (RequestedFlatWorkGroupSize &&
286       Requested.first < MinImpliedByFlatWorkGroupSize)
287     return Default;
288 
289   return Requested;
290 }
291 
292 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
293   Function *Kernel = I->getParent()->getParent();
294   unsigned MinSize = 0;
295   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
296   bool IdQuery = false;
297 
298   // If reqd_work_group_size is present it narrows value down.
299   if (auto *CI = dyn_cast<CallInst>(I)) {
300     const Function *F = CI->getCalledFunction();
301     if (F) {
302       unsigned Dim = UINT_MAX;
303       switch (F->getIntrinsicID()) {
304       case Intrinsic::amdgcn_workitem_id_x:
305       case Intrinsic::r600_read_tidig_x:
306         IdQuery = true;
307         LLVM_FALLTHROUGH;
308       case Intrinsic::r600_read_local_size_x:
309         Dim = 0;
310         break;
311       case Intrinsic::amdgcn_workitem_id_y:
312       case Intrinsic::r600_read_tidig_y:
313         IdQuery = true;
314         LLVM_FALLTHROUGH;
315       case Intrinsic::r600_read_local_size_y:
316         Dim = 1;
317         break;
318       case Intrinsic::amdgcn_workitem_id_z:
319       case Intrinsic::r600_read_tidig_z:
320         IdQuery = true;
321         LLVM_FALLTHROUGH;
322       case Intrinsic::r600_read_local_size_z:
323         Dim = 2;
324         break;
325       default:
326         break;
327       }
328       if (Dim <= 3) {
329         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
330           if (Node->getNumOperands() == 3)
331             MinSize = MaxSize = mdconst::extract<ConstantInt>(
332                                   Node->getOperand(Dim))->getZExtValue();
333       }
334     }
335   }
336 
337   if (!MaxSize)
338     return false;
339 
340   // Range metadata is [Lo, Hi). For ID query we need to pass max size
341   // as Hi. For size query we need to pass Hi + 1.
342   if (IdQuery)
343     MinSize = 0;
344   else
345     ++MaxSize;
346 
347   MDBuilder MDB(I->getContext());
348   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
349                                                   APInt(32, MaxSize));
350   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
351   return true;
352 }
353 
354 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
355                              const TargetMachine &TM) :
356   AMDGPUSubtarget(TT, GPU, FS, TM),
357   InstrInfo(*this),
358   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
359   TLInfo(TM, *this) {}
360 
361 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
362                          const TargetMachine &TM)
363     : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
364       FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
365       TLInfo(TM, *this) {
366   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
367   Legalizer.reset(new AMDGPULegalizerInfo());
368 
369   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
370   InstSelector.reset(new AMDGPUInstructionSelector(
371       *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get())));
372 }
373 
374 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
375                                       unsigned NumRegionInstrs) const {
376   // Track register pressure so the scheduler can try to decrease
377   // pressure once register usage is above the threshold defined by
378   // SIRegisterInfo::getRegPressureSetLimit()
379   Policy.ShouldTrackPressure = true;
380 
381   // Enabling both top down and bottom up scheduling seems to give us less
382   // register spills than just using one of these approaches on its own.
383   Policy.OnlyTopDown = false;
384   Policy.OnlyBottomUp = false;
385 
386   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
387   if (!enableSIScheduler())
388     Policy.ShouldTrackLaneMasks = true;
389 }
390 
391 bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
392   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
393 }
394 
395 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
396                                             unsigned ExplicitArgBytes) const {
397   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
398   if (ImplicitBytes == 0)
399     return ExplicitArgBytes;
400 
401   unsigned Alignment = getAlignmentForImplicitArgPtr();
402   return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
403 }
404 
405 unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
406   if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
407     if (SGPRs <= 80)
408       return 10;
409     if (SGPRs <= 88)
410       return 9;
411     if (SGPRs <= 100)
412       return 8;
413     return 7;
414   }
415   if (SGPRs <= 48)
416     return 10;
417   if (SGPRs <= 56)
418     return 9;
419   if (SGPRs <= 64)
420     return 8;
421   if (SGPRs <= 72)
422     return 7;
423   if (SGPRs <= 80)
424     return 6;
425   return 5;
426 }
427 
428 unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
429   if (VGPRs <= 24)
430     return 10;
431   if (VGPRs <= 28)
432     return 9;
433   if (VGPRs <= 32)
434     return 8;
435   if (VGPRs <= 36)
436     return 7;
437   if (VGPRs <= 40)
438     return 6;
439   if (VGPRs <= 48)
440     return 5;
441   if (VGPRs <= 64)
442     return 4;
443   if (VGPRs <= 84)
444     return 3;
445   if (VGPRs <= 128)
446     return 2;
447   return 1;
448 }
449 
450 unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
451   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
452   if (MFI.hasFlatScratchInit()) {
453     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
454       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
455     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
456       return 4; // FLAT_SCRATCH, VCC (in that order).
457   }
458 
459   if (isXNACKEnabled())
460     return 4; // XNACK, VCC (in that order).
461   return 2; // VCC.
462 }
463 
464 unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
465   const Function &F = *MF.getFunction();
466   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
467 
468   // Compute maximum number of SGPRs function can use using default/requested
469   // minimum number of waves per execution unit.
470   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
471   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
472   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
473 
474   // Check if maximum number of SGPRs was explicitly requested using
475   // "amdgpu-num-sgpr" attribute.
476   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
477     unsigned Requested = AMDGPU::getIntegerAttribute(
478       F, "amdgpu-num-sgpr", MaxNumSGPRs);
479 
480     // Make sure requested value does not violate subtarget's specifications.
481     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
482       Requested = 0;
483 
484     // If more SGPRs are required to support the input user/system SGPRs,
485     // increase to accommodate them.
486     //
487     // FIXME: This really ends up using the requested number of SGPRs + number
488     // of reserved special registers in total. Theoretically you could re-use
489     // the last input registers for these special registers, but this would
490     // require a lot of complexity to deal with the weird aliasing.
491     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
492     if (Requested && Requested < InputNumSGPRs)
493       Requested = InputNumSGPRs;
494 
495     // Make sure requested value is compatible with values implied by
496     // default/requested minimum/maximum number of waves per execution unit.
497     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
498       Requested = 0;
499     if (WavesPerEU.second &&
500         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
501       Requested = 0;
502 
503     if (Requested)
504       MaxNumSGPRs = Requested;
505   }
506 
507   if (hasSGPRInitBug())
508     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
509 
510   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
511                   MaxAddressableNumSGPRs);
512 }
513 
514 unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
515   const Function &F = *MF.getFunction();
516   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
517 
518   // Compute maximum number of VGPRs function can use using default/requested
519   // minimum number of waves per execution unit.
520   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
521   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
522 
523   // Check if maximum number of VGPRs was explicitly requested using
524   // "amdgpu-num-vgpr" attribute.
525   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
526     unsigned Requested = AMDGPU::getIntegerAttribute(
527       F, "amdgpu-num-vgpr", MaxNumVGPRs);
528 
529     // Make sure requested value does not violate subtarget's specifications.
530     if (Requested && Requested <= getReservedNumVGPRs(MF))
531       Requested = 0;
532 
533     // Make sure requested value is compatible with values implied by
534     // default/requested minimum/maximum number of waves per execution unit.
535     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
536       Requested = 0;
537     if (WavesPerEU.second &&
538         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
539       Requested = 0;
540 
541     if (Requested)
542       MaxNumVGPRs = Requested;
543   }
544 
545   return MaxNumVGPRs - getReservedNumVGPRs(MF);
546 }
547 
548 namespace {
549 struct MemOpClusterMutation : ScheduleDAGMutation {
550   const SIInstrInfo *TII;
551 
552   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
553 
554   void apply(ScheduleDAGInstrs *DAGInstrs) override {
555     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
556 
557     SUnit *SUa = nullptr;
558     // Search for two consequent memory operations and link them
559     // to prevent scheduler from moving them apart.
560     // In DAG pre-process SUnits are in the original order of
561     // the instructions before scheduling.
562     for (SUnit &SU : DAG->SUnits) {
563       MachineInstr &MI2 = *SU.getInstr();
564       if (!MI2.mayLoad() && !MI2.mayStore()) {
565         SUa = nullptr;
566         continue;
567       }
568       if (!SUa) {
569         SUa = &SU;
570         continue;
571       }
572 
573       MachineInstr &MI1 = *SUa->getInstr();
574       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
575           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
576           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
577           (TII->isDS(MI1)   && TII->isDS(MI2))) {
578         SU.addPredBarrier(SUa);
579 
580         for (const SDep &SI : SU.Preds) {
581           if (SI.getSUnit() != SUa)
582             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
583         }
584 
585         if (&SU != &DAG->ExitSU) {
586           for (const SDep &SI : SUa->Succs) {
587             if (SI.getSUnit() != &SU)
588               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
589           }
590         }
591       }
592 
593       SUa = &SU;
594     }
595   }
596 };
597 } // namespace
598 
599 void SISubtarget::getPostRAMutations(
600     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
601   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
602 }
603