1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                                  StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   SmallString<256> FullFS("+promote-alloca,+load-store-opt,");
81 
82   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
84 
85   // FIXME: I don't think think Evergreen has any useful support for
86   // denormals, but should be checked. Should we issue a warning somewhere
87   // if someone tries to enable these?
88   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
89     FullFS += "+fp64-fp16-denormals,";
90   } else {
91     FullFS += "-fp32-denormals,";
92   }
93 
94   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
95 
96   FullFS += FS;
97 
98   ParseSubtargetFeatures(GPU, FullFS);
99 
100   // We don't support FP64 for EG/NI atm.
101   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
102 
103   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105   // variants of MUBUF instructions.
106   if (!hasAddr64() && !FS.contains("flat-for-global")) {
107     FlatForGlobal = true;
108   }
109 
110   // Set defaults if needed.
111   if (MaxPrivateElementSize == 0)
112     MaxPrivateElementSize = 4;
113 
114   if (LDSBankCount == 0)
115     LDSBankCount = 32;
116 
117   if (TT.getArch() == Triple::amdgcn) {
118     if (LocalMemorySize == 0)
119       LocalMemorySize = 32768;
120 
121     // Do something sensible for unspecified target.
122     if (!HasMovrel && !HasVGPRIndexMode)
123       HasMovrel = true;
124   }
125 
126   // Don't crash on invalid devices.
127   if (WavefrontSize == 0)
128     WavefrontSize = 64;
129 
130   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
131 
132   return *this;
133 }
134 
135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
136   TargetTriple(TT),
137   Has16BitInsts(false),
138   HasMadMixInsts(false),
139   FP32Denormals(false),
140   FPExceptions(false),
141   HasSDWA(false),
142   HasVOP3PInsts(false),
143   HasMulI24(true),
144   HasMulU24(true),
145   HasInv2PiInlineImm(false),
146   HasFminFmaxLegacy(true),
147   EnablePromoteAlloca(false),
148   HasTrigReducedRange(false),
149   LocalMemorySize(0),
150   WavefrontSize(0)
151   { }
152 
153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
154                            const GCNTargetMachine &TM) :
155     AMDGPUGenSubtargetInfo(TT, GPU, FS),
156     AMDGPUSubtarget(TT),
157     TargetTriple(TT),
158     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
159     InstrItins(getInstrItineraryForCPU(GPU)),
160     LDSBankCount(0),
161     MaxPrivateElementSize(0),
162 
163     FastFMAF32(false),
164     HalfRate64Ops(false),
165 
166     FP64FP16Denormals(false),
167     FlatForGlobal(false),
168     AutoWaitcntBeforeBarrier(false),
169     CodeObjectV3(false),
170     UnalignedScratchAccess(false),
171     UnalignedBufferAccess(false),
172 
173     HasApertureRegs(false),
174     EnableXNACK(false),
175     TrapHandler(false),
176 
177     EnableHugePrivateBuffer(false),
178     EnableLoadStoreOpt(false),
179     EnableUnsafeDSOffsetFolding(false),
180     EnableSIScheduler(false),
181     EnableDS128(false),
182     EnablePRTStrictNull(false),
183     DumpCode(false),
184 
185     FP64(false),
186     GCN3Encoding(false),
187     CIInsts(false),
188     VIInsts(false),
189     GFX9Insts(false),
190     SGPRInitBug(false),
191     HasSMemRealTime(false),
192     HasIntClamp(false),
193     HasFmaMixInsts(false),
194     HasMovrel(false),
195     HasVGPRIndexMode(false),
196     HasScalarStores(false),
197     HasScalarAtomics(false),
198     HasSDWAOmod(false),
199     HasSDWAScalar(false),
200     HasSDWASdst(false),
201     HasSDWAMac(false),
202     HasSDWAOutModsVOPC(false),
203     HasDPP(false),
204     HasR128A16(false),
205     HasDLInsts(false),
206     HasDot1Insts(false),
207     HasDot2Insts(false),
208     EnableSRAMECC(false),
209     FlatAddressSpace(false),
210     FlatInstOffsets(false),
211     FlatGlobalInsts(false),
212     FlatScratchInsts(false),
213     AddNoCarryInsts(false),
214     HasUnpackedD16VMem(false),
215 
216     ScalarizeGlobal(false),
217 
218     FeatureDisable(false),
219     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
220     TLInfo(TM, *this),
221     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
222   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
223   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
224   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
225   InstSelector.reset(new AMDGPUInstructionSelector(
226   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
227 }
228 
229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
230   const Function &F) const {
231   if (NWaves == 1)
232     return getLocalMemorySize();
233   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
234   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
235   unsigned MaxWaves = getMaxWavesPerEU();
236   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
237 }
238 
239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
240   const Function &F) const {
241   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
242   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
243   unsigned MaxWaves = getMaxWavesPerEU();
244   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
245   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
246   NumWaves = std::min(NumWaves, MaxWaves);
247   NumWaves = std::max(NumWaves, 1u);
248   return NumWaves;
249 }
250 
251 unsigned
252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
253   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
254   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
255 }
256 
257 std::pair<unsigned, unsigned>
258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
259   switch (CC) {
260   case CallingConv::AMDGPU_CS:
261   case CallingConv::AMDGPU_KERNEL:
262   case CallingConv::SPIR_KERNEL:
263     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
264   case CallingConv::AMDGPU_VS:
265   case CallingConv::AMDGPU_LS:
266   case CallingConv::AMDGPU_HS:
267   case CallingConv::AMDGPU_ES:
268   case CallingConv::AMDGPU_GS:
269   case CallingConv::AMDGPU_PS:
270     return std::make_pair(1, getWavefrontSize());
271   default:
272     return std::make_pair(1, 16 * getWavefrontSize());
273   }
274 }
275 
276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
277   const Function &F) const {
278   // FIXME: 1024 if function.
279   // Default minimum/maximum flat work group sizes.
280   std::pair<unsigned, unsigned> Default =
281     getDefaultFlatWorkGroupSize(F.getCallingConv());
282 
283   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
284   // starts using "amdgpu-flat-work-group-size" attribute.
285   Default.second = AMDGPU::getIntegerAttribute(
286     F, "amdgpu-max-work-group-size", Default.second);
287   Default.first = std::min(Default.first, Default.second);
288 
289   // Requested minimum/maximum flat work group sizes.
290   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
291     F, "amdgpu-flat-work-group-size", Default);
292 
293   // Make sure requested minimum is less than requested maximum.
294   if (Requested.first > Requested.second)
295     return Default;
296 
297   // Make sure requested values do not violate subtarget's specifications.
298   if (Requested.first < getMinFlatWorkGroupSize())
299     return Default;
300   if (Requested.second > getMaxFlatWorkGroupSize())
301     return Default;
302 
303   return Requested;
304 }
305 
306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
307   const Function &F) const {
308   // Default minimum/maximum number of waves per execution unit.
309   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
310 
311   // Default/requested minimum/maximum flat work group sizes.
312   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
313 
314   // If minimum/maximum flat work group sizes were explicitly requested using
315   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
316   // number of waves per execution unit to values implied by requested
317   // minimum/maximum flat work group sizes.
318   unsigned MinImpliedByFlatWorkGroupSize =
319     getMaxWavesPerEU(FlatWorkGroupSizes.second);
320   bool RequestedFlatWorkGroupSize = false;
321 
322   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
323   // starts using "amdgpu-flat-work-group-size" attribute.
324   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
325       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
326     Default.first = MinImpliedByFlatWorkGroupSize;
327     RequestedFlatWorkGroupSize = true;
328   }
329 
330   // Requested minimum/maximum number of waves per execution unit.
331   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
332     F, "amdgpu-waves-per-eu", Default, true);
333 
334   // Make sure requested minimum is less than requested maximum.
335   if (Requested.second && Requested.first > Requested.second)
336     return Default;
337 
338   // Make sure requested values do not violate subtarget's specifications.
339   if (Requested.first < getMinWavesPerEU() ||
340       Requested.first > getMaxWavesPerEU())
341     return Default;
342   if (Requested.second > getMaxWavesPerEU())
343     return Default;
344 
345   // Make sure requested values are compatible with values implied by requested
346   // minimum/maximum flat work group sizes.
347   if (RequestedFlatWorkGroupSize &&
348       Requested.first < MinImpliedByFlatWorkGroupSize)
349     return Default;
350 
351   return Requested;
352 }
353 
354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
355   Function *Kernel = I->getParent()->getParent();
356   unsigned MinSize = 0;
357   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
358   bool IdQuery = false;
359 
360   // If reqd_work_group_size is present it narrows value down.
361   if (auto *CI = dyn_cast<CallInst>(I)) {
362     const Function *F = CI->getCalledFunction();
363     if (F) {
364       unsigned Dim = UINT_MAX;
365       switch (F->getIntrinsicID()) {
366       case Intrinsic::amdgcn_workitem_id_x:
367       case Intrinsic::r600_read_tidig_x:
368         IdQuery = true;
369         LLVM_FALLTHROUGH;
370       case Intrinsic::r600_read_local_size_x:
371         Dim = 0;
372         break;
373       case Intrinsic::amdgcn_workitem_id_y:
374       case Intrinsic::r600_read_tidig_y:
375         IdQuery = true;
376         LLVM_FALLTHROUGH;
377       case Intrinsic::r600_read_local_size_y:
378         Dim = 1;
379         break;
380       case Intrinsic::amdgcn_workitem_id_z:
381       case Intrinsic::r600_read_tidig_z:
382         IdQuery = true;
383         LLVM_FALLTHROUGH;
384       case Intrinsic::r600_read_local_size_z:
385         Dim = 2;
386         break;
387       default:
388         break;
389       }
390       if (Dim <= 3) {
391         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
392           if (Node->getNumOperands() == 3)
393             MinSize = MaxSize = mdconst::extract<ConstantInt>(
394                                   Node->getOperand(Dim))->getZExtValue();
395       }
396     }
397   }
398 
399   if (!MaxSize)
400     return false;
401 
402   // Range metadata is [Lo, Hi). For ID query we need to pass max size
403   // as Hi. For size query we need to pass Hi + 1.
404   if (IdQuery)
405     MinSize = 0;
406   else
407     ++MaxSize;
408 
409   MDBuilder MDB(I->getContext());
410   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
411                                                   APInt(32, MaxSize));
412   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
413   return true;
414 }
415 
416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
417                                                  unsigned &MaxAlign) const {
418   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
419          F.getCallingConv() == CallingConv::SPIR_KERNEL);
420 
421   const DataLayout &DL = F.getParent()->getDataLayout();
422   uint64_t ExplicitArgBytes = 0;
423   MaxAlign = 1;
424 
425   for (const Argument &Arg : F.args()) {
426     Type *ArgTy = Arg.getType();
427 
428     unsigned Align = DL.getABITypeAlignment(ArgTy);
429     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
430     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
431     MaxAlign = std::max(MaxAlign, Align);
432   }
433 
434   return ExplicitArgBytes;
435 }
436 
437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
438                                                 unsigned &MaxAlign) const {
439   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
440 
441   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
442 
443   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
444   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
445   if (ImplicitBytes != 0) {
446     unsigned Alignment = getAlignmentForImplicitArgPtr();
447     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
448   }
449 
450   // Being able to dereference past the end is useful for emitting scalar loads.
451   return alignTo(TotalSize, 4);
452 }
453 
454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
455                              const TargetMachine &TM) :
456   R600GenSubtargetInfo(TT, GPU, FS),
457   AMDGPUSubtarget(TT),
458   InstrInfo(*this),
459   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
460   FMA(false),
461   CaymanISA(false),
462   CFALUBug(false),
463   HasVertexCache(false),
464   R600ALUInst(false),
465   FP64(false),
466   TexVTXClauseSize(0),
467   Gen(R600),
468   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
469   InstrItins(getInstrItineraryForCPU(GPU)) { }
470 
471 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
472                                       unsigned NumRegionInstrs) const {
473   // Track register pressure so the scheduler can try to decrease
474   // pressure once register usage is above the threshold defined by
475   // SIRegisterInfo::getRegPressureSetLimit()
476   Policy.ShouldTrackPressure = true;
477 
478   // Enabling both top down and bottom up scheduling seems to give us less
479   // register spills than just using one of these approaches on its own.
480   Policy.OnlyTopDown = false;
481   Policy.OnlyBottomUp = false;
482 
483   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
484   if (!enableSIScheduler())
485     Policy.ShouldTrackLaneMasks = true;
486 }
487 
488 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
489   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
490     if (SGPRs <= 80)
491       return 10;
492     if (SGPRs <= 88)
493       return 9;
494     if (SGPRs <= 100)
495       return 8;
496     return 7;
497   }
498   if (SGPRs <= 48)
499     return 10;
500   if (SGPRs <= 56)
501     return 9;
502   if (SGPRs <= 64)
503     return 8;
504   if (SGPRs <= 72)
505     return 7;
506   if (SGPRs <= 80)
507     return 6;
508   return 5;
509 }
510 
511 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
512   if (VGPRs <= 24)
513     return 10;
514   if (VGPRs <= 28)
515     return 9;
516   if (VGPRs <= 32)
517     return 8;
518   if (VGPRs <= 36)
519     return 7;
520   if (VGPRs <= 40)
521     return 6;
522   if (VGPRs <= 48)
523     return 5;
524   if (VGPRs <= 64)
525     return 4;
526   if (VGPRs <= 84)
527     return 3;
528   if (VGPRs <= 128)
529     return 2;
530   return 1;
531 }
532 
533 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
534   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
535   if (MFI.hasFlatScratchInit()) {
536     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
537       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
538     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
539       return 4; // FLAT_SCRATCH, VCC (in that order).
540   }
541 
542   if (isXNACKEnabled())
543     return 4; // XNACK, VCC (in that order).
544   return 2; // VCC.
545 }
546 
547 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
548   const Function &F = MF.getFunction();
549   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
550 
551   // Compute maximum number of SGPRs function can use using default/requested
552   // minimum number of waves per execution unit.
553   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
554   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
555   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
556 
557   // Check if maximum number of SGPRs was explicitly requested using
558   // "amdgpu-num-sgpr" attribute.
559   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
560     unsigned Requested = AMDGPU::getIntegerAttribute(
561       F, "amdgpu-num-sgpr", MaxNumSGPRs);
562 
563     // Make sure requested value does not violate subtarget's specifications.
564     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
565       Requested = 0;
566 
567     // If more SGPRs are required to support the input user/system SGPRs,
568     // increase to accommodate them.
569     //
570     // FIXME: This really ends up using the requested number of SGPRs + number
571     // of reserved special registers in total. Theoretically you could re-use
572     // the last input registers for these special registers, but this would
573     // require a lot of complexity to deal with the weird aliasing.
574     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
575     if (Requested && Requested < InputNumSGPRs)
576       Requested = InputNumSGPRs;
577 
578     // Make sure requested value is compatible with values implied by
579     // default/requested minimum/maximum number of waves per execution unit.
580     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
581       Requested = 0;
582     if (WavesPerEU.second &&
583         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
584       Requested = 0;
585 
586     if (Requested)
587       MaxNumSGPRs = Requested;
588   }
589 
590   if (hasSGPRInitBug())
591     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
592 
593   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
594                   MaxAddressableNumSGPRs);
595 }
596 
597 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
598   const Function &F = MF.getFunction();
599   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
600 
601   // Compute maximum number of VGPRs function can use using default/requested
602   // minimum number of waves per execution unit.
603   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
604   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
605 
606   // Check if maximum number of VGPRs was explicitly requested using
607   // "amdgpu-num-vgpr" attribute.
608   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
609     unsigned Requested = AMDGPU::getIntegerAttribute(
610       F, "amdgpu-num-vgpr", MaxNumVGPRs);
611 
612     // Make sure requested value is compatible with values implied by
613     // default/requested minimum/maximum number of waves per execution unit.
614     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
615       Requested = 0;
616     if (WavesPerEU.second &&
617         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
618       Requested = 0;
619 
620     if (Requested)
621       MaxNumVGPRs = Requested;
622   }
623 
624   return MaxNumVGPRs;
625 }
626 
627 namespace {
628 struct MemOpClusterMutation : ScheduleDAGMutation {
629   const SIInstrInfo *TII;
630 
631   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
632 
633   void apply(ScheduleDAGInstrs *DAG) override {
634     SUnit *SUa = nullptr;
635     // Search for two consequent memory operations and link them
636     // to prevent scheduler from moving them apart.
637     // In DAG pre-process SUnits are in the original order of
638     // the instructions before scheduling.
639     for (SUnit &SU : DAG->SUnits) {
640       MachineInstr &MI2 = *SU.getInstr();
641       if (!MI2.mayLoad() && !MI2.mayStore()) {
642         SUa = nullptr;
643         continue;
644       }
645       if (!SUa) {
646         SUa = &SU;
647         continue;
648       }
649 
650       MachineInstr &MI1 = *SUa->getInstr();
651       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
652           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
653           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
654           (TII->isDS(MI1)   && TII->isDS(MI2))) {
655         SU.addPredBarrier(SUa);
656 
657         for (const SDep &SI : SU.Preds) {
658           if (SI.getSUnit() != SUa)
659             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
660         }
661 
662         if (&SU != &DAG->ExitSU) {
663           for (const SDep &SI : SUa->Succs) {
664             if (SI.getSUnit() != &SU)
665               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
666           }
667         }
668       }
669 
670       SUa = &SU;
671     }
672   }
673 };
674 } // namespace
675 
676 void GCNSubtarget::getPostRAMutations(
677     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
678   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
679 }
680 
681 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
682   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
683     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
684   else
685     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
686 }
687 
688 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
689   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
690     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
691   else
692     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
693 }
694