1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
46 R600Subtarget &
47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48                                                StringRef GPU, StringRef FS) {
49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50   FullFS += FS;
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // FIXME: I don't think think Evergreen has any useful support for
54   // denormals, but should be checked. Should we issue a warning somewhere
55   // if someone tries to enable these?
56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57     FP32Denormals = false;
58   }
59 
60   HasMulU24 = getGeneration() >= EVERGREEN;
61   HasMulI24 = hasCaymanISA();
62 
63   return *this;
64 }
65 
66 GCNSubtarget &
67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                  StringRef GPU, StringRef FS) {
69   // Determine default and user-specified characteristics
70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71   // enabled, but some instructions do not respect them and they run at the
72   // double precision rate, so don't enable by default.
73   //
74   // We want to be able to turn these off, but making this a subtarget feature
75   // for SI has the unhelpful behavior that it unsets everything else if you
76   // disable it.
77 
78   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 
80   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82 
83   // FIXME: I don't think think Evergreen has any useful support for
84   // denormals, but should be checked. Should we issue a warning somewhere
85   // if someone tries to enable these?
86   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87     FullFS += "+fp64-fp16-denormals,";
88   } else {
89     FullFS += "-fp32-denormals,";
90   }
91 
92   FullFS += FS;
93 
94   ParseSubtargetFeatures(GPU, FullFS);
95 
96   // We don't support FP64 for EG/NI atm.
97   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
98 
99   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101   // variants of MUBUF instructions.
102   if (!hasAddr64() && !FS.contains("flat-for-global")) {
103     FlatForGlobal = true;
104   }
105 
106   // Set defaults if needed.
107   if (MaxPrivateElementSize == 0)
108     MaxPrivateElementSize = 4;
109 
110   if (LDSBankCount == 0)
111     LDSBankCount = 32;
112 
113   if (TT.getArch() == Triple::amdgcn) {
114     if (LocalMemorySize == 0)
115       LocalMemorySize = 32768;
116 
117     // Do something sensible for unspecified target.
118     if (!HasMovrel && !HasVGPRIndexMode)
119       HasMovrel = true;
120   }
121 
122   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
123 
124   return *this;
125 }
126 
127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
128                                              const FeatureBitset &FeatureBits) :
129   TargetTriple(TT),
130   SubtargetFeatureBits(FeatureBits),
131   Has16BitInsts(false),
132   HasMadMixInsts(false),
133   FP32Denormals(false),
134   FPExceptions(false),
135   HasSDWA(false),
136   HasVOP3PInsts(false),
137   HasMulI24(true),
138   HasMulU24(true),
139   HasInv2PiInlineImm(false),
140   HasFminFmaxLegacy(true),
141   EnablePromoteAlloca(false),
142   LocalMemorySize(0),
143   WavefrontSize(0)
144   { }
145 
146 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
147                                  const GCNTargetMachine &TM) :
148     AMDGPUGenSubtargetInfo(TT, GPU, FS),
149     AMDGPUSubtarget(TT, getFeatureBits()),
150     TargetTriple(TT),
151     Gen(SOUTHERN_ISLANDS),
152     IsaVersion(ISAVersion0_0_0),
153     LDSBankCount(0),
154     MaxPrivateElementSize(0),
155 
156     FastFMAF32(false),
157     HalfRate64Ops(false),
158 
159     FP64FP16Denormals(false),
160     DX10Clamp(false),
161     FlatForGlobal(false),
162     AutoWaitcntBeforeBarrier(false),
163     CodeObjectV3(false),
164     UnalignedScratchAccess(false),
165     UnalignedBufferAccess(false),
166 
167     HasApertureRegs(false),
168     EnableXNACK(false),
169     TrapHandler(false),
170     DebuggerInsertNops(false),
171     DebuggerEmitPrologue(false),
172 
173     EnableHugePrivateBuffer(false),
174     EnableVGPRSpilling(false),
175     EnableLoadStoreOpt(false),
176     EnableUnsafeDSOffsetFolding(false),
177     EnableSIScheduler(false),
178     EnableDS128(false),
179     DumpCode(false),
180 
181     FP64(false),
182     GCN3Encoding(false),
183     CIInsts(false),
184     VIInsts(false),
185     GFX9Insts(false),
186     SGPRInitBug(false),
187     HasSMemRealTime(false),
188     HasIntClamp(false),
189     HasFmaMixInsts(false),
190     HasMovrel(false),
191     HasVGPRIndexMode(false),
192     HasScalarStores(false),
193     HasScalarAtomics(false),
194     HasSDWAOmod(false),
195     HasSDWAScalar(false),
196     HasSDWASdst(false),
197     HasSDWAMac(false),
198     HasSDWAOutModsVOPC(false),
199     HasDPP(false),
200     HasDLInsts(false),
201     D16PreservesUnusedBits(false),
202     FlatAddressSpace(false),
203     FlatInstOffsets(false),
204     FlatGlobalInsts(false),
205     FlatScratchInsts(false),
206     AddNoCarryInsts(false),
207     HasUnpackedD16VMem(false),
208 
209     ScalarizeGlobal(false),
210 
211     FeatureDisable(false),
212     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
213     TLInfo(TM, *this),
214     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
215   AS = AMDGPU::getAMDGPUAS(TT);
216   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
217   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
218   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
219   InstSelector.reset(new AMDGPUInstructionSelector(
220   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
221 }
222 
223 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
224   const Function &F) const {
225   if (NWaves == 1)
226     return getLocalMemorySize();
227   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
228   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
229   unsigned MaxWaves = getMaxWavesPerEU();
230   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
231 }
232 
233 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
234   const Function &F) const {
235   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
236   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
237   unsigned MaxWaves = getMaxWavesPerEU();
238   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
239   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
240   NumWaves = std::min(NumWaves, MaxWaves);
241   NumWaves = std::max(NumWaves, 1u);
242   return NumWaves;
243 }
244 
245 unsigned
246 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
247   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
248   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
249 }
250 
251 std::pair<unsigned, unsigned>
252 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
253   switch (CC) {
254   case CallingConv::AMDGPU_CS:
255   case CallingConv::AMDGPU_KERNEL:
256   case CallingConv::SPIR_KERNEL:
257     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
258   case CallingConv::AMDGPU_VS:
259   case CallingConv::AMDGPU_LS:
260   case CallingConv::AMDGPU_HS:
261   case CallingConv::AMDGPU_ES:
262   case CallingConv::AMDGPU_GS:
263   case CallingConv::AMDGPU_PS:
264     return std::make_pair(1, getWavefrontSize());
265   default:
266     return std::make_pair(1, 16 * getWavefrontSize());
267   }
268 }
269 
270 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
271   const Function &F) const {
272   // FIXME: 1024 if function.
273   // Default minimum/maximum flat work group sizes.
274   std::pair<unsigned, unsigned> Default =
275     getDefaultFlatWorkGroupSize(F.getCallingConv());
276 
277   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
278   // starts using "amdgpu-flat-work-group-size" attribute.
279   Default.second = AMDGPU::getIntegerAttribute(
280     F, "amdgpu-max-work-group-size", Default.second);
281   Default.first = std::min(Default.first, Default.second);
282 
283   // Requested minimum/maximum flat work group sizes.
284   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
285     F, "amdgpu-flat-work-group-size", Default);
286 
287   // Make sure requested minimum is less than requested maximum.
288   if (Requested.first > Requested.second)
289     return Default;
290 
291   // Make sure requested values do not violate subtarget's specifications.
292   if (Requested.first < getMinFlatWorkGroupSize())
293     return Default;
294   if (Requested.second > getMaxFlatWorkGroupSize())
295     return Default;
296 
297   return Requested;
298 }
299 
300 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
301   const Function &F) const {
302   // Default minimum/maximum number of waves per execution unit.
303   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
304 
305   // Default/requested minimum/maximum flat work group sizes.
306   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
307 
308   // If minimum/maximum flat work group sizes were explicitly requested using
309   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
310   // number of waves per execution unit to values implied by requested
311   // minimum/maximum flat work group sizes.
312   unsigned MinImpliedByFlatWorkGroupSize =
313     getMaxWavesPerEU(FlatWorkGroupSizes.second);
314   bool RequestedFlatWorkGroupSize = false;
315 
316   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
317   // starts using "amdgpu-flat-work-group-size" attribute.
318   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
319       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
320     Default.first = MinImpliedByFlatWorkGroupSize;
321     RequestedFlatWorkGroupSize = true;
322   }
323 
324   // Requested minimum/maximum number of waves per execution unit.
325   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
326     F, "amdgpu-waves-per-eu", Default, true);
327 
328   // Make sure requested minimum is less than requested maximum.
329   if (Requested.second && Requested.first > Requested.second)
330     return Default;
331 
332   // Make sure requested values do not violate subtarget's specifications.
333   if (Requested.first < getMinWavesPerEU() ||
334       Requested.first > getMaxWavesPerEU())
335     return Default;
336   if (Requested.second > getMaxWavesPerEU())
337     return Default;
338 
339   // Make sure requested values are compatible with values implied by requested
340   // minimum/maximum flat work group sizes.
341   if (RequestedFlatWorkGroupSize &&
342       Requested.first < MinImpliedByFlatWorkGroupSize)
343     return Default;
344 
345   return Requested;
346 }
347 
348 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
349   Function *Kernel = I->getParent()->getParent();
350   unsigned MinSize = 0;
351   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
352   bool IdQuery = false;
353 
354   // If reqd_work_group_size is present it narrows value down.
355   if (auto *CI = dyn_cast<CallInst>(I)) {
356     const Function *F = CI->getCalledFunction();
357     if (F) {
358       unsigned Dim = UINT_MAX;
359       switch (F->getIntrinsicID()) {
360       case Intrinsic::amdgcn_workitem_id_x:
361       case Intrinsic::r600_read_tidig_x:
362         IdQuery = true;
363         LLVM_FALLTHROUGH;
364       case Intrinsic::r600_read_local_size_x:
365         Dim = 0;
366         break;
367       case Intrinsic::amdgcn_workitem_id_y:
368       case Intrinsic::r600_read_tidig_y:
369         IdQuery = true;
370         LLVM_FALLTHROUGH;
371       case Intrinsic::r600_read_local_size_y:
372         Dim = 1;
373         break;
374       case Intrinsic::amdgcn_workitem_id_z:
375       case Intrinsic::r600_read_tidig_z:
376         IdQuery = true;
377         LLVM_FALLTHROUGH;
378       case Intrinsic::r600_read_local_size_z:
379         Dim = 2;
380         break;
381       default:
382         break;
383       }
384       if (Dim <= 3) {
385         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
386           if (Node->getNumOperands() == 3)
387             MinSize = MaxSize = mdconst::extract<ConstantInt>(
388                                   Node->getOperand(Dim))->getZExtValue();
389       }
390     }
391   }
392 
393   if (!MaxSize)
394     return false;
395 
396   // Range metadata is [Lo, Hi). For ID query we need to pass max size
397   // as Hi. For size query we need to pass Hi + 1.
398   if (IdQuery)
399     MinSize = 0;
400   else
401     ++MaxSize;
402 
403   MDBuilder MDB(I->getContext());
404   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
405                                                   APInt(32, MaxSize));
406   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
407   return true;
408 }
409 
410 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
411                                                  unsigned &MaxAlign) const {
412   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
413          F.getCallingConv() == CallingConv::SPIR_KERNEL);
414 
415   const DataLayout &DL = F.getParent()->getDataLayout();
416   uint64_t ExplicitArgBytes = 0;
417   MaxAlign = 1;
418 
419   for (const Argument &Arg : F.args()) {
420     Type *ArgTy = Arg.getType();
421 
422     unsigned Align = DL.getABITypeAlignment(ArgTy);
423     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
424     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
425     MaxAlign = std::max(MaxAlign, Align);
426   }
427 
428   return ExplicitArgBytes;
429 }
430 
431 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
432                                                 unsigned &MaxAlign) const {
433   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
434 
435   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
436 
437   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
438   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
439   if (ImplicitBytes != 0) {
440     unsigned Alignment = getAlignmentForImplicitArgPtr();
441     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
442   }
443 
444   // Being able to dereference past the end is useful for emitting scalar loads.
445   return alignTo(TotalSize, 4);
446 }
447 
448 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
449                              const TargetMachine &TM) :
450   R600GenSubtargetInfo(TT, GPU, FS),
451   AMDGPUSubtarget(TT, getFeatureBits()),
452   InstrInfo(*this),
453   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
454   FMA(false),
455   CaymanISA(false),
456   CFALUBug(false),
457   DX10Clamp(false),
458   HasVertexCache(false),
459   R600ALUInst(false),
460   FP64(false),
461   TexVTXClauseSize(0),
462   Gen(R600),
463   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
464   InstrItins(getInstrItineraryForCPU(GPU)),
465   AS (AMDGPU::getAMDGPUAS(TT)) { }
466 
467 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
468                                       unsigned NumRegionInstrs) const {
469   // Track register pressure so the scheduler can try to decrease
470   // pressure once register usage is above the threshold defined by
471   // SIRegisterInfo::getRegPressureSetLimit()
472   Policy.ShouldTrackPressure = true;
473 
474   // Enabling both top down and bottom up scheduling seems to give us less
475   // register spills than just using one of these approaches on its own.
476   Policy.OnlyTopDown = false;
477   Policy.OnlyBottomUp = false;
478 
479   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
480   if (!enableSIScheduler())
481     Policy.ShouldTrackLaneMasks = true;
482 }
483 
484 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
485   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
486 }
487 
488 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
489   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
490     if (SGPRs <= 80)
491       return 10;
492     if (SGPRs <= 88)
493       return 9;
494     if (SGPRs <= 100)
495       return 8;
496     return 7;
497   }
498   if (SGPRs <= 48)
499     return 10;
500   if (SGPRs <= 56)
501     return 9;
502   if (SGPRs <= 64)
503     return 8;
504   if (SGPRs <= 72)
505     return 7;
506   if (SGPRs <= 80)
507     return 6;
508   return 5;
509 }
510 
511 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
512   if (VGPRs <= 24)
513     return 10;
514   if (VGPRs <= 28)
515     return 9;
516   if (VGPRs <= 32)
517     return 8;
518   if (VGPRs <= 36)
519     return 7;
520   if (VGPRs <= 40)
521     return 6;
522   if (VGPRs <= 48)
523     return 5;
524   if (VGPRs <= 64)
525     return 4;
526   if (VGPRs <= 84)
527     return 3;
528   if (VGPRs <= 128)
529     return 2;
530   return 1;
531 }
532 
533 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
534   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
535   if (MFI.hasFlatScratchInit()) {
536     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
537       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
538     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
539       return 4; // FLAT_SCRATCH, VCC (in that order).
540   }
541 
542   if (isXNACKEnabled())
543     return 4; // XNACK, VCC (in that order).
544   return 2; // VCC.
545 }
546 
547 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
548   const Function &F = MF.getFunction();
549   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
550 
551   // Compute maximum number of SGPRs function can use using default/requested
552   // minimum number of waves per execution unit.
553   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
554   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
555   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
556 
557   // Check if maximum number of SGPRs was explicitly requested using
558   // "amdgpu-num-sgpr" attribute.
559   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
560     unsigned Requested = AMDGPU::getIntegerAttribute(
561       F, "amdgpu-num-sgpr", MaxNumSGPRs);
562 
563     // Make sure requested value does not violate subtarget's specifications.
564     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
565       Requested = 0;
566 
567     // If more SGPRs are required to support the input user/system SGPRs,
568     // increase to accommodate them.
569     //
570     // FIXME: This really ends up using the requested number of SGPRs + number
571     // of reserved special registers in total. Theoretically you could re-use
572     // the last input registers for these special registers, but this would
573     // require a lot of complexity to deal with the weird aliasing.
574     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
575     if (Requested && Requested < InputNumSGPRs)
576       Requested = InputNumSGPRs;
577 
578     // Make sure requested value is compatible with values implied by
579     // default/requested minimum/maximum number of waves per execution unit.
580     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
581       Requested = 0;
582     if (WavesPerEU.second &&
583         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
584       Requested = 0;
585 
586     if (Requested)
587       MaxNumSGPRs = Requested;
588   }
589 
590   if (hasSGPRInitBug())
591     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
592 
593   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
594                   MaxAddressableNumSGPRs);
595 }
596 
597 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
598   const Function &F = MF.getFunction();
599   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
600 
601   // Compute maximum number of VGPRs function can use using default/requested
602   // minimum number of waves per execution unit.
603   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
604   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
605 
606   // Check if maximum number of VGPRs was explicitly requested using
607   // "amdgpu-num-vgpr" attribute.
608   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
609     unsigned Requested = AMDGPU::getIntegerAttribute(
610       F, "amdgpu-num-vgpr", MaxNumVGPRs);
611 
612     // Make sure requested value is compatible with values implied by
613     // default/requested minimum/maximum number of waves per execution unit.
614     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
615       Requested = 0;
616     if (WavesPerEU.second &&
617         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
618       Requested = 0;
619 
620     if (Requested)
621       MaxNumVGPRs = Requested;
622   }
623 
624   return MaxNumVGPRs;
625 }
626 
627 namespace {
628 struct MemOpClusterMutation : ScheduleDAGMutation {
629   const SIInstrInfo *TII;
630 
631   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
632 
633   void apply(ScheduleDAGInstrs *DAGInstrs) override {
634     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
635 
636     SUnit *SUa = nullptr;
637     // Search for two consequent memory operations and link them
638     // to prevent scheduler from moving them apart.
639     // In DAG pre-process SUnits are in the original order of
640     // the instructions before scheduling.
641     for (SUnit &SU : DAG->SUnits) {
642       MachineInstr &MI2 = *SU.getInstr();
643       if (!MI2.mayLoad() && !MI2.mayStore()) {
644         SUa = nullptr;
645         continue;
646       }
647       if (!SUa) {
648         SUa = &SU;
649         continue;
650       }
651 
652       MachineInstr &MI1 = *SUa->getInstr();
653       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
654           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
655           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
656           (TII->isDS(MI1)   && TII->isDS(MI2))) {
657         SU.addPredBarrier(SUa);
658 
659         for (const SDep &SI : SU.Preds) {
660           if (SI.getSUnit() != SUa)
661             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
662         }
663 
664         if (&SU != &DAG->ExitSU) {
665           for (const SDep &SI : SUa->Succs) {
666             if (SI.getSUnit() != &SU)
667               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
668           }
669         }
670       }
671 
672       SUa = &SU;
673     }
674   }
675 };
676 } // namespace
677 
678 void GCNSubtarget::getPostRAMutations(
679     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
680   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
681 }
682 
683 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
684   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
685     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
686   else
687     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
688 }
689 
690 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
691   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
692     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
693   else
694     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
695 }
696