1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
46 R600Subtarget &
47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48                                                StringRef GPU, StringRef FS) {
49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50   FullFS += FS;
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // FIXME: I don't think think Evergreen has any useful support for
54   // denormals, but should be checked. Should we issue a warning somewhere
55   // if someone tries to enable these?
56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57     FP32Denormals = false;
58   }
59 
60   HasMulU24 = getGeneration() >= EVERGREEN;
61   HasMulI24 = hasCaymanISA();
62 
63   return *this;
64 }
65 
66 GCNSubtarget &
67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                  StringRef GPU, StringRef FS) {
69   // Determine default and user-specified characteristics
70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71   // enabled, but some instructions do not respect them and they run at the
72   // double precision rate, so don't enable by default.
73   //
74   // We want to be able to turn these off, but making this a subtarget feature
75   // for SI has the unhelpful behavior that it unsets everything else if you
76   // disable it.
77 
78   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 
80   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82 
83   // FIXME: I don't think think Evergreen has any useful support for
84   // denormals, but should be checked. Should we issue a warning somewhere
85   // if someone tries to enable these?
86   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87     FullFS += "+fp64-fp16-denormals,";
88   } else {
89     FullFS += "-fp32-denormals,";
90   }
91 
92   FullFS += FS;
93 
94   ParseSubtargetFeatures(GPU, FullFS);
95 
96   // We don't support FP64 for EG/NI atm.
97   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
98 
99   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101   // variants of MUBUF instructions.
102   if (!hasAddr64() && !FS.contains("flat-for-global")) {
103     FlatForGlobal = true;
104   }
105 
106   // Set defaults if needed.
107   if (MaxPrivateElementSize == 0)
108     MaxPrivateElementSize = 4;
109 
110   if (LDSBankCount == 0)
111     LDSBankCount = 32;
112 
113   if (TT.getArch() == Triple::amdgcn) {
114     if (LocalMemorySize == 0)
115       LocalMemorySize = 32768;
116 
117     // Do something sensible for unspecified target.
118     if (!HasMovrel && !HasVGPRIndexMode)
119       HasMovrel = true;
120   }
121 
122   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
123 
124   return *this;
125 }
126 
127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
128                                              const FeatureBitset &FeatureBits) :
129   TargetTriple(TT),
130   SubtargetFeatureBits(FeatureBits),
131   Has16BitInsts(false),
132   HasMadMixInsts(false),
133   FP32Denormals(false),
134   FPExceptions(false),
135   HasSDWA(false),
136   HasVOP3PInsts(false),
137   HasMulI24(true),
138   HasMulU24(true),
139   HasInv2PiInlineImm(false),
140   HasFminFmaxLegacy(true),
141   EnablePromoteAlloca(false),
142   LocalMemorySize(0),
143   WavefrontSize(0)
144   { }
145 
146 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
147                                  const GCNTargetMachine &TM) :
148     AMDGPUGenSubtargetInfo(TT, GPU, FS),
149     AMDGPUSubtarget(TT, getFeatureBits()),
150     TargetTriple(TT),
151     Gen(SOUTHERN_ISLANDS),
152     IsaVersion(ISAVersion0_0_0),
153     LDSBankCount(0),
154     MaxPrivateElementSize(0),
155 
156     FastFMAF32(false),
157     HalfRate64Ops(false),
158 
159     FP64FP16Denormals(false),
160     DX10Clamp(false),
161     FlatForGlobal(false),
162     AutoWaitcntBeforeBarrier(false),
163     CodeObjectV3(false),
164     UnalignedScratchAccess(false),
165     UnalignedBufferAccess(false),
166 
167     HasApertureRegs(false),
168     EnableXNACK(false),
169     TrapHandler(false),
170     DebuggerInsertNops(false),
171     DebuggerEmitPrologue(false),
172 
173     EnableHugePrivateBuffer(false),
174     EnableVGPRSpilling(false),
175     EnableLoadStoreOpt(false),
176     EnableUnsafeDSOffsetFolding(false),
177     EnableSIScheduler(false),
178     EnableDS128(false),
179     DumpCode(false),
180 
181     FP64(false),
182     GCN3Encoding(false),
183     CIInsts(false),
184     VIInsts(false),
185     GFX9Insts(false),
186     SGPRInitBug(false),
187     HasSMemRealTime(false),
188     HasIntClamp(false),
189     HasFmaMixInsts(false),
190     HasMovrel(false),
191     HasVGPRIndexMode(false),
192     HasScalarStores(false),
193     HasScalarAtomics(false),
194     HasSDWAOmod(false),
195     HasSDWAScalar(false),
196     HasSDWASdst(false),
197     HasSDWAMac(false),
198     HasSDWAOutModsVOPC(false),
199     HasDPP(false),
200     HasR128A16(false),
201     HasDLInsts(false),
202     D16PreservesUnusedBits(false),
203     FlatAddressSpace(false),
204     FlatInstOffsets(false),
205     FlatGlobalInsts(false),
206     FlatScratchInsts(false),
207     AddNoCarryInsts(false),
208     HasUnpackedD16VMem(false),
209 
210     ScalarizeGlobal(false),
211 
212     FeatureDisable(false),
213     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
214     TLInfo(TM, *this),
215     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
216   AS = AMDGPU::getAMDGPUAS(TT);
217   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
218   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
219   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
220   InstSelector.reset(new AMDGPUInstructionSelector(
221   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
222 }
223 
224 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
225   const Function &F) const {
226   if (NWaves == 1)
227     return getLocalMemorySize();
228   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
229   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
230   unsigned MaxWaves = getMaxWavesPerEU();
231   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
232 }
233 
234 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
235   const Function &F) const {
236   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
237   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
238   unsigned MaxWaves = getMaxWavesPerEU();
239   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
240   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
241   NumWaves = std::min(NumWaves, MaxWaves);
242   NumWaves = std::max(NumWaves, 1u);
243   return NumWaves;
244 }
245 
246 unsigned
247 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
248   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
249   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
250 }
251 
252 std::pair<unsigned, unsigned>
253 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
254   switch (CC) {
255   case CallingConv::AMDGPU_CS:
256   case CallingConv::AMDGPU_KERNEL:
257   case CallingConv::SPIR_KERNEL:
258     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
259   case CallingConv::AMDGPU_VS:
260   case CallingConv::AMDGPU_LS:
261   case CallingConv::AMDGPU_HS:
262   case CallingConv::AMDGPU_ES:
263   case CallingConv::AMDGPU_GS:
264   case CallingConv::AMDGPU_PS:
265     return std::make_pair(1, getWavefrontSize());
266   default:
267     return std::make_pair(1, 16 * getWavefrontSize());
268   }
269 }
270 
271 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
272   const Function &F) const {
273   // FIXME: 1024 if function.
274   // Default minimum/maximum flat work group sizes.
275   std::pair<unsigned, unsigned> Default =
276     getDefaultFlatWorkGroupSize(F.getCallingConv());
277 
278   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
279   // starts using "amdgpu-flat-work-group-size" attribute.
280   Default.second = AMDGPU::getIntegerAttribute(
281     F, "amdgpu-max-work-group-size", Default.second);
282   Default.first = std::min(Default.first, Default.second);
283 
284   // Requested minimum/maximum flat work group sizes.
285   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
286     F, "amdgpu-flat-work-group-size", Default);
287 
288   // Make sure requested minimum is less than requested maximum.
289   if (Requested.first > Requested.second)
290     return Default;
291 
292   // Make sure requested values do not violate subtarget's specifications.
293   if (Requested.first < getMinFlatWorkGroupSize())
294     return Default;
295   if (Requested.second > getMaxFlatWorkGroupSize())
296     return Default;
297 
298   return Requested;
299 }
300 
301 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
302   const Function &F) const {
303   // Default minimum/maximum number of waves per execution unit.
304   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
305 
306   // Default/requested minimum/maximum flat work group sizes.
307   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
308 
309   // If minimum/maximum flat work group sizes were explicitly requested using
310   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
311   // number of waves per execution unit to values implied by requested
312   // minimum/maximum flat work group sizes.
313   unsigned MinImpliedByFlatWorkGroupSize =
314     getMaxWavesPerEU(FlatWorkGroupSizes.second);
315   bool RequestedFlatWorkGroupSize = false;
316 
317   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
318   // starts using "amdgpu-flat-work-group-size" attribute.
319   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
320       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
321     Default.first = MinImpliedByFlatWorkGroupSize;
322     RequestedFlatWorkGroupSize = true;
323   }
324 
325   // Requested minimum/maximum number of waves per execution unit.
326   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
327     F, "amdgpu-waves-per-eu", Default, true);
328 
329   // Make sure requested minimum is less than requested maximum.
330   if (Requested.second && Requested.first > Requested.second)
331     return Default;
332 
333   // Make sure requested values do not violate subtarget's specifications.
334   if (Requested.first < getMinWavesPerEU() ||
335       Requested.first > getMaxWavesPerEU())
336     return Default;
337   if (Requested.second > getMaxWavesPerEU())
338     return Default;
339 
340   // Make sure requested values are compatible with values implied by requested
341   // minimum/maximum flat work group sizes.
342   if (RequestedFlatWorkGroupSize &&
343       Requested.first < MinImpliedByFlatWorkGroupSize)
344     return Default;
345 
346   return Requested;
347 }
348 
349 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
350   Function *Kernel = I->getParent()->getParent();
351   unsigned MinSize = 0;
352   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
353   bool IdQuery = false;
354 
355   // If reqd_work_group_size is present it narrows value down.
356   if (auto *CI = dyn_cast<CallInst>(I)) {
357     const Function *F = CI->getCalledFunction();
358     if (F) {
359       unsigned Dim = UINT_MAX;
360       switch (F->getIntrinsicID()) {
361       case Intrinsic::amdgcn_workitem_id_x:
362       case Intrinsic::r600_read_tidig_x:
363         IdQuery = true;
364         LLVM_FALLTHROUGH;
365       case Intrinsic::r600_read_local_size_x:
366         Dim = 0;
367         break;
368       case Intrinsic::amdgcn_workitem_id_y:
369       case Intrinsic::r600_read_tidig_y:
370         IdQuery = true;
371         LLVM_FALLTHROUGH;
372       case Intrinsic::r600_read_local_size_y:
373         Dim = 1;
374         break;
375       case Intrinsic::amdgcn_workitem_id_z:
376       case Intrinsic::r600_read_tidig_z:
377         IdQuery = true;
378         LLVM_FALLTHROUGH;
379       case Intrinsic::r600_read_local_size_z:
380         Dim = 2;
381         break;
382       default:
383         break;
384       }
385       if (Dim <= 3) {
386         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
387           if (Node->getNumOperands() == 3)
388             MinSize = MaxSize = mdconst::extract<ConstantInt>(
389                                   Node->getOperand(Dim))->getZExtValue();
390       }
391     }
392   }
393 
394   if (!MaxSize)
395     return false;
396 
397   // Range metadata is [Lo, Hi). For ID query we need to pass max size
398   // as Hi. For size query we need to pass Hi + 1.
399   if (IdQuery)
400     MinSize = 0;
401   else
402     ++MaxSize;
403 
404   MDBuilder MDB(I->getContext());
405   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
406                                                   APInt(32, MaxSize));
407   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
408   return true;
409 }
410 
411 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
412                                                  unsigned &MaxAlign) const {
413   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
414          F.getCallingConv() == CallingConv::SPIR_KERNEL);
415 
416   const DataLayout &DL = F.getParent()->getDataLayout();
417   uint64_t ExplicitArgBytes = 0;
418   MaxAlign = 1;
419 
420   for (const Argument &Arg : F.args()) {
421     Type *ArgTy = Arg.getType();
422 
423     unsigned Align = DL.getABITypeAlignment(ArgTy);
424     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
425     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
426     MaxAlign = std::max(MaxAlign, Align);
427   }
428 
429   return ExplicitArgBytes;
430 }
431 
432 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
433                                                 unsigned &MaxAlign) const {
434   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
435 
436   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
437 
438   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
439   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
440   if (ImplicitBytes != 0) {
441     unsigned Alignment = getAlignmentForImplicitArgPtr();
442     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
443   }
444 
445   // Being able to dereference past the end is useful for emitting scalar loads.
446   return alignTo(TotalSize, 4);
447 }
448 
449 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
450                              const TargetMachine &TM) :
451   R600GenSubtargetInfo(TT, GPU, FS),
452   AMDGPUSubtarget(TT, getFeatureBits()),
453   InstrInfo(*this),
454   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
455   FMA(false),
456   CaymanISA(false),
457   CFALUBug(false),
458   DX10Clamp(false),
459   HasVertexCache(false),
460   R600ALUInst(false),
461   FP64(false),
462   TexVTXClauseSize(0),
463   Gen(R600),
464   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
465   InstrItins(getInstrItineraryForCPU(GPU)),
466   AS (AMDGPU::getAMDGPUAS(TT)) { }
467 
468 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
469                                       unsigned NumRegionInstrs) const {
470   // Track register pressure so the scheduler can try to decrease
471   // pressure once register usage is above the threshold defined by
472   // SIRegisterInfo::getRegPressureSetLimit()
473   Policy.ShouldTrackPressure = true;
474 
475   // Enabling both top down and bottom up scheduling seems to give us less
476   // register spills than just using one of these approaches on its own.
477   Policy.OnlyTopDown = false;
478   Policy.OnlyBottomUp = false;
479 
480   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
481   if (!enableSIScheduler())
482     Policy.ShouldTrackLaneMasks = true;
483 }
484 
485 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
486   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
487 }
488 
489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
490   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
491     if (SGPRs <= 80)
492       return 10;
493     if (SGPRs <= 88)
494       return 9;
495     if (SGPRs <= 100)
496       return 8;
497     return 7;
498   }
499   if (SGPRs <= 48)
500     return 10;
501   if (SGPRs <= 56)
502     return 9;
503   if (SGPRs <= 64)
504     return 8;
505   if (SGPRs <= 72)
506     return 7;
507   if (SGPRs <= 80)
508     return 6;
509   return 5;
510 }
511 
512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
513   if (VGPRs <= 24)
514     return 10;
515   if (VGPRs <= 28)
516     return 9;
517   if (VGPRs <= 32)
518     return 8;
519   if (VGPRs <= 36)
520     return 7;
521   if (VGPRs <= 40)
522     return 6;
523   if (VGPRs <= 48)
524     return 5;
525   if (VGPRs <= 64)
526     return 4;
527   if (VGPRs <= 84)
528     return 3;
529   if (VGPRs <= 128)
530     return 2;
531   return 1;
532 }
533 
534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
535   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
536   if (MFI.hasFlatScratchInit()) {
537     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
538       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
539     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
540       return 4; // FLAT_SCRATCH, VCC (in that order).
541   }
542 
543   if (isXNACKEnabled())
544     return 4; // XNACK, VCC (in that order).
545   return 2; // VCC.
546 }
547 
548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
549   const Function &F = MF.getFunction();
550   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
551 
552   // Compute maximum number of SGPRs function can use using default/requested
553   // minimum number of waves per execution unit.
554   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
555   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
556   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
557 
558   // Check if maximum number of SGPRs was explicitly requested using
559   // "amdgpu-num-sgpr" attribute.
560   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
561     unsigned Requested = AMDGPU::getIntegerAttribute(
562       F, "amdgpu-num-sgpr", MaxNumSGPRs);
563 
564     // Make sure requested value does not violate subtarget's specifications.
565     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
566       Requested = 0;
567 
568     // If more SGPRs are required to support the input user/system SGPRs,
569     // increase to accommodate them.
570     //
571     // FIXME: This really ends up using the requested number of SGPRs + number
572     // of reserved special registers in total. Theoretically you could re-use
573     // the last input registers for these special registers, but this would
574     // require a lot of complexity to deal with the weird aliasing.
575     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
576     if (Requested && Requested < InputNumSGPRs)
577       Requested = InputNumSGPRs;
578 
579     // Make sure requested value is compatible with values implied by
580     // default/requested minimum/maximum number of waves per execution unit.
581     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
582       Requested = 0;
583     if (WavesPerEU.second &&
584         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
585       Requested = 0;
586 
587     if (Requested)
588       MaxNumSGPRs = Requested;
589   }
590 
591   if (hasSGPRInitBug())
592     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
593 
594   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
595                   MaxAddressableNumSGPRs);
596 }
597 
598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
599   const Function &F = MF.getFunction();
600   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
601 
602   // Compute maximum number of VGPRs function can use using default/requested
603   // minimum number of waves per execution unit.
604   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
605   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
606 
607   // Check if maximum number of VGPRs was explicitly requested using
608   // "amdgpu-num-vgpr" attribute.
609   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
610     unsigned Requested = AMDGPU::getIntegerAttribute(
611       F, "amdgpu-num-vgpr", MaxNumVGPRs);
612 
613     // Make sure requested value is compatible with values implied by
614     // default/requested minimum/maximum number of waves per execution unit.
615     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
616       Requested = 0;
617     if (WavesPerEU.second &&
618         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
619       Requested = 0;
620 
621     if (Requested)
622       MaxNumVGPRs = Requested;
623   }
624 
625   return MaxNumVGPRs;
626 }
627 
628 namespace {
629 struct MemOpClusterMutation : ScheduleDAGMutation {
630   const SIInstrInfo *TII;
631 
632   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
633 
634   void apply(ScheduleDAGInstrs *DAGInstrs) override {
635     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
636 
637     SUnit *SUa = nullptr;
638     // Search for two consequent memory operations and link them
639     // to prevent scheduler from moving them apart.
640     // In DAG pre-process SUnits are in the original order of
641     // the instructions before scheduling.
642     for (SUnit &SU : DAG->SUnits) {
643       MachineInstr &MI2 = *SU.getInstr();
644       if (!MI2.mayLoad() && !MI2.mayStore()) {
645         SUa = nullptr;
646         continue;
647       }
648       if (!SUa) {
649         SUa = &SU;
650         continue;
651       }
652 
653       MachineInstr &MI1 = *SUa->getInstr();
654       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
655           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
656           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
657           (TII->isDS(MI1)   && TII->isDS(MI2))) {
658         SU.addPredBarrier(SUa);
659 
660         for (const SDep &SI : SU.Preds) {
661           if (SI.getSUnit() != SUa)
662             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
663         }
664 
665         if (&SU != &DAG->ExitSU) {
666           for (const SDep &SI : SUa->Succs) {
667             if (SI.getSUnit() != &SU)
668               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
669           }
670         }
671       }
672 
673       SUa = &SU;
674     }
675   }
676 };
677 } // namespace
678 
679 void GCNSubtarget::getPostRAMutations(
680     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
681   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
682 }
683 
684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
685   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
686     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
687   else
688     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
689 }
690 
691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
692   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
693     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
694   else
695     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
696 }
697