1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   FullFS += FS;
98 
99   ParseSubtargetFeatures(GPU, FullFS);
100 
101   // We don't support FP64 for EG/NI atm.
102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 
104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106   // variants of MUBUF instructions.
107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
108     FlatForGlobal = true;
109   }
110 
111   // Set defaults if needed.
112   if (MaxPrivateElementSize == 0)
113     MaxPrivateElementSize = 4;
114 
115   if (LDSBankCount == 0)
116     LDSBankCount = 32;
117 
118   if (TT.getArch() == Triple::amdgcn) {
119     if (LocalMemorySize == 0)
120       LocalMemorySize = 32768;
121 
122     // Do something sensible for unspecified target.
123     if (!HasMovrel && !HasVGPRIndexMode)
124       HasMovrel = true;
125   }
126 
127   // Don't crash on invalid devices.
128   if (WavefrontSize == 0)
129     WavefrontSize = 64;
130 
131   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 
133   // ECC is on by default, but turn it off if the hardware doesn't support it
134   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
135   // ECC.
136   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
137     ToggleFeature(AMDGPU::FeatureSRAMECC);
138     EnableSRAMECC = false;
139   }
140 
141   return *this;
142 }
143 
144 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
145   TargetTriple(TT),
146   Has16BitInsts(false),
147   HasMadMixInsts(false),
148   FP32Denormals(false),
149   FPExceptions(false),
150   HasSDWA(false),
151   HasVOP3PInsts(false),
152   HasMulI24(true),
153   HasMulU24(true),
154   HasInv2PiInlineImm(false),
155   HasFminFmaxLegacy(true),
156   EnablePromoteAlloca(false),
157   HasTrigReducedRange(false),
158   LocalMemorySize(0),
159   WavefrontSize(0)
160   { }
161 
162 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
163                            const GCNTargetMachine &TM) :
164     AMDGPUGenSubtargetInfo(TT, GPU, FS),
165     AMDGPUSubtarget(TT),
166     TargetTriple(TT),
167     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
168     InstrItins(getInstrItineraryForCPU(GPU)),
169     LDSBankCount(0),
170     MaxPrivateElementSize(0),
171 
172     FastFMAF32(false),
173     HalfRate64Ops(false),
174 
175     FP64FP16Denormals(false),
176     FlatForGlobal(false),
177     AutoWaitcntBeforeBarrier(false),
178     CodeObjectV3(false),
179     UnalignedScratchAccess(false),
180     UnalignedBufferAccess(false),
181 
182     HasApertureRegs(false),
183     EnableXNACK(false),
184     TrapHandler(false),
185 
186     EnableHugePrivateBuffer(false),
187     EnableLoadStoreOpt(false),
188     EnableUnsafeDSOffsetFolding(false),
189     EnableSIScheduler(false),
190     EnableDS128(false),
191     EnablePRTStrictNull(false),
192     DumpCode(false),
193 
194     FP64(false),
195     GCN3Encoding(false),
196     CIInsts(false),
197     GFX8Insts(false),
198     GFX9Insts(false),
199     GFX7GFX8GFX9Insts(false),
200     SGPRInitBug(false),
201     HasSMemRealTime(false),
202     HasIntClamp(false),
203     HasFmaMixInsts(false),
204     HasMovrel(false),
205     HasVGPRIndexMode(false),
206     HasScalarStores(false),
207     HasScalarAtomics(false),
208     HasSDWAOmod(false),
209     HasSDWAScalar(false),
210     HasSDWASdst(false),
211     HasSDWAMac(false),
212     HasSDWAOutModsVOPC(false),
213     HasDPP(false),
214     HasR128A16(false),
215     HasDLInsts(false),
216     HasDot1Insts(false),
217     HasDot2Insts(false),
218     EnableSRAMECC(false),
219     DoesNotSupportSRAMECC(false),
220     FlatAddressSpace(false),
221     FlatInstOffsets(false),
222     FlatGlobalInsts(false),
223     FlatScratchInsts(false),
224     AddNoCarryInsts(false),
225     HasUnpackedD16VMem(false),
226 
227     ScalarizeGlobal(false),
228 
229     FeatureDisable(false),
230     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
231     TLInfo(TM, *this),
232     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
233   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
234   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
235   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
236   InstSelector.reset(new AMDGPUInstructionSelector(
237   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
238 }
239 
240 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
241   const Function &F) const {
242   if (NWaves == 1)
243     return getLocalMemorySize();
244   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
245   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
246   unsigned MaxWaves = getMaxWavesPerEU();
247   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
248 }
249 
250 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
251   const Function &F) const {
252   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
253   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
254   unsigned MaxWaves = getMaxWavesPerEU();
255   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
256   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
257   NumWaves = std::min(NumWaves, MaxWaves);
258   NumWaves = std::max(NumWaves, 1u);
259   return NumWaves;
260 }
261 
262 unsigned
263 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
264   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
265   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
266 }
267 
268 std::pair<unsigned, unsigned>
269 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
270   switch (CC) {
271   case CallingConv::AMDGPU_CS:
272   case CallingConv::AMDGPU_KERNEL:
273   case CallingConv::SPIR_KERNEL:
274     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
275   case CallingConv::AMDGPU_VS:
276   case CallingConv::AMDGPU_LS:
277   case CallingConv::AMDGPU_HS:
278   case CallingConv::AMDGPU_ES:
279   case CallingConv::AMDGPU_GS:
280   case CallingConv::AMDGPU_PS:
281     return std::make_pair(1, getWavefrontSize());
282   default:
283     return std::make_pair(1, 16 * getWavefrontSize());
284   }
285 }
286 
287 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
288   const Function &F) const {
289   // FIXME: 1024 if function.
290   // Default minimum/maximum flat work group sizes.
291   std::pair<unsigned, unsigned> Default =
292     getDefaultFlatWorkGroupSize(F.getCallingConv());
293 
294   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
295   // starts using "amdgpu-flat-work-group-size" attribute.
296   Default.second = AMDGPU::getIntegerAttribute(
297     F, "amdgpu-max-work-group-size", Default.second);
298   Default.first = std::min(Default.first, Default.second);
299 
300   // Requested minimum/maximum flat work group sizes.
301   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
302     F, "amdgpu-flat-work-group-size", Default);
303 
304   // Make sure requested minimum is less than requested maximum.
305   if (Requested.first > Requested.second)
306     return Default;
307 
308   // Make sure requested values do not violate subtarget's specifications.
309   if (Requested.first < getMinFlatWorkGroupSize())
310     return Default;
311   if (Requested.second > getMaxFlatWorkGroupSize())
312     return Default;
313 
314   return Requested;
315 }
316 
317 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
318   const Function &F) const {
319   // Default minimum/maximum number of waves per execution unit.
320   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
321 
322   // Default/requested minimum/maximum flat work group sizes.
323   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
324 
325   // If minimum/maximum flat work group sizes were explicitly requested using
326   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
327   // number of waves per execution unit to values implied by requested
328   // minimum/maximum flat work group sizes.
329   unsigned MinImpliedByFlatWorkGroupSize =
330     getMaxWavesPerEU(FlatWorkGroupSizes.second);
331   bool RequestedFlatWorkGroupSize = false;
332 
333   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
334   // starts using "amdgpu-flat-work-group-size" attribute.
335   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
336       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
337     Default.first = MinImpliedByFlatWorkGroupSize;
338     RequestedFlatWorkGroupSize = true;
339   }
340 
341   // Requested minimum/maximum number of waves per execution unit.
342   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
343     F, "amdgpu-waves-per-eu", Default, true);
344 
345   // Make sure requested minimum is less than requested maximum.
346   if (Requested.second && Requested.first > Requested.second)
347     return Default;
348 
349   // Make sure requested values do not violate subtarget's specifications.
350   if (Requested.first < getMinWavesPerEU() ||
351       Requested.first > getMaxWavesPerEU())
352     return Default;
353   if (Requested.second > getMaxWavesPerEU())
354     return Default;
355 
356   // Make sure requested values are compatible with values implied by requested
357   // minimum/maximum flat work group sizes.
358   if (RequestedFlatWorkGroupSize &&
359       Requested.first < MinImpliedByFlatWorkGroupSize)
360     return Default;
361 
362   return Requested;
363 }
364 
365 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
366   Function *Kernel = I->getParent()->getParent();
367   unsigned MinSize = 0;
368   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
369   bool IdQuery = false;
370 
371   // If reqd_work_group_size is present it narrows value down.
372   if (auto *CI = dyn_cast<CallInst>(I)) {
373     const Function *F = CI->getCalledFunction();
374     if (F) {
375       unsigned Dim = UINT_MAX;
376       switch (F->getIntrinsicID()) {
377       case Intrinsic::amdgcn_workitem_id_x:
378       case Intrinsic::r600_read_tidig_x:
379         IdQuery = true;
380         LLVM_FALLTHROUGH;
381       case Intrinsic::r600_read_local_size_x:
382         Dim = 0;
383         break;
384       case Intrinsic::amdgcn_workitem_id_y:
385       case Intrinsic::r600_read_tidig_y:
386         IdQuery = true;
387         LLVM_FALLTHROUGH;
388       case Intrinsic::r600_read_local_size_y:
389         Dim = 1;
390         break;
391       case Intrinsic::amdgcn_workitem_id_z:
392       case Intrinsic::r600_read_tidig_z:
393         IdQuery = true;
394         LLVM_FALLTHROUGH;
395       case Intrinsic::r600_read_local_size_z:
396         Dim = 2;
397         break;
398       default:
399         break;
400       }
401       if (Dim <= 3) {
402         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
403           if (Node->getNumOperands() == 3)
404             MinSize = MaxSize = mdconst::extract<ConstantInt>(
405                                   Node->getOperand(Dim))->getZExtValue();
406       }
407     }
408   }
409 
410   if (!MaxSize)
411     return false;
412 
413   // Range metadata is [Lo, Hi). For ID query we need to pass max size
414   // as Hi. For size query we need to pass Hi + 1.
415   if (IdQuery)
416     MinSize = 0;
417   else
418     ++MaxSize;
419 
420   MDBuilder MDB(I->getContext());
421   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
422                                                   APInt(32, MaxSize));
423   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
424   return true;
425 }
426 
427 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
428                                                  unsigned &MaxAlign) const {
429   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
430          F.getCallingConv() == CallingConv::SPIR_KERNEL);
431 
432   const DataLayout &DL = F.getParent()->getDataLayout();
433   uint64_t ExplicitArgBytes = 0;
434   MaxAlign = 1;
435 
436   for (const Argument &Arg : F.args()) {
437     Type *ArgTy = Arg.getType();
438 
439     unsigned Align = DL.getABITypeAlignment(ArgTy);
440     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
441     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
442     MaxAlign = std::max(MaxAlign, Align);
443   }
444 
445   return ExplicitArgBytes;
446 }
447 
448 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
449                                                 unsigned &MaxAlign) const {
450   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
451 
452   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
453 
454   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
455   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
456   if (ImplicitBytes != 0) {
457     unsigned Alignment = getAlignmentForImplicitArgPtr();
458     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
459   }
460 
461   // Being able to dereference past the end is useful for emitting scalar loads.
462   return alignTo(TotalSize, 4);
463 }
464 
465 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
466                              const TargetMachine &TM) :
467   R600GenSubtargetInfo(TT, GPU, FS),
468   AMDGPUSubtarget(TT),
469   InstrInfo(*this),
470   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
471   FMA(false),
472   CaymanISA(false),
473   CFALUBug(false),
474   HasVertexCache(false),
475   R600ALUInst(false),
476   FP64(false),
477   TexVTXClauseSize(0),
478   Gen(R600),
479   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
480   InstrItins(getInstrItineraryForCPU(GPU)) { }
481 
482 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
483                                       unsigned NumRegionInstrs) const {
484   // Track register pressure so the scheduler can try to decrease
485   // pressure once register usage is above the threshold defined by
486   // SIRegisterInfo::getRegPressureSetLimit()
487   Policy.ShouldTrackPressure = true;
488 
489   // Enabling both top down and bottom up scheduling seems to give us less
490   // register spills than just using one of these approaches on its own.
491   Policy.OnlyTopDown = false;
492   Policy.OnlyBottomUp = false;
493 
494   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
495   if (!enableSIScheduler())
496     Policy.ShouldTrackLaneMasks = true;
497 }
498 
499 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
500   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
501     if (SGPRs <= 80)
502       return 10;
503     if (SGPRs <= 88)
504       return 9;
505     if (SGPRs <= 100)
506       return 8;
507     return 7;
508   }
509   if (SGPRs <= 48)
510     return 10;
511   if (SGPRs <= 56)
512     return 9;
513   if (SGPRs <= 64)
514     return 8;
515   if (SGPRs <= 72)
516     return 7;
517   if (SGPRs <= 80)
518     return 6;
519   return 5;
520 }
521 
522 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
523   if (VGPRs <= 24)
524     return 10;
525   if (VGPRs <= 28)
526     return 9;
527   if (VGPRs <= 32)
528     return 8;
529   if (VGPRs <= 36)
530     return 7;
531   if (VGPRs <= 40)
532     return 6;
533   if (VGPRs <= 48)
534     return 5;
535   if (VGPRs <= 64)
536     return 4;
537   if (VGPRs <= 84)
538     return 3;
539   if (VGPRs <= 128)
540     return 2;
541   return 1;
542 }
543 
544 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
545   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
546   if (MFI.hasFlatScratchInit()) {
547     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
548       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
549     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
550       return 4; // FLAT_SCRATCH, VCC (in that order).
551   }
552 
553   if (isXNACKEnabled())
554     return 4; // XNACK, VCC (in that order).
555   return 2; // VCC.
556 }
557 
558 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
559   const Function &F = MF.getFunction();
560   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
561 
562   // Compute maximum number of SGPRs function can use using default/requested
563   // minimum number of waves per execution unit.
564   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
565   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
566   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
567 
568   // Check if maximum number of SGPRs was explicitly requested using
569   // "amdgpu-num-sgpr" attribute.
570   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
571     unsigned Requested = AMDGPU::getIntegerAttribute(
572       F, "amdgpu-num-sgpr", MaxNumSGPRs);
573 
574     // Make sure requested value does not violate subtarget's specifications.
575     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
576       Requested = 0;
577 
578     // If more SGPRs are required to support the input user/system SGPRs,
579     // increase to accommodate them.
580     //
581     // FIXME: This really ends up using the requested number of SGPRs + number
582     // of reserved special registers in total. Theoretically you could re-use
583     // the last input registers for these special registers, but this would
584     // require a lot of complexity to deal with the weird aliasing.
585     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
586     if (Requested && Requested < InputNumSGPRs)
587       Requested = InputNumSGPRs;
588 
589     // Make sure requested value is compatible with values implied by
590     // default/requested minimum/maximum number of waves per execution unit.
591     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
592       Requested = 0;
593     if (WavesPerEU.second &&
594         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
595       Requested = 0;
596 
597     if (Requested)
598       MaxNumSGPRs = Requested;
599   }
600 
601   if (hasSGPRInitBug())
602     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
603 
604   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
605                   MaxAddressableNumSGPRs);
606 }
607 
608 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
609   const Function &F = MF.getFunction();
610   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
611 
612   // Compute maximum number of VGPRs function can use using default/requested
613   // minimum number of waves per execution unit.
614   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
615   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
616 
617   // Check if maximum number of VGPRs was explicitly requested using
618   // "amdgpu-num-vgpr" attribute.
619   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
620     unsigned Requested = AMDGPU::getIntegerAttribute(
621       F, "amdgpu-num-vgpr", MaxNumVGPRs);
622 
623     // Make sure requested value is compatible with values implied by
624     // default/requested minimum/maximum number of waves per execution unit.
625     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
626       Requested = 0;
627     if (WavesPerEU.second &&
628         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
629       Requested = 0;
630 
631     if (Requested)
632       MaxNumVGPRs = Requested;
633   }
634 
635   return MaxNumVGPRs;
636 }
637 
638 namespace {
639 struct MemOpClusterMutation : ScheduleDAGMutation {
640   const SIInstrInfo *TII;
641 
642   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
643 
644   void apply(ScheduleDAGInstrs *DAG) override {
645     SUnit *SUa = nullptr;
646     // Search for two consequent memory operations and link them
647     // to prevent scheduler from moving them apart.
648     // In DAG pre-process SUnits are in the original order of
649     // the instructions before scheduling.
650     for (SUnit &SU : DAG->SUnits) {
651       MachineInstr &MI2 = *SU.getInstr();
652       if (!MI2.mayLoad() && !MI2.mayStore()) {
653         SUa = nullptr;
654         continue;
655       }
656       if (!SUa) {
657         SUa = &SU;
658         continue;
659       }
660 
661       MachineInstr &MI1 = *SUa->getInstr();
662       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
663           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
664           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
665           (TII->isDS(MI1)   && TII->isDS(MI2))) {
666         SU.addPredBarrier(SUa);
667 
668         for (const SDep &SI : SU.Preds) {
669           if (SI.getSUnit() != SUa)
670             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
671         }
672 
673         if (&SU != &DAG->ExitSU) {
674           for (const SDep &SI : SUa->Succs) {
675             if (SI.getSUnit() != &SU)
676               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
677           }
678         }
679       }
680 
681       SUa = &SU;
682     }
683   }
684 };
685 } // namespace
686 
687 void GCNSubtarget::getPostRAMutations(
688     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
689   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
690 }
691 
692 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
693   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
694     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
695   else
696     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
697 }
698 
699 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
700   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
701     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
702   else
703     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
704 }
705