1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   // Disable mutually exclusive bits.
98   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100       FullFS += "-wavefrontsize16,";
101     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102       FullFS += "-wavefrontsize32,";
103     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104       FullFS += "-wavefrontsize64,";
105   }
106 
107   FullFS += FS;
108 
109   ParseSubtargetFeatures(GPU, FullFS);
110 
111   // We don't support FP64 for EG/NI atm.
112   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113 
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116   // variants of MUBUF instructions.
117   if (!hasAddr64() && !FS.contains("flat-for-global")) {
118     FlatForGlobal = true;
119   }
120 
121   // Set defaults if needed.
122   if (MaxPrivateElementSize == 0)
123     MaxPrivateElementSize = 4;
124 
125   if (LDSBankCount == 0)
126     LDSBankCount = 32;
127 
128   if (TT.getArch() == Triple::amdgcn) {
129     if (LocalMemorySize == 0)
130       LocalMemorySize = 32768;
131 
132     // Do something sensible for unspecified target.
133     if (!HasMovrel && !HasVGPRIndexMode)
134       HasMovrel = true;
135   }
136 
137   // Don't crash on invalid devices.
138   if (WavefrontSize == 0)
139     WavefrontSize = 64;
140 
141   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
142 
143   if (DoesNotSupportXNACK && EnableXNACK) {
144     ToggleFeature(AMDGPU::FeatureXNACK);
145     EnableXNACK = false;
146   }
147 
148   // ECC is on by default, but turn it off if the hardware doesn't support it
149   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
150   // ECC.
151   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
152     ToggleFeature(AMDGPU::FeatureSRAMECC);
153     EnableSRAMECC = false;
154   }
155 
156   return *this;
157 }
158 
159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
160   TargetTriple(TT),
161   Has16BitInsts(false),
162   HasMadMixInsts(false),
163   FP32Denormals(false),
164   FPExceptions(false),
165   HasSDWA(false),
166   HasVOP3PInsts(false),
167   HasMulI24(true),
168   HasMulU24(true),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   LocalMemorySize(0),
174   WavefrontSize(0)
175   { }
176 
177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
178                            const GCNTargetMachine &TM) :
179     AMDGPUGenSubtargetInfo(TT, GPU, FS),
180     AMDGPUSubtarget(TT),
181     TargetTriple(TT),
182     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
183     InstrItins(getInstrItineraryForCPU(GPU)),
184     LDSBankCount(0),
185     MaxPrivateElementSize(0),
186 
187     FastFMAF32(false),
188     HalfRate64Ops(false),
189 
190     FP64FP16Denormals(false),
191     FlatForGlobal(false),
192     AutoWaitcntBeforeBarrier(false),
193     CodeObjectV3(false),
194     UnalignedScratchAccess(false),
195     UnalignedBufferAccess(false),
196 
197     HasApertureRegs(false),
198     EnableXNACK(false),
199     DoesNotSupportXNACK(false),
200     EnableCuMode(false),
201     TrapHandler(false),
202 
203     EnableLoadStoreOpt(false),
204     EnableUnsafeDSOffsetFolding(false),
205     EnableSIScheduler(false),
206     EnableDS128(false),
207     EnablePRTStrictNull(false),
208     DumpCode(false),
209 
210     FP64(false),
211     GCN3Encoding(false),
212     CIInsts(false),
213     GFX8Insts(false),
214     GFX9Insts(false),
215     GFX10Insts(false),
216     GFX7GFX8GFX9Insts(false),
217     SGPRInitBug(false),
218     HasSMemRealTime(false),
219     HasIntClamp(false),
220     HasFmaMixInsts(false),
221     HasMovrel(false),
222     HasVGPRIndexMode(false),
223     HasScalarStores(false),
224     HasScalarAtomics(false),
225     HasSDWAOmod(false),
226     HasSDWAScalar(false),
227     HasSDWASdst(false),
228     HasSDWAMac(false),
229     HasSDWAOutModsVOPC(false),
230     HasDPP(false),
231     HasDPP8(false),
232     HasR128A16(false),
233     HasNSAEncoding(false),
234     HasDLInsts(false),
235     HasDot1Insts(false),
236     HasDot2Insts(false),
237     HasDot3Insts(false),
238     HasDot4Insts(false),
239     HasDot5Insts(false),
240     HasDot6Insts(false),
241     HasMAIInsts(false),
242     HasPkFmacF16Inst(false),
243     HasAtomicFaddInsts(false),
244     EnableSRAMECC(false),
245     DoesNotSupportSRAMECC(false),
246     HasNoSdstCMPX(false),
247     HasVscnt(false),
248     HasRegisterBanking(false),
249     HasVOP3Literal(false),
250     HasNoDataDepHazard(false),
251     FlatAddressSpace(false),
252     FlatInstOffsets(false),
253     FlatGlobalInsts(false),
254     FlatScratchInsts(false),
255     ScalarFlatScratchInsts(false),
256     AddNoCarryInsts(false),
257     HasUnpackedD16VMem(false),
258     LDSMisalignedBug(false),
259 
260     ScalarizeGlobal(false),
261 
262     HasVcmpxPermlaneHazard(false),
263     HasVMEMtoScalarWriteHazard(false),
264     HasSMEMtoVectorWriteHazard(false),
265     HasInstFwdPrefetchBug(false),
266     HasVcmpxExecWARHazard(false),
267     HasLdsBranchVmemWARHazard(false),
268     HasNSAtoVMEMBug(false),
269     HasOffset3fBug(false),
270     HasFlatSegmentOffsetBug(false),
271 
272     FeatureDisable(false),
273     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
274     TLInfo(TM, *this),
275     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
276   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
277   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
278   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
279   InstSelector.reset(new AMDGPUInstructionSelector(
280   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
281 }
282 
283 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
284   if (getGeneration() < GFX10)
285     return 1;
286 
287   switch (Opcode) {
288   case AMDGPU::V_LSHLREV_B64:
289   case AMDGPU::V_LSHLREV_B64_gfx10:
290   case AMDGPU::V_LSHL_B64:
291   case AMDGPU::V_LSHRREV_B64:
292   case AMDGPU::V_LSHRREV_B64_gfx10:
293   case AMDGPU::V_LSHR_B64:
294   case AMDGPU::V_ASHRREV_I64:
295   case AMDGPU::V_ASHRREV_I64_gfx10:
296   case AMDGPU::V_ASHR_I64:
297     return 1;
298   }
299 
300   return 2;
301 }
302 
303 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
304   const Function &F) const {
305   if (NWaves == 1)
306     return getLocalMemorySize();
307   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
308   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
309   if (!WorkGroupsPerCu)
310     return 0;
311   unsigned MaxWaves = getMaxWavesPerEU();
312   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
313 }
314 
315 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
316   const Function &F) const {
317   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
318   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
319   if (!WorkGroupsPerCu)
320     return 0;
321   unsigned MaxWaves = getMaxWavesPerEU();
322   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
323   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
324   NumWaves = std::min(NumWaves, MaxWaves);
325   NumWaves = std::max(NumWaves, 1u);
326   return NumWaves;
327 }
328 
329 unsigned
330 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
331   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
332   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
333 }
334 
335 std::pair<unsigned, unsigned>
336 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
337   switch (CC) {
338   case CallingConv::AMDGPU_CS:
339   case CallingConv::AMDGPU_KERNEL:
340   case CallingConv::SPIR_KERNEL:
341     return std::make_pair(getWavefrontSize() * 2,
342                           std::max(getWavefrontSize() * 4, 256u));
343   case CallingConv::AMDGPU_VS:
344   case CallingConv::AMDGPU_LS:
345   case CallingConv::AMDGPU_HS:
346   case CallingConv::AMDGPU_ES:
347   case CallingConv::AMDGPU_GS:
348   case CallingConv::AMDGPU_PS:
349     return std::make_pair(1, getWavefrontSize());
350   default:
351     return std::make_pair(1, 16 * getWavefrontSize());
352   }
353 }
354 
355 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
356   const Function &F) const {
357   // FIXME: 1024 if function.
358   // Default minimum/maximum flat work group sizes.
359   std::pair<unsigned, unsigned> Default =
360     getDefaultFlatWorkGroupSize(F.getCallingConv());
361 
362   // Requested minimum/maximum flat work group sizes.
363   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
364     F, "amdgpu-flat-work-group-size", Default);
365 
366   // Make sure requested minimum is less than requested maximum.
367   if (Requested.first > Requested.second)
368     return Default;
369 
370   // Make sure requested values do not violate subtarget's specifications.
371   if (Requested.first < getMinFlatWorkGroupSize())
372     return Default;
373   if (Requested.second > getMaxFlatWorkGroupSize())
374     return Default;
375 
376   return Requested;
377 }
378 
379 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
380   const Function &F) const {
381   // Default minimum/maximum number of waves per execution unit.
382   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
383 
384   // Default/requested minimum/maximum flat work group sizes.
385   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
386 
387   // If minimum/maximum flat work group sizes were explicitly requested using
388   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
389   // number of waves per execution unit to values implied by requested
390   // minimum/maximum flat work group sizes.
391   unsigned MinImpliedByFlatWorkGroupSize =
392     getMaxWavesPerEU(FlatWorkGroupSizes.second);
393   bool RequestedFlatWorkGroupSize = false;
394 
395   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
396     Default.first = MinImpliedByFlatWorkGroupSize;
397     RequestedFlatWorkGroupSize = true;
398   }
399 
400   // Requested minimum/maximum number of waves per execution unit.
401   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
402     F, "amdgpu-waves-per-eu", Default, true);
403 
404   // Make sure requested minimum is less than requested maximum.
405   if (Requested.second && Requested.first > Requested.second)
406     return Default;
407 
408   // Make sure requested values do not violate subtarget's specifications.
409   if (Requested.first < getMinWavesPerEU() ||
410       Requested.first > getMaxWavesPerEU())
411     return Default;
412   if (Requested.second > getMaxWavesPerEU())
413     return Default;
414 
415   // Make sure requested values are compatible with values implied by requested
416   // minimum/maximum flat work group sizes.
417   if (RequestedFlatWorkGroupSize &&
418       Requested.first < MinImpliedByFlatWorkGroupSize)
419     return Default;
420 
421   return Requested;
422 }
423 
424 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
425   Function *Kernel = I->getParent()->getParent();
426   unsigned MinSize = 0;
427   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
428   bool IdQuery = false;
429 
430   // If reqd_work_group_size is present it narrows value down.
431   if (auto *CI = dyn_cast<CallInst>(I)) {
432     const Function *F = CI->getCalledFunction();
433     if (F) {
434       unsigned Dim = UINT_MAX;
435       switch (F->getIntrinsicID()) {
436       case Intrinsic::amdgcn_workitem_id_x:
437       case Intrinsic::r600_read_tidig_x:
438         IdQuery = true;
439         LLVM_FALLTHROUGH;
440       case Intrinsic::r600_read_local_size_x:
441         Dim = 0;
442         break;
443       case Intrinsic::amdgcn_workitem_id_y:
444       case Intrinsic::r600_read_tidig_y:
445         IdQuery = true;
446         LLVM_FALLTHROUGH;
447       case Intrinsic::r600_read_local_size_y:
448         Dim = 1;
449         break;
450       case Intrinsic::amdgcn_workitem_id_z:
451       case Intrinsic::r600_read_tidig_z:
452         IdQuery = true;
453         LLVM_FALLTHROUGH;
454       case Intrinsic::r600_read_local_size_z:
455         Dim = 2;
456         break;
457       default:
458         break;
459       }
460       if (Dim <= 3) {
461         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
462           if (Node->getNumOperands() == 3)
463             MinSize = MaxSize = mdconst::extract<ConstantInt>(
464                                   Node->getOperand(Dim))->getZExtValue();
465       }
466     }
467   }
468 
469   if (!MaxSize)
470     return false;
471 
472   // Range metadata is [Lo, Hi). For ID query we need to pass max size
473   // as Hi. For size query we need to pass Hi + 1.
474   if (IdQuery)
475     MinSize = 0;
476   else
477     ++MaxSize;
478 
479   MDBuilder MDB(I->getContext());
480   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
481                                                   APInt(32, MaxSize));
482   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
483   return true;
484 }
485 
486 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
487                                                  unsigned &MaxAlign) const {
488   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
489          F.getCallingConv() == CallingConv::SPIR_KERNEL);
490 
491   const DataLayout &DL = F.getParent()->getDataLayout();
492   uint64_t ExplicitArgBytes = 0;
493   MaxAlign = 1;
494 
495   for (const Argument &Arg : F.args()) {
496     Type *ArgTy = Arg.getType();
497 
498     unsigned Align = DL.getABITypeAlignment(ArgTy);
499     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
500     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
501     MaxAlign = std::max(MaxAlign, Align);
502   }
503 
504   return ExplicitArgBytes;
505 }
506 
507 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
508                                                 unsigned &MaxAlign) const {
509   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
510 
511   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
512 
513   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
514   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
515   if (ImplicitBytes != 0) {
516     unsigned Alignment = getAlignmentForImplicitArgPtr();
517     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
518   }
519 
520   // Being able to dereference past the end is useful for emitting scalar loads.
521   return alignTo(TotalSize, 4);
522 }
523 
524 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
525                              const TargetMachine &TM) :
526   R600GenSubtargetInfo(TT, GPU, FS),
527   AMDGPUSubtarget(TT),
528   InstrInfo(*this),
529   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
530   FMA(false),
531   CaymanISA(false),
532   CFALUBug(false),
533   HasVertexCache(false),
534   R600ALUInst(false),
535   FP64(false),
536   TexVTXClauseSize(0),
537   Gen(R600),
538   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
539   InstrItins(getInstrItineraryForCPU(GPU)) { }
540 
541 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
542                                       unsigned NumRegionInstrs) const {
543   // Track register pressure so the scheduler can try to decrease
544   // pressure once register usage is above the threshold defined by
545   // SIRegisterInfo::getRegPressureSetLimit()
546   Policy.ShouldTrackPressure = true;
547 
548   // Enabling both top down and bottom up scheduling seems to give us less
549   // register spills than just using one of these approaches on its own.
550   Policy.OnlyTopDown = false;
551   Policy.OnlyBottomUp = false;
552 
553   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
554   if (!enableSIScheduler())
555     Policy.ShouldTrackLaneMasks = true;
556 }
557 
558 bool GCNSubtarget::hasMadF16() const {
559   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
560 }
561 
562 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
563   if (getGeneration() >= AMDGPUSubtarget::GFX10)
564     return 10;
565 
566   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
567     if (SGPRs <= 80)
568       return 10;
569     if (SGPRs <= 88)
570       return 9;
571     if (SGPRs <= 100)
572       return 8;
573     return 7;
574   }
575   if (SGPRs <= 48)
576     return 10;
577   if (SGPRs <= 56)
578     return 9;
579   if (SGPRs <= 64)
580     return 8;
581   if (SGPRs <= 72)
582     return 7;
583   if (SGPRs <= 80)
584     return 6;
585   return 5;
586 }
587 
588 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
589   if (VGPRs <= 24)
590     return 10;
591   if (VGPRs <= 28)
592     return 9;
593   if (VGPRs <= 32)
594     return 8;
595   if (VGPRs <= 36)
596     return 7;
597   if (VGPRs <= 40)
598     return 6;
599   if (VGPRs <= 48)
600     return 5;
601   if (VGPRs <= 64)
602     return 4;
603   if (VGPRs <= 84)
604     return 3;
605   if (VGPRs <= 128)
606     return 2;
607   return 1;
608 }
609 
610 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
611   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
612   if (getGeneration() >= AMDGPUSubtarget::GFX10)
613     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
614 
615   if (MFI.hasFlatScratchInit()) {
616     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
617       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
618     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
619       return 4; // FLAT_SCRATCH, VCC (in that order).
620   }
621 
622   if (isXNACKEnabled())
623     return 4; // XNACK, VCC (in that order).
624   return 2; // VCC.
625 }
626 
627 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
628   const Function &F = MF.getFunction();
629   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
630 
631   // Compute maximum number of SGPRs function can use using default/requested
632   // minimum number of waves per execution unit.
633   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
634   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
635   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
636 
637   // Check if maximum number of SGPRs was explicitly requested using
638   // "amdgpu-num-sgpr" attribute.
639   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
640     unsigned Requested = AMDGPU::getIntegerAttribute(
641       F, "amdgpu-num-sgpr", MaxNumSGPRs);
642 
643     // Make sure requested value does not violate subtarget's specifications.
644     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
645       Requested = 0;
646 
647     // If more SGPRs are required to support the input user/system SGPRs,
648     // increase to accommodate them.
649     //
650     // FIXME: This really ends up using the requested number of SGPRs + number
651     // of reserved special registers in total. Theoretically you could re-use
652     // the last input registers for these special registers, but this would
653     // require a lot of complexity to deal with the weird aliasing.
654     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
655     if (Requested && Requested < InputNumSGPRs)
656       Requested = InputNumSGPRs;
657 
658     // Make sure requested value is compatible with values implied by
659     // default/requested minimum/maximum number of waves per execution unit.
660     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
661       Requested = 0;
662     if (WavesPerEU.second &&
663         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
664       Requested = 0;
665 
666     if (Requested)
667       MaxNumSGPRs = Requested;
668   }
669 
670   if (hasSGPRInitBug())
671     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
672 
673   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
674                   MaxAddressableNumSGPRs);
675 }
676 
677 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
678   const Function &F = MF.getFunction();
679   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
680 
681   // Compute maximum number of VGPRs function can use using default/requested
682   // minimum number of waves per execution unit.
683   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
684   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
685 
686   // Check if maximum number of VGPRs was explicitly requested using
687   // "amdgpu-num-vgpr" attribute.
688   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
689     unsigned Requested = AMDGPU::getIntegerAttribute(
690       F, "amdgpu-num-vgpr", MaxNumVGPRs);
691 
692     // Make sure requested value is compatible with values implied by
693     // default/requested minimum/maximum number of waves per execution unit.
694     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
695       Requested = 0;
696     if (WavesPerEU.second &&
697         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
698       Requested = 0;
699 
700     if (Requested)
701       MaxNumVGPRs = Requested;
702   }
703 
704   return MaxNumVGPRs;
705 }
706 
707 namespace {
708 struct MemOpClusterMutation : ScheduleDAGMutation {
709   const SIInstrInfo *TII;
710 
711   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
712 
713   void apply(ScheduleDAGInstrs *DAG) override {
714     SUnit *SUa = nullptr;
715     // Search for two consequent memory operations and link them
716     // to prevent scheduler from moving them apart.
717     // In DAG pre-process SUnits are in the original order of
718     // the instructions before scheduling.
719     for (SUnit &SU : DAG->SUnits) {
720       MachineInstr &MI2 = *SU.getInstr();
721       if (!MI2.mayLoad() && !MI2.mayStore()) {
722         SUa = nullptr;
723         continue;
724       }
725       if (!SUa) {
726         SUa = &SU;
727         continue;
728       }
729 
730       MachineInstr &MI1 = *SUa->getInstr();
731       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
732           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
733           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
734           (TII->isDS(MI1)   && TII->isDS(MI2))) {
735         SU.addPredBarrier(SUa);
736 
737         for (const SDep &SI : SU.Preds) {
738           if (SI.getSUnit() != SUa)
739             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
740         }
741 
742         if (&SU != &DAG->ExitSU) {
743           for (const SDep &SI : SUa->Succs) {
744             if (SI.getSUnit() != &SU)
745               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
746           }
747         }
748       }
749 
750       SUa = &SU;
751     }
752   }
753 };
754 } // namespace
755 
756 void GCNSubtarget::getPostRAMutations(
757     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
758   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
759 }
760 
761 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
762   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
763     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
764   else
765     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
766 }
767 
768 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
769   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
770     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
771   else
772     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
773 }
774