1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   // Disable mutually exclusive bits.
98   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100       FullFS += "-wavefrontsize16,";
101     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102       FullFS += "-wavefrontsize32,";
103     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104       FullFS += "-wavefrontsize64,";
105   }
106 
107   FullFS += FS;
108 
109   ParseSubtargetFeatures(GPU, FullFS);
110 
111   // We don't support FP64 for EG/NI atm.
112   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113 
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116   // variants of MUBUF instructions.
117   if (!hasAddr64() && !FS.contains("flat-for-global")) {
118     FlatForGlobal = true;
119   }
120 
121   // Set defaults if needed.
122   if (MaxPrivateElementSize == 0)
123     MaxPrivateElementSize = 4;
124 
125   if (LDSBankCount == 0)
126     LDSBankCount = 32;
127 
128   if (TT.getArch() == Triple::amdgcn) {
129     if (LocalMemorySize == 0)
130       LocalMemorySize = 32768;
131 
132     // Do something sensible for unspecified target.
133     if (!HasMovrel && !HasVGPRIndexMode)
134       HasMovrel = true;
135   }
136 
137   // Don't crash on invalid devices.
138   if (WavefrontSize == 0)
139     WavefrontSize = 64;
140 
141   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
142 
143   if (DoesNotSupportXNACK && EnableXNACK) {
144     ToggleFeature(AMDGPU::FeatureXNACK);
145     EnableXNACK = false;
146   }
147 
148   // ECC is on by default, but turn it off if the hardware doesn't support it
149   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
150   // ECC.
151   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
152     ToggleFeature(AMDGPU::FeatureSRAMECC);
153     EnableSRAMECC = false;
154   }
155 
156   return *this;
157 }
158 
159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
160   TargetTriple(TT),
161   Has16BitInsts(false),
162   HasMadMixInsts(false),
163   FP32Denormals(false),
164   FPExceptions(false),
165   HasSDWA(false),
166   HasVOP3PInsts(false),
167   HasMulI24(true),
168   HasMulU24(true),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   LocalMemorySize(0),
174   WavefrontSize(0)
175   { }
176 
177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
178                            const GCNTargetMachine &TM) :
179     AMDGPUGenSubtargetInfo(TT, GPU, FS),
180     AMDGPUSubtarget(TT),
181     TargetTriple(TT),
182     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
183     InstrItins(getInstrItineraryForCPU(GPU)),
184     LDSBankCount(0),
185     MaxPrivateElementSize(0),
186 
187     FastFMAF32(false),
188     HalfRate64Ops(false),
189 
190     FP64FP16Denormals(false),
191     FlatForGlobal(false),
192     AutoWaitcntBeforeBarrier(false),
193     CodeObjectV3(false),
194     UnalignedScratchAccess(false),
195     UnalignedBufferAccess(false),
196 
197     HasApertureRegs(false),
198     EnableXNACK(false),
199     DoesNotSupportXNACK(false),
200     EnableCuMode(false),
201     TrapHandler(false),
202 
203     EnableLoadStoreOpt(false),
204     EnableUnsafeDSOffsetFolding(false),
205     EnableSIScheduler(false),
206     EnableDS128(false),
207     EnablePRTStrictNull(false),
208     DumpCode(false),
209 
210     FP64(false),
211     GCN3Encoding(false),
212     CIInsts(false),
213     GFX8Insts(false),
214     GFX9Insts(false),
215     GFX10Insts(false),
216     GFX7GFX8GFX9Insts(false),
217     SGPRInitBug(false),
218     HasSMemRealTime(false),
219     HasIntClamp(false),
220     HasFmaMixInsts(false),
221     HasMovrel(false),
222     HasVGPRIndexMode(false),
223     HasScalarStores(false),
224     HasScalarAtomics(false),
225     HasSDWAOmod(false),
226     HasSDWAScalar(false),
227     HasSDWASdst(false),
228     HasSDWAMac(false),
229     HasSDWAOutModsVOPC(false),
230     HasDPP(false),
231     HasDPP8(false),
232     HasR128A16(false),
233     HasNSAEncoding(false),
234     HasDLInsts(false),
235     HasDot1Insts(false),
236     HasDot2Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     EnableSRAMECC(false),
240     DoesNotSupportSRAMECC(false),
241     HasNoSdstCMPX(false),
242     HasVscnt(false),
243     HasRegisterBanking(false),
244     HasVOP3Literal(false),
245     HasNoDataDepHazard(false),
246     FlatAddressSpace(false),
247     FlatInstOffsets(false),
248     FlatGlobalInsts(false),
249     FlatScratchInsts(false),
250     ScalarFlatScratchInsts(false),
251     AddNoCarryInsts(false),
252     HasUnpackedD16VMem(false),
253     LDSMisalignedBug(false),
254 
255     ScalarizeGlobal(false),
256 
257     HasVcmpxPermlaneHazard(false),
258     HasVMEMtoScalarWriteHazard(false),
259     HasSMEMtoVectorWriteHazard(false),
260     HasInstFwdPrefetchBug(false),
261     HasVcmpxExecWARHazard(false),
262     HasLdsBranchVmemWARHazard(false),
263     HasNSAtoVMEMBug(false),
264     HasFlatSegmentOffsetBug(false),
265 
266     FeatureDisable(false),
267     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
268     TLInfo(TM, *this),
269     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
270   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
271   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
272   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
273   InstSelector.reset(new AMDGPUInstructionSelector(
274   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
275 }
276 
277 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
278   if (getGeneration() < GFX10)
279     return 1;
280 
281   switch (Opcode) {
282   case AMDGPU::V_LSHLREV_B64:
283   case AMDGPU::V_LSHLREV_B64_gfx10:
284   case AMDGPU::V_LSHL_B64:
285   case AMDGPU::V_LSHRREV_B64:
286   case AMDGPU::V_LSHRREV_B64_gfx10:
287   case AMDGPU::V_LSHR_B64:
288   case AMDGPU::V_ASHRREV_I64:
289   case AMDGPU::V_ASHRREV_I64_gfx10:
290   case AMDGPU::V_ASHR_I64:
291     return 1;
292   }
293 
294   return 2;
295 }
296 
297 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
298   const Function &F) const {
299   if (NWaves == 1)
300     return getLocalMemorySize();
301   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
302   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
303   if (!WorkGroupsPerCu)
304     return 0;
305   unsigned MaxWaves = getMaxWavesPerEU();
306   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
307 }
308 
309 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
310   const Function &F) const {
311   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
312   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
313   if (!WorkGroupsPerCu)
314     return 0;
315   unsigned MaxWaves = getMaxWavesPerEU();
316   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
317   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
318   NumWaves = std::min(NumWaves, MaxWaves);
319   NumWaves = std::max(NumWaves, 1u);
320   return NumWaves;
321 }
322 
323 unsigned
324 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
325   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
326   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
327 }
328 
329 std::pair<unsigned, unsigned>
330 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
331   switch (CC) {
332   case CallingConv::AMDGPU_CS:
333   case CallingConv::AMDGPU_KERNEL:
334   case CallingConv::SPIR_KERNEL:
335     return std::make_pair(getWavefrontSize() * 2,
336                           std::max(getWavefrontSize() * 4, 256u));
337   case CallingConv::AMDGPU_VS:
338   case CallingConv::AMDGPU_LS:
339   case CallingConv::AMDGPU_HS:
340   case CallingConv::AMDGPU_ES:
341   case CallingConv::AMDGPU_GS:
342   case CallingConv::AMDGPU_PS:
343     return std::make_pair(1, getWavefrontSize());
344   default:
345     return std::make_pair(1, 16 * getWavefrontSize());
346   }
347 }
348 
349 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
350   const Function &F) const {
351   // FIXME: 1024 if function.
352   // Default minimum/maximum flat work group sizes.
353   std::pair<unsigned, unsigned> Default =
354     getDefaultFlatWorkGroupSize(F.getCallingConv());
355 
356   // Requested minimum/maximum flat work group sizes.
357   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
358     F, "amdgpu-flat-work-group-size", Default);
359 
360   // Make sure requested minimum is less than requested maximum.
361   if (Requested.first > Requested.second)
362     return Default;
363 
364   // Make sure requested values do not violate subtarget's specifications.
365   if (Requested.first < getMinFlatWorkGroupSize())
366     return Default;
367   if (Requested.second > getMaxFlatWorkGroupSize())
368     return Default;
369 
370   return Requested;
371 }
372 
373 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
374   const Function &F) const {
375   // Default minimum/maximum number of waves per execution unit.
376   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
377 
378   // Default/requested minimum/maximum flat work group sizes.
379   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
380 
381   // If minimum/maximum flat work group sizes were explicitly requested using
382   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
383   // number of waves per execution unit to values implied by requested
384   // minimum/maximum flat work group sizes.
385   unsigned MinImpliedByFlatWorkGroupSize =
386     getMaxWavesPerEU(FlatWorkGroupSizes.second);
387   bool RequestedFlatWorkGroupSize = false;
388 
389   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
390     Default.first = MinImpliedByFlatWorkGroupSize;
391     RequestedFlatWorkGroupSize = true;
392   }
393 
394   // Requested minimum/maximum number of waves per execution unit.
395   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
396     F, "amdgpu-waves-per-eu", Default, true);
397 
398   // Make sure requested minimum is less than requested maximum.
399   if (Requested.second && Requested.first > Requested.second)
400     return Default;
401 
402   // Make sure requested values do not violate subtarget's specifications.
403   if (Requested.first < getMinWavesPerEU() ||
404       Requested.first > getMaxWavesPerEU())
405     return Default;
406   if (Requested.second > getMaxWavesPerEU())
407     return Default;
408 
409   // Make sure requested values are compatible with values implied by requested
410   // minimum/maximum flat work group sizes.
411   if (RequestedFlatWorkGroupSize &&
412       Requested.first < MinImpliedByFlatWorkGroupSize)
413     return Default;
414 
415   return Requested;
416 }
417 
418 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
419   Function *Kernel = I->getParent()->getParent();
420   unsigned MinSize = 0;
421   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
422   bool IdQuery = false;
423 
424   // If reqd_work_group_size is present it narrows value down.
425   if (auto *CI = dyn_cast<CallInst>(I)) {
426     const Function *F = CI->getCalledFunction();
427     if (F) {
428       unsigned Dim = UINT_MAX;
429       switch (F->getIntrinsicID()) {
430       case Intrinsic::amdgcn_workitem_id_x:
431       case Intrinsic::r600_read_tidig_x:
432         IdQuery = true;
433         LLVM_FALLTHROUGH;
434       case Intrinsic::r600_read_local_size_x:
435         Dim = 0;
436         break;
437       case Intrinsic::amdgcn_workitem_id_y:
438       case Intrinsic::r600_read_tidig_y:
439         IdQuery = true;
440         LLVM_FALLTHROUGH;
441       case Intrinsic::r600_read_local_size_y:
442         Dim = 1;
443         break;
444       case Intrinsic::amdgcn_workitem_id_z:
445       case Intrinsic::r600_read_tidig_z:
446         IdQuery = true;
447         LLVM_FALLTHROUGH;
448       case Intrinsic::r600_read_local_size_z:
449         Dim = 2;
450         break;
451       default:
452         break;
453       }
454       if (Dim <= 3) {
455         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
456           if (Node->getNumOperands() == 3)
457             MinSize = MaxSize = mdconst::extract<ConstantInt>(
458                                   Node->getOperand(Dim))->getZExtValue();
459       }
460     }
461   }
462 
463   if (!MaxSize)
464     return false;
465 
466   // Range metadata is [Lo, Hi). For ID query we need to pass max size
467   // as Hi. For size query we need to pass Hi + 1.
468   if (IdQuery)
469     MinSize = 0;
470   else
471     ++MaxSize;
472 
473   MDBuilder MDB(I->getContext());
474   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
475                                                   APInt(32, MaxSize));
476   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
477   return true;
478 }
479 
480 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
481                                                  unsigned &MaxAlign) const {
482   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
483          F.getCallingConv() == CallingConv::SPIR_KERNEL);
484 
485   const DataLayout &DL = F.getParent()->getDataLayout();
486   uint64_t ExplicitArgBytes = 0;
487   MaxAlign = 1;
488 
489   for (const Argument &Arg : F.args()) {
490     Type *ArgTy = Arg.getType();
491 
492     unsigned Align = DL.getABITypeAlignment(ArgTy);
493     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
494     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
495     MaxAlign = std::max(MaxAlign, Align);
496   }
497 
498   return ExplicitArgBytes;
499 }
500 
501 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
502                                                 unsigned &MaxAlign) const {
503   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
504 
505   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
506 
507   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
508   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
509   if (ImplicitBytes != 0) {
510     unsigned Alignment = getAlignmentForImplicitArgPtr();
511     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
512   }
513 
514   // Being able to dereference past the end is useful for emitting scalar loads.
515   return alignTo(TotalSize, 4);
516 }
517 
518 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
519                              const TargetMachine &TM) :
520   R600GenSubtargetInfo(TT, GPU, FS),
521   AMDGPUSubtarget(TT),
522   InstrInfo(*this),
523   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
524   FMA(false),
525   CaymanISA(false),
526   CFALUBug(false),
527   HasVertexCache(false),
528   R600ALUInst(false),
529   FP64(false),
530   TexVTXClauseSize(0),
531   Gen(R600),
532   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
533   InstrItins(getInstrItineraryForCPU(GPU)) { }
534 
535 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
536                                       unsigned NumRegionInstrs) const {
537   // Track register pressure so the scheduler can try to decrease
538   // pressure once register usage is above the threshold defined by
539   // SIRegisterInfo::getRegPressureSetLimit()
540   Policy.ShouldTrackPressure = true;
541 
542   // Enabling both top down and bottom up scheduling seems to give us less
543   // register spills than just using one of these approaches on its own.
544   Policy.OnlyTopDown = false;
545   Policy.OnlyBottomUp = false;
546 
547   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
548   if (!enableSIScheduler())
549     Policy.ShouldTrackLaneMasks = true;
550 }
551 
552 bool GCNSubtarget::hasMadF16() const {
553   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
554 }
555 
556 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
557   if (getGeneration() >= AMDGPUSubtarget::GFX10)
558     return 10;
559 
560   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
561     if (SGPRs <= 80)
562       return 10;
563     if (SGPRs <= 88)
564       return 9;
565     if (SGPRs <= 100)
566       return 8;
567     return 7;
568   }
569   if (SGPRs <= 48)
570     return 10;
571   if (SGPRs <= 56)
572     return 9;
573   if (SGPRs <= 64)
574     return 8;
575   if (SGPRs <= 72)
576     return 7;
577   if (SGPRs <= 80)
578     return 6;
579   return 5;
580 }
581 
582 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
583   if (VGPRs <= 24)
584     return 10;
585   if (VGPRs <= 28)
586     return 9;
587   if (VGPRs <= 32)
588     return 8;
589   if (VGPRs <= 36)
590     return 7;
591   if (VGPRs <= 40)
592     return 6;
593   if (VGPRs <= 48)
594     return 5;
595   if (VGPRs <= 64)
596     return 4;
597   if (VGPRs <= 84)
598     return 3;
599   if (VGPRs <= 128)
600     return 2;
601   return 1;
602 }
603 
604 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
605   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
606   if (getGeneration() >= AMDGPUSubtarget::GFX10)
607     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
608 
609   if (MFI.hasFlatScratchInit()) {
610     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
611       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
612     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
613       return 4; // FLAT_SCRATCH, VCC (in that order).
614   }
615 
616   if (isXNACKEnabled())
617     return 4; // XNACK, VCC (in that order).
618   return 2; // VCC.
619 }
620 
621 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
622   const Function &F = MF.getFunction();
623   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
624 
625   // Compute maximum number of SGPRs function can use using default/requested
626   // minimum number of waves per execution unit.
627   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
628   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
629   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
630 
631   // Check if maximum number of SGPRs was explicitly requested using
632   // "amdgpu-num-sgpr" attribute.
633   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
634     unsigned Requested = AMDGPU::getIntegerAttribute(
635       F, "amdgpu-num-sgpr", MaxNumSGPRs);
636 
637     // Make sure requested value does not violate subtarget's specifications.
638     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
639       Requested = 0;
640 
641     // If more SGPRs are required to support the input user/system SGPRs,
642     // increase to accommodate them.
643     //
644     // FIXME: This really ends up using the requested number of SGPRs + number
645     // of reserved special registers in total. Theoretically you could re-use
646     // the last input registers for these special registers, but this would
647     // require a lot of complexity to deal with the weird aliasing.
648     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
649     if (Requested && Requested < InputNumSGPRs)
650       Requested = InputNumSGPRs;
651 
652     // Make sure requested value is compatible with values implied by
653     // default/requested minimum/maximum number of waves per execution unit.
654     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
655       Requested = 0;
656     if (WavesPerEU.second &&
657         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
658       Requested = 0;
659 
660     if (Requested)
661       MaxNumSGPRs = Requested;
662   }
663 
664   if (hasSGPRInitBug())
665     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
666 
667   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
668                   MaxAddressableNumSGPRs);
669 }
670 
671 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
672   const Function &F = MF.getFunction();
673   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
674 
675   // Compute maximum number of VGPRs function can use using default/requested
676   // minimum number of waves per execution unit.
677   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
678   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
679 
680   // Check if maximum number of VGPRs was explicitly requested using
681   // "amdgpu-num-vgpr" attribute.
682   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
683     unsigned Requested = AMDGPU::getIntegerAttribute(
684       F, "amdgpu-num-vgpr", MaxNumVGPRs);
685 
686     // Make sure requested value is compatible with values implied by
687     // default/requested minimum/maximum number of waves per execution unit.
688     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
689       Requested = 0;
690     if (WavesPerEU.second &&
691         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
692       Requested = 0;
693 
694     if (Requested)
695       MaxNumVGPRs = Requested;
696   }
697 
698   return MaxNumVGPRs;
699 }
700 
701 namespace {
702 struct MemOpClusterMutation : ScheduleDAGMutation {
703   const SIInstrInfo *TII;
704 
705   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
706 
707   void apply(ScheduleDAGInstrs *DAG) override {
708     SUnit *SUa = nullptr;
709     // Search for two consequent memory operations and link them
710     // to prevent scheduler from moving them apart.
711     // In DAG pre-process SUnits are in the original order of
712     // the instructions before scheduling.
713     for (SUnit &SU : DAG->SUnits) {
714       MachineInstr &MI2 = *SU.getInstr();
715       if (!MI2.mayLoad() && !MI2.mayStore()) {
716         SUa = nullptr;
717         continue;
718       }
719       if (!SUa) {
720         SUa = &SU;
721         continue;
722       }
723 
724       MachineInstr &MI1 = *SUa->getInstr();
725       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
726           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
727           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
728           (TII->isDS(MI1)   && TII->isDS(MI2))) {
729         SU.addPredBarrier(SUa);
730 
731         for (const SDep &SI : SU.Preds) {
732           if (SI.getSUnit() != SUa)
733             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
734         }
735 
736         if (&SU != &DAG->ExitSU) {
737           for (const SDep &SI : SUa->Succs) {
738             if (SI.getSUnit() != &SU)
739               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
740           }
741         }
742       }
743 
744       SUa = &SU;
745     }
746   }
747 };
748 } // namespace
749 
750 void GCNSubtarget::getPostRAMutations(
751     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
752   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
753 }
754 
755 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
756   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
757     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
758   else
759     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
760 }
761 
762 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
763   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
764     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
765   else
766     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
767 }
768