1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   FullFS += FS;
98 
99   ParseSubtargetFeatures(GPU, FullFS);
100 
101   // We don't support FP64 for EG/NI atm.
102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 
104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106   // variants of MUBUF instructions.
107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
108     FlatForGlobal = true;
109   }
110 
111   // Set defaults if needed.
112   if (MaxPrivateElementSize == 0)
113     MaxPrivateElementSize = 4;
114 
115   if (LDSBankCount == 0)
116     LDSBankCount = 32;
117 
118   if (TT.getArch() == Triple::amdgcn) {
119     if (LocalMemorySize == 0)
120       LocalMemorySize = 32768;
121 
122     // Do something sensible for unspecified target.
123     if (!HasMovrel && !HasVGPRIndexMode)
124       HasMovrel = true;
125   }
126 
127   // Don't crash on invalid devices.
128   if (WavefrontSize == 0)
129     WavefrontSize = 64;
130 
131   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 
133   if (DoesNotSupportXNACK && EnableXNACK) {
134     ToggleFeature(AMDGPU::FeatureXNACK);
135     EnableXNACK = false;
136   }
137 
138   // ECC is on by default, but turn it off if the hardware doesn't support it
139   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
140   // ECC.
141   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
142     ToggleFeature(AMDGPU::FeatureSRAMECC);
143     EnableSRAMECC = false;
144   }
145 
146   return *this;
147 }
148 
149 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
150   TargetTriple(TT),
151   Has16BitInsts(false),
152   HasMadMixInsts(false),
153   FP32Denormals(false),
154   FPExceptions(false),
155   HasSDWA(false),
156   HasVOP3PInsts(false),
157   HasMulI24(true),
158   HasMulU24(true),
159   HasInv2PiInlineImm(false),
160   HasFminFmaxLegacy(true),
161   EnablePromoteAlloca(false),
162   HasTrigReducedRange(false),
163   LocalMemorySize(0),
164   WavefrontSize(0)
165   { }
166 
167 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
168                            const GCNTargetMachine &TM) :
169     AMDGPUGenSubtargetInfo(TT, GPU, FS),
170     AMDGPUSubtarget(TT),
171     TargetTriple(TT),
172     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
173     InstrItins(getInstrItineraryForCPU(GPU)),
174     LDSBankCount(0),
175     MaxPrivateElementSize(0),
176 
177     FastFMAF32(false),
178     HalfRate64Ops(false),
179 
180     FP64FP16Denormals(false),
181     FlatForGlobal(false),
182     AutoWaitcntBeforeBarrier(false),
183     CodeObjectV3(false),
184     UnalignedScratchAccess(false),
185     UnalignedBufferAccess(false),
186 
187     HasApertureRegs(false),
188     EnableXNACK(false),
189     DoesNotSupportXNACK(false),
190     EnableCuMode(false),
191     TrapHandler(false),
192 
193     EnableLoadStoreOpt(false),
194     EnableUnsafeDSOffsetFolding(false),
195     EnableSIScheduler(false),
196     EnableDS128(false),
197     EnablePRTStrictNull(false),
198     DumpCode(false),
199 
200     FP64(false),
201     GCN3Encoding(false),
202     CIInsts(false),
203     GFX8Insts(false),
204     GFX9Insts(false),
205     GFX10Insts(false),
206     GFX7GFX8GFX9Insts(false),
207     SGPRInitBug(false),
208     HasSMemRealTime(false),
209     HasIntClamp(false),
210     HasFmaMixInsts(false),
211     HasMovrel(false),
212     HasVGPRIndexMode(false),
213     HasScalarStores(false),
214     HasScalarAtomics(false),
215     HasSDWAOmod(false),
216     HasSDWAScalar(false),
217     HasSDWASdst(false),
218     HasSDWAMac(false),
219     HasSDWAOutModsVOPC(false),
220     HasDPP(false),
221     HasR128A16(false),
222     HasNSAEncoding(false),
223     HasDLInsts(false),
224     HasDot1Insts(false),
225     HasDot2Insts(false),
226     EnableSRAMECC(false),
227     DoesNotSupportSRAMECC(false),
228     HasNoSdstCMPX(false),
229     HasVscnt(false),
230     HasRegisterBanking(false),
231     HasVOP3Literal(false),
232     HasNoDataDepHazard(false),
233     FlatAddressSpace(false),
234     FlatInstOffsets(false),
235     FlatGlobalInsts(false),
236     FlatScratchInsts(false),
237     ScalarFlatScratchInsts(false),
238     AddNoCarryInsts(false),
239     HasUnpackedD16VMem(false),
240     LDSMisalignedBug(false),
241 
242     ScalarizeGlobal(false),
243 
244     HasVcmpxPermlaneHazard(false),
245     HasVMEMtoScalarWriteHazard(false),
246     HasSMEMtoVectorWriteHazard(false),
247     HasInstFwdPrefetchBug(false),
248     HasVcmpxExecWARHazard(false),
249     HasLdsBranchVmemWARHazard(false),
250     HasNSAtoVMEMBug(false),
251     HasFlatSegmentOffsetBug(false),
252 
253     FeatureDisable(false),
254     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
255     TLInfo(TM, *this),
256     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
257   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
258   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
259   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
260   InstSelector.reset(new AMDGPUInstructionSelector(
261   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
262 }
263 
264 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
265   if (getGeneration() < GFX10)
266     return 1;
267 
268   switch (Opcode) {
269   case AMDGPU::V_LSHLREV_B64:
270   case AMDGPU::V_LSHLREV_B64_gfx10:
271   case AMDGPU::V_LSHL_B64:
272   case AMDGPU::V_LSHRREV_B64:
273   case AMDGPU::V_LSHRREV_B64_gfx10:
274   case AMDGPU::V_LSHR_B64:
275   case AMDGPU::V_ASHRREV_I64:
276   case AMDGPU::V_ASHRREV_I64_gfx10:
277   case AMDGPU::V_ASHR_I64:
278     return 1;
279   }
280 
281   return 2;
282 }
283 
284 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
285   const Function &F) const {
286   if (NWaves == 1)
287     return getLocalMemorySize();
288   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
289   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
290   if (!WorkGroupsPerCu)
291     return 0;
292   unsigned MaxWaves = getMaxWavesPerEU();
293   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
294 }
295 
296 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
297   const Function &F) const {
298   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
299   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
300   if (!WorkGroupsPerCu)
301     return 0;
302   unsigned MaxWaves = getMaxWavesPerEU();
303   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
304   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
305   NumWaves = std::min(NumWaves, MaxWaves);
306   NumWaves = std::max(NumWaves, 1u);
307   return NumWaves;
308 }
309 
310 unsigned
311 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
312   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
313   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
314 }
315 
316 std::pair<unsigned, unsigned>
317 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
318   switch (CC) {
319   case CallingConv::AMDGPU_CS:
320   case CallingConv::AMDGPU_KERNEL:
321   case CallingConv::SPIR_KERNEL:
322     return std::make_pair(getWavefrontSize() * 2,
323                           std::max(getWavefrontSize() * 4, 256u));
324   case CallingConv::AMDGPU_VS:
325   case CallingConv::AMDGPU_LS:
326   case CallingConv::AMDGPU_HS:
327   case CallingConv::AMDGPU_ES:
328   case CallingConv::AMDGPU_GS:
329   case CallingConv::AMDGPU_PS:
330     return std::make_pair(1, getWavefrontSize());
331   default:
332     return std::make_pair(1, 16 * getWavefrontSize());
333   }
334 }
335 
336 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
337   const Function &F) const {
338   // FIXME: 1024 if function.
339   // Default minimum/maximum flat work group sizes.
340   std::pair<unsigned, unsigned> Default =
341     getDefaultFlatWorkGroupSize(F.getCallingConv());
342 
343   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
344   // starts using "amdgpu-flat-work-group-size" attribute.
345   Default.second = AMDGPU::getIntegerAttribute(
346     F, "amdgpu-max-work-group-size", Default.second);
347   Default.first = std::min(Default.first, Default.second);
348 
349   // Requested minimum/maximum flat work group sizes.
350   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
351     F, "amdgpu-flat-work-group-size", Default);
352 
353   // Make sure requested minimum is less than requested maximum.
354   if (Requested.first > Requested.second)
355     return Default;
356 
357   // Make sure requested values do not violate subtarget's specifications.
358   if (Requested.first < getMinFlatWorkGroupSize())
359     return Default;
360   if (Requested.second > getMaxFlatWorkGroupSize())
361     return Default;
362 
363   return Requested;
364 }
365 
366 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
367   const Function &F) const {
368   // Default minimum/maximum number of waves per execution unit.
369   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
370 
371   // Default/requested minimum/maximum flat work group sizes.
372   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
373 
374   // If minimum/maximum flat work group sizes were explicitly requested using
375   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
376   // number of waves per execution unit to values implied by requested
377   // minimum/maximum flat work group sizes.
378   unsigned MinImpliedByFlatWorkGroupSize =
379     getMaxWavesPerEU(FlatWorkGroupSizes.second);
380   bool RequestedFlatWorkGroupSize = false;
381 
382   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
383   // starts using "amdgpu-flat-work-group-size" attribute.
384   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
385       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
386     Default.first = MinImpliedByFlatWorkGroupSize;
387     RequestedFlatWorkGroupSize = true;
388   }
389 
390   // Requested minimum/maximum number of waves per execution unit.
391   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
392     F, "amdgpu-waves-per-eu", Default, true);
393 
394   // Make sure requested minimum is less than requested maximum.
395   if (Requested.second && Requested.first > Requested.second)
396     return Default;
397 
398   // Make sure requested values do not violate subtarget's specifications.
399   if (Requested.first < getMinWavesPerEU() ||
400       Requested.first > getMaxWavesPerEU())
401     return Default;
402   if (Requested.second > getMaxWavesPerEU())
403     return Default;
404 
405   // Make sure requested values are compatible with values implied by requested
406   // minimum/maximum flat work group sizes.
407   if (RequestedFlatWorkGroupSize &&
408       Requested.first < MinImpliedByFlatWorkGroupSize)
409     return Default;
410 
411   return Requested;
412 }
413 
414 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
415   Function *Kernel = I->getParent()->getParent();
416   unsigned MinSize = 0;
417   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
418   bool IdQuery = false;
419 
420   // If reqd_work_group_size is present it narrows value down.
421   if (auto *CI = dyn_cast<CallInst>(I)) {
422     const Function *F = CI->getCalledFunction();
423     if (F) {
424       unsigned Dim = UINT_MAX;
425       switch (F->getIntrinsicID()) {
426       case Intrinsic::amdgcn_workitem_id_x:
427       case Intrinsic::r600_read_tidig_x:
428         IdQuery = true;
429         LLVM_FALLTHROUGH;
430       case Intrinsic::r600_read_local_size_x:
431         Dim = 0;
432         break;
433       case Intrinsic::amdgcn_workitem_id_y:
434       case Intrinsic::r600_read_tidig_y:
435         IdQuery = true;
436         LLVM_FALLTHROUGH;
437       case Intrinsic::r600_read_local_size_y:
438         Dim = 1;
439         break;
440       case Intrinsic::amdgcn_workitem_id_z:
441       case Intrinsic::r600_read_tidig_z:
442         IdQuery = true;
443         LLVM_FALLTHROUGH;
444       case Intrinsic::r600_read_local_size_z:
445         Dim = 2;
446         break;
447       default:
448         break;
449       }
450       if (Dim <= 3) {
451         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
452           if (Node->getNumOperands() == 3)
453             MinSize = MaxSize = mdconst::extract<ConstantInt>(
454                                   Node->getOperand(Dim))->getZExtValue();
455       }
456     }
457   }
458 
459   if (!MaxSize)
460     return false;
461 
462   // Range metadata is [Lo, Hi). For ID query we need to pass max size
463   // as Hi. For size query we need to pass Hi + 1.
464   if (IdQuery)
465     MinSize = 0;
466   else
467     ++MaxSize;
468 
469   MDBuilder MDB(I->getContext());
470   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
471                                                   APInt(32, MaxSize));
472   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
473   return true;
474 }
475 
476 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
477                                                  unsigned &MaxAlign) const {
478   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
479          F.getCallingConv() == CallingConv::SPIR_KERNEL);
480 
481   const DataLayout &DL = F.getParent()->getDataLayout();
482   uint64_t ExplicitArgBytes = 0;
483   MaxAlign = 1;
484 
485   for (const Argument &Arg : F.args()) {
486     Type *ArgTy = Arg.getType();
487 
488     unsigned Align = DL.getABITypeAlignment(ArgTy);
489     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
490     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
491     MaxAlign = std::max(MaxAlign, Align);
492   }
493 
494   return ExplicitArgBytes;
495 }
496 
497 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
498                                                 unsigned &MaxAlign) const {
499   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
500 
501   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
502 
503   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
504   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
505   if (ImplicitBytes != 0) {
506     unsigned Alignment = getAlignmentForImplicitArgPtr();
507     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
508   }
509 
510   // Being able to dereference past the end is useful for emitting scalar loads.
511   return alignTo(TotalSize, 4);
512 }
513 
514 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
515                              const TargetMachine &TM) :
516   R600GenSubtargetInfo(TT, GPU, FS),
517   AMDGPUSubtarget(TT),
518   InstrInfo(*this),
519   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
520   FMA(false),
521   CaymanISA(false),
522   CFALUBug(false),
523   HasVertexCache(false),
524   R600ALUInst(false),
525   FP64(false),
526   TexVTXClauseSize(0),
527   Gen(R600),
528   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
529   InstrItins(getInstrItineraryForCPU(GPU)) { }
530 
531 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
532                                       unsigned NumRegionInstrs) const {
533   // Track register pressure so the scheduler can try to decrease
534   // pressure once register usage is above the threshold defined by
535   // SIRegisterInfo::getRegPressureSetLimit()
536   Policy.ShouldTrackPressure = true;
537 
538   // Enabling both top down and bottom up scheduling seems to give us less
539   // register spills than just using one of these approaches on its own.
540   Policy.OnlyTopDown = false;
541   Policy.OnlyBottomUp = false;
542 
543   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
544   if (!enableSIScheduler())
545     Policy.ShouldTrackLaneMasks = true;
546 }
547 
548 bool GCNSubtarget::hasMadF16() const {
549   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
550 }
551 
552 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
553   if (getGeneration() >= AMDGPUSubtarget::GFX10)
554     return 10;
555 
556   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
557     if (SGPRs <= 80)
558       return 10;
559     if (SGPRs <= 88)
560       return 9;
561     if (SGPRs <= 100)
562       return 8;
563     return 7;
564   }
565   if (SGPRs <= 48)
566     return 10;
567   if (SGPRs <= 56)
568     return 9;
569   if (SGPRs <= 64)
570     return 8;
571   if (SGPRs <= 72)
572     return 7;
573   if (SGPRs <= 80)
574     return 6;
575   return 5;
576 }
577 
578 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
579   if (VGPRs <= 24)
580     return 10;
581   if (VGPRs <= 28)
582     return 9;
583   if (VGPRs <= 32)
584     return 8;
585   if (VGPRs <= 36)
586     return 7;
587   if (VGPRs <= 40)
588     return 6;
589   if (VGPRs <= 48)
590     return 5;
591   if (VGPRs <= 64)
592     return 4;
593   if (VGPRs <= 84)
594     return 3;
595   if (VGPRs <= 128)
596     return 2;
597   return 1;
598 }
599 
600 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
601   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
602   if (getGeneration() >= AMDGPUSubtarget::GFX10)
603     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
604 
605   if (MFI.hasFlatScratchInit()) {
606     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
607       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
608     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
609       return 4; // FLAT_SCRATCH, VCC (in that order).
610   }
611 
612   if (isXNACKEnabled())
613     return 4; // XNACK, VCC (in that order).
614   return 2; // VCC.
615 }
616 
617 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
618   const Function &F = MF.getFunction();
619   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
620 
621   // Compute maximum number of SGPRs function can use using default/requested
622   // minimum number of waves per execution unit.
623   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
624   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
625   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
626 
627   // Check if maximum number of SGPRs was explicitly requested using
628   // "amdgpu-num-sgpr" attribute.
629   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
630     unsigned Requested = AMDGPU::getIntegerAttribute(
631       F, "amdgpu-num-sgpr", MaxNumSGPRs);
632 
633     // Make sure requested value does not violate subtarget's specifications.
634     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
635       Requested = 0;
636 
637     // If more SGPRs are required to support the input user/system SGPRs,
638     // increase to accommodate them.
639     //
640     // FIXME: This really ends up using the requested number of SGPRs + number
641     // of reserved special registers in total. Theoretically you could re-use
642     // the last input registers for these special registers, but this would
643     // require a lot of complexity to deal with the weird aliasing.
644     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
645     if (Requested && Requested < InputNumSGPRs)
646       Requested = InputNumSGPRs;
647 
648     // Make sure requested value is compatible with values implied by
649     // default/requested minimum/maximum number of waves per execution unit.
650     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
651       Requested = 0;
652     if (WavesPerEU.second &&
653         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
654       Requested = 0;
655 
656     if (Requested)
657       MaxNumSGPRs = Requested;
658   }
659 
660   if (hasSGPRInitBug())
661     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
662 
663   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
664                   MaxAddressableNumSGPRs);
665 }
666 
667 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
668   const Function &F = MF.getFunction();
669   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
670 
671   // Compute maximum number of VGPRs function can use using default/requested
672   // minimum number of waves per execution unit.
673   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
674   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
675 
676   // Check if maximum number of VGPRs was explicitly requested using
677   // "amdgpu-num-vgpr" attribute.
678   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
679     unsigned Requested = AMDGPU::getIntegerAttribute(
680       F, "amdgpu-num-vgpr", MaxNumVGPRs);
681 
682     // Make sure requested value is compatible with values implied by
683     // default/requested minimum/maximum number of waves per execution unit.
684     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
685       Requested = 0;
686     if (WavesPerEU.second &&
687         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
688       Requested = 0;
689 
690     if (Requested)
691       MaxNumVGPRs = Requested;
692   }
693 
694   return MaxNumVGPRs;
695 }
696 
697 namespace {
698 struct MemOpClusterMutation : ScheduleDAGMutation {
699   const SIInstrInfo *TII;
700 
701   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
702 
703   void apply(ScheduleDAGInstrs *DAG) override {
704     SUnit *SUa = nullptr;
705     // Search for two consequent memory operations and link them
706     // to prevent scheduler from moving them apart.
707     // In DAG pre-process SUnits are in the original order of
708     // the instructions before scheduling.
709     for (SUnit &SU : DAG->SUnits) {
710       MachineInstr &MI2 = *SU.getInstr();
711       if (!MI2.mayLoad() && !MI2.mayStore()) {
712         SUa = nullptr;
713         continue;
714       }
715       if (!SUa) {
716         SUa = &SU;
717         continue;
718       }
719 
720       MachineInstr &MI1 = *SUa->getInstr();
721       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
722           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
723           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
724           (TII->isDS(MI1)   && TII->isDS(MI2))) {
725         SU.addPredBarrier(SUa);
726 
727         for (const SDep &SI : SU.Preds) {
728           if (SI.getSUnit() != SUa)
729             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
730         }
731 
732         if (&SU != &DAG->ExitSU) {
733           for (const SDep &SI : SUa->Succs) {
734             if (SI.getSUnit() != &SU)
735               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
736           }
737         }
738       }
739 
740       SUa = &SU;
741     }
742   }
743 };
744 } // namespace
745 
746 void GCNSubtarget::getPostRAMutations(
747     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
748   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
749 }
750 
751 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
752   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
753     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
754   else
755     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
756 }
757 
758 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
759   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
760     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
761   else
762     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
763 }
764