1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   // Disable mutually exclusive bits.
98   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100       FullFS += "-wavefrontsize16,";
101     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102       FullFS += "-wavefrontsize32,";
103     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104       FullFS += "-wavefrontsize64,";
105   }
106 
107   FullFS += FS;
108 
109   ParseSubtargetFeatures(GPU, FullFS);
110 
111   // We don't support FP64 for EG/NI atm.
112   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113 
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116   // variants of MUBUF instructions.
117   if (!hasAddr64() && !FS.contains("flat-for-global")) {
118     FlatForGlobal = true;
119   }
120 
121   // Set defaults if needed.
122   if (MaxPrivateElementSize == 0)
123     MaxPrivateElementSize = 4;
124 
125   if (LDSBankCount == 0)
126     LDSBankCount = 32;
127 
128   if (TT.getArch() == Triple::amdgcn) {
129     if (LocalMemorySize == 0)
130       LocalMemorySize = 32768;
131 
132     // Do something sensible for unspecified target.
133     if (!HasMovrel && !HasVGPRIndexMode)
134       HasMovrel = true;
135   }
136 
137   // Don't crash on invalid devices.
138   if (WavefrontSize == 0)
139     WavefrontSize = 64;
140 
141   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
142 
143   if (DoesNotSupportXNACK && EnableXNACK) {
144     ToggleFeature(AMDGPU::FeatureXNACK);
145     EnableXNACK = false;
146   }
147 
148   // ECC is on by default, but turn it off if the hardware doesn't support it
149   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
150   // ECC.
151   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
152     ToggleFeature(AMDGPU::FeatureSRAMECC);
153     EnableSRAMECC = false;
154   }
155 
156   return *this;
157 }
158 
159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
160   TargetTriple(TT),
161   Has16BitInsts(false),
162   HasMadMixInsts(false),
163   FP32Denormals(false),
164   FPExceptions(false),
165   HasSDWA(false),
166   HasVOP3PInsts(false),
167   HasMulI24(true),
168   HasMulU24(true),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   LocalMemorySize(0),
174   WavefrontSize(0)
175   { }
176 
177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
178                            const GCNTargetMachine &TM) :
179     AMDGPUGenSubtargetInfo(TT, GPU, FS),
180     AMDGPUSubtarget(TT),
181     TargetTriple(TT),
182     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
183     InstrItins(getInstrItineraryForCPU(GPU)),
184     LDSBankCount(0),
185     MaxPrivateElementSize(0),
186 
187     FastFMAF32(false),
188     HalfRate64Ops(false),
189 
190     FP64FP16Denormals(false),
191     FlatForGlobal(false),
192     AutoWaitcntBeforeBarrier(false),
193     CodeObjectV3(false),
194     UnalignedScratchAccess(false),
195     UnalignedBufferAccess(false),
196 
197     HasApertureRegs(false),
198     EnableXNACK(false),
199     DoesNotSupportXNACK(false),
200     EnableCuMode(false),
201     TrapHandler(false),
202 
203     EnableLoadStoreOpt(false),
204     EnableUnsafeDSOffsetFolding(false),
205     EnableSIScheduler(false),
206     EnableDS128(false),
207     EnablePRTStrictNull(false),
208     DumpCode(false),
209 
210     FP64(false),
211     GCN3Encoding(false),
212     CIInsts(false),
213     GFX8Insts(false),
214     GFX9Insts(false),
215     GFX10Insts(false),
216     GFX7GFX8GFX9Insts(false),
217     SGPRInitBug(false),
218     HasSMemRealTime(false),
219     HasIntClamp(false),
220     HasFmaMixInsts(false),
221     HasMovrel(false),
222     HasVGPRIndexMode(false),
223     HasScalarStores(false),
224     HasScalarAtomics(false),
225     HasSDWAOmod(false),
226     HasSDWAScalar(false),
227     HasSDWASdst(false),
228     HasSDWAMac(false),
229     HasSDWAOutModsVOPC(false),
230     HasDPP(false),
231     HasDPP8(false),
232     HasR128A16(false),
233     HasNSAEncoding(false),
234     HasDLInsts(false),
235     HasDot1Insts(false),
236     HasDot2Insts(false),
237     EnableSRAMECC(false),
238     DoesNotSupportSRAMECC(false),
239     HasNoSdstCMPX(false),
240     HasVscnt(false),
241     HasRegisterBanking(false),
242     HasVOP3Literal(false),
243     HasNoDataDepHazard(false),
244     FlatAddressSpace(false),
245     FlatInstOffsets(false),
246     FlatGlobalInsts(false),
247     FlatScratchInsts(false),
248     ScalarFlatScratchInsts(false),
249     AddNoCarryInsts(false),
250     HasUnpackedD16VMem(false),
251     LDSMisalignedBug(false),
252 
253     ScalarizeGlobal(false),
254 
255     HasVcmpxPermlaneHazard(false),
256     HasVMEMtoScalarWriteHazard(false),
257     HasSMEMtoVectorWriteHazard(false),
258     HasInstFwdPrefetchBug(false),
259     HasVcmpxExecWARHazard(false),
260     HasLdsBranchVmemWARHazard(false),
261     HasNSAtoVMEMBug(false),
262     HasFlatSegmentOffsetBug(false),
263 
264     FeatureDisable(false),
265     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
266     TLInfo(TM, *this),
267     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
268   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
269   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
270   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
271   InstSelector.reset(new AMDGPUInstructionSelector(
272   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
273 }
274 
275 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
276   if (getGeneration() < GFX10)
277     return 1;
278 
279   switch (Opcode) {
280   case AMDGPU::V_LSHLREV_B64:
281   case AMDGPU::V_LSHLREV_B64_gfx10:
282   case AMDGPU::V_LSHL_B64:
283   case AMDGPU::V_LSHRREV_B64:
284   case AMDGPU::V_LSHRREV_B64_gfx10:
285   case AMDGPU::V_LSHR_B64:
286   case AMDGPU::V_ASHRREV_I64:
287   case AMDGPU::V_ASHRREV_I64_gfx10:
288   case AMDGPU::V_ASHR_I64:
289     return 1;
290   }
291 
292   return 2;
293 }
294 
295 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
296   const Function &F) const {
297   if (NWaves == 1)
298     return getLocalMemorySize();
299   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
300   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
301   if (!WorkGroupsPerCu)
302     return 0;
303   unsigned MaxWaves = getMaxWavesPerEU();
304   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
305 }
306 
307 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
308   const Function &F) const {
309   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
310   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
311   if (!WorkGroupsPerCu)
312     return 0;
313   unsigned MaxWaves = getMaxWavesPerEU();
314   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
315   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
316   NumWaves = std::min(NumWaves, MaxWaves);
317   NumWaves = std::max(NumWaves, 1u);
318   return NumWaves;
319 }
320 
321 unsigned
322 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
323   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
324   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
325 }
326 
327 std::pair<unsigned, unsigned>
328 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
329   switch (CC) {
330   case CallingConv::AMDGPU_CS:
331   case CallingConv::AMDGPU_KERNEL:
332   case CallingConv::SPIR_KERNEL:
333     return std::make_pair(getWavefrontSize() * 2,
334                           std::max(getWavefrontSize() * 4, 256u));
335   case CallingConv::AMDGPU_VS:
336   case CallingConv::AMDGPU_LS:
337   case CallingConv::AMDGPU_HS:
338   case CallingConv::AMDGPU_ES:
339   case CallingConv::AMDGPU_GS:
340   case CallingConv::AMDGPU_PS:
341     return std::make_pair(1, getWavefrontSize());
342   default:
343     return std::make_pair(1, 16 * getWavefrontSize());
344   }
345 }
346 
347 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
348   const Function &F) const {
349   // FIXME: 1024 if function.
350   // Default minimum/maximum flat work group sizes.
351   std::pair<unsigned, unsigned> Default =
352     getDefaultFlatWorkGroupSize(F.getCallingConv());
353 
354   // Requested minimum/maximum flat work group sizes.
355   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
356     F, "amdgpu-flat-work-group-size", Default);
357 
358   // Make sure requested minimum is less than requested maximum.
359   if (Requested.first > Requested.second)
360     return Default;
361 
362   // Make sure requested values do not violate subtarget's specifications.
363   if (Requested.first < getMinFlatWorkGroupSize())
364     return Default;
365   if (Requested.second > getMaxFlatWorkGroupSize())
366     return Default;
367 
368   return Requested;
369 }
370 
371 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
372   const Function &F) const {
373   // Default minimum/maximum number of waves per execution unit.
374   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
375 
376   // Default/requested minimum/maximum flat work group sizes.
377   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
378 
379   // If minimum/maximum flat work group sizes were explicitly requested using
380   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
381   // number of waves per execution unit to values implied by requested
382   // minimum/maximum flat work group sizes.
383   unsigned MinImpliedByFlatWorkGroupSize =
384     getMaxWavesPerEU(FlatWorkGroupSizes.second);
385   bool RequestedFlatWorkGroupSize = false;
386 
387   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
388     Default.first = MinImpliedByFlatWorkGroupSize;
389     RequestedFlatWorkGroupSize = true;
390   }
391 
392   // Requested minimum/maximum number of waves per execution unit.
393   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
394     F, "amdgpu-waves-per-eu", Default, true);
395 
396   // Make sure requested minimum is less than requested maximum.
397   if (Requested.second && Requested.first > Requested.second)
398     return Default;
399 
400   // Make sure requested values do not violate subtarget's specifications.
401   if (Requested.first < getMinWavesPerEU() ||
402       Requested.first > getMaxWavesPerEU())
403     return Default;
404   if (Requested.second > getMaxWavesPerEU())
405     return Default;
406 
407   // Make sure requested values are compatible with values implied by requested
408   // minimum/maximum flat work group sizes.
409   if (RequestedFlatWorkGroupSize &&
410       Requested.first < MinImpliedByFlatWorkGroupSize)
411     return Default;
412 
413   return Requested;
414 }
415 
416 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
417   Function *Kernel = I->getParent()->getParent();
418   unsigned MinSize = 0;
419   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
420   bool IdQuery = false;
421 
422   // If reqd_work_group_size is present it narrows value down.
423   if (auto *CI = dyn_cast<CallInst>(I)) {
424     const Function *F = CI->getCalledFunction();
425     if (F) {
426       unsigned Dim = UINT_MAX;
427       switch (F->getIntrinsicID()) {
428       case Intrinsic::amdgcn_workitem_id_x:
429       case Intrinsic::r600_read_tidig_x:
430         IdQuery = true;
431         LLVM_FALLTHROUGH;
432       case Intrinsic::r600_read_local_size_x:
433         Dim = 0;
434         break;
435       case Intrinsic::amdgcn_workitem_id_y:
436       case Intrinsic::r600_read_tidig_y:
437         IdQuery = true;
438         LLVM_FALLTHROUGH;
439       case Intrinsic::r600_read_local_size_y:
440         Dim = 1;
441         break;
442       case Intrinsic::amdgcn_workitem_id_z:
443       case Intrinsic::r600_read_tidig_z:
444         IdQuery = true;
445         LLVM_FALLTHROUGH;
446       case Intrinsic::r600_read_local_size_z:
447         Dim = 2;
448         break;
449       default:
450         break;
451       }
452       if (Dim <= 3) {
453         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
454           if (Node->getNumOperands() == 3)
455             MinSize = MaxSize = mdconst::extract<ConstantInt>(
456                                   Node->getOperand(Dim))->getZExtValue();
457       }
458     }
459   }
460 
461   if (!MaxSize)
462     return false;
463 
464   // Range metadata is [Lo, Hi). For ID query we need to pass max size
465   // as Hi. For size query we need to pass Hi + 1.
466   if (IdQuery)
467     MinSize = 0;
468   else
469     ++MaxSize;
470 
471   MDBuilder MDB(I->getContext());
472   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
473                                                   APInt(32, MaxSize));
474   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
475   return true;
476 }
477 
478 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
479                                                  unsigned &MaxAlign) const {
480   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
481          F.getCallingConv() == CallingConv::SPIR_KERNEL);
482 
483   const DataLayout &DL = F.getParent()->getDataLayout();
484   uint64_t ExplicitArgBytes = 0;
485   MaxAlign = 1;
486 
487   for (const Argument &Arg : F.args()) {
488     Type *ArgTy = Arg.getType();
489 
490     unsigned Align = DL.getABITypeAlignment(ArgTy);
491     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
492     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
493     MaxAlign = std::max(MaxAlign, Align);
494   }
495 
496   return ExplicitArgBytes;
497 }
498 
499 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
500                                                 unsigned &MaxAlign) const {
501   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
502 
503   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
504 
505   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
506   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
507   if (ImplicitBytes != 0) {
508     unsigned Alignment = getAlignmentForImplicitArgPtr();
509     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
510   }
511 
512   // Being able to dereference past the end is useful for emitting scalar loads.
513   return alignTo(TotalSize, 4);
514 }
515 
516 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
517                              const TargetMachine &TM) :
518   R600GenSubtargetInfo(TT, GPU, FS),
519   AMDGPUSubtarget(TT),
520   InstrInfo(*this),
521   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
522   FMA(false),
523   CaymanISA(false),
524   CFALUBug(false),
525   HasVertexCache(false),
526   R600ALUInst(false),
527   FP64(false),
528   TexVTXClauseSize(0),
529   Gen(R600),
530   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
531   InstrItins(getInstrItineraryForCPU(GPU)) { }
532 
533 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
534                                       unsigned NumRegionInstrs) const {
535   // Track register pressure so the scheduler can try to decrease
536   // pressure once register usage is above the threshold defined by
537   // SIRegisterInfo::getRegPressureSetLimit()
538   Policy.ShouldTrackPressure = true;
539 
540   // Enabling both top down and bottom up scheduling seems to give us less
541   // register spills than just using one of these approaches on its own.
542   Policy.OnlyTopDown = false;
543   Policy.OnlyBottomUp = false;
544 
545   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
546   if (!enableSIScheduler())
547     Policy.ShouldTrackLaneMasks = true;
548 }
549 
550 bool GCNSubtarget::hasMadF16() const {
551   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
552 }
553 
554 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
555   if (getGeneration() >= AMDGPUSubtarget::GFX10)
556     return 10;
557 
558   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
559     if (SGPRs <= 80)
560       return 10;
561     if (SGPRs <= 88)
562       return 9;
563     if (SGPRs <= 100)
564       return 8;
565     return 7;
566   }
567   if (SGPRs <= 48)
568     return 10;
569   if (SGPRs <= 56)
570     return 9;
571   if (SGPRs <= 64)
572     return 8;
573   if (SGPRs <= 72)
574     return 7;
575   if (SGPRs <= 80)
576     return 6;
577   return 5;
578 }
579 
580 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
581   if (VGPRs <= 24)
582     return 10;
583   if (VGPRs <= 28)
584     return 9;
585   if (VGPRs <= 32)
586     return 8;
587   if (VGPRs <= 36)
588     return 7;
589   if (VGPRs <= 40)
590     return 6;
591   if (VGPRs <= 48)
592     return 5;
593   if (VGPRs <= 64)
594     return 4;
595   if (VGPRs <= 84)
596     return 3;
597   if (VGPRs <= 128)
598     return 2;
599   return 1;
600 }
601 
602 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
603   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
604   if (getGeneration() >= AMDGPUSubtarget::GFX10)
605     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
606 
607   if (MFI.hasFlatScratchInit()) {
608     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
609       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
610     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
611       return 4; // FLAT_SCRATCH, VCC (in that order).
612   }
613 
614   if (isXNACKEnabled())
615     return 4; // XNACK, VCC (in that order).
616   return 2; // VCC.
617 }
618 
619 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
620   const Function &F = MF.getFunction();
621   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
622 
623   // Compute maximum number of SGPRs function can use using default/requested
624   // minimum number of waves per execution unit.
625   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
626   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
627   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
628 
629   // Check if maximum number of SGPRs was explicitly requested using
630   // "amdgpu-num-sgpr" attribute.
631   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
632     unsigned Requested = AMDGPU::getIntegerAttribute(
633       F, "amdgpu-num-sgpr", MaxNumSGPRs);
634 
635     // Make sure requested value does not violate subtarget's specifications.
636     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
637       Requested = 0;
638 
639     // If more SGPRs are required to support the input user/system SGPRs,
640     // increase to accommodate them.
641     //
642     // FIXME: This really ends up using the requested number of SGPRs + number
643     // of reserved special registers in total. Theoretically you could re-use
644     // the last input registers for these special registers, but this would
645     // require a lot of complexity to deal with the weird aliasing.
646     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
647     if (Requested && Requested < InputNumSGPRs)
648       Requested = InputNumSGPRs;
649 
650     // Make sure requested value is compatible with values implied by
651     // default/requested minimum/maximum number of waves per execution unit.
652     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
653       Requested = 0;
654     if (WavesPerEU.second &&
655         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
656       Requested = 0;
657 
658     if (Requested)
659       MaxNumSGPRs = Requested;
660   }
661 
662   if (hasSGPRInitBug())
663     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
664 
665   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
666                   MaxAddressableNumSGPRs);
667 }
668 
669 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
670   const Function &F = MF.getFunction();
671   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
672 
673   // Compute maximum number of VGPRs function can use using default/requested
674   // minimum number of waves per execution unit.
675   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
676   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
677 
678   // Check if maximum number of VGPRs was explicitly requested using
679   // "amdgpu-num-vgpr" attribute.
680   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
681     unsigned Requested = AMDGPU::getIntegerAttribute(
682       F, "amdgpu-num-vgpr", MaxNumVGPRs);
683 
684     // Make sure requested value is compatible with values implied by
685     // default/requested minimum/maximum number of waves per execution unit.
686     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
687       Requested = 0;
688     if (WavesPerEU.second &&
689         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
690       Requested = 0;
691 
692     if (Requested)
693       MaxNumVGPRs = Requested;
694   }
695 
696   return MaxNumVGPRs;
697 }
698 
699 namespace {
700 struct MemOpClusterMutation : ScheduleDAGMutation {
701   const SIInstrInfo *TII;
702 
703   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
704 
705   void apply(ScheduleDAGInstrs *DAG) override {
706     SUnit *SUa = nullptr;
707     // Search for two consequent memory operations and link them
708     // to prevent scheduler from moving them apart.
709     // In DAG pre-process SUnits are in the original order of
710     // the instructions before scheduling.
711     for (SUnit &SU : DAG->SUnits) {
712       MachineInstr &MI2 = *SU.getInstr();
713       if (!MI2.mayLoad() && !MI2.mayStore()) {
714         SUa = nullptr;
715         continue;
716       }
717       if (!SUa) {
718         SUa = &SU;
719         continue;
720       }
721 
722       MachineInstr &MI1 = *SUa->getInstr();
723       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
724           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
725           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
726           (TII->isDS(MI1)   && TII->isDS(MI2))) {
727         SU.addPredBarrier(SUa);
728 
729         for (const SDep &SI : SU.Preds) {
730           if (SI.getSUnit() != SUa)
731             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
732         }
733 
734         if (&SU != &DAG->ExitSU) {
735           for (const SDep &SI : SUa->Succs) {
736             if (SI.getSUnit() != &SU)
737               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
738           }
739         }
740       }
741 
742       SUa = &SU;
743     }
744   }
745 };
746 } // namespace
747 
748 void GCNSubtarget::getPostRAMutations(
749     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
750   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
751 }
752 
753 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
754   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
755     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
756   else
757     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
758 }
759 
760 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
761   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
762     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
763   else
764     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
765 }
766