1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   // Disable mutually exclusive bits.
98   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
99     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
100       FullFS += "-wavefrontsize16,";
101     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
102       FullFS += "-wavefrontsize32,";
103     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
104       FullFS += "-wavefrontsize64,";
105   }
106 
107   FullFS += FS;
108 
109   ParseSubtargetFeatures(GPU, FullFS);
110 
111   // We don't support FP64 for EG/NI atm.
112   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113 
114   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
116   // variants of MUBUF instructions.
117   if (!hasAddr64() && !FS.contains("flat-for-global")) {
118     FlatForGlobal = true;
119   }
120 
121   // Set defaults if needed.
122   if (MaxPrivateElementSize == 0)
123     MaxPrivateElementSize = 4;
124 
125   if (LDSBankCount == 0)
126     LDSBankCount = 32;
127 
128   if (TT.getArch() == Triple::amdgcn) {
129     if (LocalMemorySize == 0)
130       LocalMemorySize = 32768;
131 
132     // Do something sensible for unspecified target.
133     if (!HasMovrel && !HasVGPRIndexMode)
134       HasMovrel = true;
135   }
136 
137   // Don't crash on invalid devices.
138   if (WavefrontSize == 0)
139     WavefrontSize = 64;
140 
141   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
142 
143   if (DoesNotSupportXNACK && EnableXNACK) {
144     ToggleFeature(AMDGPU::FeatureXNACK);
145     EnableXNACK = false;
146   }
147 
148   // ECC is on by default, but turn it off if the hardware doesn't support it
149   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
150   // ECC.
151   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
152     ToggleFeature(AMDGPU::FeatureSRAMECC);
153     EnableSRAMECC = false;
154   }
155 
156   return *this;
157 }
158 
159 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
160   TargetTriple(TT),
161   Has16BitInsts(false),
162   HasMadMixInsts(false),
163   FP32Denormals(false),
164   FPExceptions(false),
165   HasSDWA(false),
166   HasVOP3PInsts(false),
167   HasMulI24(true),
168   HasMulU24(true),
169   HasInv2PiInlineImm(false),
170   HasFminFmaxLegacy(true),
171   EnablePromoteAlloca(false),
172   HasTrigReducedRange(false),
173   LocalMemorySize(0),
174   WavefrontSize(0)
175   { }
176 
177 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
178                            const GCNTargetMachine &TM) :
179     AMDGPUGenSubtargetInfo(TT, GPU, FS),
180     AMDGPUSubtarget(TT),
181     TargetTriple(TT),
182     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
183     InstrItins(getInstrItineraryForCPU(GPU)),
184     LDSBankCount(0),
185     MaxPrivateElementSize(0),
186 
187     FastFMAF32(false),
188     HalfRate64Ops(false),
189 
190     FP64FP16Denormals(false),
191     FlatForGlobal(false),
192     AutoWaitcntBeforeBarrier(false),
193     CodeObjectV3(false),
194     UnalignedScratchAccess(false),
195     UnalignedBufferAccess(false),
196 
197     HasApertureRegs(false),
198     EnableXNACK(false),
199     DoesNotSupportXNACK(false),
200     EnableCuMode(false),
201     TrapHandler(false),
202 
203     EnableLoadStoreOpt(false),
204     EnableUnsafeDSOffsetFolding(false),
205     EnableSIScheduler(false),
206     EnableDS128(false),
207     EnablePRTStrictNull(false),
208     DumpCode(false),
209 
210     FP64(false),
211     GCN3Encoding(false),
212     CIInsts(false),
213     GFX8Insts(false),
214     GFX9Insts(false),
215     GFX10Insts(false),
216     GFX7GFX8GFX9Insts(false),
217     SGPRInitBug(false),
218     HasSMemRealTime(false),
219     HasIntClamp(false),
220     HasFmaMixInsts(false),
221     HasMovrel(false),
222     HasVGPRIndexMode(false),
223     HasScalarStores(false),
224     HasScalarAtomics(false),
225     HasSDWAOmod(false),
226     HasSDWAScalar(false),
227     HasSDWASdst(false),
228     HasSDWAMac(false),
229     HasSDWAOutModsVOPC(false),
230     HasDPP(false),
231     HasDPP8(false),
232     HasR128A16(false),
233     HasNSAEncoding(false),
234     HasDLInsts(false),
235     HasDot1Insts(false),
236     HasDot2Insts(false),
237     HasDot5Insts(false),
238     HasDot6Insts(false),
239     EnableSRAMECC(false),
240     DoesNotSupportSRAMECC(false),
241     HasNoSdstCMPX(false),
242     HasVscnt(false),
243     HasRegisterBanking(false),
244     HasVOP3Literal(false),
245     HasNoDataDepHazard(false),
246     FlatAddressSpace(false),
247     FlatInstOffsets(false),
248     FlatGlobalInsts(false),
249     FlatScratchInsts(false),
250     ScalarFlatScratchInsts(false),
251     AddNoCarryInsts(false),
252     HasUnpackedD16VMem(false),
253     LDSMisalignedBug(false),
254 
255     ScalarizeGlobal(false),
256 
257     HasVcmpxPermlaneHazard(false),
258     HasVMEMtoScalarWriteHazard(false),
259     HasSMEMtoVectorWriteHazard(false),
260     HasInstFwdPrefetchBug(false),
261     HasVcmpxExecWARHazard(false),
262     HasLdsBranchVmemWARHazard(false),
263     HasNSAtoVMEMBug(false),
264     HasOffset3fBug(false),
265     HasFlatSegmentOffsetBug(false),
266 
267     FeatureDisable(false),
268     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
269     TLInfo(TM, *this),
270     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
271   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
272   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
273   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
274   InstSelector.reset(new AMDGPUInstructionSelector(
275   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
276 }
277 
278 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
279   if (getGeneration() < GFX10)
280     return 1;
281 
282   switch (Opcode) {
283   case AMDGPU::V_LSHLREV_B64:
284   case AMDGPU::V_LSHLREV_B64_gfx10:
285   case AMDGPU::V_LSHL_B64:
286   case AMDGPU::V_LSHRREV_B64:
287   case AMDGPU::V_LSHRREV_B64_gfx10:
288   case AMDGPU::V_LSHR_B64:
289   case AMDGPU::V_ASHRREV_I64:
290   case AMDGPU::V_ASHRREV_I64_gfx10:
291   case AMDGPU::V_ASHR_I64:
292     return 1;
293   }
294 
295   return 2;
296 }
297 
298 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
299   const Function &F) const {
300   if (NWaves == 1)
301     return getLocalMemorySize();
302   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
303   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
304   if (!WorkGroupsPerCu)
305     return 0;
306   unsigned MaxWaves = getMaxWavesPerEU();
307   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
308 }
309 
310 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
311   const Function &F) const {
312   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
313   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
314   if (!WorkGroupsPerCu)
315     return 0;
316   unsigned MaxWaves = getMaxWavesPerEU();
317   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
318   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
319   NumWaves = std::min(NumWaves, MaxWaves);
320   NumWaves = std::max(NumWaves, 1u);
321   return NumWaves;
322 }
323 
324 unsigned
325 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
326   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
327   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
328 }
329 
330 std::pair<unsigned, unsigned>
331 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
332   switch (CC) {
333   case CallingConv::AMDGPU_CS:
334   case CallingConv::AMDGPU_KERNEL:
335   case CallingConv::SPIR_KERNEL:
336     return std::make_pair(getWavefrontSize() * 2,
337                           std::max(getWavefrontSize() * 4, 256u));
338   case CallingConv::AMDGPU_VS:
339   case CallingConv::AMDGPU_LS:
340   case CallingConv::AMDGPU_HS:
341   case CallingConv::AMDGPU_ES:
342   case CallingConv::AMDGPU_GS:
343   case CallingConv::AMDGPU_PS:
344     return std::make_pair(1, getWavefrontSize());
345   default:
346     return std::make_pair(1, 16 * getWavefrontSize());
347   }
348 }
349 
350 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
351   const Function &F) const {
352   // FIXME: 1024 if function.
353   // Default minimum/maximum flat work group sizes.
354   std::pair<unsigned, unsigned> Default =
355     getDefaultFlatWorkGroupSize(F.getCallingConv());
356 
357   // Requested minimum/maximum flat work group sizes.
358   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
359     F, "amdgpu-flat-work-group-size", Default);
360 
361   // Make sure requested minimum is less than requested maximum.
362   if (Requested.first > Requested.second)
363     return Default;
364 
365   // Make sure requested values do not violate subtarget's specifications.
366   if (Requested.first < getMinFlatWorkGroupSize())
367     return Default;
368   if (Requested.second > getMaxFlatWorkGroupSize())
369     return Default;
370 
371   return Requested;
372 }
373 
374 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
375   const Function &F) const {
376   // Default minimum/maximum number of waves per execution unit.
377   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
378 
379   // Default/requested minimum/maximum flat work group sizes.
380   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
381 
382   // If minimum/maximum flat work group sizes were explicitly requested using
383   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
384   // number of waves per execution unit to values implied by requested
385   // minimum/maximum flat work group sizes.
386   unsigned MinImpliedByFlatWorkGroupSize =
387     getMaxWavesPerEU(FlatWorkGroupSizes.second);
388   bool RequestedFlatWorkGroupSize = false;
389 
390   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
391     Default.first = MinImpliedByFlatWorkGroupSize;
392     RequestedFlatWorkGroupSize = true;
393   }
394 
395   // Requested minimum/maximum number of waves per execution unit.
396   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
397     F, "amdgpu-waves-per-eu", Default, true);
398 
399   // Make sure requested minimum is less than requested maximum.
400   if (Requested.second && Requested.first > Requested.second)
401     return Default;
402 
403   // Make sure requested values do not violate subtarget's specifications.
404   if (Requested.first < getMinWavesPerEU() ||
405       Requested.first > getMaxWavesPerEU())
406     return Default;
407   if (Requested.second > getMaxWavesPerEU())
408     return Default;
409 
410   // Make sure requested values are compatible with values implied by requested
411   // minimum/maximum flat work group sizes.
412   if (RequestedFlatWorkGroupSize &&
413       Requested.first < MinImpliedByFlatWorkGroupSize)
414     return Default;
415 
416   return Requested;
417 }
418 
419 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
420   Function *Kernel = I->getParent()->getParent();
421   unsigned MinSize = 0;
422   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
423   bool IdQuery = false;
424 
425   // If reqd_work_group_size is present it narrows value down.
426   if (auto *CI = dyn_cast<CallInst>(I)) {
427     const Function *F = CI->getCalledFunction();
428     if (F) {
429       unsigned Dim = UINT_MAX;
430       switch (F->getIntrinsicID()) {
431       case Intrinsic::amdgcn_workitem_id_x:
432       case Intrinsic::r600_read_tidig_x:
433         IdQuery = true;
434         LLVM_FALLTHROUGH;
435       case Intrinsic::r600_read_local_size_x:
436         Dim = 0;
437         break;
438       case Intrinsic::amdgcn_workitem_id_y:
439       case Intrinsic::r600_read_tidig_y:
440         IdQuery = true;
441         LLVM_FALLTHROUGH;
442       case Intrinsic::r600_read_local_size_y:
443         Dim = 1;
444         break;
445       case Intrinsic::amdgcn_workitem_id_z:
446       case Intrinsic::r600_read_tidig_z:
447         IdQuery = true;
448         LLVM_FALLTHROUGH;
449       case Intrinsic::r600_read_local_size_z:
450         Dim = 2;
451         break;
452       default:
453         break;
454       }
455       if (Dim <= 3) {
456         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
457           if (Node->getNumOperands() == 3)
458             MinSize = MaxSize = mdconst::extract<ConstantInt>(
459                                   Node->getOperand(Dim))->getZExtValue();
460       }
461     }
462   }
463 
464   if (!MaxSize)
465     return false;
466 
467   // Range metadata is [Lo, Hi). For ID query we need to pass max size
468   // as Hi. For size query we need to pass Hi + 1.
469   if (IdQuery)
470     MinSize = 0;
471   else
472     ++MaxSize;
473 
474   MDBuilder MDB(I->getContext());
475   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
476                                                   APInt(32, MaxSize));
477   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
478   return true;
479 }
480 
481 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
482                                                  unsigned &MaxAlign) const {
483   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
484          F.getCallingConv() == CallingConv::SPIR_KERNEL);
485 
486   const DataLayout &DL = F.getParent()->getDataLayout();
487   uint64_t ExplicitArgBytes = 0;
488   MaxAlign = 1;
489 
490   for (const Argument &Arg : F.args()) {
491     Type *ArgTy = Arg.getType();
492 
493     unsigned Align = DL.getABITypeAlignment(ArgTy);
494     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
495     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
496     MaxAlign = std::max(MaxAlign, Align);
497   }
498 
499   return ExplicitArgBytes;
500 }
501 
502 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
503                                                 unsigned &MaxAlign) const {
504   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
505 
506   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
507 
508   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
509   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
510   if (ImplicitBytes != 0) {
511     unsigned Alignment = getAlignmentForImplicitArgPtr();
512     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
513   }
514 
515   // Being able to dereference past the end is useful for emitting scalar loads.
516   return alignTo(TotalSize, 4);
517 }
518 
519 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
520                              const TargetMachine &TM) :
521   R600GenSubtargetInfo(TT, GPU, FS),
522   AMDGPUSubtarget(TT),
523   InstrInfo(*this),
524   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
525   FMA(false),
526   CaymanISA(false),
527   CFALUBug(false),
528   HasVertexCache(false),
529   R600ALUInst(false),
530   FP64(false),
531   TexVTXClauseSize(0),
532   Gen(R600),
533   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
534   InstrItins(getInstrItineraryForCPU(GPU)) { }
535 
536 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
537                                       unsigned NumRegionInstrs) const {
538   // Track register pressure so the scheduler can try to decrease
539   // pressure once register usage is above the threshold defined by
540   // SIRegisterInfo::getRegPressureSetLimit()
541   Policy.ShouldTrackPressure = true;
542 
543   // Enabling both top down and bottom up scheduling seems to give us less
544   // register spills than just using one of these approaches on its own.
545   Policy.OnlyTopDown = false;
546   Policy.OnlyBottomUp = false;
547 
548   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
549   if (!enableSIScheduler())
550     Policy.ShouldTrackLaneMasks = true;
551 }
552 
553 bool GCNSubtarget::hasMadF16() const {
554   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
555 }
556 
557 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
558   if (getGeneration() >= AMDGPUSubtarget::GFX10)
559     return 10;
560 
561   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
562     if (SGPRs <= 80)
563       return 10;
564     if (SGPRs <= 88)
565       return 9;
566     if (SGPRs <= 100)
567       return 8;
568     return 7;
569   }
570   if (SGPRs <= 48)
571     return 10;
572   if (SGPRs <= 56)
573     return 9;
574   if (SGPRs <= 64)
575     return 8;
576   if (SGPRs <= 72)
577     return 7;
578   if (SGPRs <= 80)
579     return 6;
580   return 5;
581 }
582 
583 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
584   if (VGPRs <= 24)
585     return 10;
586   if (VGPRs <= 28)
587     return 9;
588   if (VGPRs <= 32)
589     return 8;
590   if (VGPRs <= 36)
591     return 7;
592   if (VGPRs <= 40)
593     return 6;
594   if (VGPRs <= 48)
595     return 5;
596   if (VGPRs <= 64)
597     return 4;
598   if (VGPRs <= 84)
599     return 3;
600   if (VGPRs <= 128)
601     return 2;
602   return 1;
603 }
604 
605 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
606   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
607   if (getGeneration() >= AMDGPUSubtarget::GFX10)
608     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
609 
610   if (MFI.hasFlatScratchInit()) {
611     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
612       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
613     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
614       return 4; // FLAT_SCRATCH, VCC (in that order).
615   }
616 
617   if (isXNACKEnabled())
618     return 4; // XNACK, VCC (in that order).
619   return 2; // VCC.
620 }
621 
622 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
623   const Function &F = MF.getFunction();
624   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
625 
626   // Compute maximum number of SGPRs function can use using default/requested
627   // minimum number of waves per execution unit.
628   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
629   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
630   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
631 
632   // Check if maximum number of SGPRs was explicitly requested using
633   // "amdgpu-num-sgpr" attribute.
634   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
635     unsigned Requested = AMDGPU::getIntegerAttribute(
636       F, "amdgpu-num-sgpr", MaxNumSGPRs);
637 
638     // Make sure requested value does not violate subtarget's specifications.
639     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
640       Requested = 0;
641 
642     // If more SGPRs are required to support the input user/system SGPRs,
643     // increase to accommodate them.
644     //
645     // FIXME: This really ends up using the requested number of SGPRs + number
646     // of reserved special registers in total. Theoretically you could re-use
647     // the last input registers for these special registers, but this would
648     // require a lot of complexity to deal with the weird aliasing.
649     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
650     if (Requested && Requested < InputNumSGPRs)
651       Requested = InputNumSGPRs;
652 
653     // Make sure requested value is compatible with values implied by
654     // default/requested minimum/maximum number of waves per execution unit.
655     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
656       Requested = 0;
657     if (WavesPerEU.second &&
658         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
659       Requested = 0;
660 
661     if (Requested)
662       MaxNumSGPRs = Requested;
663   }
664 
665   if (hasSGPRInitBug())
666     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
667 
668   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
669                   MaxAddressableNumSGPRs);
670 }
671 
672 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
673   const Function &F = MF.getFunction();
674   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
675 
676   // Compute maximum number of VGPRs function can use using default/requested
677   // minimum number of waves per execution unit.
678   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
679   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
680 
681   // Check if maximum number of VGPRs was explicitly requested using
682   // "amdgpu-num-vgpr" attribute.
683   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
684     unsigned Requested = AMDGPU::getIntegerAttribute(
685       F, "amdgpu-num-vgpr", MaxNumVGPRs);
686 
687     // Make sure requested value is compatible with values implied by
688     // default/requested minimum/maximum number of waves per execution unit.
689     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
690       Requested = 0;
691     if (WavesPerEU.second &&
692         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
693       Requested = 0;
694 
695     if (Requested)
696       MaxNumVGPRs = Requested;
697   }
698 
699   return MaxNumVGPRs;
700 }
701 
702 namespace {
703 struct MemOpClusterMutation : ScheduleDAGMutation {
704   const SIInstrInfo *TII;
705 
706   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
707 
708   void apply(ScheduleDAGInstrs *DAG) override {
709     SUnit *SUa = nullptr;
710     // Search for two consequent memory operations and link them
711     // to prevent scheduler from moving them apart.
712     // In DAG pre-process SUnits are in the original order of
713     // the instructions before scheduling.
714     for (SUnit &SU : DAG->SUnits) {
715       MachineInstr &MI2 = *SU.getInstr();
716       if (!MI2.mayLoad() && !MI2.mayStore()) {
717         SUa = nullptr;
718         continue;
719       }
720       if (!SUa) {
721         SUa = &SU;
722         continue;
723       }
724 
725       MachineInstr &MI1 = *SUa->getInstr();
726       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
727           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
728           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
729           (TII->isDS(MI1)   && TII->isDS(MI2))) {
730         SU.addPredBarrier(SUa);
731 
732         for (const SDep &SI : SU.Preds) {
733           if (SI.getSUnit() != SUa)
734             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
735         }
736 
737         if (&SU != &DAG->ExitSU) {
738           for (const SDep &SI : SUa->Succs) {
739             if (SI.getSUnit() != &SU)
740               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
741           }
742         }
743       }
744 
745       SUa = &SU;
746     }
747   }
748 };
749 } // namespace
750 
751 void GCNSubtarget::getPostRAMutations(
752     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
753   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
754 }
755 
756 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
757   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
758     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
759   else
760     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
761 }
762 
763 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
764   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
765     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
766   else
767     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
768 }
769