1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM) :
199     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
200     AMDGPUSubtarget(TT),
201     TargetTriple(TT),
202     TargetID(*this),
203     Gen(INVALID),
204     InstrItins(getInstrItineraryForCPU(GPU)),
205     LDSBankCount(0),
206     MaxPrivateElementSize(0),
207 
208     FastFMAF32(false),
209     FastDenormalF32(false),
210     HalfRate64Ops(false),
211     FullRate64Ops(false),
212 
213     FlatForGlobal(false),
214     AutoWaitcntBeforeBarrier(false),
215     UnalignedScratchAccess(false),
216     UnalignedAccessMode(false),
217 
218     HasApertureRegs(false),
219     SupportsXNACK(false),
220     EnableXNACK(false),
221     EnableTgSplit(false),
222     EnableCuMode(false),
223     TrapHandler(false),
224 
225     EnableLoadStoreOpt(false),
226     EnableUnsafeDSOffsetFolding(false),
227     EnableSIScheduler(false),
228     EnableDS128(false),
229     EnablePRTStrictNull(false),
230     DumpCode(false),
231 
232     FP64(false),
233     CIInsts(false),
234     GFX8Insts(false),
235     GFX9Insts(false),
236     GFX90AInsts(false),
237     GFX10Insts(false),
238     GFX10_3Insts(false),
239     GFX7GFX8GFX9Insts(false),
240     SGPRInitBug(false),
241     HasSMemRealTime(false),
242     HasIntClamp(false),
243     HasFmaMixInsts(false),
244     HasMovrel(false),
245     HasVGPRIndexMode(false),
246     HasScalarStores(false),
247     HasScalarAtomics(false),
248     HasSDWAOmod(false),
249     HasSDWAScalar(false),
250     HasSDWASdst(false),
251     HasSDWAMac(false),
252     HasSDWAOutModsVOPC(false),
253     HasDPP(false),
254     HasDPP8(false),
255     Has64BitDPP(false),
256     HasPackedFP32Ops(false),
257     HasExtendedImageInsts(false),
258     HasR128A16(false),
259     HasGFX10A16(false),
260     HasG16(false),
261     HasNSAEncoding(false),
262     GFX10_BEncoding(false),
263     HasDLInsts(false),
264     HasDot1Insts(false),
265     HasDot2Insts(false),
266     HasDot3Insts(false),
267     HasDot4Insts(false),
268     HasDot5Insts(false),
269     HasDot6Insts(false),
270     HasMAIInsts(false),
271     HasPkFmacF16Inst(false),
272     HasAtomicFaddInsts(false),
273     SupportsSRAMECC(false),
274     EnableSRAMECC(false),
275     HasNoSdstCMPX(false),
276     HasVscnt(false),
277     HasGetWaveIdInst(false),
278     HasSMemTimeInst(false),
279     HasShaderCyclesRegister(false),
280     HasRegisterBanking(false),
281     HasVOP3Literal(false),
282     HasNoDataDepHazard(false),
283     FlatAddressSpace(false),
284     FlatInstOffsets(false),
285     FlatGlobalInsts(false),
286     FlatScratchInsts(false),
287     ScalarFlatScratchInsts(false),
288     AddNoCarryInsts(false),
289     HasUnpackedD16VMem(false),
290     LDSMisalignedBug(false),
291     HasMFMAInlineLiteralBug(false),
292     UnalignedBufferAccess(false),
293     UnalignedDSAccess(false),
294     HasPackedTID(false),
295 
296     ScalarizeGlobal(false),
297 
298     HasVcmpxPermlaneHazard(false),
299     HasVMEMtoScalarWriteHazard(false),
300     HasSMEMtoVectorWriteHazard(false),
301     HasInstFwdPrefetchBug(false),
302     HasVcmpxExecWARHazard(false),
303     HasLdsBranchVmemWARHazard(false),
304     HasNSAtoVMEMBug(false),
305     HasOffset3fBug(false),
306     HasFlatSegmentOffsetBug(false),
307     HasImageStoreD16Bug(false),
308     HasImageGather4D16Bug(false),
309 
310     FeatureDisable(false),
311     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
312     TLInfo(TM, *this),
313     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
314   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
315   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
316   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
317   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
318   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
319   InstSelector.reset(new AMDGPUInstructionSelector(
320   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
321 }
322 
323 bool GCNSubtarget::enableFlatScratch() const {
324   return EnableFlatScratch && hasFlatScratchInsts();
325 }
326 
327 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
328   if (getGeneration() < GFX10)
329     return 1;
330 
331   switch (Opcode) {
332   case AMDGPU::V_LSHLREV_B64_e64:
333   case AMDGPU::V_LSHLREV_B64_gfx10:
334   case AMDGPU::V_LSHL_B64_e64:
335   case AMDGPU::V_LSHRREV_B64_e64:
336   case AMDGPU::V_LSHRREV_B64_gfx10:
337   case AMDGPU::V_LSHR_B64_e64:
338   case AMDGPU::V_ASHRREV_I64_e64:
339   case AMDGPU::V_ASHRREV_I64_gfx10:
340   case AMDGPU::V_ASHR_I64_e64:
341     return 1;
342   }
343 
344   return 2;
345 }
346 
347 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
348   const Function &F) const {
349   if (NWaves == 1)
350     return getLocalMemorySize();
351   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
352   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
353   if (!WorkGroupsPerCu)
354     return 0;
355   unsigned MaxWaves = getMaxWavesPerEU();
356   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
357 }
358 
359 // FIXME: Should return min,max range.
360 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
361   const Function &F) const {
362   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
363   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
364   if (!MaxWorkGroupsPerCu)
365     return 0;
366 
367   const unsigned WaveSize = getWavefrontSize();
368 
369   // FIXME: Do we need to account for alignment requirement of LDS rounding the
370   // size up?
371   // Compute restriction based on LDS usage
372   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
373 
374   // This can be queried with more LDS than is possible, so just assume the
375   // worst.
376   if (NumGroups == 0)
377     return 1;
378 
379   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
380 
381   // Round to the number of waves.
382   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
383   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
384 
385   // Clamp to the maximum possible number of waves.
386   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
387 
388   // FIXME: Needs to be a multiple of the group size?
389   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
390 
391   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
392          "computed invalid occupancy");
393   return MaxWaves;
394 }
395 
396 unsigned
397 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
398   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
399   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
400 }
401 
402 std::pair<unsigned, unsigned>
403 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
404   switch (CC) {
405   case CallingConv::AMDGPU_VS:
406   case CallingConv::AMDGPU_LS:
407   case CallingConv::AMDGPU_HS:
408   case CallingConv::AMDGPU_ES:
409   case CallingConv::AMDGPU_GS:
410   case CallingConv::AMDGPU_PS:
411     return std::make_pair(1, getWavefrontSize());
412   default:
413     return std::make_pair(1u, getMaxFlatWorkGroupSize());
414   }
415 }
416 
417 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
418   const Function &F) const {
419   // Default minimum/maximum flat work group sizes.
420   std::pair<unsigned, unsigned> Default =
421     getDefaultFlatWorkGroupSize(F.getCallingConv());
422 
423   // Requested minimum/maximum flat work group sizes.
424   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
425     F, "amdgpu-flat-work-group-size", Default);
426 
427   // Make sure requested minimum is less than requested maximum.
428   if (Requested.first > Requested.second)
429     return Default;
430 
431   // Make sure requested values do not violate subtarget's specifications.
432   if (Requested.first < getMinFlatWorkGroupSize())
433     return Default;
434   if (Requested.second > getMaxFlatWorkGroupSize())
435     return Default;
436 
437   return Requested;
438 }
439 
440 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
441   const Function &F) const {
442   // Default minimum/maximum number of waves per execution unit.
443   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
444 
445   // Default/requested minimum/maximum flat work group sizes.
446   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
447 
448   // If minimum/maximum flat work group sizes were explicitly requested using
449   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
450   // number of waves per execution unit to values implied by requested
451   // minimum/maximum flat work group sizes.
452   unsigned MinImpliedByFlatWorkGroupSize =
453     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
454   Default.first = MinImpliedByFlatWorkGroupSize;
455   bool RequestedFlatWorkGroupSize =
456       F.hasFnAttribute("amdgpu-flat-work-group-size");
457 
458   // Requested minimum/maximum number of waves per execution unit.
459   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
460     F, "amdgpu-waves-per-eu", Default, true);
461 
462   // Make sure requested minimum is less than requested maximum.
463   if (Requested.second && Requested.first > Requested.second)
464     return Default;
465 
466   // Make sure requested values do not violate subtarget's specifications.
467   if (Requested.first < getMinWavesPerEU() ||
468       Requested.second > getMaxWavesPerEU())
469     return Default;
470 
471   // Make sure requested values are compatible with values implied by requested
472   // minimum/maximum flat work group sizes.
473   if (RequestedFlatWorkGroupSize &&
474       Requested.first < MinImpliedByFlatWorkGroupSize)
475     return Default;
476 
477   return Requested;
478 }
479 
480 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
481   auto Node = Kernel.getMetadata("reqd_work_group_size");
482   if (Node && Node->getNumOperands() == 3)
483     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
484   return std::numeric_limits<unsigned>::max();
485 }
486 
487 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
488   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
489 }
490 
491 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
492                                            unsigned Dimension) const {
493   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
494   if (ReqdSize != std::numeric_limits<unsigned>::max())
495     return ReqdSize - 1;
496   return getFlatWorkGroupSizes(Kernel).second - 1;
497 }
498 
499 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
500   Function *Kernel = I->getParent()->getParent();
501   unsigned MinSize = 0;
502   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
503   bool IdQuery = false;
504 
505   // If reqd_work_group_size is present it narrows value down.
506   if (auto *CI = dyn_cast<CallInst>(I)) {
507     const Function *F = CI->getCalledFunction();
508     if (F) {
509       unsigned Dim = UINT_MAX;
510       switch (F->getIntrinsicID()) {
511       case Intrinsic::amdgcn_workitem_id_x:
512       case Intrinsic::r600_read_tidig_x:
513         IdQuery = true;
514         LLVM_FALLTHROUGH;
515       case Intrinsic::r600_read_local_size_x:
516         Dim = 0;
517         break;
518       case Intrinsic::amdgcn_workitem_id_y:
519       case Intrinsic::r600_read_tidig_y:
520         IdQuery = true;
521         LLVM_FALLTHROUGH;
522       case Intrinsic::r600_read_local_size_y:
523         Dim = 1;
524         break;
525       case Intrinsic::amdgcn_workitem_id_z:
526       case Intrinsic::r600_read_tidig_z:
527         IdQuery = true;
528         LLVM_FALLTHROUGH;
529       case Intrinsic::r600_read_local_size_z:
530         Dim = 2;
531         break;
532       default:
533         break;
534       }
535 
536       if (Dim <= 3) {
537         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
538         if (ReqdSize != std::numeric_limits<unsigned>::max())
539           MinSize = MaxSize = ReqdSize;
540       }
541     }
542   }
543 
544   if (!MaxSize)
545     return false;
546 
547   // Range metadata is [Lo, Hi). For ID query we need to pass max size
548   // as Hi. For size query we need to pass Hi + 1.
549   if (IdQuery)
550     MinSize = 0;
551   else
552     ++MaxSize;
553 
554   MDBuilder MDB(I->getContext());
555   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
556                                                   APInt(32, MaxSize));
557   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
558   return true;
559 }
560 
561 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
562   if (isMesaKernel(F))
563     return 16;
564   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
565 }
566 
567 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
568                                                  Align &MaxAlign) const {
569   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
570          F.getCallingConv() == CallingConv::SPIR_KERNEL);
571 
572   const DataLayout &DL = F.getParent()->getDataLayout();
573   uint64_t ExplicitArgBytes = 0;
574   MaxAlign = Align(1);
575 
576   for (const Argument &Arg : F.args()) {
577     const bool IsByRef = Arg.hasByRefAttr();
578     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
579     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
580     if (!Alignment)
581       Alignment = DL.getABITypeAlign(ArgTy);
582 
583     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
584     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
585     MaxAlign = max(MaxAlign, Alignment);
586   }
587 
588   return ExplicitArgBytes;
589 }
590 
591 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
592                                                 Align &MaxAlign) const {
593   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
594 
595   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
596 
597   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
598   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
599   if (ImplicitBytes != 0) {
600     const Align Alignment = getAlignmentForImplicitArgPtr();
601     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
602   }
603 
604   // Being able to dereference past the end is useful for emitting scalar loads.
605   return alignTo(TotalSize, 4);
606 }
607 
608 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
609   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
610                                   : AMDGPUDwarfFlavour::Wave64;
611 }
612 
613 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
614                              const TargetMachine &TM) :
615   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
616   AMDGPUSubtarget(TT),
617   InstrInfo(*this),
618   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
619   FMA(false),
620   CaymanISA(false),
621   CFALUBug(false),
622   HasVertexCache(false),
623   R600ALUInst(false),
624   FP64(false),
625   TexVTXClauseSize(0),
626   Gen(R600),
627   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
628   InstrItins(getInstrItineraryForCPU(GPU)) { }
629 
630 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
631                                       unsigned NumRegionInstrs) const {
632   // Track register pressure so the scheduler can try to decrease
633   // pressure once register usage is above the threshold defined by
634   // SIRegisterInfo::getRegPressureSetLimit()
635   Policy.ShouldTrackPressure = true;
636 
637   // Enabling both top down and bottom up scheduling seems to give us less
638   // register spills than just using one of these approaches on its own.
639   Policy.OnlyTopDown = false;
640   Policy.OnlyBottomUp = false;
641 
642   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
643   if (!enableSIScheduler())
644     Policy.ShouldTrackLaneMasks = true;
645 }
646 
647 bool GCNSubtarget::hasMadF16() const {
648   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
649 }
650 
651 bool GCNSubtarget::useVGPRIndexMode() const {
652   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
653 }
654 
655 bool GCNSubtarget::useAA() const { return UseAA; }
656 
657 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
658   if (getGeneration() >= AMDGPUSubtarget::GFX10)
659     return getMaxWavesPerEU();
660 
661   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
662     if (SGPRs <= 80)
663       return 10;
664     if (SGPRs <= 88)
665       return 9;
666     if (SGPRs <= 100)
667       return 8;
668     return 7;
669   }
670   if (SGPRs <= 48)
671     return 10;
672   if (SGPRs <= 56)
673     return 9;
674   if (SGPRs <= 64)
675     return 8;
676   if (SGPRs <= 72)
677     return 7;
678   if (SGPRs <= 80)
679     return 6;
680   return 5;
681 }
682 
683 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
684   unsigned MaxWaves = getMaxWavesPerEU();
685   unsigned Granule = getVGPRAllocGranule();
686   if (VGPRs < Granule)
687     return MaxWaves;
688   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
689   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
690 }
691 
692 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
693   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
694   if (getGeneration() >= AMDGPUSubtarget::GFX10)
695     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
696 
697   if (MFI.hasFlatScratchInit()) {
698     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
699       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
700     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
701       return 4; // FLAT_SCRATCH, VCC (in that order).
702   }
703 
704   if (isXNACKEnabled())
705     return 4; // XNACK, VCC (in that order).
706   return 2; // VCC.
707 }
708 
709 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
710                                         unsigned NumSGPRs,
711                                         unsigned NumVGPRs) const {
712   unsigned Occupancy =
713     std::min(getMaxWavesPerEU(),
714              getOccupancyWithLocalMemSize(LDSSize, F));
715   if (NumSGPRs)
716     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
717   if (NumVGPRs)
718     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
719   return Occupancy;
720 }
721 
722 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
723   const Function &F = MF.getFunction();
724   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
725 
726   // Compute maximum number of SGPRs function can use using default/requested
727   // minimum number of waves per execution unit.
728   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
729   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
730   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
731 
732   // Check if maximum number of SGPRs was explicitly requested using
733   // "amdgpu-num-sgpr" attribute.
734   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
735     unsigned Requested = AMDGPU::getIntegerAttribute(
736       F, "amdgpu-num-sgpr", MaxNumSGPRs);
737 
738     // Make sure requested value does not violate subtarget's specifications.
739     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
740       Requested = 0;
741 
742     // If more SGPRs are required to support the input user/system SGPRs,
743     // increase to accommodate them.
744     //
745     // FIXME: This really ends up using the requested number of SGPRs + number
746     // of reserved special registers in total. Theoretically you could re-use
747     // the last input registers for these special registers, but this would
748     // require a lot of complexity to deal with the weird aliasing.
749     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
750     if (Requested && Requested < InputNumSGPRs)
751       Requested = InputNumSGPRs;
752 
753     // Make sure requested value is compatible with values implied by
754     // default/requested minimum/maximum number of waves per execution unit.
755     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
756       Requested = 0;
757     if (WavesPerEU.second &&
758         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
759       Requested = 0;
760 
761     if (Requested)
762       MaxNumSGPRs = Requested;
763   }
764 
765   if (hasSGPRInitBug())
766     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
767 
768   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
769                   MaxAddressableNumSGPRs);
770 }
771 
772 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
773   const Function &F = MF.getFunction();
774   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
775 
776   // Compute maximum number of VGPRs function can use using default/requested
777   // minimum number of waves per execution unit.
778   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
779   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
780 
781   // Check if maximum number of VGPRs was explicitly requested using
782   // "amdgpu-num-vgpr" attribute.
783   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
784     unsigned Requested = AMDGPU::getIntegerAttribute(
785       F, "amdgpu-num-vgpr", MaxNumVGPRs);
786 
787     if (hasGFX90AInsts())
788       Requested *= 2;
789 
790     // Make sure requested value is compatible with values implied by
791     // default/requested minimum/maximum number of waves per execution unit.
792     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
793       Requested = 0;
794     if (WavesPerEU.second &&
795         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
796       Requested = 0;
797 
798     if (Requested)
799       MaxNumVGPRs = Requested;
800   }
801 
802   return MaxNumVGPRs;
803 }
804 
805 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
806                                          int UseOpIdx, SDep &Dep) const {
807   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
808       !Def->isInstr() || !Use->isInstr())
809     return;
810 
811   MachineInstr *DefI = Def->getInstr();
812   MachineInstr *UseI = Use->getInstr();
813 
814   if (DefI->isBundle()) {
815     const SIRegisterInfo *TRI = getRegisterInfo();
816     auto Reg = Dep.getReg();
817     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
818     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
819     unsigned Lat = 0;
820     for (++I; I != E && I->isBundledWithPred(); ++I) {
821       if (I->modifiesRegister(Reg, TRI))
822         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
823       else if (Lat)
824         --Lat;
825     }
826     Dep.setLatency(Lat);
827   } else if (UseI->isBundle()) {
828     const SIRegisterInfo *TRI = getRegisterInfo();
829     auto Reg = Dep.getReg();
830     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
831     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
832     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
833     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
834       if (I->readsRegister(Reg, TRI))
835         break;
836       --Lat;
837     }
838     Dep.setLatency(Lat);
839   }
840 }
841 
842 namespace {
843 struct FillMFMAShadowMutation : ScheduleDAGMutation {
844   const SIInstrInfo *TII;
845 
846   ScheduleDAGMI *DAG;
847 
848   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
849 
850   bool isSALU(const SUnit *SU) const {
851     const MachineInstr *MI = SU->getInstr();
852     return MI && TII->isSALU(*MI) && !MI->isTerminator();
853   }
854 
855   bool isVALU(const SUnit *SU) const {
856     const MachineInstr *MI = SU->getInstr();
857     return MI && TII->isVALU(*MI);
858   }
859 
860   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
861     if (Pred->NodeNum < Succ->NodeNum)
862       return true;
863 
864     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
865 
866     for (unsigned I = 0; I < Succs.size(); ++I) {
867       for (const SDep &SI : Succs[I]->Succs) {
868         const SUnit *SU = SI.getSUnit();
869         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
870           Succs.push_back(SU);
871       }
872     }
873 
874     SmallPtrSet<const SUnit*, 32> Visited;
875     while (!Preds.empty()) {
876       const SUnit *SU = Preds.pop_back_val();
877       if (llvm::is_contained(Succs, SU))
878         return false;
879       Visited.insert(SU);
880       for (const SDep &SI : SU->Preds)
881         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
882           Preds.push_back(SI.getSUnit());
883     }
884 
885     return true;
886   }
887 
888   // Link as much SALU intructions in chain as possible. Return the size
889   // of the chain. Links up to MaxChain instructions.
890   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
891                          SmallPtrSetImpl<SUnit *> &Visited) const {
892     SmallVector<SUnit *, 8> Worklist({To});
893     unsigned Linked = 0;
894 
895     while (!Worklist.empty() && MaxChain-- > 0) {
896       SUnit *SU = Worklist.pop_back_val();
897       if (!Visited.insert(SU).second)
898         continue;
899 
900       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
901                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
902 
903       if (SU->addPred(SDep(From, SDep::Artificial), false))
904         ++Linked;
905 
906       for (SDep &SI : From->Succs) {
907         SUnit *SUv = SI.getSUnit();
908         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
909           SUv->addPred(SDep(SU, SDep::Artificial), false);
910       }
911 
912       for (SDep &SI : SU->Succs) {
913         SUnit *Succ = SI.getSUnit();
914         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
915           Worklist.push_back(Succ);
916       }
917     }
918 
919     return Linked;
920   }
921 
922   void apply(ScheduleDAGInstrs *DAGInstrs) override {
923     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
924     if (!ST.hasMAIInsts() || DisablePowerSched)
925       return;
926     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
927     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
928     if (!TSchedModel || DAG->SUnits.empty())
929       return;
930 
931     // Scan for MFMA long latency instructions and try to add a dependency
932     // of available SALU instructions to give them a chance to fill MFMA
933     // shadow. That is desirable to fill MFMA shadow with SALU instructions
934     // rather than VALU to prevent power consumption bursts and throttle.
935     auto LastSALU = DAG->SUnits.begin();
936     auto E = DAG->SUnits.end();
937     SmallPtrSet<SUnit*, 32> Visited;
938     for (SUnit &SU : DAG->SUnits) {
939       MachineInstr &MAI = *SU.getInstr();
940       if (!TII->isMAI(MAI) ||
941            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
942            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
943         continue;
944 
945       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
946 
947       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
948                  dbgs() << "Need " << Lat
949                         << " instructions to cover latency.\n");
950 
951       // Find up to Lat independent scalar instructions as early as
952       // possible such that they can be scheduled after this MFMA.
953       for ( ; Lat && LastSALU != E; ++LastSALU) {
954         if (Visited.count(&*LastSALU))
955           continue;
956 
957         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
958           continue;
959 
960         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
961       }
962     }
963   }
964 };
965 } // namespace
966 
967 void GCNSubtarget::getPostRAMutations(
968     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
969   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
970 }
971 
972 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
973   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
974     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
975   else
976     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
977 }
978 
979 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
980   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
981     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
982   else
983     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
984 }
985