1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM)
199     : // clang-format off
200     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201     AMDGPUSubtarget(TT),
202     TargetTriple(TT),
203     TargetID(*this),
204     Gen(INVALID),
205     InstrItins(getInstrItineraryForCPU(GPU)),
206     LDSBankCount(0),
207     MaxPrivateElementSize(0),
208 
209     FastFMAF32(false),
210     FastDenormalF32(false),
211     HalfRate64Ops(false),
212     FullRate64Ops(false),
213 
214     FlatForGlobal(false),
215     AutoWaitcntBeforeBarrier(false),
216     UnalignedScratchAccess(false),
217     UnalignedAccessMode(false),
218 
219     HasApertureRegs(false),
220     SupportsXNACK(false),
221     EnableXNACK(false),
222     EnableTgSplit(false),
223     EnableCuMode(false),
224     TrapHandler(false),
225 
226     EnableLoadStoreOpt(false),
227     EnableUnsafeDSOffsetFolding(false),
228     EnableSIScheduler(false),
229     EnableDS128(false),
230     EnablePRTStrictNull(false),
231     DumpCode(false),
232 
233     FP64(false),
234     CIInsts(false),
235     GFX8Insts(false),
236     GFX9Insts(false),
237     GFX90AInsts(false),
238     GFX10Insts(false),
239     GFX10_3Insts(false),
240     GFX7GFX8GFX9Insts(false),
241     SGPRInitBug(false),
242     NegativeScratchOffsetBug(false),
243     NegativeUnalignedScratchOffsetBug(false),
244     HasSMemRealTime(false),
245     HasIntClamp(false),
246     HasFmaMixInsts(false),
247     HasMovrel(false),
248     HasVGPRIndexMode(false),
249     HasScalarStores(false),
250     HasScalarAtomics(false),
251     HasSDWAOmod(false),
252     HasSDWAScalar(false),
253     HasSDWASdst(false),
254     HasSDWAMac(false),
255     HasSDWAOutModsVOPC(false),
256     HasDPP(false),
257     HasDPP8(false),
258     Has64BitDPP(false),
259     HasPackedFP32Ops(false),
260     HasExtendedImageInsts(false),
261     HasR128A16(false),
262     HasGFX10A16(false),
263     HasG16(false),
264     HasNSAEncoding(false),
265     GFX10_BEncoding(false),
266     HasDLInsts(false),
267     HasDot1Insts(false),
268     HasDot2Insts(false),
269     HasDot3Insts(false),
270     HasDot4Insts(false),
271     HasDot5Insts(false),
272     HasDot6Insts(false),
273     HasDot7Insts(false),
274     HasMAIInsts(false),
275     HasPkFmacF16Inst(false),
276     HasAtomicFaddInsts(false),
277     SupportsSRAMECC(false),
278     EnableSRAMECC(false),
279     HasNoSdstCMPX(false),
280     HasVscnt(false),
281     HasGetWaveIdInst(false),
282     HasSMemTimeInst(false),
283     HasShaderCyclesRegister(false),
284     HasRegisterBanking(false),
285     HasVOP3Literal(false),
286     HasNoDataDepHazard(false),
287     FlatAddressSpace(false),
288     FlatInstOffsets(false),
289     FlatGlobalInsts(false),
290     FlatScratchInsts(false),
291     ScalarFlatScratchInsts(false),
292     AddNoCarryInsts(false),
293     HasUnpackedD16VMem(false),
294     LDSMisalignedBug(false),
295     HasMFMAInlineLiteralBug(false),
296     UnalignedBufferAccess(false),
297     UnalignedDSAccess(false),
298     HasPackedTID(false),
299 
300     ScalarizeGlobal(false),
301 
302     HasVcmpxPermlaneHazard(false),
303     HasVMEMtoScalarWriteHazard(false),
304     HasSMEMtoVectorWriteHazard(false),
305     HasInstFwdPrefetchBug(false),
306     HasVcmpxExecWARHazard(false),
307     HasLdsBranchVmemWARHazard(false),
308     HasNSAtoVMEMBug(false),
309     HasNSAClauseBug(false),
310     HasOffset3fBug(false),
311     HasFlatSegmentOffsetBug(false),
312     HasImageStoreD16Bug(false),
313     HasImageGather4D16Bug(false),
314 
315     FeatureDisable(false),
316     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
317     TLInfo(TM, *this),
318     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
319   // clang-format on
320   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
321   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
322   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
323   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
324   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
325   InstSelector.reset(new AMDGPUInstructionSelector(
326   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
327 }
328 
329 bool GCNSubtarget::enableFlatScratch() const {
330   return EnableFlatScratch && hasFlatScratchInsts();
331 }
332 
333 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
334   if (getGeneration() < GFX10)
335     return 1;
336 
337   switch (Opcode) {
338   case AMDGPU::V_LSHLREV_B64_e64:
339   case AMDGPU::V_LSHLREV_B64_gfx10:
340   case AMDGPU::V_LSHL_B64_e64:
341   case AMDGPU::V_LSHRREV_B64_e64:
342   case AMDGPU::V_LSHRREV_B64_gfx10:
343   case AMDGPU::V_LSHR_B64_e64:
344   case AMDGPU::V_ASHRREV_I64_e64:
345   case AMDGPU::V_ASHRREV_I64_gfx10:
346   case AMDGPU::V_ASHR_I64_e64:
347     return 1;
348   }
349 
350   return 2;
351 }
352 
353 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
354   const Function &F) const {
355   if (NWaves == 1)
356     return getLocalMemorySize();
357   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
358   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
359   if (!WorkGroupsPerCu)
360     return 0;
361   unsigned MaxWaves = getMaxWavesPerEU();
362   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
363 }
364 
365 // FIXME: Should return min,max range.
366 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
367   const Function &F) const {
368   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
369   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
370   if (!MaxWorkGroupsPerCu)
371     return 0;
372 
373   const unsigned WaveSize = getWavefrontSize();
374 
375   // FIXME: Do we need to account for alignment requirement of LDS rounding the
376   // size up?
377   // Compute restriction based on LDS usage
378   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
379 
380   // This can be queried with more LDS than is possible, so just assume the
381   // worst.
382   if (NumGroups == 0)
383     return 1;
384 
385   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
386 
387   // Round to the number of waves.
388   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
389   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
390 
391   // Clamp to the maximum possible number of waves.
392   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
393 
394   // FIXME: Needs to be a multiple of the group size?
395   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
396 
397   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
398          "computed invalid occupancy");
399   return MaxWaves;
400 }
401 
402 unsigned
403 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
404   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
405   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
406 }
407 
408 std::pair<unsigned, unsigned>
409 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
410   switch (CC) {
411   case CallingConv::AMDGPU_VS:
412   case CallingConv::AMDGPU_LS:
413   case CallingConv::AMDGPU_HS:
414   case CallingConv::AMDGPU_ES:
415   case CallingConv::AMDGPU_GS:
416   case CallingConv::AMDGPU_PS:
417     return std::make_pair(1, getWavefrontSize());
418   default:
419     return std::make_pair(1u, getMaxFlatWorkGroupSize());
420   }
421 }
422 
423 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
424   const Function &F) const {
425   // Default minimum/maximum flat work group sizes.
426   std::pair<unsigned, unsigned> Default =
427     getDefaultFlatWorkGroupSize(F.getCallingConv());
428 
429   // Requested minimum/maximum flat work group sizes.
430   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
431     F, "amdgpu-flat-work-group-size", Default);
432 
433   // Make sure requested minimum is less than requested maximum.
434   if (Requested.first > Requested.second)
435     return Default;
436 
437   // Make sure requested values do not violate subtarget's specifications.
438   if (Requested.first < getMinFlatWorkGroupSize())
439     return Default;
440   if (Requested.second > getMaxFlatWorkGroupSize())
441     return Default;
442 
443   return Requested;
444 }
445 
446 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
447   const Function &F) const {
448   // Default minimum/maximum number of waves per execution unit.
449   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
450 
451   // Default/requested minimum/maximum flat work group sizes.
452   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
453 
454   // If minimum/maximum flat work group sizes were explicitly requested using
455   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
456   // number of waves per execution unit to values implied by requested
457   // minimum/maximum flat work group sizes.
458   unsigned MinImpliedByFlatWorkGroupSize =
459     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
460   Default.first = MinImpliedByFlatWorkGroupSize;
461   bool RequestedFlatWorkGroupSize =
462       F.hasFnAttribute("amdgpu-flat-work-group-size");
463 
464   // Requested minimum/maximum number of waves per execution unit.
465   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
466     F, "amdgpu-waves-per-eu", Default, true);
467 
468   // Make sure requested minimum is less than requested maximum.
469   if (Requested.second && Requested.first > Requested.second)
470     return Default;
471 
472   // Make sure requested values do not violate subtarget's specifications.
473   if (Requested.first < getMinWavesPerEU() ||
474       Requested.second > getMaxWavesPerEU())
475     return Default;
476 
477   // Make sure requested values are compatible with values implied by requested
478   // minimum/maximum flat work group sizes.
479   if (RequestedFlatWorkGroupSize &&
480       Requested.first < MinImpliedByFlatWorkGroupSize)
481     return Default;
482 
483   return Requested;
484 }
485 
486 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
487   auto Node = Kernel.getMetadata("reqd_work_group_size");
488   if (Node && Node->getNumOperands() == 3)
489     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
490   return std::numeric_limits<unsigned>::max();
491 }
492 
493 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
494   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
495 }
496 
497 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
498                                            unsigned Dimension) const {
499   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
500   if (ReqdSize != std::numeric_limits<unsigned>::max())
501     return ReqdSize - 1;
502   return getFlatWorkGroupSizes(Kernel).second - 1;
503 }
504 
505 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
506   Function *Kernel = I->getParent()->getParent();
507   unsigned MinSize = 0;
508   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
509   bool IdQuery = false;
510 
511   // If reqd_work_group_size is present it narrows value down.
512   if (auto *CI = dyn_cast<CallInst>(I)) {
513     const Function *F = CI->getCalledFunction();
514     if (F) {
515       unsigned Dim = UINT_MAX;
516       switch (F->getIntrinsicID()) {
517       case Intrinsic::amdgcn_workitem_id_x:
518       case Intrinsic::r600_read_tidig_x:
519         IdQuery = true;
520         LLVM_FALLTHROUGH;
521       case Intrinsic::r600_read_local_size_x:
522         Dim = 0;
523         break;
524       case Intrinsic::amdgcn_workitem_id_y:
525       case Intrinsic::r600_read_tidig_y:
526         IdQuery = true;
527         LLVM_FALLTHROUGH;
528       case Intrinsic::r600_read_local_size_y:
529         Dim = 1;
530         break;
531       case Intrinsic::amdgcn_workitem_id_z:
532       case Intrinsic::r600_read_tidig_z:
533         IdQuery = true;
534         LLVM_FALLTHROUGH;
535       case Intrinsic::r600_read_local_size_z:
536         Dim = 2;
537         break;
538       default:
539         break;
540       }
541 
542       if (Dim <= 3) {
543         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
544         if (ReqdSize != std::numeric_limits<unsigned>::max())
545           MinSize = MaxSize = ReqdSize;
546       }
547     }
548   }
549 
550   if (!MaxSize)
551     return false;
552 
553   // Range metadata is [Lo, Hi). For ID query we need to pass max size
554   // as Hi. For size query we need to pass Hi + 1.
555   if (IdQuery)
556     MinSize = 0;
557   else
558     ++MaxSize;
559 
560   MDBuilder MDB(I->getContext());
561   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
562                                                   APInt(32, MaxSize));
563   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
564   return true;
565 }
566 
567 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
568   if (isMesaKernel(F))
569     return 16;
570   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
571 }
572 
573 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
574                                                  Align &MaxAlign) const {
575   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
576          F.getCallingConv() == CallingConv::SPIR_KERNEL);
577 
578   const DataLayout &DL = F.getParent()->getDataLayout();
579   uint64_t ExplicitArgBytes = 0;
580   MaxAlign = Align(1);
581 
582   for (const Argument &Arg : F.args()) {
583     const bool IsByRef = Arg.hasByRefAttr();
584     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
585     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
586     if (!Alignment)
587       Alignment = DL.getABITypeAlign(ArgTy);
588 
589     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
590     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
591     MaxAlign = max(MaxAlign, Alignment);
592   }
593 
594   return ExplicitArgBytes;
595 }
596 
597 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
598                                                 Align &MaxAlign) const {
599   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
600 
601   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
602 
603   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
604   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
605   if (ImplicitBytes != 0) {
606     const Align Alignment = getAlignmentForImplicitArgPtr();
607     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
608   }
609 
610   // Being able to dereference past the end is useful for emitting scalar loads.
611   return alignTo(TotalSize, 4);
612 }
613 
614 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
615   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
616                                   : AMDGPUDwarfFlavour::Wave64;
617 }
618 
619 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
620                              const TargetMachine &TM) :
621   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
622   AMDGPUSubtarget(TT),
623   InstrInfo(*this),
624   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
625   FMA(false),
626   CaymanISA(false),
627   CFALUBug(false),
628   HasVertexCache(false),
629   R600ALUInst(false),
630   FP64(false),
631   TexVTXClauseSize(0),
632   Gen(R600),
633   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
634   InstrItins(getInstrItineraryForCPU(GPU)) { }
635 
636 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
637                                       unsigned NumRegionInstrs) const {
638   // Track register pressure so the scheduler can try to decrease
639   // pressure once register usage is above the threshold defined by
640   // SIRegisterInfo::getRegPressureSetLimit()
641   Policy.ShouldTrackPressure = true;
642 
643   // Enabling both top down and bottom up scheduling seems to give us less
644   // register spills than just using one of these approaches on its own.
645   Policy.OnlyTopDown = false;
646   Policy.OnlyBottomUp = false;
647 
648   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
649   if (!enableSIScheduler())
650     Policy.ShouldTrackLaneMasks = true;
651 }
652 
653 bool GCNSubtarget::hasMadF16() const {
654   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
655 }
656 
657 bool GCNSubtarget::useVGPRIndexMode() const {
658   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
659 }
660 
661 bool GCNSubtarget::useAA() const { return UseAA; }
662 
663 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
664   if (getGeneration() >= AMDGPUSubtarget::GFX10)
665     return getMaxWavesPerEU();
666 
667   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
668     if (SGPRs <= 80)
669       return 10;
670     if (SGPRs <= 88)
671       return 9;
672     if (SGPRs <= 100)
673       return 8;
674     return 7;
675   }
676   if (SGPRs <= 48)
677     return 10;
678   if (SGPRs <= 56)
679     return 9;
680   if (SGPRs <= 64)
681     return 8;
682   if (SGPRs <= 72)
683     return 7;
684   if (SGPRs <= 80)
685     return 6;
686   return 5;
687 }
688 
689 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
690   unsigned MaxWaves = getMaxWavesPerEU();
691   unsigned Granule = getVGPRAllocGranule();
692   if (VGPRs < Granule)
693     return MaxWaves;
694   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
695   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
696 }
697 
698 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
699   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
700   if (getGeneration() >= AMDGPUSubtarget::GFX10)
701     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
702 
703   if (MFI.hasFlatScratchInit()) {
704     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
705       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
706     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
707       return 4; // FLAT_SCRATCH, VCC (in that order).
708   }
709 
710   if (isXNACKEnabled())
711     return 4; // XNACK, VCC (in that order).
712   return 2; // VCC.
713 }
714 
715 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
716                                         unsigned NumSGPRs,
717                                         unsigned NumVGPRs) const {
718   unsigned Occupancy =
719     std::min(getMaxWavesPerEU(),
720              getOccupancyWithLocalMemSize(LDSSize, F));
721   if (NumSGPRs)
722     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
723   if (NumVGPRs)
724     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
725   return Occupancy;
726 }
727 
728 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
729   const Function &F = MF.getFunction();
730   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
731 
732   // Compute maximum number of SGPRs function can use using default/requested
733   // minimum number of waves per execution unit.
734   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
735   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
736   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
737 
738   // Check if maximum number of SGPRs was explicitly requested using
739   // "amdgpu-num-sgpr" attribute.
740   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
741     unsigned Requested = AMDGPU::getIntegerAttribute(
742       F, "amdgpu-num-sgpr", MaxNumSGPRs);
743 
744     // Make sure requested value does not violate subtarget's specifications.
745     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
746       Requested = 0;
747 
748     // If more SGPRs are required to support the input user/system SGPRs,
749     // increase to accommodate them.
750     //
751     // FIXME: This really ends up using the requested number of SGPRs + number
752     // of reserved special registers in total. Theoretically you could re-use
753     // the last input registers for these special registers, but this would
754     // require a lot of complexity to deal with the weird aliasing.
755     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
756     if (Requested && Requested < InputNumSGPRs)
757       Requested = InputNumSGPRs;
758 
759     // Make sure requested value is compatible with values implied by
760     // default/requested minimum/maximum number of waves per execution unit.
761     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
762       Requested = 0;
763     if (WavesPerEU.second &&
764         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
765       Requested = 0;
766 
767     if (Requested)
768       MaxNumSGPRs = Requested;
769   }
770 
771   if (hasSGPRInitBug())
772     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
773 
774   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
775                   MaxAddressableNumSGPRs);
776 }
777 
778 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
779   const Function &F = MF.getFunction();
780   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
781 
782   // Compute maximum number of VGPRs function can use using default/requested
783   // minimum number of waves per execution unit.
784   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
785   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
786 
787   // Check if maximum number of VGPRs was explicitly requested using
788   // "amdgpu-num-vgpr" attribute.
789   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
790     unsigned Requested = AMDGPU::getIntegerAttribute(
791       F, "amdgpu-num-vgpr", MaxNumVGPRs);
792 
793     if (hasGFX90AInsts())
794       Requested *= 2;
795 
796     // Make sure requested value is compatible with values implied by
797     // default/requested minimum/maximum number of waves per execution unit.
798     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
799       Requested = 0;
800     if (WavesPerEU.second &&
801         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
802       Requested = 0;
803 
804     if (Requested)
805       MaxNumVGPRs = Requested;
806   }
807 
808   return MaxNumVGPRs;
809 }
810 
811 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
812                                          int UseOpIdx, SDep &Dep) const {
813   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
814       !Def->isInstr() || !Use->isInstr())
815     return;
816 
817   MachineInstr *DefI = Def->getInstr();
818   MachineInstr *UseI = Use->getInstr();
819 
820   if (DefI->isBundle()) {
821     const SIRegisterInfo *TRI = getRegisterInfo();
822     auto Reg = Dep.getReg();
823     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
824     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
825     unsigned Lat = 0;
826     for (++I; I != E && I->isBundledWithPred(); ++I) {
827       if (I->modifiesRegister(Reg, TRI))
828         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
829       else if (Lat)
830         --Lat;
831     }
832     Dep.setLatency(Lat);
833   } else if (UseI->isBundle()) {
834     const SIRegisterInfo *TRI = getRegisterInfo();
835     auto Reg = Dep.getReg();
836     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
837     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
838     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
839     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
840       if (I->readsRegister(Reg, TRI))
841         break;
842       --Lat;
843     }
844     Dep.setLatency(Lat);
845   }
846 }
847 
848 namespace {
849 struct FillMFMAShadowMutation : ScheduleDAGMutation {
850   const SIInstrInfo *TII;
851 
852   ScheduleDAGMI *DAG;
853 
854   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
855 
856   bool isSALU(const SUnit *SU) const {
857     const MachineInstr *MI = SU->getInstr();
858     return MI && TII->isSALU(*MI) && !MI->isTerminator();
859   }
860 
861   bool isVALU(const SUnit *SU) const {
862     const MachineInstr *MI = SU->getInstr();
863     return MI && TII->isVALU(*MI);
864   }
865 
866   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
867     if (Pred->NodeNum < Succ->NodeNum)
868       return true;
869 
870     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
871 
872     for (unsigned I = 0; I < Succs.size(); ++I) {
873       for (const SDep &SI : Succs[I]->Succs) {
874         const SUnit *SU = SI.getSUnit();
875         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
876           Succs.push_back(SU);
877       }
878     }
879 
880     SmallPtrSet<const SUnit*, 32> Visited;
881     while (!Preds.empty()) {
882       const SUnit *SU = Preds.pop_back_val();
883       if (llvm::is_contained(Succs, SU))
884         return false;
885       Visited.insert(SU);
886       for (const SDep &SI : SU->Preds)
887         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
888           Preds.push_back(SI.getSUnit());
889     }
890 
891     return true;
892   }
893 
894   // Link as much SALU intructions in chain as possible. Return the size
895   // of the chain. Links up to MaxChain instructions.
896   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
897                          SmallPtrSetImpl<SUnit *> &Visited) const {
898     SmallVector<SUnit *, 8> Worklist({To});
899     unsigned Linked = 0;
900 
901     while (!Worklist.empty() && MaxChain-- > 0) {
902       SUnit *SU = Worklist.pop_back_val();
903       if (!Visited.insert(SU).second)
904         continue;
905 
906       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
907                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
908 
909       if (SU->addPred(SDep(From, SDep::Artificial), false))
910         ++Linked;
911 
912       for (SDep &SI : From->Succs) {
913         SUnit *SUv = SI.getSUnit();
914         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
915           SUv->addPred(SDep(SU, SDep::Artificial), false);
916       }
917 
918       for (SDep &SI : SU->Succs) {
919         SUnit *Succ = SI.getSUnit();
920         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
921           Worklist.push_back(Succ);
922       }
923     }
924 
925     return Linked;
926   }
927 
928   void apply(ScheduleDAGInstrs *DAGInstrs) override {
929     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
930     if (!ST.hasMAIInsts() || DisablePowerSched)
931       return;
932     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
933     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
934     if (!TSchedModel || DAG->SUnits.empty())
935       return;
936 
937     // Scan for MFMA long latency instructions and try to add a dependency
938     // of available SALU instructions to give them a chance to fill MFMA
939     // shadow. That is desirable to fill MFMA shadow with SALU instructions
940     // rather than VALU to prevent power consumption bursts and throttle.
941     auto LastSALU = DAG->SUnits.begin();
942     auto E = DAG->SUnits.end();
943     SmallPtrSet<SUnit*, 32> Visited;
944     for (SUnit &SU : DAG->SUnits) {
945       MachineInstr &MAI = *SU.getInstr();
946       if (!TII->isMAI(MAI) ||
947            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
948            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
949         continue;
950 
951       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
952 
953       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
954                  dbgs() << "Need " << Lat
955                         << " instructions to cover latency.\n");
956 
957       // Find up to Lat independent scalar instructions as early as
958       // possible such that they can be scheduled after this MFMA.
959       for ( ; Lat && LastSALU != E; ++LastSALU) {
960         if (Visited.count(&*LastSALU))
961           continue;
962 
963         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
964           continue;
965 
966         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
967       }
968     }
969   }
970 };
971 } // namespace
972 
973 void GCNSubtarget::getPostRAMutations(
974     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
975   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
976 }
977 
978 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
979   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
980     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
981   else
982     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
983 }
984 
985 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
986   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
987     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
988   else
989     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
990 }
991