1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM)
199     : // clang-format off
200     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201     AMDGPUSubtarget(TT),
202     TargetTriple(TT),
203     TargetID(*this),
204     Gen(INVALID),
205     InstrItins(getInstrItineraryForCPU(GPU)),
206     LDSBankCount(0),
207     MaxPrivateElementSize(0),
208 
209     FastFMAF32(false),
210     FastDenormalF32(false),
211     HalfRate64Ops(false),
212     FullRate64Ops(false),
213 
214     FlatForGlobal(false),
215     AutoWaitcntBeforeBarrier(false),
216     UnalignedScratchAccess(false),
217     UnalignedAccessMode(false),
218 
219     HasApertureRegs(false),
220     SupportsXNACK(false),
221     EnableXNACK(false),
222     EnableTgSplit(false),
223     EnableCuMode(false),
224     TrapHandler(false),
225 
226     EnableLoadStoreOpt(false),
227     EnableUnsafeDSOffsetFolding(false),
228     EnableSIScheduler(false),
229     EnableDS128(false),
230     EnablePRTStrictNull(false),
231     DumpCode(false),
232 
233     FP64(false),
234     CIInsts(false),
235     GFX8Insts(false),
236     GFX9Insts(false),
237     GFX90AInsts(false),
238     GFX10Insts(false),
239     GFX10_3Insts(false),
240     GFX7GFX8GFX9Insts(false),
241     SGPRInitBug(false),
242     NegativeScratchOffsetBug(false),
243     NegativeUnalignedScratchOffsetBug(false),
244     HasSMemRealTime(false),
245     HasIntClamp(false),
246     HasFmaMixInsts(false),
247     HasMovrel(false),
248     HasVGPRIndexMode(false),
249     HasScalarStores(false),
250     HasScalarAtomics(false),
251     HasSDWAOmod(false),
252     HasSDWAScalar(false),
253     HasSDWASdst(false),
254     HasSDWAMac(false),
255     HasSDWAOutModsVOPC(false),
256     HasDPP(false),
257     HasDPP8(false),
258     Has64BitDPP(false),
259     HasPackedFP32Ops(false),
260     HasExtendedImageInsts(false),
261     HasR128A16(false),
262     HasGFX10A16(false),
263     HasG16(false),
264     HasNSAEncoding(false),
265     GFX10_AEncoding(false),
266     GFX10_BEncoding(false),
267     HasDLInsts(false),
268     HasDot1Insts(false),
269     HasDot2Insts(false),
270     HasDot3Insts(false),
271     HasDot4Insts(false),
272     HasDot5Insts(false),
273     HasDot6Insts(false),
274     HasDot7Insts(false),
275     HasMAIInsts(false),
276     HasPkFmacF16Inst(false),
277     HasAtomicFaddInsts(false),
278     SupportsSRAMECC(false),
279     EnableSRAMECC(false),
280     HasNoSdstCMPX(false),
281     HasVscnt(false),
282     HasGetWaveIdInst(false),
283     HasSMemTimeInst(false),
284     HasShaderCyclesRegister(false),
285     HasRegisterBanking(false),
286     HasVOP3Literal(false),
287     HasNoDataDepHazard(false),
288     FlatAddressSpace(false),
289     FlatInstOffsets(false),
290     FlatGlobalInsts(false),
291     FlatScratchInsts(false),
292     ScalarFlatScratchInsts(false),
293     HasArchitectedFlatScratch(false),
294     AddNoCarryInsts(false),
295     HasUnpackedD16VMem(false),
296     LDSMisalignedBug(false),
297     HasMFMAInlineLiteralBug(false),
298     UnalignedBufferAccess(false),
299     UnalignedDSAccess(false),
300     HasPackedTID(false),
301 
302     ScalarizeGlobal(false),
303 
304     HasVcmpxPermlaneHazard(false),
305     HasVMEMtoScalarWriteHazard(false),
306     HasSMEMtoVectorWriteHazard(false),
307     HasInstFwdPrefetchBug(false),
308     HasVcmpxExecWARHazard(false),
309     HasLdsBranchVmemWARHazard(false),
310     HasNSAtoVMEMBug(false),
311     HasNSAClauseBug(false),
312     HasOffset3fBug(false),
313     HasFlatSegmentOffsetBug(false),
314     HasImageStoreD16Bug(false),
315     HasImageGather4D16Bug(false),
316 
317     FeatureDisable(false),
318     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
319     TLInfo(TM, *this),
320     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
321   // clang-format on
322   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
323   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
324   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
325   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
326   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
327   InstSelector.reset(new AMDGPUInstructionSelector(
328   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
329 }
330 
331 bool GCNSubtarget::enableFlatScratch() const {
332   return flatScratchIsArchitected() ||
333          (EnableFlatScratch && hasFlatScratchInsts());
334 }
335 
336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
337   if (getGeneration() < GFX10)
338     return 1;
339 
340   switch (Opcode) {
341   case AMDGPU::V_LSHLREV_B64_e64:
342   case AMDGPU::V_LSHLREV_B64_gfx10:
343   case AMDGPU::V_LSHL_B64_e64:
344   case AMDGPU::V_LSHRREV_B64_e64:
345   case AMDGPU::V_LSHRREV_B64_gfx10:
346   case AMDGPU::V_LSHR_B64_e64:
347   case AMDGPU::V_ASHRREV_I64_e64:
348   case AMDGPU::V_ASHRREV_I64_gfx10:
349   case AMDGPU::V_ASHR_I64_e64:
350     return 1;
351   }
352 
353   return 2;
354 }
355 
356 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
357   const Function &F) const {
358   if (NWaves == 1)
359     return getLocalMemorySize();
360   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
361   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
362   if (!WorkGroupsPerCu)
363     return 0;
364   unsigned MaxWaves = getMaxWavesPerEU();
365   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
366 }
367 
368 // FIXME: Should return min,max range.
369 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
370   const Function &F) const {
371   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
372   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
373   if (!MaxWorkGroupsPerCu)
374     return 0;
375 
376   const unsigned WaveSize = getWavefrontSize();
377 
378   // FIXME: Do we need to account for alignment requirement of LDS rounding the
379   // size up?
380   // Compute restriction based on LDS usage
381   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
382 
383   // This can be queried with more LDS than is possible, so just assume the
384   // worst.
385   if (NumGroups == 0)
386     return 1;
387 
388   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
389 
390   // Round to the number of waves.
391   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
392   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
393 
394   // Clamp to the maximum possible number of waves.
395   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
396 
397   // FIXME: Needs to be a multiple of the group size?
398   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
399 
400   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
401          "computed invalid occupancy");
402   return MaxWaves;
403 }
404 
405 unsigned
406 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
407   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
408   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
409 }
410 
411 std::pair<unsigned, unsigned>
412 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
413   switch (CC) {
414   case CallingConv::AMDGPU_VS:
415   case CallingConv::AMDGPU_LS:
416   case CallingConv::AMDGPU_HS:
417   case CallingConv::AMDGPU_ES:
418   case CallingConv::AMDGPU_GS:
419   case CallingConv::AMDGPU_PS:
420     return std::make_pair(1, getWavefrontSize());
421   default:
422     return std::make_pair(1u, getMaxFlatWorkGroupSize());
423   }
424 }
425 
426 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
427   const Function &F) const {
428   // Default minimum/maximum flat work group sizes.
429   std::pair<unsigned, unsigned> Default =
430     getDefaultFlatWorkGroupSize(F.getCallingConv());
431 
432   // Requested minimum/maximum flat work group sizes.
433   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
434     F, "amdgpu-flat-work-group-size", Default);
435 
436   // Make sure requested minimum is less than requested maximum.
437   if (Requested.first > Requested.second)
438     return Default;
439 
440   // Make sure requested values do not violate subtarget's specifications.
441   if (Requested.first < getMinFlatWorkGroupSize())
442     return Default;
443   if (Requested.second > getMaxFlatWorkGroupSize())
444     return Default;
445 
446   return Requested;
447 }
448 
449 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
450   const Function &F) const {
451   // Default minimum/maximum number of waves per execution unit.
452   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
453 
454   // Default/requested minimum/maximum flat work group sizes.
455   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
456 
457   // If minimum/maximum flat work group sizes were explicitly requested using
458   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
459   // number of waves per execution unit to values implied by requested
460   // minimum/maximum flat work group sizes.
461   unsigned MinImpliedByFlatWorkGroupSize =
462     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
463   Default.first = MinImpliedByFlatWorkGroupSize;
464   bool RequestedFlatWorkGroupSize =
465       F.hasFnAttribute("amdgpu-flat-work-group-size");
466 
467   // Requested minimum/maximum number of waves per execution unit.
468   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
469     F, "amdgpu-waves-per-eu", Default, true);
470 
471   // Make sure requested minimum is less than requested maximum.
472   if (Requested.second && Requested.first > Requested.second)
473     return Default;
474 
475   // Make sure requested values do not violate subtarget's specifications.
476   if (Requested.first < getMinWavesPerEU() ||
477       Requested.second > getMaxWavesPerEU())
478     return Default;
479 
480   // Make sure requested values are compatible with values implied by requested
481   // minimum/maximum flat work group sizes.
482   if (RequestedFlatWorkGroupSize &&
483       Requested.first < MinImpliedByFlatWorkGroupSize)
484     return Default;
485 
486   return Requested;
487 }
488 
489 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
490   auto Node = Kernel.getMetadata("reqd_work_group_size");
491   if (Node && Node->getNumOperands() == 3)
492     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
493   return std::numeric_limits<unsigned>::max();
494 }
495 
496 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
497   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
498 }
499 
500 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
501                                            unsigned Dimension) const {
502   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
503   if (ReqdSize != std::numeric_limits<unsigned>::max())
504     return ReqdSize - 1;
505   return getFlatWorkGroupSizes(Kernel).second - 1;
506 }
507 
508 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
509   Function *Kernel = I->getParent()->getParent();
510   unsigned MinSize = 0;
511   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
512   bool IdQuery = false;
513 
514   // If reqd_work_group_size is present it narrows value down.
515   if (auto *CI = dyn_cast<CallInst>(I)) {
516     const Function *F = CI->getCalledFunction();
517     if (F) {
518       unsigned Dim = UINT_MAX;
519       switch (F->getIntrinsicID()) {
520       case Intrinsic::amdgcn_workitem_id_x:
521       case Intrinsic::r600_read_tidig_x:
522         IdQuery = true;
523         LLVM_FALLTHROUGH;
524       case Intrinsic::r600_read_local_size_x:
525         Dim = 0;
526         break;
527       case Intrinsic::amdgcn_workitem_id_y:
528       case Intrinsic::r600_read_tidig_y:
529         IdQuery = true;
530         LLVM_FALLTHROUGH;
531       case Intrinsic::r600_read_local_size_y:
532         Dim = 1;
533         break;
534       case Intrinsic::amdgcn_workitem_id_z:
535       case Intrinsic::r600_read_tidig_z:
536         IdQuery = true;
537         LLVM_FALLTHROUGH;
538       case Intrinsic::r600_read_local_size_z:
539         Dim = 2;
540         break;
541       default:
542         break;
543       }
544 
545       if (Dim <= 3) {
546         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
547         if (ReqdSize != std::numeric_limits<unsigned>::max())
548           MinSize = MaxSize = ReqdSize;
549       }
550     }
551   }
552 
553   if (!MaxSize)
554     return false;
555 
556   // Range metadata is [Lo, Hi). For ID query we need to pass max size
557   // as Hi. For size query we need to pass Hi + 1.
558   if (IdQuery)
559     MinSize = 0;
560   else
561     ++MaxSize;
562 
563   MDBuilder MDB(I->getContext());
564   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
565                                                   APInt(32, MaxSize));
566   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
567   return true;
568 }
569 
570 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
571   if (isMesaKernel(F))
572     return 16;
573   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
574 }
575 
576 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
577                                                  Align &MaxAlign) const {
578   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
579          F.getCallingConv() == CallingConv::SPIR_KERNEL);
580 
581   const DataLayout &DL = F.getParent()->getDataLayout();
582   uint64_t ExplicitArgBytes = 0;
583   MaxAlign = Align(1);
584 
585   for (const Argument &Arg : F.args()) {
586     const bool IsByRef = Arg.hasByRefAttr();
587     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
588     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
589     if (!Alignment)
590       Alignment = DL.getABITypeAlign(ArgTy);
591 
592     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
593     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
594     MaxAlign = max(MaxAlign, Alignment);
595   }
596 
597   return ExplicitArgBytes;
598 }
599 
600 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
601                                                 Align &MaxAlign) const {
602   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
603 
604   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
605 
606   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
607   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
608   if (ImplicitBytes != 0) {
609     const Align Alignment = getAlignmentForImplicitArgPtr();
610     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
611   }
612 
613   // Being able to dereference past the end is useful for emitting scalar loads.
614   return alignTo(TotalSize, 4);
615 }
616 
617 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
618   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
619                                   : AMDGPUDwarfFlavour::Wave64;
620 }
621 
622 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
623                              const TargetMachine &TM) :
624   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
625   AMDGPUSubtarget(TT),
626   InstrInfo(*this),
627   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
628   FMA(false),
629   CaymanISA(false),
630   CFALUBug(false),
631   HasVertexCache(false),
632   R600ALUInst(false),
633   FP64(false),
634   TexVTXClauseSize(0),
635   Gen(R600),
636   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
637   InstrItins(getInstrItineraryForCPU(GPU)) { }
638 
639 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
640                                       unsigned NumRegionInstrs) const {
641   // Track register pressure so the scheduler can try to decrease
642   // pressure once register usage is above the threshold defined by
643   // SIRegisterInfo::getRegPressureSetLimit()
644   Policy.ShouldTrackPressure = true;
645 
646   // Enabling both top down and bottom up scheduling seems to give us less
647   // register spills than just using one of these approaches on its own.
648   Policy.OnlyTopDown = false;
649   Policy.OnlyBottomUp = false;
650 
651   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
652   if (!enableSIScheduler())
653     Policy.ShouldTrackLaneMasks = true;
654 }
655 
656 bool GCNSubtarget::hasMadF16() const {
657   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
658 }
659 
660 bool GCNSubtarget::useVGPRIndexMode() const {
661   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
662 }
663 
664 bool GCNSubtarget::useAA() const { return UseAA; }
665 
666 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
667   if (getGeneration() >= AMDGPUSubtarget::GFX10)
668     return getMaxWavesPerEU();
669 
670   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
671     if (SGPRs <= 80)
672       return 10;
673     if (SGPRs <= 88)
674       return 9;
675     if (SGPRs <= 100)
676       return 8;
677     return 7;
678   }
679   if (SGPRs <= 48)
680     return 10;
681   if (SGPRs <= 56)
682     return 9;
683   if (SGPRs <= 64)
684     return 8;
685   if (SGPRs <= 72)
686     return 7;
687   if (SGPRs <= 80)
688     return 6;
689   return 5;
690 }
691 
692 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
693   unsigned MaxWaves = getMaxWavesPerEU();
694   unsigned Granule = getVGPRAllocGranule();
695   if (VGPRs < Granule)
696     return MaxWaves;
697   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
698   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
699 }
700 
701 unsigned
702 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
703   if (getGeneration() >= AMDGPUSubtarget::GFX10)
704     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
705 
706   if (HasFlatScratchInit) {
707     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
708       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
709     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
710       return 4; // FLAT_SCRATCH, VCC (in that order).
711   }
712 
713   if (isXNACKEnabled())
714     return 4; // XNACK, VCC (in that order).
715   return 2; // VCC.
716 }
717 
718 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
719   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
720   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
721 }
722 
723 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
724   // The logic to detect if the function has
725   // flat scratch init is same as how MachineFunctionInfo derives.
726   bool FunctionHasFlatScratchInit = false;
727   bool HasCalls = F.hasFnAttribute("amdgpu-calls");
728   bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
729   if (hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(F.getCallingConv()) &&
730       (isAmdHsaOrMesa(F) || enableFlatScratch()) &&
731       !flatScratchIsArchitected()) {
732     if (HasCalls || HasStackObjects || enableFlatScratch())
733       FunctionHasFlatScratchInit = true;
734   }
735   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
736 }
737 
738 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
739                                         unsigned NumSGPRs,
740                                         unsigned NumVGPRs) const {
741   unsigned Occupancy =
742     std::min(getMaxWavesPerEU(),
743              getOccupancyWithLocalMemSize(LDSSize, F));
744   if (NumSGPRs)
745     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
746   if (NumVGPRs)
747     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
748   return Occupancy;
749 }
750 
751 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
752     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
753     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
754   // Compute maximum number of SGPRs function can use using default/requested
755   // minimum number of waves per execution unit.
756   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
757   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
758 
759   // Check if maximum number of SGPRs was explicitly requested using
760   // "amdgpu-num-sgpr" attribute.
761   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
762     unsigned Requested = AMDGPU::getIntegerAttribute(
763       F, "amdgpu-num-sgpr", MaxNumSGPRs);
764 
765     // Make sure requested value does not violate subtarget's specifications.
766     if (Requested && (Requested <= ReservedNumSGPRs))
767       Requested = 0;
768 
769     // If more SGPRs are required to support the input user/system SGPRs,
770     // increase to accommodate them.
771     //
772     // FIXME: This really ends up using the requested number of SGPRs + number
773     // of reserved special registers in total. Theoretically you could re-use
774     // the last input registers for these special registers, but this would
775     // require a lot of complexity to deal with the weird aliasing.
776     unsigned InputNumSGPRs = PreloadedSGPRs;
777     if (Requested && Requested < InputNumSGPRs)
778       Requested = InputNumSGPRs;
779 
780     // Make sure requested value is compatible with values implied by
781     // default/requested minimum/maximum number of waves per execution unit.
782     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
783       Requested = 0;
784     if (WavesPerEU.second &&
785         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
786       Requested = 0;
787 
788     if (Requested)
789       MaxNumSGPRs = Requested;
790   }
791 
792   if (hasSGPRInitBug())
793     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
794 
795   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
796 }
797 
798 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
799   const Function &F = MF.getFunction();
800   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
801   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
802                             getReservedNumSGPRs(MF));
803 }
804 
805 static unsigned getMaxNumPreloadedSGPRs() {
806   // Max number of user SGPRs
807   unsigned MaxUserSGPRs = 4 + // private segment buffer
808                           2 + // Dispatch ptr
809                           2 + // queue ptr
810                           2 + // kernel segment ptr
811                           2 + // dispatch ID
812                           2 + // flat scratch init
813                           2;  // Implicit buffer ptr
814   // Max number of system SGPRs
815   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
816                             1 + // WorkGroupIDY
817                             1 + // WorkGroupIDZ
818                             1 + // WorkGroupInfo
819                             1;  // private segment wave byte offset
820   return MaxUserSGPRs + MaxSystemSGPRs;
821 }
822 
823 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
824   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
825                             getReservedNumSGPRs(F));
826 }
827 
828 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
829     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
830   // Compute maximum number of VGPRs function can use using default/requested
831   // minimum number of waves per execution unit.
832   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
833 
834   // Check if maximum number of VGPRs was explicitly requested using
835   // "amdgpu-num-vgpr" attribute.
836   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
837     unsigned Requested = AMDGPU::getIntegerAttribute(
838       F, "amdgpu-num-vgpr", MaxNumVGPRs);
839 
840     if (hasGFX90AInsts())
841       Requested *= 2;
842 
843     // Make sure requested value is compatible with values implied by
844     // default/requested minimum/maximum number of waves per execution unit.
845     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
846       Requested = 0;
847     if (WavesPerEU.second &&
848         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
849       Requested = 0;
850 
851     if (Requested)
852       MaxNumVGPRs = Requested;
853   }
854 
855   return MaxNumVGPRs;
856 }
857 
858 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
859   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
860 }
861 
862 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
863   const Function &F = MF.getFunction();
864   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
865   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
866 }
867 
868 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
869                                          int UseOpIdx, SDep &Dep) const {
870   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
871       !Def->isInstr() || !Use->isInstr())
872     return;
873 
874   MachineInstr *DefI = Def->getInstr();
875   MachineInstr *UseI = Use->getInstr();
876 
877   if (DefI->isBundle()) {
878     const SIRegisterInfo *TRI = getRegisterInfo();
879     auto Reg = Dep.getReg();
880     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
881     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
882     unsigned Lat = 0;
883     for (++I; I != E && I->isBundledWithPred(); ++I) {
884       if (I->modifiesRegister(Reg, TRI))
885         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
886       else if (Lat)
887         --Lat;
888     }
889     Dep.setLatency(Lat);
890   } else if (UseI->isBundle()) {
891     const SIRegisterInfo *TRI = getRegisterInfo();
892     auto Reg = Dep.getReg();
893     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
894     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
895     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
896     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
897       if (I->readsRegister(Reg, TRI))
898         break;
899       --Lat;
900     }
901     Dep.setLatency(Lat);
902   }
903 }
904 
905 namespace {
906 struct FillMFMAShadowMutation : ScheduleDAGMutation {
907   const SIInstrInfo *TII;
908 
909   ScheduleDAGMI *DAG;
910 
911   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
912 
913   bool isSALU(const SUnit *SU) const {
914     const MachineInstr *MI = SU->getInstr();
915     return MI && TII->isSALU(*MI) && !MI->isTerminator();
916   }
917 
918   bool isVALU(const SUnit *SU) const {
919     const MachineInstr *MI = SU->getInstr();
920     return MI && TII->isVALU(*MI);
921   }
922 
923   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
924     if (Pred->NodeNum < Succ->NodeNum)
925       return true;
926 
927     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
928 
929     for (unsigned I = 0; I < Succs.size(); ++I) {
930       for (const SDep &SI : Succs[I]->Succs) {
931         const SUnit *SU = SI.getSUnit();
932         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
933           Succs.push_back(SU);
934       }
935     }
936 
937     SmallPtrSet<const SUnit*, 32> Visited;
938     while (!Preds.empty()) {
939       const SUnit *SU = Preds.pop_back_val();
940       if (llvm::is_contained(Succs, SU))
941         return false;
942       Visited.insert(SU);
943       for (const SDep &SI : SU->Preds)
944         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
945           Preds.push_back(SI.getSUnit());
946     }
947 
948     return true;
949   }
950 
951   // Link as much SALU intructions in chain as possible. Return the size
952   // of the chain. Links up to MaxChain instructions.
953   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
954                          SmallPtrSetImpl<SUnit *> &Visited) const {
955     SmallVector<SUnit *, 8> Worklist({To});
956     unsigned Linked = 0;
957 
958     while (!Worklist.empty() && MaxChain-- > 0) {
959       SUnit *SU = Worklist.pop_back_val();
960       if (!Visited.insert(SU).second)
961         continue;
962 
963       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
964                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
965 
966       if (SU->addPred(SDep(From, SDep::Artificial), false))
967         ++Linked;
968 
969       for (SDep &SI : From->Succs) {
970         SUnit *SUv = SI.getSUnit();
971         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
972           SUv->addPred(SDep(SU, SDep::Artificial), false);
973       }
974 
975       for (SDep &SI : SU->Succs) {
976         SUnit *Succ = SI.getSUnit();
977         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
978           Worklist.push_back(Succ);
979       }
980     }
981 
982     return Linked;
983   }
984 
985   void apply(ScheduleDAGInstrs *DAGInstrs) override {
986     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
987     if (!ST.hasMAIInsts() || DisablePowerSched)
988       return;
989     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
990     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
991     if (!TSchedModel || DAG->SUnits.empty())
992       return;
993 
994     // Scan for MFMA long latency instructions and try to add a dependency
995     // of available SALU instructions to give them a chance to fill MFMA
996     // shadow. That is desirable to fill MFMA shadow with SALU instructions
997     // rather than VALU to prevent power consumption bursts and throttle.
998     auto LastSALU = DAG->SUnits.begin();
999     auto E = DAG->SUnits.end();
1000     SmallPtrSet<SUnit*, 32> Visited;
1001     for (SUnit &SU : DAG->SUnits) {
1002       MachineInstr &MAI = *SU.getInstr();
1003       if (!TII->isMAI(MAI) ||
1004            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1005            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1006         continue;
1007 
1008       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1009 
1010       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1011                  dbgs() << "Need " << Lat
1012                         << " instructions to cover latency.\n");
1013 
1014       // Find up to Lat independent scalar instructions as early as
1015       // possible such that they can be scheduled after this MFMA.
1016       for ( ; Lat && LastSALU != E; ++LastSALU) {
1017         if (Visited.count(&*LastSALU))
1018           continue;
1019 
1020         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1021           continue;
1022 
1023         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1024       }
1025     }
1026   }
1027 };
1028 } // namespace
1029 
1030 void GCNSubtarget::getPostRAMutations(
1031     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1032   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1033 }
1034 
1035 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1036   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1037     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1038   else
1039     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1040 }
1041 
1042 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1043   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1044     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1045   else
1046     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1047 }
1048