1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM)
199     : // clang-format off
200     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
201     AMDGPUSubtarget(TT),
202     TargetTriple(TT),
203     TargetID(*this),
204     Gen(INVALID),
205     InstrItins(getInstrItineraryForCPU(GPU)),
206     LDSBankCount(0),
207     MaxPrivateElementSize(0),
208 
209     FastFMAF32(false),
210     FastDenormalF32(false),
211     HalfRate64Ops(false),
212     FullRate64Ops(false),
213 
214     FlatForGlobal(false),
215     AutoWaitcntBeforeBarrier(false),
216     UnalignedScratchAccess(false),
217     UnalignedAccessMode(false),
218 
219     HasApertureRegs(false),
220     SupportsXNACK(false),
221     EnableXNACK(false),
222     EnableTgSplit(false),
223     EnableCuMode(false),
224     TrapHandler(false),
225 
226     EnableLoadStoreOpt(false),
227     EnableUnsafeDSOffsetFolding(false),
228     EnableSIScheduler(false),
229     EnableDS128(false),
230     EnablePRTStrictNull(false),
231     DumpCode(false),
232 
233     FP64(false),
234     CIInsts(false),
235     GFX8Insts(false),
236     GFX9Insts(false),
237     GFX90AInsts(false),
238     GFX10Insts(false),
239     GFX10_3Insts(false),
240     GFX7GFX8GFX9Insts(false),
241     SGPRInitBug(false),
242     NegativeScratchOffsetBug(false),
243     NegativeUnalignedScratchOffsetBug(false),
244     HasSMemRealTime(false),
245     HasIntClamp(false),
246     HasFmaMixInsts(false),
247     HasMovrel(false),
248     HasVGPRIndexMode(false),
249     HasScalarStores(false),
250     HasScalarAtomics(false),
251     HasSDWAOmod(false),
252     HasSDWAScalar(false),
253     HasSDWASdst(false),
254     HasSDWAMac(false),
255     HasSDWAOutModsVOPC(false),
256     HasDPP(false),
257     HasDPP8(false),
258     Has64BitDPP(false),
259     HasPackedFP32Ops(false),
260     HasExtendedImageInsts(false),
261     HasR128A16(false),
262     HasGFX10A16(false),
263     HasG16(false),
264     HasNSAEncoding(false),
265     GFX10_AEncoding(false),
266     GFX10_BEncoding(false),
267     HasDLInsts(false),
268     HasDot1Insts(false),
269     HasDot2Insts(false),
270     HasDot3Insts(false),
271     HasDot4Insts(false),
272     HasDot5Insts(false),
273     HasDot6Insts(false),
274     HasDot7Insts(false),
275     HasMAIInsts(false),
276     HasPkFmacF16Inst(false),
277     HasAtomicFaddInsts(false),
278     SupportsSRAMECC(false),
279     EnableSRAMECC(false),
280     HasNoSdstCMPX(false),
281     HasVscnt(false),
282     HasGetWaveIdInst(false),
283     HasSMemTimeInst(false),
284     HasShaderCyclesRegister(false),
285     HasRegisterBanking(false),
286     HasVOP3Literal(false),
287     HasNoDataDepHazard(false),
288     FlatAddressSpace(false),
289     FlatInstOffsets(false),
290     FlatGlobalInsts(false),
291     FlatScratchInsts(false),
292     ScalarFlatScratchInsts(false),
293     HasArchitectedFlatScratch(false),
294     AddNoCarryInsts(false),
295     HasUnpackedD16VMem(false),
296     LDSMisalignedBug(false),
297     HasMFMAInlineLiteralBug(false),
298     UnalignedBufferAccess(false),
299     UnalignedDSAccess(false),
300     HasPackedTID(false),
301 
302     ScalarizeGlobal(false),
303 
304     HasVcmpxPermlaneHazard(false),
305     HasVMEMtoScalarWriteHazard(false),
306     HasSMEMtoVectorWriteHazard(false),
307     HasInstFwdPrefetchBug(false),
308     HasVcmpxExecWARHazard(false),
309     HasLdsBranchVmemWARHazard(false),
310     HasNSAtoVMEMBug(false),
311     HasNSAClauseBug(false),
312     HasOffset3fBug(false),
313     HasFlatSegmentOffsetBug(false),
314     HasImageStoreD16Bug(false),
315     HasImageGather4D16Bug(false),
316 
317     FeatureDisable(false),
318     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
319     TLInfo(TM, *this),
320     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
321   // clang-format on
322   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
323   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
324   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
325   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
326   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
327   InstSelector.reset(new AMDGPUInstructionSelector(
328   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
329 }
330 
331 bool GCNSubtarget::enableFlatScratch() const {
332   return flatScratchIsArchitected() ||
333          (EnableFlatScratch && hasFlatScratchInsts());
334 }
335 
336 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
337   if (getGeneration() < GFX10)
338     return 1;
339 
340   switch (Opcode) {
341   case AMDGPU::V_LSHLREV_B64_e64:
342   case AMDGPU::V_LSHLREV_B64_gfx10:
343   case AMDGPU::V_LSHL_B64_e64:
344   case AMDGPU::V_LSHRREV_B64_e64:
345   case AMDGPU::V_LSHRREV_B64_gfx10:
346   case AMDGPU::V_LSHR_B64_e64:
347   case AMDGPU::V_ASHRREV_I64_e64:
348   case AMDGPU::V_ASHRREV_I64_gfx10:
349   case AMDGPU::V_ASHR_I64_e64:
350     return 1;
351   }
352 
353   return 2;
354 }
355 
356 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
357   const Function &F) const {
358   if (NWaves == 1)
359     return getLocalMemorySize();
360   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
361   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
362   if (!WorkGroupsPerCu)
363     return 0;
364   unsigned MaxWaves = getMaxWavesPerEU();
365   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
366 }
367 
368 // FIXME: Should return min,max range.
369 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
370   const Function &F) const {
371   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
372   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
373   if (!MaxWorkGroupsPerCu)
374     return 0;
375 
376   const unsigned WaveSize = getWavefrontSize();
377 
378   // FIXME: Do we need to account for alignment requirement of LDS rounding the
379   // size up?
380   // Compute restriction based on LDS usage
381   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
382 
383   // This can be queried with more LDS than is possible, so just assume the
384   // worst.
385   if (NumGroups == 0)
386     return 1;
387 
388   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
389 
390   // Round to the number of waves.
391   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
392   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
393 
394   // Clamp to the maximum possible number of waves.
395   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
396 
397   // FIXME: Needs to be a multiple of the group size?
398   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
399 
400   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
401          "computed invalid occupancy");
402   return MaxWaves;
403 }
404 
405 unsigned
406 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
407   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
408   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
409 }
410 
411 std::pair<unsigned, unsigned>
412 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
413   switch (CC) {
414   case CallingConv::AMDGPU_VS:
415   case CallingConv::AMDGPU_LS:
416   case CallingConv::AMDGPU_HS:
417   case CallingConv::AMDGPU_ES:
418   case CallingConv::AMDGPU_GS:
419   case CallingConv::AMDGPU_PS:
420     return std::make_pair(1, getWavefrontSize());
421   default:
422     return std::make_pair(1u, getMaxFlatWorkGroupSize());
423   }
424 }
425 
426 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
427   const Function &F) const {
428   // Default minimum/maximum flat work group sizes.
429   std::pair<unsigned, unsigned> Default =
430     getDefaultFlatWorkGroupSize(F.getCallingConv());
431 
432   // Requested minimum/maximum flat work group sizes.
433   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
434     F, "amdgpu-flat-work-group-size", Default);
435 
436   // Make sure requested minimum is less than requested maximum.
437   if (Requested.first > Requested.second)
438     return Default;
439 
440   // Make sure requested values do not violate subtarget's specifications.
441   if (Requested.first < getMinFlatWorkGroupSize())
442     return Default;
443   if (Requested.second > getMaxFlatWorkGroupSize())
444     return Default;
445 
446   return Requested;
447 }
448 
449 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
450   const Function &F) const {
451   // Default minimum/maximum number of waves per execution unit.
452   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
453 
454   // Default/requested minimum/maximum flat work group sizes.
455   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
456 
457   // If minimum/maximum flat work group sizes were explicitly requested using
458   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
459   // number of waves per execution unit to values implied by requested
460   // minimum/maximum flat work group sizes.
461   unsigned MinImpliedByFlatWorkGroupSize =
462     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
463   Default.first = MinImpliedByFlatWorkGroupSize;
464   bool RequestedFlatWorkGroupSize =
465       F.hasFnAttribute("amdgpu-flat-work-group-size");
466 
467   // Requested minimum/maximum number of waves per execution unit.
468   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
469     F, "amdgpu-waves-per-eu", Default, true);
470 
471   // Make sure requested minimum is less than requested maximum.
472   if (Requested.second && Requested.first > Requested.second)
473     return Default;
474 
475   // Make sure requested values do not violate subtarget's specifications.
476   if (Requested.first < getMinWavesPerEU() ||
477       Requested.second > getMaxWavesPerEU())
478     return Default;
479 
480   // Make sure requested values are compatible with values implied by requested
481   // minimum/maximum flat work group sizes.
482   if (RequestedFlatWorkGroupSize &&
483       Requested.first < MinImpliedByFlatWorkGroupSize)
484     return Default;
485 
486   return Requested;
487 }
488 
489 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
490   auto Node = Kernel.getMetadata("reqd_work_group_size");
491   if (Node && Node->getNumOperands() == 3)
492     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
493   return std::numeric_limits<unsigned>::max();
494 }
495 
496 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
497   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
498 }
499 
500 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
501                                            unsigned Dimension) const {
502   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
503   if (ReqdSize != std::numeric_limits<unsigned>::max())
504     return ReqdSize - 1;
505   return getFlatWorkGroupSizes(Kernel).second - 1;
506 }
507 
508 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
509   Function *Kernel = I->getParent()->getParent();
510   unsigned MinSize = 0;
511   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
512   bool IdQuery = false;
513 
514   // If reqd_work_group_size is present it narrows value down.
515   if (auto *CI = dyn_cast<CallInst>(I)) {
516     const Function *F = CI->getCalledFunction();
517     if (F) {
518       unsigned Dim = UINT_MAX;
519       switch (F->getIntrinsicID()) {
520       case Intrinsic::amdgcn_workitem_id_x:
521       case Intrinsic::r600_read_tidig_x:
522         IdQuery = true;
523         LLVM_FALLTHROUGH;
524       case Intrinsic::r600_read_local_size_x:
525         Dim = 0;
526         break;
527       case Intrinsic::amdgcn_workitem_id_y:
528       case Intrinsic::r600_read_tidig_y:
529         IdQuery = true;
530         LLVM_FALLTHROUGH;
531       case Intrinsic::r600_read_local_size_y:
532         Dim = 1;
533         break;
534       case Intrinsic::amdgcn_workitem_id_z:
535       case Intrinsic::r600_read_tidig_z:
536         IdQuery = true;
537         LLVM_FALLTHROUGH;
538       case Intrinsic::r600_read_local_size_z:
539         Dim = 2;
540         break;
541       default:
542         break;
543       }
544 
545       if (Dim <= 3) {
546         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
547         if (ReqdSize != std::numeric_limits<unsigned>::max())
548           MinSize = MaxSize = ReqdSize;
549       }
550     }
551   }
552 
553   if (!MaxSize)
554     return false;
555 
556   // Range metadata is [Lo, Hi). For ID query we need to pass max size
557   // as Hi. For size query we need to pass Hi + 1.
558   if (IdQuery)
559     MinSize = 0;
560   else
561     ++MaxSize;
562 
563   MDBuilder MDB(I->getContext());
564   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
565                                                   APInt(32, MaxSize));
566   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
567   return true;
568 }
569 
570 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
571   if (isMesaKernel(F))
572     return 16;
573   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
574 }
575 
576 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
577                                                  Align &MaxAlign) const {
578   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
579          F.getCallingConv() == CallingConv::SPIR_KERNEL);
580 
581   const DataLayout &DL = F.getParent()->getDataLayout();
582   uint64_t ExplicitArgBytes = 0;
583   MaxAlign = Align(1);
584 
585   for (const Argument &Arg : F.args()) {
586     const bool IsByRef = Arg.hasByRefAttr();
587     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
588     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
589     if (!Alignment)
590       Alignment = DL.getABITypeAlign(ArgTy);
591 
592     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
593     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
594     MaxAlign = max(MaxAlign, Alignment);
595   }
596 
597   return ExplicitArgBytes;
598 }
599 
600 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
601                                                 Align &MaxAlign) const {
602   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
603 
604   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
605 
606   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
607   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
608   if (ImplicitBytes != 0) {
609     const Align Alignment = getAlignmentForImplicitArgPtr();
610     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
611   }
612 
613   // Being able to dereference past the end is useful for emitting scalar loads.
614   return alignTo(TotalSize, 4);
615 }
616 
617 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
618   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
619                                   : AMDGPUDwarfFlavour::Wave64;
620 }
621 
622 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
623                              const TargetMachine &TM) :
624   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
625   AMDGPUSubtarget(TT),
626   InstrInfo(*this),
627   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
628   FMA(false),
629   CaymanISA(false),
630   CFALUBug(false),
631   HasVertexCache(false),
632   R600ALUInst(false),
633   FP64(false),
634   TexVTXClauseSize(0),
635   Gen(R600),
636   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
637   InstrItins(getInstrItineraryForCPU(GPU)) { }
638 
639 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
640                                       unsigned NumRegionInstrs) const {
641   // Track register pressure so the scheduler can try to decrease
642   // pressure once register usage is above the threshold defined by
643   // SIRegisterInfo::getRegPressureSetLimit()
644   Policy.ShouldTrackPressure = true;
645 
646   // Enabling both top down and bottom up scheduling seems to give us less
647   // register spills than just using one of these approaches on its own.
648   Policy.OnlyTopDown = false;
649   Policy.OnlyBottomUp = false;
650 
651   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
652   if (!enableSIScheduler())
653     Policy.ShouldTrackLaneMasks = true;
654 }
655 
656 bool GCNSubtarget::hasMadF16() const {
657   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
658 }
659 
660 bool GCNSubtarget::useVGPRIndexMode() const {
661   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
662 }
663 
664 bool GCNSubtarget::useAA() const { return UseAA; }
665 
666 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
667   if (getGeneration() >= AMDGPUSubtarget::GFX10)
668     return getMaxWavesPerEU();
669 
670   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
671     if (SGPRs <= 80)
672       return 10;
673     if (SGPRs <= 88)
674       return 9;
675     if (SGPRs <= 100)
676       return 8;
677     return 7;
678   }
679   if (SGPRs <= 48)
680     return 10;
681   if (SGPRs <= 56)
682     return 9;
683   if (SGPRs <= 64)
684     return 8;
685   if (SGPRs <= 72)
686     return 7;
687   if (SGPRs <= 80)
688     return 6;
689   return 5;
690 }
691 
692 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
693   unsigned MaxWaves = getMaxWavesPerEU();
694   unsigned Granule = getVGPRAllocGranule();
695   if (VGPRs < Granule)
696     return MaxWaves;
697   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
698   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
699 }
700 
701 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
702   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
703   if (getGeneration() >= AMDGPUSubtarget::GFX10)
704     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
705 
706   if (MFI.hasFlatScratchInit()) {
707     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
708       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
709     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
710       return 4; // FLAT_SCRATCH, VCC (in that order).
711   }
712 
713   if (isXNACKEnabled())
714     return 4; // XNACK, VCC (in that order).
715   return 2; // VCC.
716 }
717 
718 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
719                                         unsigned NumSGPRs,
720                                         unsigned NumVGPRs) const {
721   unsigned Occupancy =
722     std::min(getMaxWavesPerEU(),
723              getOccupancyWithLocalMemSize(LDSSize, F));
724   if (NumSGPRs)
725     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
726   if (NumVGPRs)
727     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
728   return Occupancy;
729 }
730 
731 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
732   const Function &F = MF.getFunction();
733   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
734 
735   // Compute maximum number of SGPRs function can use using default/requested
736   // minimum number of waves per execution unit.
737   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
738   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
739   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
740 
741   // Check if maximum number of SGPRs was explicitly requested using
742   // "amdgpu-num-sgpr" attribute.
743   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
744     unsigned Requested = AMDGPU::getIntegerAttribute(
745       F, "amdgpu-num-sgpr", MaxNumSGPRs);
746 
747     // Make sure requested value does not violate subtarget's specifications.
748     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
749       Requested = 0;
750 
751     // If more SGPRs are required to support the input user/system SGPRs,
752     // increase to accommodate them.
753     //
754     // FIXME: This really ends up using the requested number of SGPRs + number
755     // of reserved special registers in total. Theoretically you could re-use
756     // the last input registers for these special registers, but this would
757     // require a lot of complexity to deal with the weird aliasing.
758     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
759     if (Requested && Requested < InputNumSGPRs)
760       Requested = InputNumSGPRs;
761 
762     // Make sure requested value is compatible with values implied by
763     // default/requested minimum/maximum number of waves per execution unit.
764     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
765       Requested = 0;
766     if (WavesPerEU.second &&
767         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
768       Requested = 0;
769 
770     if (Requested)
771       MaxNumSGPRs = Requested;
772   }
773 
774   if (hasSGPRInitBug())
775     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
776 
777   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
778                   MaxAddressableNumSGPRs);
779 }
780 
781 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
782   const Function &F = MF.getFunction();
783   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
784 
785   // Compute maximum number of VGPRs function can use using default/requested
786   // minimum number of waves per execution unit.
787   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
788   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
789 
790   // Check if maximum number of VGPRs was explicitly requested using
791   // "amdgpu-num-vgpr" attribute.
792   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
793     unsigned Requested = AMDGPU::getIntegerAttribute(
794       F, "amdgpu-num-vgpr", MaxNumVGPRs);
795 
796     if (hasGFX90AInsts())
797       Requested *= 2;
798 
799     // Make sure requested value is compatible with values implied by
800     // default/requested minimum/maximum number of waves per execution unit.
801     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
802       Requested = 0;
803     if (WavesPerEU.second &&
804         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
805       Requested = 0;
806 
807     if (Requested)
808       MaxNumVGPRs = Requested;
809   }
810 
811   return MaxNumVGPRs;
812 }
813 
814 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
815                                          int UseOpIdx, SDep &Dep) const {
816   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
817       !Def->isInstr() || !Use->isInstr())
818     return;
819 
820   MachineInstr *DefI = Def->getInstr();
821   MachineInstr *UseI = Use->getInstr();
822 
823   if (DefI->isBundle()) {
824     const SIRegisterInfo *TRI = getRegisterInfo();
825     auto Reg = Dep.getReg();
826     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
827     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
828     unsigned Lat = 0;
829     for (++I; I != E && I->isBundledWithPred(); ++I) {
830       if (I->modifiesRegister(Reg, TRI))
831         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
832       else if (Lat)
833         --Lat;
834     }
835     Dep.setLatency(Lat);
836   } else if (UseI->isBundle()) {
837     const SIRegisterInfo *TRI = getRegisterInfo();
838     auto Reg = Dep.getReg();
839     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
840     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
841     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
842     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
843       if (I->readsRegister(Reg, TRI))
844         break;
845       --Lat;
846     }
847     Dep.setLatency(Lat);
848   }
849 }
850 
851 namespace {
852 struct FillMFMAShadowMutation : ScheduleDAGMutation {
853   const SIInstrInfo *TII;
854 
855   ScheduleDAGMI *DAG;
856 
857   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
858 
859   bool isSALU(const SUnit *SU) const {
860     const MachineInstr *MI = SU->getInstr();
861     return MI && TII->isSALU(*MI) && !MI->isTerminator();
862   }
863 
864   bool isVALU(const SUnit *SU) const {
865     const MachineInstr *MI = SU->getInstr();
866     return MI && TII->isVALU(*MI);
867   }
868 
869   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
870     if (Pred->NodeNum < Succ->NodeNum)
871       return true;
872 
873     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
874 
875     for (unsigned I = 0; I < Succs.size(); ++I) {
876       for (const SDep &SI : Succs[I]->Succs) {
877         const SUnit *SU = SI.getSUnit();
878         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
879           Succs.push_back(SU);
880       }
881     }
882 
883     SmallPtrSet<const SUnit*, 32> Visited;
884     while (!Preds.empty()) {
885       const SUnit *SU = Preds.pop_back_val();
886       if (llvm::is_contained(Succs, SU))
887         return false;
888       Visited.insert(SU);
889       for (const SDep &SI : SU->Preds)
890         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
891           Preds.push_back(SI.getSUnit());
892     }
893 
894     return true;
895   }
896 
897   // Link as much SALU intructions in chain as possible. Return the size
898   // of the chain. Links up to MaxChain instructions.
899   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
900                          SmallPtrSetImpl<SUnit *> &Visited) const {
901     SmallVector<SUnit *, 8> Worklist({To});
902     unsigned Linked = 0;
903 
904     while (!Worklist.empty() && MaxChain-- > 0) {
905       SUnit *SU = Worklist.pop_back_val();
906       if (!Visited.insert(SU).second)
907         continue;
908 
909       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
910                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
911 
912       if (SU->addPred(SDep(From, SDep::Artificial), false))
913         ++Linked;
914 
915       for (SDep &SI : From->Succs) {
916         SUnit *SUv = SI.getSUnit();
917         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
918           SUv->addPred(SDep(SU, SDep::Artificial), false);
919       }
920 
921       for (SDep &SI : SU->Succs) {
922         SUnit *Succ = SI.getSUnit();
923         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
924           Worklist.push_back(Succ);
925       }
926     }
927 
928     return Linked;
929   }
930 
931   void apply(ScheduleDAGInstrs *DAGInstrs) override {
932     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
933     if (!ST.hasMAIInsts() || DisablePowerSched)
934       return;
935     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
936     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
937     if (!TSchedModel || DAG->SUnits.empty())
938       return;
939 
940     // Scan for MFMA long latency instructions and try to add a dependency
941     // of available SALU instructions to give them a chance to fill MFMA
942     // shadow. That is desirable to fill MFMA shadow with SALU instructions
943     // rather than VALU to prevent power consumption bursts and throttle.
944     auto LastSALU = DAG->SUnits.begin();
945     auto E = DAG->SUnits.end();
946     SmallPtrSet<SUnit*, 32> Visited;
947     for (SUnit &SU : DAG->SUnits) {
948       MachineInstr &MAI = *SU.getInstr();
949       if (!TII->isMAI(MAI) ||
950            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
951            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
952         continue;
953 
954       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
955 
956       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
957                  dbgs() << "Need " << Lat
958                         << " instructions to cover latency.\n");
959 
960       // Find up to Lat independent scalar instructions as early as
961       // possible such that they can be scheduled after this MFMA.
962       for ( ; Lat && LastSALU != E; ++LastSALU) {
963         if (Visited.count(&*LastSALU))
964           continue;
965 
966         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
967           continue;
968 
969         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
970       }
971     }
972   }
973 };
974 } // namespace
975 
976 void GCNSubtarget::getPostRAMutations(
977     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
978   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
979 }
980 
981 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
982   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
983     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
984   else
985     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
986 }
987 
988 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
989   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
990     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
991   else
992     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
993 }
994