1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM) :
199     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
200     AMDGPUSubtarget(TT),
201     TargetTriple(TT),
202     TargetID(*this),
203     Gen(INVALID),
204     InstrItins(getInstrItineraryForCPU(GPU)),
205     LDSBankCount(0),
206     MaxPrivateElementSize(0),
207 
208     FastFMAF32(false),
209     FastDenormalF32(false),
210     HalfRate64Ops(false),
211     FullRate64Ops(false),
212 
213     FlatForGlobal(false),
214     AutoWaitcntBeforeBarrier(false),
215     UnalignedScratchAccess(false),
216     UnalignedAccessMode(false),
217 
218     HasApertureRegs(false),
219     SupportsXNACK(false),
220     EnableXNACK(false),
221     EnableTgSplit(false),
222     EnableCuMode(false),
223     TrapHandler(false),
224 
225     EnableLoadStoreOpt(false),
226     EnableUnsafeDSOffsetFolding(false),
227     EnableSIScheduler(false),
228     EnableDS128(false),
229     EnablePRTStrictNull(false),
230     DumpCode(false),
231 
232     FP64(false),
233     CIInsts(false),
234     GFX8Insts(false),
235     GFX9Insts(false),
236     GFX90AInsts(false),
237     GFX10Insts(false),
238     GFX10_3Insts(false),
239     GFX7GFX8GFX9Insts(false),
240     SGPRInitBug(false),
241     HasSMemRealTime(false),
242     HasIntClamp(false),
243     HasFmaMixInsts(false),
244     HasMovrel(false),
245     HasVGPRIndexMode(false),
246     HasScalarStores(false),
247     HasScalarAtomics(false),
248     HasSDWAOmod(false),
249     HasSDWAScalar(false),
250     HasSDWASdst(false),
251     HasSDWAMac(false),
252     HasSDWAOutModsVOPC(false),
253     HasDPP(false),
254     HasDPP8(false),
255     Has64BitDPP(false),
256     HasPackedFP32Ops(false),
257     HasExtendedImageInsts(false),
258     HasR128A16(false),
259     HasGFX10A16(false),
260     HasG16(false),
261     HasNSAEncoding(false),
262     GFX10_BEncoding(false),
263     HasDLInsts(false),
264     HasDot1Insts(false),
265     HasDot2Insts(false),
266     HasDot3Insts(false),
267     HasDot4Insts(false),
268     HasDot5Insts(false),
269     HasDot6Insts(false),
270     HasMAIInsts(false),
271     HasPkFmacF16Inst(false),
272     HasAtomicFaddInsts(false),
273     SupportsSRAMECC(false),
274     EnableSRAMECC(false),
275     HasNoSdstCMPX(false),
276     HasVscnt(false),
277     HasGetWaveIdInst(false),
278     HasSMemTimeInst(false),
279     HasRegisterBanking(false),
280     HasVOP3Literal(false),
281     HasNoDataDepHazard(false),
282     FlatAddressSpace(false),
283     FlatInstOffsets(false),
284     FlatGlobalInsts(false),
285     FlatScratchInsts(false),
286     ScalarFlatScratchInsts(false),
287     AddNoCarryInsts(false),
288     HasUnpackedD16VMem(false),
289     LDSMisalignedBug(false),
290     HasMFMAInlineLiteralBug(false),
291     UnalignedBufferAccess(false),
292     UnalignedDSAccess(false),
293     HasPackedTID(false),
294 
295     ScalarizeGlobal(false),
296 
297     HasVcmpxPermlaneHazard(false),
298     HasVMEMtoScalarWriteHazard(false),
299     HasSMEMtoVectorWriteHazard(false),
300     HasInstFwdPrefetchBug(false),
301     HasVcmpxExecWARHazard(false),
302     HasLdsBranchVmemWARHazard(false),
303     HasNSAtoVMEMBug(false),
304     HasOffset3fBug(false),
305     HasFlatSegmentOffsetBug(false),
306     HasImageStoreD16Bug(false),
307     HasImageGather4D16Bug(false),
308 
309     FeatureDisable(false),
310     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
311     TLInfo(TM, *this),
312     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
313   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
314   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
315   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
316   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
317   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
318   InstSelector.reset(new AMDGPUInstructionSelector(
319   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
320 }
321 
322 bool GCNSubtarget::enableFlatScratch() const {
323   return EnableFlatScratch && hasFlatScratchInsts();
324 }
325 
326 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
327   if (getGeneration() < GFX10)
328     return 1;
329 
330   switch (Opcode) {
331   case AMDGPU::V_LSHLREV_B64_e64:
332   case AMDGPU::V_LSHLREV_B64_gfx10:
333   case AMDGPU::V_LSHL_B64_e64:
334   case AMDGPU::V_LSHRREV_B64_e64:
335   case AMDGPU::V_LSHRREV_B64_gfx10:
336   case AMDGPU::V_LSHR_B64_e64:
337   case AMDGPU::V_ASHRREV_I64_e64:
338   case AMDGPU::V_ASHRREV_I64_gfx10:
339   case AMDGPU::V_ASHR_I64_e64:
340     return 1;
341   }
342 
343   return 2;
344 }
345 
346 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
347   const Function &F) const {
348   if (NWaves == 1)
349     return getLocalMemorySize();
350   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
351   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
352   if (!WorkGroupsPerCu)
353     return 0;
354   unsigned MaxWaves = getMaxWavesPerEU();
355   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
356 }
357 
358 // FIXME: Should return min,max range.
359 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
360   const Function &F) const {
361   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
362   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
363   if (!MaxWorkGroupsPerCu)
364     return 0;
365 
366   const unsigned WaveSize = getWavefrontSize();
367 
368   // FIXME: Do we need to account for alignment requirement of LDS rounding the
369   // size up?
370   // Compute restriction based on LDS usage
371   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
372 
373   // This can be queried with more LDS than is possible, so just assume the
374   // worst.
375   if (NumGroups == 0)
376     return 1;
377 
378   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
379 
380   // Round to the number of waves.
381   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
382   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
383 
384   // Clamp to the maximum possible number of waves.
385   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
386 
387   // FIXME: Needs to be a multiple of the group size?
388   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
389 
390   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
391          "computed invalid occupancy");
392   return MaxWaves;
393 }
394 
395 unsigned
396 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
397   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
398   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
399 }
400 
401 std::pair<unsigned, unsigned>
402 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
403   switch (CC) {
404   case CallingConv::AMDGPU_VS:
405   case CallingConv::AMDGPU_LS:
406   case CallingConv::AMDGPU_HS:
407   case CallingConv::AMDGPU_ES:
408   case CallingConv::AMDGPU_GS:
409   case CallingConv::AMDGPU_PS:
410     return std::make_pair(1, getWavefrontSize());
411   default:
412     return std::make_pair(1u, getMaxFlatWorkGroupSize());
413   }
414 }
415 
416 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
417   const Function &F) const {
418   // Default minimum/maximum flat work group sizes.
419   std::pair<unsigned, unsigned> Default =
420     getDefaultFlatWorkGroupSize(F.getCallingConv());
421 
422   // Requested minimum/maximum flat work group sizes.
423   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
424     F, "amdgpu-flat-work-group-size", Default);
425 
426   // Make sure requested minimum is less than requested maximum.
427   if (Requested.first > Requested.second)
428     return Default;
429 
430   // Make sure requested values do not violate subtarget's specifications.
431   if (Requested.first < getMinFlatWorkGroupSize())
432     return Default;
433   if (Requested.second > getMaxFlatWorkGroupSize())
434     return Default;
435 
436   return Requested;
437 }
438 
439 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
440   const Function &F) const {
441   // Default minimum/maximum number of waves per execution unit.
442   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
443 
444   // Default/requested minimum/maximum flat work group sizes.
445   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
446 
447   // If minimum/maximum flat work group sizes were explicitly requested using
448   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
449   // number of waves per execution unit to values implied by requested
450   // minimum/maximum flat work group sizes.
451   unsigned MinImpliedByFlatWorkGroupSize =
452     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
453   Default.first = MinImpliedByFlatWorkGroupSize;
454   bool RequestedFlatWorkGroupSize =
455       F.hasFnAttribute("amdgpu-flat-work-group-size");
456 
457   // Requested minimum/maximum number of waves per execution unit.
458   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
459     F, "amdgpu-waves-per-eu", Default, true);
460 
461   // Make sure requested minimum is less than requested maximum.
462   if (Requested.second && Requested.first > Requested.second)
463     return Default;
464 
465   // Make sure requested values do not violate subtarget's specifications.
466   if (Requested.first < getMinWavesPerEU() ||
467       Requested.second > getMaxWavesPerEU())
468     return Default;
469 
470   // Make sure requested values are compatible with values implied by requested
471   // minimum/maximum flat work group sizes.
472   if (RequestedFlatWorkGroupSize &&
473       Requested.first < MinImpliedByFlatWorkGroupSize)
474     return Default;
475 
476   return Requested;
477 }
478 
479 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
480   auto Node = Kernel.getMetadata("reqd_work_group_size");
481   if (Node && Node->getNumOperands() == 3)
482     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
483   return std::numeric_limits<unsigned>::max();
484 }
485 
486 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
487   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
488 }
489 
490 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
491                                            unsigned Dimension) const {
492   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
493   if (ReqdSize != std::numeric_limits<unsigned>::max())
494     return ReqdSize - 1;
495   return getFlatWorkGroupSizes(Kernel).second - 1;
496 }
497 
498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
499   Function *Kernel = I->getParent()->getParent();
500   unsigned MinSize = 0;
501   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
502   bool IdQuery = false;
503 
504   // If reqd_work_group_size is present it narrows value down.
505   if (auto *CI = dyn_cast<CallInst>(I)) {
506     const Function *F = CI->getCalledFunction();
507     if (F) {
508       unsigned Dim = UINT_MAX;
509       switch (F->getIntrinsicID()) {
510       case Intrinsic::amdgcn_workitem_id_x:
511       case Intrinsic::r600_read_tidig_x:
512         IdQuery = true;
513         LLVM_FALLTHROUGH;
514       case Intrinsic::r600_read_local_size_x:
515         Dim = 0;
516         break;
517       case Intrinsic::amdgcn_workitem_id_y:
518       case Intrinsic::r600_read_tidig_y:
519         IdQuery = true;
520         LLVM_FALLTHROUGH;
521       case Intrinsic::r600_read_local_size_y:
522         Dim = 1;
523         break;
524       case Intrinsic::amdgcn_workitem_id_z:
525       case Intrinsic::r600_read_tidig_z:
526         IdQuery = true;
527         LLVM_FALLTHROUGH;
528       case Intrinsic::r600_read_local_size_z:
529         Dim = 2;
530         break;
531       default:
532         break;
533       }
534 
535       if (Dim <= 3) {
536         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
537         if (ReqdSize != std::numeric_limits<unsigned>::max())
538           MinSize = MaxSize = ReqdSize;
539       }
540     }
541   }
542 
543   if (!MaxSize)
544     return false;
545 
546   // Range metadata is [Lo, Hi). For ID query we need to pass max size
547   // as Hi. For size query we need to pass Hi + 1.
548   if (IdQuery)
549     MinSize = 0;
550   else
551     ++MaxSize;
552 
553   MDBuilder MDB(I->getContext());
554   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
555                                                   APInt(32, MaxSize));
556   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
557   return true;
558 }
559 
560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
561   if (isMesaKernel(F))
562     return 16;
563   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
564 }
565 
566 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
567                                                  Align &MaxAlign) const {
568   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
569          F.getCallingConv() == CallingConv::SPIR_KERNEL);
570 
571   const DataLayout &DL = F.getParent()->getDataLayout();
572   uint64_t ExplicitArgBytes = 0;
573   MaxAlign = Align(1);
574 
575   for (const Argument &Arg : F.args()) {
576     const bool IsByRef = Arg.hasByRefAttr();
577     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
578     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
579     if (!Alignment)
580       Alignment = DL.getABITypeAlign(ArgTy);
581 
582     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
583     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
584     MaxAlign = max(MaxAlign, Alignment);
585   }
586 
587   return ExplicitArgBytes;
588 }
589 
590 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
591                                                 Align &MaxAlign) const {
592   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
593 
594   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
595 
596   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
597   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
598   if (ImplicitBytes != 0) {
599     const Align Alignment = getAlignmentForImplicitArgPtr();
600     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
601   }
602 
603   // Being able to dereference past the end is useful for emitting scalar loads.
604   return alignTo(TotalSize, 4);
605 }
606 
607 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
608   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
609                                   : AMDGPUDwarfFlavour::Wave64;
610 }
611 
612 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
613                              const TargetMachine &TM) :
614   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
615   AMDGPUSubtarget(TT),
616   InstrInfo(*this),
617   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
618   FMA(false),
619   CaymanISA(false),
620   CFALUBug(false),
621   HasVertexCache(false),
622   R600ALUInst(false),
623   FP64(false),
624   TexVTXClauseSize(0),
625   Gen(R600),
626   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
627   InstrItins(getInstrItineraryForCPU(GPU)) { }
628 
629 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
630                                       unsigned NumRegionInstrs) const {
631   // Track register pressure so the scheduler can try to decrease
632   // pressure once register usage is above the threshold defined by
633   // SIRegisterInfo::getRegPressureSetLimit()
634   Policy.ShouldTrackPressure = true;
635 
636   // Enabling both top down and bottom up scheduling seems to give us less
637   // register spills than just using one of these approaches on its own.
638   Policy.OnlyTopDown = false;
639   Policy.OnlyBottomUp = false;
640 
641   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
642   if (!enableSIScheduler())
643     Policy.ShouldTrackLaneMasks = true;
644 }
645 
646 bool GCNSubtarget::hasMadF16() const {
647   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
648 }
649 
650 bool GCNSubtarget::useVGPRIndexMode() const {
651   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
652 }
653 
654 bool GCNSubtarget::useAA() const { return UseAA; }
655 
656 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
657   if (getGeneration() >= AMDGPUSubtarget::GFX10)
658     return getMaxWavesPerEU();
659 
660   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
661     if (SGPRs <= 80)
662       return 10;
663     if (SGPRs <= 88)
664       return 9;
665     if (SGPRs <= 100)
666       return 8;
667     return 7;
668   }
669   if (SGPRs <= 48)
670     return 10;
671   if (SGPRs <= 56)
672     return 9;
673   if (SGPRs <= 64)
674     return 8;
675   if (SGPRs <= 72)
676     return 7;
677   if (SGPRs <= 80)
678     return 6;
679   return 5;
680 }
681 
682 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
683   unsigned MaxWaves = getMaxWavesPerEU();
684   unsigned Granule = getVGPRAllocGranule();
685   if (VGPRs < Granule)
686     return MaxWaves;
687   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
688   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
689 }
690 
691 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
692   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
693   if (getGeneration() >= AMDGPUSubtarget::GFX10)
694     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
695 
696   if (MFI.hasFlatScratchInit()) {
697     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
698       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
699     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
700       return 4; // FLAT_SCRATCH, VCC (in that order).
701   }
702 
703   if (isXNACKEnabled())
704     return 4; // XNACK, VCC (in that order).
705   return 2; // VCC.
706 }
707 
708 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
709                                         unsigned NumSGPRs,
710                                         unsigned NumVGPRs) const {
711   unsigned Occupancy =
712     std::min(getMaxWavesPerEU(),
713              getOccupancyWithLocalMemSize(LDSSize, F));
714   if (NumSGPRs)
715     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
716   if (NumVGPRs)
717     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
718   return Occupancy;
719 }
720 
721 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
722   const Function &F = MF.getFunction();
723   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
724 
725   // Compute maximum number of SGPRs function can use using default/requested
726   // minimum number of waves per execution unit.
727   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
728   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
729   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
730 
731   // Check if maximum number of SGPRs was explicitly requested using
732   // "amdgpu-num-sgpr" attribute.
733   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
734     unsigned Requested = AMDGPU::getIntegerAttribute(
735       F, "amdgpu-num-sgpr", MaxNumSGPRs);
736 
737     // Make sure requested value does not violate subtarget's specifications.
738     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
739       Requested = 0;
740 
741     // If more SGPRs are required to support the input user/system SGPRs,
742     // increase to accommodate them.
743     //
744     // FIXME: This really ends up using the requested number of SGPRs + number
745     // of reserved special registers in total. Theoretically you could re-use
746     // the last input registers for these special registers, but this would
747     // require a lot of complexity to deal with the weird aliasing.
748     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
749     if (Requested && Requested < InputNumSGPRs)
750       Requested = InputNumSGPRs;
751 
752     // Make sure requested value is compatible with values implied by
753     // default/requested minimum/maximum number of waves per execution unit.
754     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
755       Requested = 0;
756     if (WavesPerEU.second &&
757         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
758       Requested = 0;
759 
760     if (Requested)
761       MaxNumSGPRs = Requested;
762   }
763 
764   if (hasSGPRInitBug())
765     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
766 
767   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
768                   MaxAddressableNumSGPRs);
769 }
770 
771 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
772   const Function &F = MF.getFunction();
773   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
774 
775   // Compute maximum number of VGPRs function can use using default/requested
776   // minimum number of waves per execution unit.
777   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
778   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
779 
780   // Check if maximum number of VGPRs was explicitly requested using
781   // "amdgpu-num-vgpr" attribute.
782   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
783     unsigned Requested = AMDGPU::getIntegerAttribute(
784       F, "amdgpu-num-vgpr", MaxNumVGPRs);
785 
786     if (hasGFX90AInsts())
787       Requested *= 2;
788 
789     // Make sure requested value is compatible with values implied by
790     // default/requested minimum/maximum number of waves per execution unit.
791     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
792       Requested = 0;
793     if (WavesPerEU.second &&
794         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
795       Requested = 0;
796 
797     if (Requested)
798       MaxNumVGPRs = Requested;
799   }
800 
801   return MaxNumVGPRs;
802 }
803 
804 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
805                                          int UseOpIdx, SDep &Dep) const {
806   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
807       !Def->isInstr() || !Use->isInstr())
808     return;
809 
810   MachineInstr *DefI = Def->getInstr();
811   MachineInstr *UseI = Use->getInstr();
812 
813   if (DefI->isBundle()) {
814     const SIRegisterInfo *TRI = getRegisterInfo();
815     auto Reg = Dep.getReg();
816     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
817     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
818     unsigned Lat = 0;
819     for (++I; I != E && I->isBundledWithPred(); ++I) {
820       if (I->modifiesRegister(Reg, TRI))
821         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
822       else if (Lat)
823         --Lat;
824     }
825     Dep.setLatency(Lat);
826   } else if (UseI->isBundle()) {
827     const SIRegisterInfo *TRI = getRegisterInfo();
828     auto Reg = Dep.getReg();
829     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
830     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
831     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
832     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
833       if (I->readsRegister(Reg, TRI))
834         break;
835       --Lat;
836     }
837     Dep.setLatency(Lat);
838   }
839 }
840 
841 namespace {
842 struct FillMFMAShadowMutation : ScheduleDAGMutation {
843   const SIInstrInfo *TII;
844 
845   ScheduleDAGMI *DAG;
846 
847   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
848 
849   bool isSALU(const SUnit *SU) const {
850     const MachineInstr *MI = SU->getInstr();
851     return MI && TII->isSALU(*MI) && !MI->isTerminator();
852   }
853 
854   bool isVALU(const SUnit *SU) const {
855     const MachineInstr *MI = SU->getInstr();
856     return MI && TII->isVALU(*MI);
857   }
858 
859   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
860     if (Pred->NodeNum < Succ->NodeNum)
861       return true;
862 
863     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
864 
865     for (unsigned I = 0; I < Succs.size(); ++I) {
866       for (const SDep &SI : Succs[I]->Succs) {
867         const SUnit *SU = SI.getSUnit();
868         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
869           Succs.push_back(SU);
870       }
871     }
872 
873     SmallPtrSet<const SUnit*, 32> Visited;
874     while (!Preds.empty()) {
875       const SUnit *SU = Preds.pop_back_val();
876       if (llvm::is_contained(Succs, SU))
877         return false;
878       Visited.insert(SU);
879       for (const SDep &SI : SU->Preds)
880         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
881           Preds.push_back(SI.getSUnit());
882     }
883 
884     return true;
885   }
886 
887   // Link as much SALU intructions in chain as possible. Return the size
888   // of the chain. Links up to MaxChain instructions.
889   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
890                          SmallPtrSetImpl<SUnit *> &Visited) const {
891     SmallVector<SUnit *, 8> Worklist({To});
892     unsigned Linked = 0;
893 
894     while (!Worklist.empty() && MaxChain-- > 0) {
895       SUnit *SU = Worklist.pop_back_val();
896       if (!Visited.insert(SU).second)
897         continue;
898 
899       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
900                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
901 
902       if (SU->addPred(SDep(From, SDep::Artificial), false))
903         ++Linked;
904 
905       for (SDep &SI : From->Succs) {
906         SUnit *SUv = SI.getSUnit();
907         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
908           SUv->addPred(SDep(SU, SDep::Artificial), false);
909       }
910 
911       for (SDep &SI : SU->Succs) {
912         SUnit *Succ = SI.getSUnit();
913         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
914           Worklist.push_back(Succ);
915       }
916     }
917 
918     return Linked;
919   }
920 
921   void apply(ScheduleDAGInstrs *DAGInstrs) override {
922     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
923     if (!ST.hasMAIInsts() || DisablePowerSched)
924       return;
925     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
926     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
927     if (!TSchedModel || DAG->SUnits.empty())
928       return;
929 
930     // Scan for MFMA long latency instructions and try to add a dependency
931     // of available SALU instructions to give them a chance to fill MFMA
932     // shadow. That is desirable to fill MFMA shadow with SALU instructions
933     // rather than VALU to prevent power consumption bursts and throttle.
934     auto LastSALU = DAG->SUnits.begin();
935     auto E = DAG->SUnits.end();
936     SmallPtrSet<SUnit*, 32> Visited;
937     for (SUnit &SU : DAG->SUnits) {
938       MachineInstr &MAI = *SU.getInstr();
939       if (!TII->isMAI(MAI) ||
940            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
941            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
942         continue;
943 
944       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
945 
946       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
947                  dbgs() << "Need " << Lat
948                         << " instructions to cover latency.\n");
949 
950       // Find up to Lat independent scalar instructions as early as
951       // possible such that they can be scheduled after this MFMA.
952       for ( ; Lat && LastSALU != E; ++LastSALU) {
953         if (Visited.count(&*LastSALU))
954           continue;
955 
956         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
957           continue;
958 
959         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
960       }
961     }
962   }
963 };
964 } // namespace
965 
966 void GCNSubtarget::getPostRAMutations(
967     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
968   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
969 }
970 
971 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
972   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
973     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
974   else
975     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
976 }
977 
978 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
979   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
980     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
981   else
982     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
983 }
984