1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/TargetFrameLowering.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/IntrinsicsR600.h"
28 #include "llvm/IR/MDBuilder.h"
29 #include "llvm/MC/MCSubtargetInfo.h"
30 #include <algorithm>
31 
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "amdgpu-subtarget"
35 
36 #define GET_SUBTARGETINFO_TARGET_DESC
37 #define GET_SUBTARGETINFO_CTOR
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenSubtargetInfo.inc"
40 #define GET_SUBTARGETINFO_TARGET_DESC
41 #define GET_SUBTARGETINFO_CTOR
42 #undef AMDGPUSubtarget
43 #include "R600GenSubtargetInfo.inc"
44 
45 static cl::opt<bool> DisablePowerSched(
46   "amdgpu-disable-power-sched",
47   cl::desc("Disable scheduling to minimize mAI power bursts"),
48   cl::init(false));
49 
50 static cl::opt<bool> EnableVGPRIndexMode(
51   "amdgpu-vgpr-index-mode",
52   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53   cl::init(false));
54 
55 static cl::opt<bool> EnableFlatScratch(
56   "amdgpu-enable-flat-scratch",
57   cl::desc("Use flat scratch instructions"),
58   cl::init(false));
59 
60 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
61                            cl::desc("Enable the use of AA during codegen."),
62                            cl::init(true));
63 
64 GCNSubtarget::~GCNSubtarget() = default;
65 
66 R600Subtarget &
67 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                StringRef GPU, StringRef FS) {
69   SmallString<256> FullFS("+promote-alloca,");
70   FullFS += FS;
71   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
72 
73   HasMulU24 = getGeneration() >= EVERGREEN;
74   HasMulI24 = hasCaymanISA();
75 
76   return *this;
77 }
78 
79 GCNSubtarget &
80 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
81                                               StringRef GPU, StringRef FS) {
82   // Determine default and user-specified characteristics
83   //
84   // We want to be able to turn these off, but making this a subtarget feature
85   // for SI has the unhelpful behavior that it unsets everything else if you
86   // disable it.
87   //
88   // Similarly we want enable-prt-strict-null to be on by default and not to
89   // unset everything else if it is disabled
90 
91   // Assuming ECC is enabled is the conservative default.
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   // Disable XNACK on targets where it is not enabled by default unless it is
168   // explicitly requested.
169   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
170     ToggleFeature(AMDGPU::FeatureXNACK);
171     EnableXNACK = false;
172   }
173 
174   // ECC is on by default, but turn it off if the hardware doesn't support it
175   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
176   // ECC.
177   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
178     ToggleFeature(AMDGPU::FeatureSRAMECC);
179     EnableSRAMECC = false;
180   }
181 
182   return *this;
183 }
184 
185 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
186   TargetTriple(TT),
187   Has16BitInsts(false),
188   HasMadMixInsts(false),
189   HasMadMacF32Insts(false),
190   HasDsSrc2Insts(false),
191   HasSDWA(false),
192   HasVOP3PInsts(false),
193   HasMulI24(true),
194   HasMulU24(true),
195   HasInv2PiInlineImm(false),
196   HasFminFmaxLegacy(true),
197   EnablePromoteAlloca(false),
198   HasTrigReducedRange(false),
199   MaxWavesPerEU(10),
200   LocalMemorySize(0),
201   WavefrontSizeLog2(0)
202   { }
203 
204 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
205                            const GCNTargetMachine &TM) :
206     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
207     AMDGPUSubtarget(TT),
208     TargetTriple(TT),
209     Gen(INVALID),
210     InstrItins(getInstrItineraryForCPU(GPU)),
211     LDSBankCount(0),
212     MaxPrivateElementSize(0),
213 
214     FastFMAF32(false),
215     FastDenormalF32(false),
216     HalfRate64Ops(false),
217 
218     FlatForGlobal(false),
219     AutoWaitcntBeforeBarrier(false),
220     UnalignedScratchAccess(false),
221     UnalignedAccessMode(false),
222 
223     HasApertureRegs(false),
224     EnableXNACK(false),
225     DoesNotSupportXNACK(false),
226     EnableCuMode(false),
227     TrapHandler(false),
228 
229     EnableLoadStoreOpt(false),
230     EnableUnsafeDSOffsetFolding(false),
231     EnableSIScheduler(false),
232     EnableDS128(false),
233     EnablePRTStrictNull(false),
234     DumpCode(false),
235 
236     FP64(false),
237     GCN3Encoding(false),
238     CIInsts(false),
239     GFX8Insts(false),
240     GFX9Insts(false),
241     GFX10Insts(false),
242     GFX10_3Insts(false),
243     GFX7GFX8GFX9Insts(false),
244     SGPRInitBug(false),
245     HasSMemRealTime(false),
246     HasIntClamp(false),
247     HasFmaMixInsts(false),
248     HasMovrel(false),
249     HasVGPRIndexMode(false),
250     HasScalarStores(false),
251     HasScalarAtomics(false),
252     HasSDWAOmod(false),
253     HasSDWAScalar(false),
254     HasSDWASdst(false),
255     HasSDWAMac(false),
256     HasSDWAOutModsVOPC(false),
257     HasDPP(false),
258     HasDPP8(false),
259     HasR128A16(false),
260     HasGFX10A16(false),
261     HasG16(false),
262     HasNSAEncoding(false),
263     GFX10_BEncoding(false),
264     HasDLInsts(false),
265     HasDot1Insts(false),
266     HasDot2Insts(false),
267     HasDot3Insts(false),
268     HasDot4Insts(false),
269     HasDot5Insts(false),
270     HasDot6Insts(false),
271     HasMAIInsts(false),
272     HasPkFmacF16Inst(false),
273     HasAtomicFaddInsts(false),
274     EnableSRAMECC(false),
275     DoesNotSupportSRAMECC(false),
276     HasNoSdstCMPX(false),
277     HasVscnt(false),
278     HasGetWaveIdInst(false),
279     HasSMemTimeInst(false),
280     HasRegisterBanking(false),
281     HasVOP3Literal(false),
282     HasNoDataDepHazard(false),
283     FlatAddressSpace(false),
284     FlatInstOffsets(false),
285     FlatGlobalInsts(false),
286     FlatScratchInsts(false),
287     ScalarFlatScratchInsts(false),
288     AddNoCarryInsts(false),
289     HasUnpackedD16VMem(false),
290     LDSMisalignedBug(false),
291     HasMFMAInlineLiteralBug(false),
292     UnalignedBufferAccess(false),
293     UnalignedDSAccess(false),
294 
295     ScalarizeGlobal(false),
296 
297     HasVcmpxPermlaneHazard(false),
298     HasVMEMtoScalarWriteHazard(false),
299     HasSMEMtoVectorWriteHazard(false),
300     HasInstFwdPrefetchBug(false),
301     HasVcmpxExecWARHazard(false),
302     HasLdsBranchVmemWARHazard(false),
303     HasNSAtoVMEMBug(false),
304     HasOffset3fBug(false),
305     HasFlatSegmentOffsetBug(false),
306     HasImageStoreD16Bug(false),
307     HasImageGather4D16Bug(false),
308 
309     FeatureDisable(false),
310     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
311     TLInfo(TM, *this),
312     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
313   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
314   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
315   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
316   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
317   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
318   InstSelector.reset(new AMDGPUInstructionSelector(
319   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
320 }
321 
322 bool GCNSubtarget::enableFlatScratch() const {
323   return EnableFlatScratch && hasFlatScratchInsts();
324 }
325 
326 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
327   if (getGeneration() < GFX10)
328     return 1;
329 
330   switch (Opcode) {
331   case AMDGPU::V_LSHLREV_B64_e64:
332   case AMDGPU::V_LSHLREV_B64_gfx10:
333   case AMDGPU::V_LSHL_B64_e64:
334   case AMDGPU::V_LSHRREV_B64_e64:
335   case AMDGPU::V_LSHRREV_B64_gfx10:
336   case AMDGPU::V_LSHR_B64_e64:
337   case AMDGPU::V_ASHRREV_I64_e64:
338   case AMDGPU::V_ASHRREV_I64_gfx10:
339   case AMDGPU::V_ASHR_I64_e64:
340     return 1;
341   }
342 
343   return 2;
344 }
345 
346 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
347   const Function &F) const {
348   if (NWaves == 1)
349     return getLocalMemorySize();
350   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
351   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
352   if (!WorkGroupsPerCu)
353     return 0;
354   unsigned MaxWaves = getMaxWavesPerEU();
355   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
356 }
357 
358 // FIXME: Should return min,max range.
359 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
360   const Function &F) const {
361   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
362   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
363   if (!MaxWorkGroupsPerCu)
364     return 0;
365 
366   const unsigned WaveSize = getWavefrontSize();
367 
368   // FIXME: Do we need to account for alignment requirement of LDS rounding the
369   // size up?
370   // Compute restriction based on LDS usage
371   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
372 
373   // This can be queried with more LDS than is possible, so just assume the
374   // worst.
375   if (NumGroups == 0)
376     return 1;
377 
378   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
379 
380   // Round to the number of waves.
381   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
382   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
383 
384   // Clamp to the maximum possible number of waves.
385   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
386 
387   // FIXME: Needs to be a multiple of the group size?
388   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
389 
390   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
391          "computed invalid occupancy");
392   return MaxWaves;
393 }
394 
395 unsigned
396 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
397   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
398   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
399 }
400 
401 std::pair<unsigned, unsigned>
402 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
403   switch (CC) {
404   case CallingConv::AMDGPU_VS:
405   case CallingConv::AMDGPU_LS:
406   case CallingConv::AMDGPU_HS:
407   case CallingConv::AMDGPU_ES:
408   case CallingConv::AMDGPU_GS:
409   case CallingConv::AMDGPU_PS:
410     return std::make_pair(1, getWavefrontSize());
411   default:
412     return std::make_pair(1u, getMaxFlatWorkGroupSize());
413   }
414 }
415 
416 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
417   const Function &F) const {
418   // Default minimum/maximum flat work group sizes.
419   std::pair<unsigned, unsigned> Default =
420     getDefaultFlatWorkGroupSize(F.getCallingConv());
421 
422   // Requested minimum/maximum flat work group sizes.
423   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
424     F, "amdgpu-flat-work-group-size", Default);
425 
426   // Make sure requested minimum is less than requested maximum.
427   if (Requested.first > Requested.second)
428     return Default;
429 
430   // Make sure requested values do not violate subtarget's specifications.
431   if (Requested.first < getMinFlatWorkGroupSize())
432     return Default;
433   if (Requested.second > getMaxFlatWorkGroupSize())
434     return Default;
435 
436   return Requested;
437 }
438 
439 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
440   const Function &F) const {
441   // Default minimum/maximum number of waves per execution unit.
442   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
443 
444   // Default/requested minimum/maximum flat work group sizes.
445   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
446 
447   // If minimum/maximum flat work group sizes were explicitly requested using
448   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
449   // number of waves per execution unit to values implied by requested
450   // minimum/maximum flat work group sizes.
451   unsigned MinImpliedByFlatWorkGroupSize =
452     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
453   Default.first = MinImpliedByFlatWorkGroupSize;
454   bool RequestedFlatWorkGroupSize =
455       F.hasFnAttribute("amdgpu-flat-work-group-size");
456 
457   // Requested minimum/maximum number of waves per execution unit.
458   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
459     F, "amdgpu-waves-per-eu", Default, true);
460 
461   // Make sure requested minimum is less than requested maximum.
462   if (Requested.second && Requested.first > Requested.second)
463     return Default;
464 
465   // Make sure requested values do not violate subtarget's specifications.
466   if (Requested.first < getMinWavesPerEU() ||
467       Requested.second > getMaxWavesPerEU())
468     return Default;
469 
470   // Make sure requested values are compatible with values implied by requested
471   // minimum/maximum flat work group sizes.
472   if (RequestedFlatWorkGroupSize &&
473       Requested.first < MinImpliedByFlatWorkGroupSize)
474     return Default;
475 
476   return Requested;
477 }
478 
479 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
480   auto Node = Kernel.getMetadata("reqd_work_group_size");
481   if (Node && Node->getNumOperands() == 3)
482     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
483   return std::numeric_limits<unsigned>::max();
484 }
485 
486 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
487   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
488 }
489 
490 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
491                                            unsigned Dimension) const {
492   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
493   if (ReqdSize != std::numeric_limits<unsigned>::max())
494     return ReqdSize - 1;
495   return getFlatWorkGroupSizes(Kernel).second - 1;
496 }
497 
498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
499   Function *Kernel = I->getParent()->getParent();
500   unsigned MinSize = 0;
501   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
502   bool IdQuery = false;
503 
504   // If reqd_work_group_size is present it narrows value down.
505   if (auto *CI = dyn_cast<CallInst>(I)) {
506     const Function *F = CI->getCalledFunction();
507     if (F) {
508       unsigned Dim = UINT_MAX;
509       switch (F->getIntrinsicID()) {
510       case Intrinsic::amdgcn_workitem_id_x:
511       case Intrinsic::r600_read_tidig_x:
512         IdQuery = true;
513         LLVM_FALLTHROUGH;
514       case Intrinsic::r600_read_local_size_x:
515         Dim = 0;
516         break;
517       case Intrinsic::amdgcn_workitem_id_y:
518       case Intrinsic::r600_read_tidig_y:
519         IdQuery = true;
520         LLVM_FALLTHROUGH;
521       case Intrinsic::r600_read_local_size_y:
522         Dim = 1;
523         break;
524       case Intrinsic::amdgcn_workitem_id_z:
525       case Intrinsic::r600_read_tidig_z:
526         IdQuery = true;
527         LLVM_FALLTHROUGH;
528       case Intrinsic::r600_read_local_size_z:
529         Dim = 2;
530         break;
531       default:
532         break;
533       }
534 
535       if (Dim <= 3) {
536         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
537         if (ReqdSize != std::numeric_limits<unsigned>::max())
538           MinSize = MaxSize = ReqdSize;
539       }
540     }
541   }
542 
543   if (!MaxSize)
544     return false;
545 
546   // Range metadata is [Lo, Hi). For ID query we need to pass max size
547   // as Hi. For size query we need to pass Hi + 1.
548   if (IdQuery)
549     MinSize = 0;
550   else
551     ++MaxSize;
552 
553   MDBuilder MDB(I->getContext());
554   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
555                                                   APInt(32, MaxSize));
556   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
557   return true;
558 }
559 
560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
561   if (isMesaKernel(F))
562     return 16;
563   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
564 }
565 
566 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
567                                                  Align &MaxAlign) const {
568   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
569          F.getCallingConv() == CallingConv::SPIR_KERNEL);
570 
571   const DataLayout &DL = F.getParent()->getDataLayout();
572   uint64_t ExplicitArgBytes = 0;
573   MaxAlign = Align(1);
574 
575   for (const Argument &Arg : F.args()) {
576     const bool IsByRef = Arg.hasByRefAttr();
577     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
578     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
579     if (!Alignment)
580       Alignment = DL.getABITypeAlign(ArgTy);
581 
582     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
583     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
584     MaxAlign = max(MaxAlign, Alignment);
585   }
586 
587   return ExplicitArgBytes;
588 }
589 
590 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
591                                                 Align &MaxAlign) const {
592   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
593 
594   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
595 
596   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
597   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
598   if (ImplicitBytes != 0) {
599     const Align Alignment = getAlignmentForImplicitArgPtr();
600     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
601   }
602 
603   // Being able to dereference past the end is useful for emitting scalar loads.
604   return alignTo(TotalSize, 4);
605 }
606 
607 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
608   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
609                                   : AMDGPUDwarfFlavour::Wave64;
610 }
611 
612 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
613                              const TargetMachine &TM) :
614   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
615   AMDGPUSubtarget(TT),
616   InstrInfo(*this),
617   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
618   FMA(false),
619   CaymanISA(false),
620   CFALUBug(false),
621   HasVertexCache(false),
622   R600ALUInst(false),
623   FP64(false),
624   TexVTXClauseSize(0),
625   Gen(R600),
626   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
627   InstrItins(getInstrItineraryForCPU(GPU)) { }
628 
629 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
630                                       unsigned NumRegionInstrs) const {
631   // Track register pressure so the scheduler can try to decrease
632   // pressure once register usage is above the threshold defined by
633   // SIRegisterInfo::getRegPressureSetLimit()
634   Policy.ShouldTrackPressure = true;
635 
636   // Enabling both top down and bottom up scheduling seems to give us less
637   // register spills than just using one of these approaches on its own.
638   Policy.OnlyTopDown = false;
639   Policy.OnlyBottomUp = false;
640 
641   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
642   if (!enableSIScheduler())
643     Policy.ShouldTrackLaneMasks = true;
644 }
645 
646 bool GCNSubtarget::hasMadF16() const {
647   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
648 }
649 
650 bool GCNSubtarget::useVGPRIndexMode() const {
651   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
652 }
653 
654 bool GCNSubtarget::useAA() const { return UseAA; }
655 
656 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
657   if (getGeneration() >= AMDGPUSubtarget::GFX10)
658     return getMaxWavesPerEU();
659 
660   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
661     if (SGPRs <= 80)
662       return 10;
663     if (SGPRs <= 88)
664       return 9;
665     if (SGPRs <= 100)
666       return 8;
667     return 7;
668   }
669   if (SGPRs <= 48)
670     return 10;
671   if (SGPRs <= 56)
672     return 9;
673   if (SGPRs <= 64)
674     return 8;
675   if (SGPRs <= 72)
676     return 7;
677   if (SGPRs <= 80)
678     return 6;
679   return 5;
680 }
681 
682 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
683   unsigned MaxWaves = getMaxWavesPerEU();
684   unsigned Granule = getVGPRAllocGranule();
685   if (VGPRs < Granule)
686     return MaxWaves;
687   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
688   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
689 }
690 
691 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
692   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
693   if (getGeneration() >= AMDGPUSubtarget::GFX10)
694     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
695 
696   if (MFI.hasFlatScratchInit()) {
697     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
698       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
699     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
700       return 4; // FLAT_SCRATCH, VCC (in that order).
701   }
702 
703   if (isXNACKEnabled())
704     return 4; // XNACK, VCC (in that order).
705   return 2; // VCC.
706 }
707 
708 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
709                                         unsigned NumSGPRs,
710                                         unsigned NumVGPRs) const {
711   unsigned Occupancy =
712     std::min(getMaxWavesPerEU(),
713              getOccupancyWithLocalMemSize(LDSSize, F));
714   if (NumSGPRs)
715     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
716   if (NumVGPRs)
717     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
718   return Occupancy;
719 }
720 
721 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
722   const Function &F = MF.getFunction();
723   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
724 
725   // Compute maximum number of SGPRs function can use using default/requested
726   // minimum number of waves per execution unit.
727   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
728   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
729   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
730 
731   // Check if maximum number of SGPRs was explicitly requested using
732   // "amdgpu-num-sgpr" attribute.
733   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
734     unsigned Requested = AMDGPU::getIntegerAttribute(
735       F, "amdgpu-num-sgpr", MaxNumSGPRs);
736 
737     // Make sure requested value does not violate subtarget's specifications.
738     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
739       Requested = 0;
740 
741     // If more SGPRs are required to support the input user/system SGPRs,
742     // increase to accommodate them.
743     //
744     // FIXME: This really ends up using the requested number of SGPRs + number
745     // of reserved special registers in total. Theoretically you could re-use
746     // the last input registers for these special registers, but this would
747     // require a lot of complexity to deal with the weird aliasing.
748     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
749     if (Requested && Requested < InputNumSGPRs)
750       Requested = InputNumSGPRs;
751 
752     // Make sure requested value is compatible with values implied by
753     // default/requested minimum/maximum number of waves per execution unit.
754     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
755       Requested = 0;
756     if (WavesPerEU.second &&
757         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
758       Requested = 0;
759 
760     if (Requested)
761       MaxNumSGPRs = Requested;
762   }
763 
764   if (hasSGPRInitBug())
765     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
766 
767   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
768                   MaxAddressableNumSGPRs);
769 }
770 
771 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
772   const Function &F = MF.getFunction();
773   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
774 
775   // Compute maximum number of VGPRs function can use using default/requested
776   // minimum number of waves per execution unit.
777   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
778   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
779 
780   // Check if maximum number of VGPRs was explicitly requested using
781   // "amdgpu-num-vgpr" attribute.
782   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
783     unsigned Requested = AMDGPU::getIntegerAttribute(
784       F, "amdgpu-num-vgpr", MaxNumVGPRs);
785 
786     // Make sure requested value is compatible with values implied by
787     // default/requested minimum/maximum number of waves per execution unit.
788     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
789       Requested = 0;
790     if (WavesPerEU.second &&
791         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
792       Requested = 0;
793 
794     if (Requested)
795       MaxNumVGPRs = Requested;
796   }
797 
798   return MaxNumVGPRs;
799 }
800 
801 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
802                                          int UseOpIdx, SDep &Dep) const {
803   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
804       !Def->isInstr() || !Use->isInstr())
805     return;
806 
807   MachineInstr *DefI = Def->getInstr();
808   MachineInstr *UseI = Use->getInstr();
809 
810   if (DefI->isBundle()) {
811     const SIRegisterInfo *TRI = getRegisterInfo();
812     auto Reg = Dep.getReg();
813     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
814     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
815     unsigned Lat = 0;
816     for (++I; I != E && I->isBundledWithPred(); ++I) {
817       if (I->modifiesRegister(Reg, TRI))
818         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
819       else if (Lat)
820         --Lat;
821     }
822     Dep.setLatency(Lat);
823   } else if (UseI->isBundle()) {
824     const SIRegisterInfo *TRI = getRegisterInfo();
825     auto Reg = Dep.getReg();
826     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
827     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
828     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
829     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
830       if (I->readsRegister(Reg, TRI))
831         break;
832       --Lat;
833     }
834     Dep.setLatency(Lat);
835   }
836 }
837 
838 namespace {
839 struct FillMFMAShadowMutation : ScheduleDAGMutation {
840   const SIInstrInfo *TII;
841 
842   ScheduleDAGMI *DAG;
843 
844   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
845 
846   bool isSALU(const SUnit *SU) const {
847     const MachineInstr *MI = SU->getInstr();
848     return MI && TII->isSALU(*MI) && !MI->isTerminator();
849   }
850 
851   bool isVALU(const SUnit *SU) const {
852     const MachineInstr *MI = SU->getInstr();
853     return MI && TII->isVALU(*MI);
854   }
855 
856   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
857     if (Pred->NodeNum < Succ->NodeNum)
858       return true;
859 
860     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
861 
862     for (unsigned I = 0; I < Succs.size(); ++I) {
863       for (const SDep &SI : Succs[I]->Succs) {
864         const SUnit *SU = SI.getSUnit();
865         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
866           Succs.push_back(SU);
867       }
868     }
869 
870     SmallPtrSet<const SUnit*, 32> Visited;
871     while (!Preds.empty()) {
872       const SUnit *SU = Preds.pop_back_val();
873       if (llvm::is_contained(Succs, SU))
874         return false;
875       Visited.insert(SU);
876       for (const SDep &SI : SU->Preds)
877         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
878           Preds.push_back(SI.getSUnit());
879     }
880 
881     return true;
882   }
883 
884   // Link as much SALU intructions in chain as possible. Return the size
885   // of the chain. Links up to MaxChain instructions.
886   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
887                          SmallPtrSetImpl<SUnit *> &Visited) const {
888     SmallVector<SUnit *, 8> Worklist({To});
889     unsigned Linked = 0;
890 
891     while (!Worklist.empty() && MaxChain-- > 0) {
892       SUnit *SU = Worklist.pop_back_val();
893       if (!Visited.insert(SU).second)
894         continue;
895 
896       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
897                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
898 
899       if (SU->addPred(SDep(From, SDep::Artificial), false))
900         ++Linked;
901 
902       for (SDep &SI : From->Succs) {
903         SUnit *SUv = SI.getSUnit();
904         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
905           SUv->addPred(SDep(SU, SDep::Artificial), false);
906       }
907 
908       for (SDep &SI : SU->Succs) {
909         SUnit *Succ = SI.getSUnit();
910         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
911           Worklist.push_back(Succ);
912       }
913     }
914 
915     return Linked;
916   }
917 
918   void apply(ScheduleDAGInstrs *DAGInstrs) override {
919     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
920     if (!ST.hasMAIInsts() || DisablePowerSched)
921       return;
922     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
923     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
924     if (!TSchedModel || DAG->SUnits.empty())
925       return;
926 
927     // Scan for MFMA long latency instructions and try to add a dependency
928     // of available SALU instructions to give them a chance to fill MFMA
929     // shadow. That is desirable to fill MFMA shadow with SALU instructions
930     // rather than VALU to prevent power consumption bursts and throttle.
931     auto LastSALU = DAG->SUnits.begin();
932     auto E = DAG->SUnits.end();
933     SmallPtrSet<SUnit*, 32> Visited;
934     for (SUnit &SU : DAG->SUnits) {
935       MachineInstr &MAI = *SU.getInstr();
936       if (!TII->isMAI(MAI) ||
937            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
938            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
939         continue;
940 
941       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
942 
943       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
944                  dbgs() << "Need " << Lat
945                         << " instructions to cover latency.\n");
946 
947       // Find up to Lat independent scalar instructions as early as
948       // possible such that they can be scheduled after this MFMA.
949       for ( ; Lat && LastSALU != E; ++LastSALU) {
950         if (Visited.count(&*LastSALU))
951           continue;
952 
953         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
954           continue;
955 
956         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
957       }
958     }
959   }
960 };
961 } // namespace
962 
963 void GCNSubtarget::getPostRAMutations(
964     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
965   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
966 }
967 
968 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
969   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
970     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
971   else
972     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
973 }
974 
975 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
976   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
977     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
978   else
979     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
980 }
981