1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/ADT/SmallString.h"
23 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/TargetFrameLowering.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/IR/IntrinsicsR600.h"
28 #include "llvm/IR/MDBuilder.h"
29 #include "llvm/MC/MCSubtargetInfo.h"
30 #include <algorithm>
31 
32 using namespace llvm;
33 
34 #define DEBUG_TYPE "amdgpu-subtarget"
35 
36 #define GET_SUBTARGETINFO_TARGET_DESC
37 #define GET_SUBTARGETINFO_CTOR
38 #define AMDGPUSubtarget GCNSubtarget
39 #include "AMDGPUGenSubtargetInfo.inc"
40 #define GET_SUBTARGETINFO_TARGET_DESC
41 #define GET_SUBTARGETINFO_CTOR
42 #undef AMDGPUSubtarget
43 #include "R600GenSubtargetInfo.inc"
44 
45 static cl::opt<bool> DisablePowerSched(
46   "amdgpu-disable-power-sched",
47   cl::desc("Disable scheduling to minimize mAI power bursts"),
48   cl::init(false));
49 
50 static cl::opt<bool> EnableVGPRIndexMode(
51   "amdgpu-vgpr-index-mode",
52   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53   cl::init(false));
54 
55 static cl::opt<bool> EnableFlatScratch(
56   "amdgpu-enable-flat-scratch",
57   cl::desc("Use flat scratch instructions"),
58   cl::init(false));
59 
60 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
61                            cl::desc("Enable the use of AA during codegen."),
62                            cl::init(true));
63 
64 GCNSubtarget::~GCNSubtarget() = default;
65 
66 R600Subtarget &
67 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                StringRef GPU, StringRef FS) {
69   SmallString<256> FullFS("+promote-alloca,");
70   FullFS += FS;
71   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
72 
73   HasMulU24 = getGeneration() >= EVERGREEN;
74   HasMulI24 = hasCaymanISA();
75 
76   return *this;
77 }
78 
79 GCNSubtarget &
80 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
81                                               StringRef GPU, StringRef FS) {
82   // Determine default and user-specified characteristics
83   //
84   // We want to be able to turn these off, but making this a subtarget feature
85   // for SI has the unhelpful behavior that it unsets everything else if you
86   // disable it.
87   //
88   // Similarly we want enable-prt-strict-null to be on by default and not to
89   // unset everything else if it is disabled
90 
91   // Assuming ECC is enabled is the conservative default.
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   // Disable XNACK on targets where it is not enabled by default unless it is
168   // explicitly requested.
169   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
170     ToggleFeature(AMDGPU::FeatureXNACK);
171     EnableXNACK = false;
172   }
173 
174   // ECC is on by default, but turn it off if the hardware doesn't support it
175   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
176   // ECC.
177   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
178     ToggleFeature(AMDGPU::FeatureSRAMECC);
179     EnableSRAMECC = false;
180   }
181 
182   return *this;
183 }
184 
185 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
186   TargetTriple(TT),
187   Has16BitInsts(false),
188   HasMadMixInsts(false),
189   HasMadMacF32Insts(false),
190   HasDsSrc2Insts(false),
191   HasSDWA(false),
192   HasVOP3PInsts(false),
193   HasMulI24(true),
194   HasMulU24(true),
195   HasInv2PiInlineImm(false),
196   HasFminFmaxLegacy(true),
197   EnablePromoteAlloca(false),
198   HasTrigReducedRange(false),
199   MaxWavesPerEU(10),
200   LocalMemorySize(0),
201   WavefrontSizeLog2(0)
202   { }
203 
204 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
205                            const GCNTargetMachine &TM) :
206     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
207     AMDGPUSubtarget(TT),
208     TargetTriple(TT),
209     Gen(INVALID),
210     InstrItins(getInstrItineraryForCPU(GPU)),
211     LDSBankCount(0),
212     MaxPrivateElementSize(0),
213 
214     FastFMAF32(false),
215     FastDenormalF32(false),
216     HalfRate64Ops(false),
217 
218     FlatForGlobal(false),
219     AutoWaitcntBeforeBarrier(false),
220     UnalignedScratchAccess(false),
221     UnalignedAccessMode(false),
222 
223     HasApertureRegs(false),
224     EnableXNACK(false),
225     DoesNotSupportXNACK(false),
226     EnableCuMode(false),
227     TrapHandler(false),
228 
229     EnableLoadStoreOpt(false),
230     EnableUnsafeDSOffsetFolding(false),
231     EnableSIScheduler(false),
232     EnableDS128(false),
233     EnablePRTStrictNull(false),
234     DumpCode(false),
235 
236     FP64(false),
237     GCN3Encoding(false),
238     CIInsts(false),
239     GFX8Insts(false),
240     GFX9Insts(false),
241     GFX10Insts(false),
242     GFX10_3Insts(false),
243     GFX7GFX8GFX9Insts(false),
244     SGPRInitBug(false),
245     HasSMemRealTime(false),
246     HasIntClamp(false),
247     HasFmaMixInsts(false),
248     HasMovrel(false),
249     HasVGPRIndexMode(false),
250     HasScalarStores(false),
251     HasScalarAtomics(false),
252     HasSDWAOmod(false),
253     HasSDWAScalar(false),
254     HasSDWASdst(false),
255     HasSDWAMac(false),
256     HasSDWAOutModsVOPC(false),
257     HasDPP(false),
258     HasDPP8(false),
259     HasR128A16(false),
260     HasGFX10A16(false),
261     HasG16(false),
262     HasNSAEncoding(false),
263     GFX10_BEncoding(false),
264     HasDLInsts(false),
265     HasDot1Insts(false),
266     HasDot2Insts(false),
267     HasDot3Insts(false),
268     HasDot4Insts(false),
269     HasDot5Insts(false),
270     HasDot6Insts(false),
271     HasMAIInsts(false),
272     HasPkFmacF16Inst(false),
273     HasAtomicFaddInsts(false),
274     EnableSRAMECC(false),
275     DoesNotSupportSRAMECC(false),
276     HasNoSdstCMPX(false),
277     HasVscnt(false),
278     HasGetWaveIdInst(false),
279     HasSMemTimeInst(false),
280     HasRegisterBanking(false),
281     HasVOP3Literal(false),
282     HasNoDataDepHazard(false),
283     FlatAddressSpace(false),
284     FlatInstOffsets(false),
285     FlatGlobalInsts(false),
286     FlatScratchInsts(false),
287     ScalarFlatScratchInsts(false),
288     AddNoCarryInsts(false),
289     HasUnpackedD16VMem(false),
290     LDSMisalignedBug(false),
291     HasMFMAInlineLiteralBug(false),
292     UnalignedBufferAccess(false),
293     UnalignedDSAccess(false),
294 
295     ScalarizeGlobal(false),
296 
297     HasVcmpxPermlaneHazard(false),
298     HasVMEMtoScalarWriteHazard(false),
299     HasSMEMtoVectorWriteHazard(false),
300     HasInstFwdPrefetchBug(false),
301     HasVcmpxExecWARHazard(false),
302     HasLdsBranchVmemWARHazard(false),
303     HasNSAtoVMEMBug(false),
304     HasOffset3fBug(false),
305     HasFlatSegmentOffsetBug(false),
306     HasImageStoreD16Bug(false),
307     HasImageGather4D16Bug(false),
308 
309     FeatureDisable(false),
310     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
311     TLInfo(TM, *this),
312     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
313   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
314   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
315   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
316   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
317   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
318   InstSelector.reset(new AMDGPUInstructionSelector(
319   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
320 }
321 
322 bool GCNSubtarget::enableFlatScratch() const {
323   return EnableFlatScratch && hasFlatScratchInsts();
324 }
325 
326 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
327   if (getGeneration() < GFX10)
328     return 1;
329 
330   switch (Opcode) {
331   case AMDGPU::V_LSHLREV_B64_e64:
332   case AMDGPU::V_LSHLREV_B64_gfx10:
333   case AMDGPU::V_LSHL_B64_e64:
334   case AMDGPU::V_LSHRREV_B64_e64:
335   case AMDGPU::V_LSHRREV_B64_gfx10:
336   case AMDGPU::V_LSHR_B64_e64:
337   case AMDGPU::V_ASHRREV_I64_e64:
338   case AMDGPU::V_ASHRREV_I64_gfx10:
339   case AMDGPU::V_ASHR_I64_e64:
340     return 1;
341   }
342 
343   return 2;
344 }
345 
346 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
347   const Function &F) const {
348   if (NWaves == 1)
349     return getLocalMemorySize();
350   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
351   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
352   if (!WorkGroupsPerCu)
353     return 0;
354   unsigned MaxWaves = getMaxWavesPerEU();
355   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
356 }
357 
358 // FIXME: Should return min,max range.
359 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
360   const Function &F) const {
361   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
362   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
363   if (!MaxWorkGroupsPerCu)
364     return 0;
365 
366   const unsigned WaveSize = getWavefrontSize();
367 
368   // FIXME: Do we need to account for alignment requirement of LDS rounding the
369   // size up?
370   // Compute restriction based on LDS usage
371   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
372 
373   // This can be queried with more LDS than is possible, so just assume the
374   // worst.
375   if (NumGroups == 0)
376     return 1;
377 
378   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
379 
380   // Round to the number of waves.
381   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
382   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
383 
384   // Clamp to the maximum possible number of waves.
385   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
386 
387   // FIXME: Needs to be a multiple of the group size?
388   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
389 
390   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
391          "computed invalid occupancy");
392   return MaxWaves;
393 }
394 
395 unsigned
396 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
397   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
398   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
399 }
400 
401 std::pair<unsigned, unsigned>
402 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
403   switch (CC) {
404   case CallingConv::AMDGPU_VS:
405   case CallingConv::AMDGPU_LS:
406   case CallingConv::AMDGPU_HS:
407   case CallingConv::AMDGPU_ES:
408   case CallingConv::AMDGPU_GS:
409   case CallingConv::AMDGPU_PS:
410     return std::make_pair(1, getWavefrontSize());
411   default:
412     return std::make_pair(1u, getMaxFlatWorkGroupSize());
413   }
414 }
415 
416 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
417   const Function &F) const {
418   // Default minimum/maximum flat work group sizes.
419   std::pair<unsigned, unsigned> Default =
420     getDefaultFlatWorkGroupSize(F.getCallingConv());
421 
422   // Requested minimum/maximum flat work group sizes.
423   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
424     F, "amdgpu-flat-work-group-size", Default);
425 
426   // Make sure requested minimum is less than requested maximum.
427   if (Requested.first > Requested.second)
428     return Default;
429 
430   // Make sure requested values do not violate subtarget's specifications.
431   if (Requested.first < getMinFlatWorkGroupSize())
432     return Default;
433   if (Requested.second > getMaxFlatWorkGroupSize())
434     return Default;
435 
436   return Requested;
437 }
438 
439 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
440   const Function &F) const {
441   // Default minimum/maximum number of waves per execution unit.
442   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
443 
444   // Default/requested minimum/maximum flat work group sizes.
445   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
446 
447   // If minimum/maximum flat work group sizes were explicitly requested using
448   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
449   // number of waves per execution unit to values implied by requested
450   // minimum/maximum flat work group sizes.
451   unsigned MinImpliedByFlatWorkGroupSize =
452     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
453   Default.first = MinImpliedByFlatWorkGroupSize;
454   bool RequestedFlatWorkGroupSize =
455       F.hasFnAttribute("amdgpu-flat-work-group-size");
456 
457   // Requested minimum/maximum number of waves per execution unit.
458   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
459     F, "amdgpu-waves-per-eu", Default, true);
460 
461   // Make sure requested minimum is less than requested maximum.
462   if (Requested.second && Requested.first > Requested.second)
463     return Default;
464 
465   // Make sure requested values do not violate subtarget's specifications.
466   if (Requested.first < getMinWavesPerEU() ||
467       Requested.second > getMaxWavesPerEU())
468     return Default;
469 
470   // Make sure requested values are compatible with values implied by requested
471   // minimum/maximum flat work group sizes.
472   if (RequestedFlatWorkGroupSize &&
473       Requested.first < MinImpliedByFlatWorkGroupSize)
474     return Default;
475 
476   return Requested;
477 }
478 
479 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
480   auto Node = Kernel.getMetadata("reqd_work_group_size");
481   if (Node && Node->getNumOperands() == 3)
482     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
483   return std::numeric_limits<unsigned>::max();
484 }
485 
486 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
487   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
488 }
489 
490 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
491                                            unsigned Dimension) const {
492   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
493   if (ReqdSize != std::numeric_limits<unsigned>::max())
494     return ReqdSize - 1;
495   return getFlatWorkGroupSizes(Kernel).second - 1;
496 }
497 
498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
499   Function *Kernel = I->getParent()->getParent();
500   unsigned MinSize = 0;
501   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
502   bool IdQuery = false;
503 
504   // If reqd_work_group_size is present it narrows value down.
505   if (auto *CI = dyn_cast<CallInst>(I)) {
506     const Function *F = CI->getCalledFunction();
507     if (F) {
508       unsigned Dim = UINT_MAX;
509       switch (F->getIntrinsicID()) {
510       case Intrinsic::amdgcn_workitem_id_x:
511       case Intrinsic::r600_read_tidig_x:
512         IdQuery = true;
513         LLVM_FALLTHROUGH;
514       case Intrinsic::r600_read_local_size_x:
515         Dim = 0;
516         break;
517       case Intrinsic::amdgcn_workitem_id_y:
518       case Intrinsic::r600_read_tidig_y:
519         IdQuery = true;
520         LLVM_FALLTHROUGH;
521       case Intrinsic::r600_read_local_size_y:
522         Dim = 1;
523         break;
524       case Intrinsic::amdgcn_workitem_id_z:
525       case Intrinsic::r600_read_tidig_z:
526         IdQuery = true;
527         LLVM_FALLTHROUGH;
528       case Intrinsic::r600_read_local_size_z:
529         Dim = 2;
530         break;
531       default:
532         break;
533       }
534 
535       if (Dim <= 3) {
536         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
537         if (ReqdSize != std::numeric_limits<unsigned>::max())
538           MinSize = MaxSize = ReqdSize;
539       }
540     }
541   }
542 
543   if (!MaxSize)
544     return false;
545 
546   // Range metadata is [Lo, Hi). For ID query we need to pass max size
547   // as Hi. For size query we need to pass Hi + 1.
548   if (IdQuery)
549     MinSize = 0;
550   else
551     ++MaxSize;
552 
553   MDBuilder MDB(I->getContext());
554   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
555                                                   APInt(32, MaxSize));
556   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
557   return true;
558 }
559 
560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
561   if (isMesaKernel(F))
562     return 16;
563   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
564 }
565 
566 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
567                                                  Align &MaxAlign) const {
568   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
569          F.getCallingConv() == CallingConv::SPIR_KERNEL);
570 
571   const DataLayout &DL = F.getParent()->getDataLayout();
572   uint64_t ExplicitArgBytes = 0;
573   MaxAlign = Align(1);
574 
575   for (const Argument &Arg : F.args()) {
576     const bool IsByRef = Arg.hasByRefAttr();
577     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
578     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
579     if (!Alignment)
580       Alignment = DL.getABITypeAlign(ArgTy);
581 
582     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
583     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
584     MaxAlign = max(MaxAlign, Alignment);
585   }
586 
587   return ExplicitArgBytes;
588 }
589 
590 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
591                                                 Align &MaxAlign) const {
592   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
593 
594   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
595 
596   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
597   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
598   if (ImplicitBytes != 0) {
599     const Align Alignment = getAlignmentForImplicitArgPtr();
600     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
601   }
602 
603   // Being able to dereference past the end is useful for emitting scalar loads.
604   return alignTo(TotalSize, 4);
605 }
606 
607 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
608                              const TargetMachine &TM) :
609   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
610   AMDGPUSubtarget(TT),
611   InstrInfo(*this),
612   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
613   FMA(false),
614   CaymanISA(false),
615   CFALUBug(false),
616   HasVertexCache(false),
617   R600ALUInst(false),
618   FP64(false),
619   TexVTXClauseSize(0),
620   Gen(R600),
621   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
622   InstrItins(getInstrItineraryForCPU(GPU)) { }
623 
624 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
625                                       unsigned NumRegionInstrs) const {
626   // Track register pressure so the scheduler can try to decrease
627   // pressure once register usage is above the threshold defined by
628   // SIRegisterInfo::getRegPressureSetLimit()
629   Policy.ShouldTrackPressure = true;
630 
631   // Enabling both top down and bottom up scheduling seems to give us less
632   // register spills than just using one of these approaches on its own.
633   Policy.OnlyTopDown = false;
634   Policy.OnlyBottomUp = false;
635 
636   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
637   if (!enableSIScheduler())
638     Policy.ShouldTrackLaneMasks = true;
639 }
640 
641 bool GCNSubtarget::hasMadF16() const {
642   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
643 }
644 
645 bool GCNSubtarget::useVGPRIndexMode() const {
646   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
647 }
648 
649 bool GCNSubtarget::useAA() const { return UseAA; }
650 
651 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
652   if (getGeneration() >= AMDGPUSubtarget::GFX10)
653     return getMaxWavesPerEU();
654 
655   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
656     if (SGPRs <= 80)
657       return 10;
658     if (SGPRs <= 88)
659       return 9;
660     if (SGPRs <= 100)
661       return 8;
662     return 7;
663   }
664   if (SGPRs <= 48)
665     return 10;
666   if (SGPRs <= 56)
667     return 9;
668   if (SGPRs <= 64)
669     return 8;
670   if (SGPRs <= 72)
671     return 7;
672   if (SGPRs <= 80)
673     return 6;
674   return 5;
675 }
676 
677 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
678   unsigned MaxWaves = getMaxWavesPerEU();
679   unsigned Granule = getVGPRAllocGranule();
680   if (VGPRs < Granule)
681     return MaxWaves;
682   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
683   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
684 }
685 
686 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
687   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
688   if (getGeneration() >= AMDGPUSubtarget::GFX10)
689     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
690 
691   if (MFI.hasFlatScratchInit()) {
692     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
693       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
694     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
695       return 4; // FLAT_SCRATCH, VCC (in that order).
696   }
697 
698   if (isXNACKEnabled())
699     return 4; // XNACK, VCC (in that order).
700   return 2; // VCC.
701 }
702 
703 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
704                                         unsigned NumSGPRs,
705                                         unsigned NumVGPRs) const {
706   unsigned Occupancy =
707     std::min(getMaxWavesPerEU(),
708              getOccupancyWithLocalMemSize(LDSSize, F));
709   if (NumSGPRs)
710     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
711   if (NumVGPRs)
712     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
713   return Occupancy;
714 }
715 
716 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
717   const Function &F = MF.getFunction();
718   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
719 
720   // Compute maximum number of SGPRs function can use using default/requested
721   // minimum number of waves per execution unit.
722   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
723   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
724   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
725 
726   // Check if maximum number of SGPRs was explicitly requested using
727   // "amdgpu-num-sgpr" attribute.
728   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
729     unsigned Requested = AMDGPU::getIntegerAttribute(
730       F, "amdgpu-num-sgpr", MaxNumSGPRs);
731 
732     // Make sure requested value does not violate subtarget's specifications.
733     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
734       Requested = 0;
735 
736     // If more SGPRs are required to support the input user/system SGPRs,
737     // increase to accommodate them.
738     //
739     // FIXME: This really ends up using the requested number of SGPRs + number
740     // of reserved special registers in total. Theoretically you could re-use
741     // the last input registers for these special registers, but this would
742     // require a lot of complexity to deal with the weird aliasing.
743     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
744     if (Requested && Requested < InputNumSGPRs)
745       Requested = InputNumSGPRs;
746 
747     // Make sure requested value is compatible with values implied by
748     // default/requested minimum/maximum number of waves per execution unit.
749     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
750       Requested = 0;
751     if (WavesPerEU.second &&
752         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
753       Requested = 0;
754 
755     if (Requested)
756       MaxNumSGPRs = Requested;
757   }
758 
759   if (hasSGPRInitBug())
760     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
761 
762   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
763                   MaxAddressableNumSGPRs);
764 }
765 
766 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
767   const Function &F = MF.getFunction();
768   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
769 
770   // Compute maximum number of VGPRs function can use using default/requested
771   // minimum number of waves per execution unit.
772   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
773   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
774 
775   // Check if maximum number of VGPRs was explicitly requested using
776   // "amdgpu-num-vgpr" attribute.
777   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
778     unsigned Requested = AMDGPU::getIntegerAttribute(
779       F, "amdgpu-num-vgpr", MaxNumVGPRs);
780 
781     // Make sure requested value is compatible with values implied by
782     // default/requested minimum/maximum number of waves per execution unit.
783     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
784       Requested = 0;
785     if (WavesPerEU.second &&
786         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
787       Requested = 0;
788 
789     if (Requested)
790       MaxNumVGPRs = Requested;
791   }
792 
793   return MaxNumVGPRs;
794 }
795 
796 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
797                                          int UseOpIdx, SDep &Dep) const {
798   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
799       !Def->isInstr() || !Use->isInstr())
800     return;
801 
802   MachineInstr *DefI = Def->getInstr();
803   MachineInstr *UseI = Use->getInstr();
804 
805   if (DefI->isBundle()) {
806     const SIRegisterInfo *TRI = getRegisterInfo();
807     auto Reg = Dep.getReg();
808     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
809     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
810     unsigned Lat = 0;
811     for (++I; I != E && I->isBundledWithPred(); ++I) {
812       if (I->modifiesRegister(Reg, TRI))
813         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
814       else if (Lat)
815         --Lat;
816     }
817     Dep.setLatency(Lat);
818   } else if (UseI->isBundle()) {
819     const SIRegisterInfo *TRI = getRegisterInfo();
820     auto Reg = Dep.getReg();
821     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
822     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
823     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
824     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
825       if (I->readsRegister(Reg, TRI))
826         break;
827       --Lat;
828     }
829     Dep.setLatency(Lat);
830   }
831 }
832 
833 namespace {
834 struct FillMFMAShadowMutation : ScheduleDAGMutation {
835   const SIInstrInfo *TII;
836 
837   ScheduleDAGMI *DAG;
838 
839   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
840 
841   bool isSALU(const SUnit *SU) const {
842     const MachineInstr *MI = SU->getInstr();
843     return MI && TII->isSALU(*MI) && !MI->isTerminator();
844   }
845 
846   bool isVALU(const SUnit *SU) const {
847     const MachineInstr *MI = SU->getInstr();
848     return MI && TII->isVALU(*MI);
849   }
850 
851   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
852     if (Pred->NodeNum < Succ->NodeNum)
853       return true;
854 
855     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
856 
857     for (unsigned I = 0; I < Succs.size(); ++I) {
858       for (const SDep &SI : Succs[I]->Succs) {
859         const SUnit *SU = SI.getSUnit();
860         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
861           Succs.push_back(SU);
862       }
863     }
864 
865     SmallPtrSet<const SUnit*, 32> Visited;
866     while (!Preds.empty()) {
867       const SUnit *SU = Preds.pop_back_val();
868       if (llvm::is_contained(Succs, SU))
869         return false;
870       Visited.insert(SU);
871       for (const SDep &SI : SU->Preds)
872         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
873           Preds.push_back(SI.getSUnit());
874     }
875 
876     return true;
877   }
878 
879   // Link as much SALU intructions in chain as possible. Return the size
880   // of the chain. Links up to MaxChain instructions.
881   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
882                          SmallPtrSetImpl<SUnit *> &Visited) const {
883     SmallVector<SUnit *, 8> Worklist({To});
884     unsigned Linked = 0;
885 
886     while (!Worklist.empty() && MaxChain-- > 0) {
887       SUnit *SU = Worklist.pop_back_val();
888       if (!Visited.insert(SU).second)
889         continue;
890 
891       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
892                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
893 
894       if (SU->addPred(SDep(From, SDep::Artificial), false))
895         ++Linked;
896 
897       for (SDep &SI : From->Succs) {
898         SUnit *SUv = SI.getSUnit();
899         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
900           SUv->addPred(SDep(SU, SDep::Artificial), false);
901       }
902 
903       for (SDep &SI : SU->Succs) {
904         SUnit *Succ = SI.getSUnit();
905         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
906           Worklist.push_back(Succ);
907       }
908     }
909 
910     return Linked;
911   }
912 
913   void apply(ScheduleDAGInstrs *DAGInstrs) override {
914     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
915     if (!ST.hasMAIInsts() || DisablePowerSched)
916       return;
917     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
918     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
919     if (!TSchedModel || DAG->SUnits.empty())
920       return;
921 
922     // Scan for MFMA long latency instructions and try to add a dependency
923     // of available SALU instructions to give them a chance to fill MFMA
924     // shadow. That is desirable to fill MFMA shadow with SALU instructions
925     // rather than VALU to prevent power consumption bursts and throttle.
926     auto LastSALU = DAG->SUnits.begin();
927     auto E = DAG->SUnits.end();
928     SmallPtrSet<SUnit*, 32> Visited;
929     for (SUnit &SU : DAG->SUnits) {
930       MachineInstr &MAI = *SU.getInstr();
931       if (!TII->isMAI(MAI) ||
932            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
933            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
934         continue;
935 
936       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
937 
938       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
939                  dbgs() << "Need " << Lat
940                         << " instructions to cover latency.\n");
941 
942       // Find up to Lat independent scalar instructions as early as
943       // possible such that they can be scheduled after this MFMA.
944       for ( ; Lat && LastSALU != E; ++LastSALU) {
945         if (Visited.count(&*LastSALU))
946           continue;
947 
948         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
949           continue;
950 
951         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
952       }
953     }
954   }
955 };
956 } // namespace
957 
958 void GCNSubtarget::getPostRAMutations(
959     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
960   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
961 }
962 
963 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
964   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
965     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
966   else
967     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
968 }
969 
970 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
971   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
972     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
973   else
974     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
975 }
976