1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 
167   TargetID.setTargetIDFromFeaturesString(FS);
168 
169   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
170                     << TargetID.getXnackSetting() << '\n');
171   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
172                     << TargetID.getSramEccSetting() << '\n');
173 
174   return *this;
175 }
176 
177 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
178   TargetTriple(TT),
179   GCN3Encoding(false),
180   Has16BitInsts(false),
181   HasMadMixInsts(false),
182   HasMadMacF32Insts(false),
183   HasDsSrc2Insts(false),
184   HasSDWA(false),
185   HasVOP3PInsts(false),
186   HasMulI24(true),
187   HasMulU24(true),
188   HasInv2PiInlineImm(false),
189   HasFminFmaxLegacy(true),
190   EnablePromoteAlloca(false),
191   HasTrigReducedRange(false),
192   MaxWavesPerEU(10),
193   LocalMemorySize(0),
194   WavefrontSizeLog2(0)
195   { }
196 
197 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
198                            const GCNTargetMachine &TM) :
199     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
200     AMDGPUSubtarget(TT),
201     TargetTriple(TT),
202     TargetID(*this),
203     Gen(INVALID),
204     InstrItins(getInstrItineraryForCPU(GPU)),
205     LDSBankCount(0),
206     MaxPrivateElementSize(0),
207 
208     FastFMAF32(false),
209     FastDenormalF32(false),
210     HalfRate64Ops(false),
211     FullRate64Ops(false),
212 
213     FlatForGlobal(false),
214     AutoWaitcntBeforeBarrier(false),
215     UnalignedScratchAccess(false),
216     UnalignedAccessMode(false),
217 
218     HasApertureRegs(false),
219     SupportsXNACK(false),
220     EnableXNACK(false),
221     EnableTgSplit(false),
222     EnableCuMode(false),
223     TrapHandler(false),
224 
225     EnableLoadStoreOpt(false),
226     EnableUnsafeDSOffsetFolding(false),
227     EnableSIScheduler(false),
228     EnableDS128(false),
229     EnablePRTStrictNull(false),
230     DumpCode(false),
231 
232     FP64(false),
233     CIInsts(false),
234     GFX8Insts(false),
235     GFX9Insts(false),
236     GFX90AInsts(false),
237     GFX10Insts(false),
238     GFX10_3Insts(false),
239     GFX7GFX8GFX9Insts(false),
240     SGPRInitBug(false),
241     HasSMemRealTime(false),
242     HasIntClamp(false),
243     HasFmaMixInsts(false),
244     HasMovrel(false),
245     HasVGPRIndexMode(false),
246     HasScalarStores(false),
247     HasScalarAtomics(false),
248     HasSDWAOmod(false),
249     HasSDWAScalar(false),
250     HasSDWASdst(false),
251     HasSDWAMac(false),
252     HasSDWAOutModsVOPC(false),
253     HasDPP(false),
254     HasDPP8(false),
255     Has64BitDPP(false),
256     HasPackedFP32Ops(false),
257     HasExtendedImageInsts(false),
258     HasR128A16(false),
259     HasGFX10A16(false),
260     HasG16(false),
261     HasNSAEncoding(false),
262     GFX10_BEncoding(false),
263     HasDLInsts(false),
264     HasDot1Insts(false),
265     HasDot2Insts(false),
266     HasDot3Insts(false),
267     HasDot4Insts(false),
268     HasDot5Insts(false),
269     HasDot6Insts(false),
270     HasDot7Insts(false),
271     HasMAIInsts(false),
272     HasPkFmacF16Inst(false),
273     HasAtomicFaddInsts(false),
274     SupportsSRAMECC(false),
275     EnableSRAMECC(false),
276     HasNoSdstCMPX(false),
277     HasVscnt(false),
278     HasGetWaveIdInst(false),
279     HasSMemTimeInst(false),
280     HasShaderCyclesRegister(false),
281     HasRegisterBanking(false),
282     HasVOP3Literal(false),
283     HasNoDataDepHazard(false),
284     FlatAddressSpace(false),
285     FlatInstOffsets(false),
286     FlatGlobalInsts(false),
287     FlatScratchInsts(false),
288     ScalarFlatScratchInsts(false),
289     AddNoCarryInsts(false),
290     HasUnpackedD16VMem(false),
291     LDSMisalignedBug(false),
292     HasMFMAInlineLiteralBug(false),
293     UnalignedBufferAccess(false),
294     UnalignedDSAccess(false),
295     HasPackedTID(false),
296 
297     ScalarizeGlobal(false),
298 
299     HasVcmpxPermlaneHazard(false),
300     HasVMEMtoScalarWriteHazard(false),
301     HasSMEMtoVectorWriteHazard(false),
302     HasInstFwdPrefetchBug(false),
303     HasVcmpxExecWARHazard(false),
304     HasLdsBranchVmemWARHazard(false),
305     HasNSAtoVMEMBug(false),
306     HasOffset3fBug(false),
307     HasFlatSegmentOffsetBug(false),
308     HasImageStoreD16Bug(false),
309     HasImageGather4D16Bug(false),
310 
311     FeatureDisable(false),
312     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
313     TLInfo(TM, *this),
314     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
315   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
316   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
317   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
318   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
319   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
320   InstSelector.reset(new AMDGPUInstructionSelector(
321   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
322 }
323 
324 bool GCNSubtarget::enableFlatScratch() const {
325   return EnableFlatScratch && hasFlatScratchInsts();
326 }
327 
328 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
329   if (getGeneration() < GFX10)
330     return 1;
331 
332   switch (Opcode) {
333   case AMDGPU::V_LSHLREV_B64_e64:
334   case AMDGPU::V_LSHLREV_B64_gfx10:
335   case AMDGPU::V_LSHL_B64_e64:
336   case AMDGPU::V_LSHRREV_B64_e64:
337   case AMDGPU::V_LSHRREV_B64_gfx10:
338   case AMDGPU::V_LSHR_B64_e64:
339   case AMDGPU::V_ASHRREV_I64_e64:
340   case AMDGPU::V_ASHRREV_I64_gfx10:
341   case AMDGPU::V_ASHR_I64_e64:
342     return 1;
343   }
344 
345   return 2;
346 }
347 
348 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
349   const Function &F) const {
350   if (NWaves == 1)
351     return getLocalMemorySize();
352   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
353   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
354   if (!WorkGroupsPerCu)
355     return 0;
356   unsigned MaxWaves = getMaxWavesPerEU();
357   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
358 }
359 
360 // FIXME: Should return min,max range.
361 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
362   const Function &F) const {
363   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
364   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
365   if (!MaxWorkGroupsPerCu)
366     return 0;
367 
368   const unsigned WaveSize = getWavefrontSize();
369 
370   // FIXME: Do we need to account for alignment requirement of LDS rounding the
371   // size up?
372   // Compute restriction based on LDS usage
373   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
374 
375   // This can be queried with more LDS than is possible, so just assume the
376   // worst.
377   if (NumGroups == 0)
378     return 1;
379 
380   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
381 
382   // Round to the number of waves.
383   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
384   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
385 
386   // Clamp to the maximum possible number of waves.
387   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
388 
389   // FIXME: Needs to be a multiple of the group size?
390   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
391 
392   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
393          "computed invalid occupancy");
394   return MaxWaves;
395 }
396 
397 unsigned
398 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
399   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
400   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
401 }
402 
403 std::pair<unsigned, unsigned>
404 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
405   switch (CC) {
406   case CallingConv::AMDGPU_VS:
407   case CallingConv::AMDGPU_LS:
408   case CallingConv::AMDGPU_HS:
409   case CallingConv::AMDGPU_ES:
410   case CallingConv::AMDGPU_GS:
411   case CallingConv::AMDGPU_PS:
412     return std::make_pair(1, getWavefrontSize());
413   default:
414     return std::make_pair(1u, getMaxFlatWorkGroupSize());
415   }
416 }
417 
418 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
419   const Function &F) const {
420   // Default minimum/maximum flat work group sizes.
421   std::pair<unsigned, unsigned> Default =
422     getDefaultFlatWorkGroupSize(F.getCallingConv());
423 
424   // Requested minimum/maximum flat work group sizes.
425   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
426     F, "amdgpu-flat-work-group-size", Default);
427 
428   // Make sure requested minimum is less than requested maximum.
429   if (Requested.first > Requested.second)
430     return Default;
431 
432   // Make sure requested values do not violate subtarget's specifications.
433   if (Requested.first < getMinFlatWorkGroupSize())
434     return Default;
435   if (Requested.second > getMaxFlatWorkGroupSize())
436     return Default;
437 
438   return Requested;
439 }
440 
441 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
442   const Function &F) const {
443   // Default minimum/maximum number of waves per execution unit.
444   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
445 
446   // Default/requested minimum/maximum flat work group sizes.
447   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
448 
449   // If minimum/maximum flat work group sizes were explicitly requested using
450   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
451   // number of waves per execution unit to values implied by requested
452   // minimum/maximum flat work group sizes.
453   unsigned MinImpliedByFlatWorkGroupSize =
454     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
455   Default.first = MinImpliedByFlatWorkGroupSize;
456   bool RequestedFlatWorkGroupSize =
457       F.hasFnAttribute("amdgpu-flat-work-group-size");
458 
459   // Requested minimum/maximum number of waves per execution unit.
460   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
461     F, "amdgpu-waves-per-eu", Default, true);
462 
463   // Make sure requested minimum is less than requested maximum.
464   if (Requested.second && Requested.first > Requested.second)
465     return Default;
466 
467   // Make sure requested values do not violate subtarget's specifications.
468   if (Requested.first < getMinWavesPerEU() ||
469       Requested.second > getMaxWavesPerEU())
470     return Default;
471 
472   // Make sure requested values are compatible with values implied by requested
473   // minimum/maximum flat work group sizes.
474   if (RequestedFlatWorkGroupSize &&
475       Requested.first < MinImpliedByFlatWorkGroupSize)
476     return Default;
477 
478   return Requested;
479 }
480 
481 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
482   auto Node = Kernel.getMetadata("reqd_work_group_size");
483   if (Node && Node->getNumOperands() == 3)
484     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
485   return std::numeric_limits<unsigned>::max();
486 }
487 
488 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
489   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
490 }
491 
492 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
493                                            unsigned Dimension) const {
494   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
495   if (ReqdSize != std::numeric_limits<unsigned>::max())
496     return ReqdSize - 1;
497   return getFlatWorkGroupSizes(Kernel).second - 1;
498 }
499 
500 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
501   Function *Kernel = I->getParent()->getParent();
502   unsigned MinSize = 0;
503   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
504   bool IdQuery = false;
505 
506   // If reqd_work_group_size is present it narrows value down.
507   if (auto *CI = dyn_cast<CallInst>(I)) {
508     const Function *F = CI->getCalledFunction();
509     if (F) {
510       unsigned Dim = UINT_MAX;
511       switch (F->getIntrinsicID()) {
512       case Intrinsic::amdgcn_workitem_id_x:
513       case Intrinsic::r600_read_tidig_x:
514         IdQuery = true;
515         LLVM_FALLTHROUGH;
516       case Intrinsic::r600_read_local_size_x:
517         Dim = 0;
518         break;
519       case Intrinsic::amdgcn_workitem_id_y:
520       case Intrinsic::r600_read_tidig_y:
521         IdQuery = true;
522         LLVM_FALLTHROUGH;
523       case Intrinsic::r600_read_local_size_y:
524         Dim = 1;
525         break;
526       case Intrinsic::amdgcn_workitem_id_z:
527       case Intrinsic::r600_read_tidig_z:
528         IdQuery = true;
529         LLVM_FALLTHROUGH;
530       case Intrinsic::r600_read_local_size_z:
531         Dim = 2;
532         break;
533       default:
534         break;
535       }
536 
537       if (Dim <= 3) {
538         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
539         if (ReqdSize != std::numeric_limits<unsigned>::max())
540           MinSize = MaxSize = ReqdSize;
541       }
542     }
543   }
544 
545   if (!MaxSize)
546     return false;
547 
548   // Range metadata is [Lo, Hi). For ID query we need to pass max size
549   // as Hi. For size query we need to pass Hi + 1.
550   if (IdQuery)
551     MinSize = 0;
552   else
553     ++MaxSize;
554 
555   MDBuilder MDB(I->getContext());
556   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
557                                                   APInt(32, MaxSize));
558   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
559   return true;
560 }
561 
562 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
563   if (isMesaKernel(F))
564     return 16;
565   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
566 }
567 
568 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
569                                                  Align &MaxAlign) const {
570   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
571          F.getCallingConv() == CallingConv::SPIR_KERNEL);
572 
573   const DataLayout &DL = F.getParent()->getDataLayout();
574   uint64_t ExplicitArgBytes = 0;
575   MaxAlign = Align(1);
576 
577   for (const Argument &Arg : F.args()) {
578     const bool IsByRef = Arg.hasByRefAttr();
579     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
580     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
581     if (!Alignment)
582       Alignment = DL.getABITypeAlign(ArgTy);
583 
584     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
585     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
586     MaxAlign = max(MaxAlign, Alignment);
587   }
588 
589   return ExplicitArgBytes;
590 }
591 
592 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
593                                                 Align &MaxAlign) const {
594   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
595 
596   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
597 
598   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
599   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
600   if (ImplicitBytes != 0) {
601     const Align Alignment = getAlignmentForImplicitArgPtr();
602     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
603   }
604 
605   // Being able to dereference past the end is useful for emitting scalar loads.
606   return alignTo(TotalSize, 4);
607 }
608 
609 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
610   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
611                                   : AMDGPUDwarfFlavour::Wave64;
612 }
613 
614 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
615                              const TargetMachine &TM) :
616   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
617   AMDGPUSubtarget(TT),
618   InstrInfo(*this),
619   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
620   FMA(false),
621   CaymanISA(false),
622   CFALUBug(false),
623   HasVertexCache(false),
624   R600ALUInst(false),
625   FP64(false),
626   TexVTXClauseSize(0),
627   Gen(R600),
628   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
629   InstrItins(getInstrItineraryForCPU(GPU)) { }
630 
631 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
632                                       unsigned NumRegionInstrs) const {
633   // Track register pressure so the scheduler can try to decrease
634   // pressure once register usage is above the threshold defined by
635   // SIRegisterInfo::getRegPressureSetLimit()
636   Policy.ShouldTrackPressure = true;
637 
638   // Enabling both top down and bottom up scheduling seems to give us less
639   // register spills than just using one of these approaches on its own.
640   Policy.OnlyTopDown = false;
641   Policy.OnlyBottomUp = false;
642 
643   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
644   if (!enableSIScheduler())
645     Policy.ShouldTrackLaneMasks = true;
646 }
647 
648 bool GCNSubtarget::hasMadF16() const {
649   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
650 }
651 
652 bool GCNSubtarget::useVGPRIndexMode() const {
653   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
654 }
655 
656 bool GCNSubtarget::useAA() const { return UseAA; }
657 
658 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
659   if (getGeneration() >= AMDGPUSubtarget::GFX10)
660     return getMaxWavesPerEU();
661 
662   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
663     if (SGPRs <= 80)
664       return 10;
665     if (SGPRs <= 88)
666       return 9;
667     if (SGPRs <= 100)
668       return 8;
669     return 7;
670   }
671   if (SGPRs <= 48)
672     return 10;
673   if (SGPRs <= 56)
674     return 9;
675   if (SGPRs <= 64)
676     return 8;
677   if (SGPRs <= 72)
678     return 7;
679   if (SGPRs <= 80)
680     return 6;
681   return 5;
682 }
683 
684 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
685   unsigned MaxWaves = getMaxWavesPerEU();
686   unsigned Granule = getVGPRAllocGranule();
687   if (VGPRs < Granule)
688     return MaxWaves;
689   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
690   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
691 }
692 
693 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
694   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
695   if (getGeneration() >= AMDGPUSubtarget::GFX10)
696     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
697 
698   if (MFI.hasFlatScratchInit()) {
699     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
700       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
701     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
702       return 4; // FLAT_SCRATCH, VCC (in that order).
703   }
704 
705   if (isXNACKEnabled())
706     return 4; // XNACK, VCC (in that order).
707   return 2; // VCC.
708 }
709 
710 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
711                                         unsigned NumSGPRs,
712                                         unsigned NumVGPRs) const {
713   unsigned Occupancy =
714     std::min(getMaxWavesPerEU(),
715              getOccupancyWithLocalMemSize(LDSSize, F));
716   if (NumSGPRs)
717     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
718   if (NumVGPRs)
719     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
720   return Occupancy;
721 }
722 
723 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
724   const Function &F = MF.getFunction();
725   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
726 
727   // Compute maximum number of SGPRs function can use using default/requested
728   // minimum number of waves per execution unit.
729   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
730   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
731   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
732 
733   // Check if maximum number of SGPRs was explicitly requested using
734   // "amdgpu-num-sgpr" attribute.
735   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
736     unsigned Requested = AMDGPU::getIntegerAttribute(
737       F, "amdgpu-num-sgpr", MaxNumSGPRs);
738 
739     // Make sure requested value does not violate subtarget's specifications.
740     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
741       Requested = 0;
742 
743     // If more SGPRs are required to support the input user/system SGPRs,
744     // increase to accommodate them.
745     //
746     // FIXME: This really ends up using the requested number of SGPRs + number
747     // of reserved special registers in total. Theoretically you could re-use
748     // the last input registers for these special registers, but this would
749     // require a lot of complexity to deal with the weird aliasing.
750     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
751     if (Requested && Requested < InputNumSGPRs)
752       Requested = InputNumSGPRs;
753 
754     // Make sure requested value is compatible with values implied by
755     // default/requested minimum/maximum number of waves per execution unit.
756     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
757       Requested = 0;
758     if (WavesPerEU.second &&
759         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
760       Requested = 0;
761 
762     if (Requested)
763       MaxNumSGPRs = Requested;
764   }
765 
766   if (hasSGPRInitBug())
767     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
768 
769   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
770                   MaxAddressableNumSGPRs);
771 }
772 
773 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
774   const Function &F = MF.getFunction();
775   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
776 
777   // Compute maximum number of VGPRs function can use using default/requested
778   // minimum number of waves per execution unit.
779   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
780   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
781 
782   // Check if maximum number of VGPRs was explicitly requested using
783   // "amdgpu-num-vgpr" attribute.
784   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
785     unsigned Requested = AMDGPU::getIntegerAttribute(
786       F, "amdgpu-num-vgpr", MaxNumVGPRs);
787 
788     if (hasGFX90AInsts())
789       Requested *= 2;
790 
791     // Make sure requested value is compatible with values implied by
792     // default/requested minimum/maximum number of waves per execution unit.
793     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
794       Requested = 0;
795     if (WavesPerEU.second &&
796         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
797       Requested = 0;
798 
799     if (Requested)
800       MaxNumVGPRs = Requested;
801   }
802 
803   return MaxNumVGPRs;
804 }
805 
806 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
807                                          int UseOpIdx, SDep &Dep) const {
808   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
809       !Def->isInstr() || !Use->isInstr())
810     return;
811 
812   MachineInstr *DefI = Def->getInstr();
813   MachineInstr *UseI = Use->getInstr();
814 
815   if (DefI->isBundle()) {
816     const SIRegisterInfo *TRI = getRegisterInfo();
817     auto Reg = Dep.getReg();
818     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
819     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
820     unsigned Lat = 0;
821     for (++I; I != E && I->isBundledWithPred(); ++I) {
822       if (I->modifiesRegister(Reg, TRI))
823         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
824       else if (Lat)
825         --Lat;
826     }
827     Dep.setLatency(Lat);
828   } else if (UseI->isBundle()) {
829     const SIRegisterInfo *TRI = getRegisterInfo();
830     auto Reg = Dep.getReg();
831     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
832     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
833     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
834     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
835       if (I->readsRegister(Reg, TRI))
836         break;
837       --Lat;
838     }
839     Dep.setLatency(Lat);
840   }
841 }
842 
843 namespace {
844 struct FillMFMAShadowMutation : ScheduleDAGMutation {
845   const SIInstrInfo *TII;
846 
847   ScheduleDAGMI *DAG;
848 
849   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
850 
851   bool isSALU(const SUnit *SU) const {
852     const MachineInstr *MI = SU->getInstr();
853     return MI && TII->isSALU(*MI) && !MI->isTerminator();
854   }
855 
856   bool isVALU(const SUnit *SU) const {
857     const MachineInstr *MI = SU->getInstr();
858     return MI && TII->isVALU(*MI);
859   }
860 
861   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
862     if (Pred->NodeNum < Succ->NodeNum)
863       return true;
864 
865     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
866 
867     for (unsigned I = 0; I < Succs.size(); ++I) {
868       for (const SDep &SI : Succs[I]->Succs) {
869         const SUnit *SU = SI.getSUnit();
870         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
871           Succs.push_back(SU);
872       }
873     }
874 
875     SmallPtrSet<const SUnit*, 32> Visited;
876     while (!Preds.empty()) {
877       const SUnit *SU = Preds.pop_back_val();
878       if (llvm::is_contained(Succs, SU))
879         return false;
880       Visited.insert(SU);
881       for (const SDep &SI : SU->Preds)
882         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
883           Preds.push_back(SI.getSUnit());
884     }
885 
886     return true;
887   }
888 
889   // Link as much SALU intructions in chain as possible. Return the size
890   // of the chain. Links up to MaxChain instructions.
891   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
892                          SmallPtrSetImpl<SUnit *> &Visited) const {
893     SmallVector<SUnit *, 8> Worklist({To});
894     unsigned Linked = 0;
895 
896     while (!Worklist.empty() && MaxChain-- > 0) {
897       SUnit *SU = Worklist.pop_back_val();
898       if (!Visited.insert(SU).second)
899         continue;
900 
901       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
902                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
903 
904       if (SU->addPred(SDep(From, SDep::Artificial), false))
905         ++Linked;
906 
907       for (SDep &SI : From->Succs) {
908         SUnit *SUv = SI.getSUnit();
909         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
910           SUv->addPred(SDep(SU, SDep::Artificial), false);
911       }
912 
913       for (SDep &SI : SU->Succs) {
914         SUnit *Succ = SI.getSUnit();
915         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
916           Worklist.push_back(Succ);
917       }
918     }
919 
920     return Linked;
921   }
922 
923   void apply(ScheduleDAGInstrs *DAGInstrs) override {
924     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
925     if (!ST.hasMAIInsts() || DisablePowerSched)
926       return;
927     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
928     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
929     if (!TSchedModel || DAG->SUnits.empty())
930       return;
931 
932     // Scan for MFMA long latency instructions and try to add a dependency
933     // of available SALU instructions to give them a chance to fill MFMA
934     // shadow. That is desirable to fill MFMA shadow with SALU instructions
935     // rather than VALU to prevent power consumption bursts and throttle.
936     auto LastSALU = DAG->SUnits.begin();
937     auto E = DAG->SUnits.end();
938     SmallPtrSet<SUnit*, 32> Visited;
939     for (SUnit &SU : DAG->SUnits) {
940       MachineInstr &MAI = *SU.getInstr();
941       if (!TII->isMAI(MAI) ||
942            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
943            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
944         continue;
945 
946       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
947 
948       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
949                  dbgs() << "Need " << Lat
950                         << " instructions to cover latency.\n");
951 
952       // Find up to Lat independent scalar instructions as early as
953       // possible such that they can be scheduled after this MFMA.
954       for ( ; Lat && LastSALU != E; ++LastSALU) {
955         if (Visited.count(&*LastSALU))
956           continue;
957 
958         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
959           continue;
960 
961         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
962       }
963     }
964   }
965 };
966 } // namespace
967 
968 void GCNSubtarget::getPostRAMutations(
969     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
970   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
971 }
972 
973 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
974   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
975     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
976   else
977     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
978 }
979 
980 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
981   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
982     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
983   else
984     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
985 }
986