1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 static cl::opt<bool> EnableFlatScratch(
54   "amdgpu-enable-flat-scratch",
55   cl::desc("Use flat scratch instructions"),
56   cl::init(false));
57 
58 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
59                            cl::desc("Enable the use of AA during codegen."),
60                            cl::init(true));
61 
62 GCNSubtarget::~GCNSubtarget() = default;
63 
64 R600Subtarget &
65 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
66                                                StringRef GPU, StringRef FS) {
67   SmallString<256> FullFS("+promote-alloca,");
68   FullFS += FS;
69   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
70 
71   HasMulU24 = getGeneration() >= EVERGREEN;
72   HasMulI24 = hasCaymanISA();
73 
74   return *this;
75 }
76 
77 GCNSubtarget &
78 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
79                                               StringRef GPU, StringRef FS) {
80   // Determine default and user-specified characteristics
81   //
82   // We want to be able to turn these off, but making this a subtarget feature
83   // for SI has the unhelpful behavior that it unsets everything else if you
84   // disable it.
85   //
86   // Similarly we want enable-prt-strict-null to be on by default and not to
87   // unset everything else if it is disabled
88 
89   // Assuming ECC is enabled is the conservative default.
90   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
91 
92   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
93   if (isAmdHsaOS())
94     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
95 
96   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
97 
98   // Disable mutually exclusive bits.
99   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
100     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
101       FullFS += "-wavefrontsize16,";
102     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
103       FullFS += "-wavefrontsize32,";
104     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
105       FullFS += "-wavefrontsize64,";
106   }
107 
108   FullFS += FS;
109 
110   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
111 
112   // Implement the "generic" processors, which acts as the default when no
113   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
114   // the first amdgcn target that supports flat addressing. Other OSes defaults
115   // to the first amdgcn target.
116   if (Gen == AMDGPUSubtarget::INVALID) {
117      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
118                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
119   }
120 
121   // We don't support FP64 for EG/NI atm.
122   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123 
124   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
125   // support flat operations, otherwise they cannot access a 64-bit global
126   // address space
127   assert(hasAddr64() || hasFlat());
128   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
129   // that do not support ADDR64 variants of MUBUF instructions. Such targets
130   // cannot use a 64 bit offset with a MUBUF instruction to access the global
131   // address space
132   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
133     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
134     FlatForGlobal = true;
135   }
136   // Unless +-flat-for-global is specified, use MUBUF instructions for global
137   // address space access if flat operations are not available.
138   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
139     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
140     FlatForGlobal = false;
141   }
142 
143   // Set defaults if needed.
144   if (MaxPrivateElementSize == 0)
145     MaxPrivateElementSize = 4;
146 
147   if (LDSBankCount == 0)
148     LDSBankCount = 32;
149 
150   if (TT.getArch() == Triple::amdgcn) {
151     if (LocalMemorySize == 0)
152       LocalMemorySize = 32768;
153 
154     // Do something sensible for unspecified target.
155     if (!HasMovrel && !HasVGPRIndexMode)
156       HasMovrel = true;
157   }
158 
159   // Don't crash on invalid devices.
160   if (WavefrontSizeLog2 == 0)
161     WavefrontSizeLog2 = 5;
162 
163   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
164 
165   // Disable XNACK on targets where it is not enabled by default unless it is
166   // explicitly requested.
167   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
168     ToggleFeature(AMDGPU::FeatureXNACK);
169     EnableXNACK = false;
170   }
171 
172   // ECC is on by default, but turn it off if the hardware doesn't support it
173   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
174   // ECC.
175   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
176     ToggleFeature(AMDGPU::FeatureSRAMECC);
177     EnableSRAMECC = false;
178   }
179 
180   return *this;
181 }
182 
183 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
184   TargetTriple(TT),
185   Has16BitInsts(false),
186   HasMadMixInsts(false),
187   HasMadMacF32Insts(false),
188   HasDsSrc2Insts(false),
189   HasSDWA(false),
190   HasVOP3PInsts(false),
191   HasMulI24(true),
192   HasMulU24(true),
193   HasInv2PiInlineImm(false),
194   HasFminFmaxLegacy(true),
195   EnablePromoteAlloca(false),
196   HasTrigReducedRange(false),
197   MaxWavesPerEU(10),
198   LocalMemorySize(0),
199   WavefrontSizeLog2(0)
200   { }
201 
202 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
203                            const GCNTargetMachine &TM) :
204     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
205     AMDGPUSubtarget(TT),
206     TargetTriple(TT),
207     Gen(INVALID),
208     InstrItins(getInstrItineraryForCPU(GPU)),
209     LDSBankCount(0),
210     MaxPrivateElementSize(0),
211 
212     FastFMAF32(false),
213     FastDenormalF32(false),
214     HalfRate64Ops(false),
215 
216     FlatForGlobal(false),
217     AutoWaitcntBeforeBarrier(false),
218     UnalignedScratchAccess(false),
219     UnalignedAccessMode(false),
220 
221     HasApertureRegs(false),
222     EnableXNACK(false),
223     DoesNotSupportXNACK(false),
224     EnableCuMode(false),
225     TrapHandler(false),
226 
227     EnableLoadStoreOpt(false),
228     EnableUnsafeDSOffsetFolding(false),
229     EnableSIScheduler(false),
230     EnableDS128(false),
231     EnablePRTStrictNull(false),
232     DumpCode(false),
233 
234     FP64(false),
235     GCN3Encoding(false),
236     CIInsts(false),
237     GFX8Insts(false),
238     GFX9Insts(false),
239     GFX10Insts(false),
240     GFX10_3Insts(false),
241     GFX7GFX8GFX9Insts(false),
242     SGPRInitBug(false),
243     HasSMemRealTime(false),
244     HasIntClamp(false),
245     HasFmaMixInsts(false),
246     HasMovrel(false),
247     HasVGPRIndexMode(false),
248     HasScalarStores(false),
249     HasScalarAtomics(false),
250     HasSDWAOmod(false),
251     HasSDWAScalar(false),
252     HasSDWASdst(false),
253     HasSDWAMac(false),
254     HasSDWAOutModsVOPC(false),
255     HasDPP(false),
256     HasDPP8(false),
257     HasR128A16(false),
258     HasGFX10A16(false),
259     HasG16(false),
260     HasNSAEncoding(false),
261     GFX10_BEncoding(false),
262     HasDLInsts(false),
263     HasDot1Insts(false),
264     HasDot2Insts(false),
265     HasDot3Insts(false),
266     HasDot4Insts(false),
267     HasDot5Insts(false),
268     HasDot6Insts(false),
269     HasMAIInsts(false),
270     HasPkFmacF16Inst(false),
271     HasAtomicFaddInsts(false),
272     EnableSRAMECC(false),
273     DoesNotSupportSRAMECC(false),
274     HasNoSdstCMPX(false),
275     HasVscnt(false),
276     HasGetWaveIdInst(false),
277     HasSMemTimeInst(false),
278     HasRegisterBanking(false),
279     HasVOP3Literal(false),
280     HasNoDataDepHazard(false),
281     FlatAddressSpace(false),
282     FlatInstOffsets(false),
283     FlatGlobalInsts(false),
284     FlatScratchInsts(false),
285     ScalarFlatScratchInsts(false),
286     AddNoCarryInsts(false),
287     HasUnpackedD16VMem(false),
288     LDSMisalignedBug(false),
289     HasMFMAInlineLiteralBug(false),
290     UnalignedBufferAccess(false),
291     UnalignedDSAccess(false),
292 
293     ScalarizeGlobal(false),
294 
295     HasVcmpxPermlaneHazard(false),
296     HasVMEMtoScalarWriteHazard(false),
297     HasSMEMtoVectorWriteHazard(false),
298     HasInstFwdPrefetchBug(false),
299     HasVcmpxExecWARHazard(false),
300     HasLdsBranchVmemWARHazard(false),
301     HasNSAtoVMEMBug(false),
302     HasOffset3fBug(false),
303     HasFlatSegmentOffsetBug(false),
304     HasImageStoreD16Bug(false),
305     HasImageGather4D16Bug(false),
306 
307     FeatureDisable(false),
308     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
309     TLInfo(TM, *this),
310     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
311   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
312   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
313   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
314   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
315   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
316   InstSelector.reset(new AMDGPUInstructionSelector(
317   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
318 }
319 
320 bool GCNSubtarget::enableFlatScratch() const {
321   return EnableFlatScratch && hasFlatScratchInsts();
322 }
323 
324 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
325   if (getGeneration() < GFX10)
326     return 1;
327 
328   switch (Opcode) {
329   case AMDGPU::V_LSHLREV_B64:
330   case AMDGPU::V_LSHLREV_B64_gfx10:
331   case AMDGPU::V_LSHL_B64:
332   case AMDGPU::V_LSHRREV_B64:
333   case AMDGPU::V_LSHRREV_B64_gfx10:
334   case AMDGPU::V_LSHR_B64:
335   case AMDGPU::V_ASHRREV_I64:
336   case AMDGPU::V_ASHRREV_I64_gfx10:
337   case AMDGPU::V_ASHR_I64:
338     return 1;
339   }
340 
341   return 2;
342 }
343 
344 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
345   const Function &F) const {
346   if (NWaves == 1)
347     return getLocalMemorySize();
348   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
349   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
350   if (!WorkGroupsPerCu)
351     return 0;
352   unsigned MaxWaves = getMaxWavesPerEU();
353   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
354 }
355 
356 // FIXME: Should return min,max range.
357 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
358   const Function &F) const {
359   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
360   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
361   if (!MaxWorkGroupsPerCu)
362     return 0;
363 
364   const unsigned WaveSize = getWavefrontSize();
365 
366   // FIXME: Do we need to account for alignment requirement of LDS rounding the
367   // size up?
368   // Compute restriction based on LDS usage
369   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
370 
371   // This can be queried with more LDS than is possible, so just assume the
372   // worst.
373   if (NumGroups == 0)
374     return 1;
375 
376   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
377 
378   // Round to the number of waves.
379   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
380   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
381 
382   // Clamp to the maximum possible number of waves.
383   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
384 
385   // FIXME: Needs to be a multiple of the group size?
386   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
387 
388   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
389          "computed invalid occupancy");
390   return MaxWaves;
391 }
392 
393 unsigned
394 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
395   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
396   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
397 }
398 
399 std::pair<unsigned, unsigned>
400 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
401   switch (CC) {
402   case CallingConv::AMDGPU_VS:
403   case CallingConv::AMDGPU_LS:
404   case CallingConv::AMDGPU_HS:
405   case CallingConv::AMDGPU_ES:
406   case CallingConv::AMDGPU_GS:
407   case CallingConv::AMDGPU_PS:
408     return std::make_pair(1, getWavefrontSize());
409   default:
410     return std::make_pair(1u, getMaxFlatWorkGroupSize());
411   }
412 }
413 
414 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
415   const Function &F) const {
416   // Default minimum/maximum flat work group sizes.
417   std::pair<unsigned, unsigned> Default =
418     getDefaultFlatWorkGroupSize(F.getCallingConv());
419 
420   // Requested minimum/maximum flat work group sizes.
421   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
422     F, "amdgpu-flat-work-group-size", Default);
423 
424   // Make sure requested minimum is less than requested maximum.
425   if (Requested.first > Requested.second)
426     return Default;
427 
428   // Make sure requested values do not violate subtarget's specifications.
429   if (Requested.first < getMinFlatWorkGroupSize())
430     return Default;
431   if (Requested.second > getMaxFlatWorkGroupSize())
432     return Default;
433 
434   return Requested;
435 }
436 
437 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
438   const Function &F) const {
439   // Default minimum/maximum number of waves per execution unit.
440   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
441 
442   // Default/requested minimum/maximum flat work group sizes.
443   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
444 
445   // If minimum/maximum flat work group sizes were explicitly requested using
446   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
447   // number of waves per execution unit to values implied by requested
448   // minimum/maximum flat work group sizes.
449   unsigned MinImpliedByFlatWorkGroupSize =
450     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
451   Default.first = MinImpliedByFlatWorkGroupSize;
452   bool RequestedFlatWorkGroupSize =
453       F.hasFnAttribute("amdgpu-flat-work-group-size");
454 
455   // Requested minimum/maximum number of waves per execution unit.
456   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
457     F, "amdgpu-waves-per-eu", Default, true);
458 
459   // Make sure requested minimum is less than requested maximum.
460   if (Requested.second && Requested.first > Requested.second)
461     return Default;
462 
463   // Make sure requested values do not violate subtarget's specifications.
464   if (Requested.first < getMinWavesPerEU() ||
465       Requested.second > getMaxWavesPerEU())
466     return Default;
467 
468   // Make sure requested values are compatible with values implied by requested
469   // minimum/maximum flat work group sizes.
470   if (RequestedFlatWorkGroupSize &&
471       Requested.first < MinImpliedByFlatWorkGroupSize)
472     return Default;
473 
474   return Requested;
475 }
476 
477 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
478   auto Node = Kernel.getMetadata("reqd_work_group_size");
479   if (Node && Node->getNumOperands() == 3)
480     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
481   return std::numeric_limits<unsigned>::max();
482 }
483 
484 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
485                                            unsigned Dimension) const {
486   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
487   if (ReqdSize != std::numeric_limits<unsigned>::max())
488     return ReqdSize - 1;
489   return getFlatWorkGroupSizes(Kernel).second - 1;
490 }
491 
492 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
493   Function *Kernel = I->getParent()->getParent();
494   unsigned MinSize = 0;
495   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
496   bool IdQuery = false;
497 
498   // If reqd_work_group_size is present it narrows value down.
499   if (auto *CI = dyn_cast<CallInst>(I)) {
500     const Function *F = CI->getCalledFunction();
501     if (F) {
502       unsigned Dim = UINT_MAX;
503       switch (F->getIntrinsicID()) {
504       case Intrinsic::amdgcn_workitem_id_x:
505       case Intrinsic::r600_read_tidig_x:
506         IdQuery = true;
507         LLVM_FALLTHROUGH;
508       case Intrinsic::r600_read_local_size_x:
509         Dim = 0;
510         break;
511       case Intrinsic::amdgcn_workitem_id_y:
512       case Intrinsic::r600_read_tidig_y:
513         IdQuery = true;
514         LLVM_FALLTHROUGH;
515       case Intrinsic::r600_read_local_size_y:
516         Dim = 1;
517         break;
518       case Intrinsic::amdgcn_workitem_id_z:
519       case Intrinsic::r600_read_tidig_z:
520         IdQuery = true;
521         LLVM_FALLTHROUGH;
522       case Intrinsic::r600_read_local_size_z:
523         Dim = 2;
524         break;
525       default:
526         break;
527       }
528 
529       if (Dim <= 3) {
530         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
531         if (ReqdSize != std::numeric_limits<unsigned>::max())
532           MinSize = MaxSize = ReqdSize;
533       }
534     }
535   }
536 
537   if (!MaxSize)
538     return false;
539 
540   // Range metadata is [Lo, Hi). For ID query we need to pass max size
541   // as Hi. For size query we need to pass Hi + 1.
542   if (IdQuery)
543     MinSize = 0;
544   else
545     ++MaxSize;
546 
547   MDBuilder MDB(I->getContext());
548   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
549                                                   APInt(32, MaxSize));
550   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
551   return true;
552 }
553 
554 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
555                                                  Align &MaxAlign) const {
556   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
557          F.getCallingConv() == CallingConv::SPIR_KERNEL);
558 
559   const DataLayout &DL = F.getParent()->getDataLayout();
560   uint64_t ExplicitArgBytes = 0;
561   MaxAlign = Align(1);
562 
563   for (const Argument &Arg : F.args()) {
564     const bool IsByRef = Arg.hasByRefAttr();
565     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
566     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
567     if (!Alignment)
568       Alignment = DL.getABITypeAlign(ArgTy);
569 
570     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
571     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
572     MaxAlign = max(MaxAlign, Alignment);
573   }
574 
575   return ExplicitArgBytes;
576 }
577 
578 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
579                                                 Align &MaxAlign) const {
580   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
581 
582   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
583 
584   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
585   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
586   if (ImplicitBytes != 0) {
587     const Align Alignment = getAlignmentForImplicitArgPtr();
588     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
589   }
590 
591   // Being able to dereference past the end is useful for emitting scalar loads.
592   return alignTo(TotalSize, 4);
593 }
594 
595 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
596                              const TargetMachine &TM) :
597   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
598   AMDGPUSubtarget(TT),
599   InstrInfo(*this),
600   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
601   FMA(false),
602   CaymanISA(false),
603   CFALUBug(false),
604   HasVertexCache(false),
605   R600ALUInst(false),
606   FP64(false),
607   TexVTXClauseSize(0),
608   Gen(R600),
609   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
610   InstrItins(getInstrItineraryForCPU(GPU)) { }
611 
612 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
613                                       unsigned NumRegionInstrs) const {
614   // Track register pressure so the scheduler can try to decrease
615   // pressure once register usage is above the threshold defined by
616   // SIRegisterInfo::getRegPressureSetLimit()
617   Policy.ShouldTrackPressure = true;
618 
619   // Enabling both top down and bottom up scheduling seems to give us less
620   // register spills than just using one of these approaches on its own.
621   Policy.OnlyTopDown = false;
622   Policy.OnlyBottomUp = false;
623 
624   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
625   if (!enableSIScheduler())
626     Policy.ShouldTrackLaneMasks = true;
627 }
628 
629 bool GCNSubtarget::hasMadF16() const {
630   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
631 }
632 
633 bool GCNSubtarget::useVGPRIndexMode() const {
634   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
635 }
636 
637 bool GCNSubtarget::useAA() const { return UseAA; }
638 
639 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
640   if (getGeneration() >= AMDGPUSubtarget::GFX10)
641     return getMaxWavesPerEU();
642 
643   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
644     if (SGPRs <= 80)
645       return 10;
646     if (SGPRs <= 88)
647       return 9;
648     if (SGPRs <= 100)
649       return 8;
650     return 7;
651   }
652   if (SGPRs <= 48)
653     return 10;
654   if (SGPRs <= 56)
655     return 9;
656   if (SGPRs <= 64)
657     return 8;
658   if (SGPRs <= 72)
659     return 7;
660   if (SGPRs <= 80)
661     return 6;
662   return 5;
663 }
664 
665 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
666   unsigned MaxWaves = getMaxWavesPerEU();
667   unsigned Granule = getVGPRAllocGranule();
668   if (VGPRs < Granule)
669     return MaxWaves;
670   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
671   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
672 }
673 
674 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
675   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
676   if (getGeneration() >= AMDGPUSubtarget::GFX10)
677     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
678 
679   if (MFI.hasFlatScratchInit()) {
680     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
681       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
682     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
683       return 4; // FLAT_SCRATCH, VCC (in that order).
684   }
685 
686   if (isXNACKEnabled())
687     return 4; // XNACK, VCC (in that order).
688   return 2; // VCC.
689 }
690 
691 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
692                                         unsigned NumSGPRs,
693                                         unsigned NumVGPRs) const {
694   unsigned Occupancy =
695     std::min(getMaxWavesPerEU(),
696              getOccupancyWithLocalMemSize(LDSSize, F));
697   if (NumSGPRs)
698     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
699   if (NumVGPRs)
700     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
701   return Occupancy;
702 }
703 
704 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
705   const Function &F = MF.getFunction();
706   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
707 
708   // Compute maximum number of SGPRs function can use using default/requested
709   // minimum number of waves per execution unit.
710   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
711   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
712   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
713 
714   // Check if maximum number of SGPRs was explicitly requested using
715   // "amdgpu-num-sgpr" attribute.
716   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
717     unsigned Requested = AMDGPU::getIntegerAttribute(
718       F, "amdgpu-num-sgpr", MaxNumSGPRs);
719 
720     // Make sure requested value does not violate subtarget's specifications.
721     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
722       Requested = 0;
723 
724     // If more SGPRs are required to support the input user/system SGPRs,
725     // increase to accommodate them.
726     //
727     // FIXME: This really ends up using the requested number of SGPRs + number
728     // of reserved special registers in total. Theoretically you could re-use
729     // the last input registers for these special registers, but this would
730     // require a lot of complexity to deal with the weird aliasing.
731     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
732     if (Requested && Requested < InputNumSGPRs)
733       Requested = InputNumSGPRs;
734 
735     // Make sure requested value is compatible with values implied by
736     // default/requested minimum/maximum number of waves per execution unit.
737     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
738       Requested = 0;
739     if (WavesPerEU.second &&
740         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
741       Requested = 0;
742 
743     if (Requested)
744       MaxNumSGPRs = Requested;
745   }
746 
747   if (hasSGPRInitBug())
748     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
749 
750   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
751                   MaxAddressableNumSGPRs);
752 }
753 
754 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
755   const Function &F = MF.getFunction();
756   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
757 
758   // Compute maximum number of VGPRs function can use using default/requested
759   // minimum number of waves per execution unit.
760   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
761   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
762 
763   // Check if maximum number of VGPRs was explicitly requested using
764   // "amdgpu-num-vgpr" attribute.
765   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
766     unsigned Requested = AMDGPU::getIntegerAttribute(
767       F, "amdgpu-num-vgpr", MaxNumVGPRs);
768 
769     // Make sure requested value is compatible with values implied by
770     // default/requested minimum/maximum number of waves per execution unit.
771     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
772       Requested = 0;
773     if (WavesPerEU.second &&
774         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
775       Requested = 0;
776 
777     if (Requested)
778       MaxNumVGPRs = Requested;
779   }
780 
781   return MaxNumVGPRs;
782 }
783 
784 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
785                                          int UseOpIdx, SDep &Dep) const {
786   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
787       !Def->isInstr() || !Use->isInstr())
788     return;
789 
790   MachineInstr *DefI = Def->getInstr();
791   MachineInstr *UseI = Use->getInstr();
792 
793   if (DefI->isBundle()) {
794     const SIRegisterInfo *TRI = getRegisterInfo();
795     auto Reg = Dep.getReg();
796     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
797     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
798     unsigned Lat = 0;
799     for (++I; I != E && I->isBundledWithPred(); ++I) {
800       if (I->modifiesRegister(Reg, TRI))
801         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
802       else if (Lat)
803         --Lat;
804     }
805     Dep.setLatency(Lat);
806   } else if (UseI->isBundle()) {
807     const SIRegisterInfo *TRI = getRegisterInfo();
808     auto Reg = Dep.getReg();
809     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
810     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
811     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
812     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
813       if (I->readsRegister(Reg, TRI))
814         break;
815       --Lat;
816     }
817     Dep.setLatency(Lat);
818   }
819 }
820 
821 namespace {
822 struct FillMFMAShadowMutation : ScheduleDAGMutation {
823   const SIInstrInfo *TII;
824 
825   ScheduleDAGMI *DAG;
826 
827   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
828 
829   bool isSALU(const SUnit *SU) const {
830     const MachineInstr *MI = SU->getInstr();
831     return MI && TII->isSALU(*MI) && !MI->isTerminator();
832   }
833 
834   bool isVALU(const SUnit *SU) const {
835     const MachineInstr *MI = SU->getInstr();
836     return MI && TII->isVALU(*MI);
837   }
838 
839   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
840     if (Pred->NodeNum < Succ->NodeNum)
841       return true;
842 
843     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
844 
845     for (unsigned I = 0; I < Succs.size(); ++I) {
846       for (const SDep &SI : Succs[I]->Succs) {
847         const SUnit *SU = SI.getSUnit();
848         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
849           Succs.push_back(SU);
850       }
851     }
852 
853     SmallPtrSet<const SUnit*, 32> Visited;
854     while (!Preds.empty()) {
855       const SUnit *SU = Preds.pop_back_val();
856       if (llvm::is_contained(Succs, SU))
857         return false;
858       Visited.insert(SU);
859       for (const SDep &SI : SU->Preds)
860         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
861           Preds.push_back(SI.getSUnit());
862     }
863 
864     return true;
865   }
866 
867   // Link as much SALU intructions in chain as possible. Return the size
868   // of the chain. Links up to MaxChain instructions.
869   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
870                          SmallPtrSetImpl<SUnit *> &Visited) const {
871     SmallVector<SUnit *, 8> Worklist({To});
872     unsigned Linked = 0;
873 
874     while (!Worklist.empty() && MaxChain-- > 0) {
875       SUnit *SU = Worklist.pop_back_val();
876       if (!Visited.insert(SU).second)
877         continue;
878 
879       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
880                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
881 
882       if (SU->addPred(SDep(From, SDep::Artificial), false))
883         ++Linked;
884 
885       for (SDep &SI : From->Succs) {
886         SUnit *SUv = SI.getSUnit();
887         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
888           SUv->addPred(SDep(SU, SDep::Artificial), false);
889       }
890 
891       for (SDep &SI : SU->Succs) {
892         SUnit *Succ = SI.getSUnit();
893         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
894           Worklist.push_back(Succ);
895       }
896     }
897 
898     return Linked;
899   }
900 
901   void apply(ScheduleDAGInstrs *DAGInstrs) override {
902     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
903     if (!ST.hasMAIInsts() || DisablePowerSched)
904       return;
905     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
906     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
907     if (!TSchedModel || DAG->SUnits.empty())
908       return;
909 
910     // Scan for MFMA long latency instructions and try to add a dependency
911     // of available SALU instructions to give them a chance to fill MFMA
912     // shadow. That is desirable to fill MFMA shadow with SALU instructions
913     // rather than VALU to prevent power consumption bursts and throttle.
914     auto LastSALU = DAG->SUnits.begin();
915     auto E = DAG->SUnits.end();
916     SmallPtrSet<SUnit*, 32> Visited;
917     for (SUnit &SU : DAG->SUnits) {
918       MachineInstr &MAI = *SU.getInstr();
919       if (!TII->isMAI(MAI) ||
920            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
921            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
922         continue;
923 
924       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
925 
926       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
927                  dbgs() << "Need " << Lat
928                         << " instructions to cover latency.\n");
929 
930       // Find up to Lat independent scalar instructions as early as
931       // possible such that they can be scheduled after this MFMA.
932       for ( ; Lat && LastSALU != E; ++LastSALU) {
933         if (Visited.count(&*LastSALU))
934           continue;
935 
936         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
937           continue;
938 
939         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
940       }
941     }
942   }
943 };
944 } // namespace
945 
946 void GCNSubtarget::getPostRAMutations(
947     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
948   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
949 }
950 
951 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
952   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
953     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
954   else
955     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
956 }
957 
958 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
959   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
960     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
961   else
962     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
963 }
964