1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   // FIXME: I don't think think Evergreen has any useful support for
63   // denormals, but should be checked. Should we issue a warning somewhere
64   // if someone tries to enable these?
65   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
66     FP32Denormals = false;
67   }
68 
69   HasMulU24 = getGeneration() >= EVERGREEN;
70   HasMulI24 = hasCaymanISA();
71 
72   return *this;
73 }
74 
75 GCNSubtarget &
76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
77                                               StringRef GPU, StringRef FS) {
78   // Determine default and user-specified characteristics
79   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
80   // enabled, but some instructions do not respect them and they run at the
81   // double precision rate, so don't enable by default.
82   //
83   // We want to be able to turn these off, but making this a subtarget feature
84   // for SI has the unhelpful behavior that it unsets everything else if you
85   // disable it.
86   //
87   // Similarly we want enable-prt-strict-null to be on by default and not to
88   // unset everything else if it is disabled
89 
90   // Assuming ECC is enabled is the conservative default.
91   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
92 
93   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
94     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
95 
96   // FIXME: I don't think think Evergreen has any useful support for
97   // denormals, but should be checked. Should we issue a warning somewhere
98   // if someone tries to enable these?
99   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
100     FullFS += "+fp64-fp16-denormals,";
101   } else {
102     FullFS += "-fp32-denormals,";
103   }
104 
105   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
106 
107   // Disable mutually exclusive bits.
108   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
109     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
110       FullFS += "-wavefrontsize16,";
111     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
112       FullFS += "-wavefrontsize32,";
113     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
114       FullFS += "-wavefrontsize64,";
115   }
116 
117   FullFS += FS;
118 
119   ParseSubtargetFeatures(GPU, FullFS);
120 
121   // We don't support FP64 for EG/NI atm.
122   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123 
124   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
125   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
126   // variants of MUBUF instructions.
127   if (!hasAddr64() && !FS.contains("flat-for-global")) {
128     FlatForGlobal = true;
129   }
130 
131   // Set defaults if needed.
132   if (MaxPrivateElementSize == 0)
133     MaxPrivateElementSize = 4;
134 
135   if (LDSBankCount == 0)
136     LDSBankCount = 32;
137 
138   if (TT.getArch() == Triple::amdgcn) {
139     if (LocalMemorySize == 0)
140       LocalMemorySize = 32768;
141 
142     // Do something sensible for unspecified target.
143     if (!HasMovrel && !HasVGPRIndexMode)
144       HasMovrel = true;
145   }
146 
147   // Don't crash on invalid devices.
148   if (WavefrontSize == 0)
149     WavefrontSize = 64;
150 
151   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152 
153   // Disable XNACK on targets where it is not enabled by default unless it is
154   // explicitly requested.
155   if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
156     ToggleFeature(AMDGPU::FeatureXNACK);
157     EnableXNACK = false;
158   }
159 
160   // ECC is on by default, but turn it off if the hardware doesn't support it
161   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
162   // ECC.
163   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
164     ToggleFeature(AMDGPU::FeatureSRAMECC);
165     EnableSRAMECC = false;
166   }
167 
168   return *this;
169 }
170 
171 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
172   TargetTriple(TT),
173   Has16BitInsts(false),
174   HasMadMixInsts(false),
175   FP32Denormals(false),
176   FPExceptions(false),
177   HasSDWA(false),
178   HasVOP3PInsts(false),
179   HasMulI24(true),
180   HasMulU24(true),
181   HasInv2PiInlineImm(false),
182   HasFminFmaxLegacy(true),
183   EnablePromoteAlloca(false),
184   HasTrigReducedRange(false),
185   MaxWavesPerEU(10),
186   LocalMemorySize(0),
187   WavefrontSize(0)
188   { }
189 
190 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
191                            const GCNTargetMachine &TM) :
192     AMDGPUGenSubtargetInfo(TT, GPU, FS),
193     AMDGPUSubtarget(TT),
194     TargetTriple(TT),
195     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
196     InstrItins(getInstrItineraryForCPU(GPU)),
197     LDSBankCount(0),
198     MaxPrivateElementSize(0),
199 
200     FastFMAF32(false),
201     HalfRate64Ops(false),
202 
203     FP64FP16Denormals(false),
204     FlatForGlobal(false),
205     AutoWaitcntBeforeBarrier(false),
206     CodeObjectV3(false),
207     UnalignedScratchAccess(false),
208     UnalignedBufferAccess(false),
209 
210     HasApertureRegs(false),
211     EnableXNACK(false),
212     DoesNotSupportXNACK(false),
213     EnableCuMode(false),
214     TrapHandler(false),
215 
216     EnableLoadStoreOpt(false),
217     EnableUnsafeDSOffsetFolding(false),
218     EnableSIScheduler(false),
219     EnableDS128(false),
220     EnablePRTStrictNull(false),
221     DumpCode(false),
222 
223     FP64(false),
224     GCN3Encoding(false),
225     CIInsts(false),
226     GFX8Insts(false),
227     GFX9Insts(false),
228     GFX10Insts(false),
229     GFX7GFX8GFX9Insts(false),
230     SGPRInitBug(false),
231     HasSMemRealTime(false),
232     HasIntClamp(false),
233     HasFmaMixInsts(false),
234     HasMovrel(false),
235     HasVGPRIndexMode(false),
236     HasScalarStores(false),
237     HasScalarAtomics(false),
238     HasSDWAOmod(false),
239     HasSDWAScalar(false),
240     HasSDWASdst(false),
241     HasSDWAMac(false),
242     HasSDWAOutModsVOPC(false),
243     HasDPP(false),
244     HasDPP8(false),
245     HasR128A16(false),
246     HasGFX10A16(false),
247     HasNSAEncoding(false),
248     HasDLInsts(false),
249     HasDot1Insts(false),
250     HasDot2Insts(false),
251     HasDot3Insts(false),
252     HasDot4Insts(false),
253     HasDot5Insts(false),
254     HasDot6Insts(false),
255     HasMAIInsts(false),
256     HasPkFmacF16Inst(false),
257     HasAtomicFaddInsts(false),
258     EnableSRAMECC(false),
259     DoesNotSupportSRAMECC(false),
260     HasNoSdstCMPX(false),
261     HasVscnt(false),
262     HasRegisterBanking(false),
263     HasVOP3Literal(false),
264     HasNoDataDepHazard(false),
265     FlatAddressSpace(false),
266     FlatInstOffsets(false),
267     FlatGlobalInsts(false),
268     FlatScratchInsts(false),
269     ScalarFlatScratchInsts(false),
270     AddNoCarryInsts(false),
271     HasUnpackedD16VMem(false),
272     LDSMisalignedBug(false),
273     HasMFMAInlineLiteralBug(false),
274 
275     ScalarizeGlobal(false),
276 
277     HasVcmpxPermlaneHazard(false),
278     HasVMEMtoScalarWriteHazard(false),
279     HasSMEMtoVectorWriteHazard(false),
280     HasInstFwdPrefetchBug(false),
281     HasVcmpxExecWARHazard(false),
282     HasLdsBranchVmemWARHazard(false),
283     HasNSAtoVMEMBug(false),
284     HasOffset3fBug(false),
285     HasFlatSegmentOffsetBug(false),
286 
287     FeatureDisable(false),
288     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
289     TLInfo(TM, *this),
290     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
291   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
292   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
293   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
294   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
295   InstSelector.reset(new AMDGPUInstructionSelector(
296   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
297 }
298 
299 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
300   if (getGeneration() < GFX10)
301     return 1;
302 
303   switch (Opcode) {
304   case AMDGPU::V_LSHLREV_B64:
305   case AMDGPU::V_LSHLREV_B64_gfx10:
306   case AMDGPU::V_LSHL_B64:
307   case AMDGPU::V_LSHRREV_B64:
308   case AMDGPU::V_LSHRREV_B64_gfx10:
309   case AMDGPU::V_LSHR_B64:
310   case AMDGPU::V_ASHRREV_I64:
311   case AMDGPU::V_ASHRREV_I64_gfx10:
312   case AMDGPU::V_ASHR_I64:
313     return 1;
314   }
315 
316   return 2;
317 }
318 
319 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
320   const Function &F) const {
321   if (NWaves == 1)
322     return getLocalMemorySize();
323   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
324   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
325   if (!WorkGroupsPerCu)
326     return 0;
327   unsigned MaxWaves = getMaxWavesPerEU();
328   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
329 }
330 
331 // FIXME: Should return min,max range.
332 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
333   const Function &F) const {
334   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
335   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
336   if (!MaxWorkGroupsPerCu)
337     return 0;
338 
339   const unsigned WaveSize = getWavefrontSize();
340 
341   // FIXME: Do we need to account for alignment requirement of LDS rounding the
342   // size up?
343   // Compute restriction based on LDS usage
344   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
345 
346   // This can be queried with more LDS than is possible, so just assume the
347   // worst.
348   if (NumGroups == 0)
349     return 1;
350 
351   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
352 
353   // Round to the number of waves.
354   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
355   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
356 
357   // Clamp to the maximum possible number of waves.
358   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
359 
360   // FIXME: Needs to be a multiple of the group size?
361   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
362 
363   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
364          "computed invalid occupancy");
365   return MaxWaves;
366 }
367 
368 unsigned
369 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
370   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
371   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
372 }
373 
374 std::pair<unsigned, unsigned>
375 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
376   switch (CC) {
377   case CallingConv::AMDGPU_VS:
378   case CallingConv::AMDGPU_LS:
379   case CallingConv::AMDGPU_HS:
380   case CallingConv::AMDGPU_ES:
381   case CallingConv::AMDGPU_GS:
382   case CallingConv::AMDGPU_PS:
383     return std::make_pair(1, getWavefrontSize());
384   default:
385     return std::make_pair(1u, getMaxFlatWorkGroupSize());
386   }
387 }
388 
389 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
390   const Function &F) const {
391   // Default minimum/maximum flat work group sizes.
392   std::pair<unsigned, unsigned> Default =
393     getDefaultFlatWorkGroupSize(F.getCallingConv());
394 
395   // Requested minimum/maximum flat work group sizes.
396   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
397     F, "amdgpu-flat-work-group-size", Default);
398 
399   // Make sure requested minimum is less than requested maximum.
400   if (Requested.first > Requested.second)
401     return Default;
402 
403   // Make sure requested values do not violate subtarget's specifications.
404   if (Requested.first < getMinFlatWorkGroupSize())
405     return Default;
406   if (Requested.second > getMaxFlatWorkGroupSize())
407     return Default;
408 
409   return Requested;
410 }
411 
412 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
413   const Function &F) const {
414   // Default minimum/maximum number of waves per execution unit.
415   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
416 
417   // Default/requested minimum/maximum flat work group sizes.
418   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
419 
420   // If minimum/maximum flat work group sizes were explicitly requested using
421   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
422   // number of waves per execution unit to values implied by requested
423   // minimum/maximum flat work group sizes.
424   unsigned MinImpliedByFlatWorkGroupSize =
425     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
426   bool RequestedFlatWorkGroupSize = false;
427 
428   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
429     Default.first = MinImpliedByFlatWorkGroupSize;
430     RequestedFlatWorkGroupSize = true;
431   }
432 
433   // Requested minimum/maximum number of waves per execution unit.
434   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
435     F, "amdgpu-waves-per-eu", Default, true);
436 
437   // Make sure requested minimum is less than requested maximum.
438   if (Requested.second && Requested.first > Requested.second)
439     return Default;
440 
441   // Make sure requested values do not violate subtarget's specifications.
442   if (Requested.first < getMinWavesPerEU() ||
443       Requested.first > getMaxWavesPerEU())
444     return Default;
445   if (Requested.second > getMaxWavesPerEU())
446     return Default;
447 
448   // Make sure requested values are compatible with values implied by requested
449   // minimum/maximum flat work group sizes.
450   if (RequestedFlatWorkGroupSize &&
451       Requested.first < MinImpliedByFlatWorkGroupSize)
452     return Default;
453 
454   return Requested;
455 }
456 
457 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
458   Function *Kernel = I->getParent()->getParent();
459   unsigned MinSize = 0;
460   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
461   bool IdQuery = false;
462 
463   // If reqd_work_group_size is present it narrows value down.
464   if (auto *CI = dyn_cast<CallInst>(I)) {
465     const Function *F = CI->getCalledFunction();
466     if (F) {
467       unsigned Dim = UINT_MAX;
468       switch (F->getIntrinsicID()) {
469       case Intrinsic::amdgcn_workitem_id_x:
470       case Intrinsic::r600_read_tidig_x:
471         IdQuery = true;
472         LLVM_FALLTHROUGH;
473       case Intrinsic::r600_read_local_size_x:
474         Dim = 0;
475         break;
476       case Intrinsic::amdgcn_workitem_id_y:
477       case Intrinsic::r600_read_tidig_y:
478         IdQuery = true;
479         LLVM_FALLTHROUGH;
480       case Intrinsic::r600_read_local_size_y:
481         Dim = 1;
482         break;
483       case Intrinsic::amdgcn_workitem_id_z:
484       case Intrinsic::r600_read_tidig_z:
485         IdQuery = true;
486         LLVM_FALLTHROUGH;
487       case Intrinsic::r600_read_local_size_z:
488         Dim = 2;
489         break;
490       default:
491         break;
492       }
493       if (Dim <= 3) {
494         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
495           if (Node->getNumOperands() == 3)
496             MinSize = MaxSize = mdconst::extract<ConstantInt>(
497                                   Node->getOperand(Dim))->getZExtValue();
498       }
499     }
500   }
501 
502   if (!MaxSize)
503     return false;
504 
505   // Range metadata is [Lo, Hi). For ID query we need to pass max size
506   // as Hi. For size query we need to pass Hi + 1.
507   if (IdQuery)
508     MinSize = 0;
509   else
510     ++MaxSize;
511 
512   MDBuilder MDB(I->getContext());
513   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
514                                                   APInt(32, MaxSize));
515   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
516   return true;
517 }
518 
519 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
520                                                  Align &MaxAlign) const {
521   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
522          F.getCallingConv() == CallingConv::SPIR_KERNEL);
523 
524   const DataLayout &DL = F.getParent()->getDataLayout();
525   uint64_t ExplicitArgBytes = 0;
526   MaxAlign = Align(1);
527 
528   for (const Argument &Arg : F.args()) {
529     Type *ArgTy = Arg.getType();
530 
531     const Align Alignment(DL.getABITypeAlignment(ArgTy));
532     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
533     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
534     MaxAlign = std::max(MaxAlign, Alignment);
535   }
536 
537   return ExplicitArgBytes;
538 }
539 
540 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
541                                                 Align &MaxAlign) const {
542   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
543 
544   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
545 
546   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
547   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
548   if (ImplicitBytes != 0) {
549     const Align Alignment = getAlignmentForImplicitArgPtr();
550     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
551   }
552 
553   // Being able to dereference past the end is useful for emitting scalar loads.
554   return alignTo(TotalSize, 4);
555 }
556 
557 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
558                              const TargetMachine &TM) :
559   R600GenSubtargetInfo(TT, GPU, FS),
560   AMDGPUSubtarget(TT),
561   InstrInfo(*this),
562   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
563   FMA(false),
564   CaymanISA(false),
565   CFALUBug(false),
566   HasVertexCache(false),
567   R600ALUInst(false),
568   FP64(false),
569   TexVTXClauseSize(0),
570   Gen(R600),
571   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
572   InstrItins(getInstrItineraryForCPU(GPU)) { }
573 
574 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
575                                       unsigned NumRegionInstrs) const {
576   // Track register pressure so the scheduler can try to decrease
577   // pressure once register usage is above the threshold defined by
578   // SIRegisterInfo::getRegPressureSetLimit()
579   Policy.ShouldTrackPressure = true;
580 
581   // Enabling both top down and bottom up scheduling seems to give us less
582   // register spills than just using one of these approaches on its own.
583   Policy.OnlyTopDown = false;
584   Policy.OnlyBottomUp = false;
585 
586   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
587   if (!enableSIScheduler())
588     Policy.ShouldTrackLaneMasks = true;
589 }
590 
591 bool GCNSubtarget::hasMadF16() const {
592   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
593 }
594 
595 bool GCNSubtarget::useVGPRIndexMode() const {
596   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
597 }
598 
599 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
600   if (getGeneration() >= AMDGPUSubtarget::GFX10)
601     return getMaxWavesPerEU();
602 
603   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
604     if (SGPRs <= 80)
605       return 10;
606     if (SGPRs <= 88)
607       return 9;
608     if (SGPRs <= 100)
609       return 8;
610     return 7;
611   }
612   if (SGPRs <= 48)
613     return 10;
614   if (SGPRs <= 56)
615     return 9;
616   if (SGPRs <= 64)
617     return 8;
618   if (SGPRs <= 72)
619     return 7;
620   if (SGPRs <= 80)
621     return 6;
622   return 5;
623 }
624 
625 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
626   unsigned MaxWaves = getMaxWavesPerEU();
627   unsigned Granule = getVGPRAllocGranule();
628   if (VGPRs < Granule)
629     return MaxWaves;
630   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
631   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
632 }
633 
634 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
635   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
636   if (getGeneration() >= AMDGPUSubtarget::GFX10)
637     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
638 
639   if (MFI.hasFlatScratchInit()) {
640     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
641       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
642     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
643       return 4; // FLAT_SCRATCH, VCC (in that order).
644   }
645 
646   if (isXNACKEnabled())
647     return 4; // XNACK, VCC (in that order).
648   return 2; // VCC.
649 }
650 
651 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
652                                         unsigned LDSSize,
653                                         unsigned NumSGPRs,
654                                         unsigned NumVGPRs) const {
655   unsigned Occupancy =
656     std::min(getMaxWavesPerEU(),
657              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
658   if (NumSGPRs)
659     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
660   if (NumVGPRs)
661     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
662   return Occupancy;
663 }
664 
665 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
666   const Function &F = MF.getFunction();
667   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
668 
669   // Compute maximum number of SGPRs function can use using default/requested
670   // minimum number of waves per execution unit.
671   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
672   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
673   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
674 
675   // Check if maximum number of SGPRs was explicitly requested using
676   // "amdgpu-num-sgpr" attribute.
677   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
678     unsigned Requested = AMDGPU::getIntegerAttribute(
679       F, "amdgpu-num-sgpr", MaxNumSGPRs);
680 
681     // Make sure requested value does not violate subtarget's specifications.
682     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
683       Requested = 0;
684 
685     // If more SGPRs are required to support the input user/system SGPRs,
686     // increase to accommodate them.
687     //
688     // FIXME: This really ends up using the requested number of SGPRs + number
689     // of reserved special registers in total. Theoretically you could re-use
690     // the last input registers for these special registers, but this would
691     // require a lot of complexity to deal with the weird aliasing.
692     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
693     if (Requested && Requested < InputNumSGPRs)
694       Requested = InputNumSGPRs;
695 
696     // Make sure requested value is compatible with values implied by
697     // default/requested minimum/maximum number of waves per execution unit.
698     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
699       Requested = 0;
700     if (WavesPerEU.second &&
701         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
702       Requested = 0;
703 
704     if (Requested)
705       MaxNumSGPRs = Requested;
706   }
707 
708   if (hasSGPRInitBug())
709     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
710 
711   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
712                   MaxAddressableNumSGPRs);
713 }
714 
715 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
716   const Function &F = MF.getFunction();
717   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
718 
719   // Compute maximum number of VGPRs function can use using default/requested
720   // minimum number of waves per execution unit.
721   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
722   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
723 
724   // Check if maximum number of VGPRs was explicitly requested using
725   // "amdgpu-num-vgpr" attribute.
726   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
727     unsigned Requested = AMDGPU::getIntegerAttribute(
728       F, "amdgpu-num-vgpr", MaxNumVGPRs);
729 
730     // Make sure requested value is compatible with values implied by
731     // default/requested minimum/maximum number of waves per execution unit.
732     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
733       Requested = 0;
734     if (WavesPerEU.second &&
735         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
736       Requested = 0;
737 
738     if (Requested)
739       MaxNumVGPRs = Requested;
740   }
741 
742   return MaxNumVGPRs;
743 }
744 
745 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
746                                          SDep &Dep) const {
747   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
748       !Src->isInstr() || !Dst->isInstr())
749     return;
750 
751   MachineInstr *SrcI = Src->getInstr();
752   MachineInstr *DstI = Dst->getInstr();
753 
754   if (SrcI->isBundle()) {
755     const SIRegisterInfo *TRI = getRegisterInfo();
756     auto Reg = Dep.getReg();
757     MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
758     MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
759     unsigned Lat = 0;
760     for (++I; I != E && I->isBundledWithPred(); ++I) {
761       if (I->modifiesRegister(Reg, TRI))
762         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
763       else if (Lat)
764         --Lat;
765     }
766     Dep.setLatency(Lat);
767   } else if (DstI->isBundle()) {
768     const SIRegisterInfo *TRI = getRegisterInfo();
769     auto Reg = Dep.getReg();
770     MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
771     MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
772     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
773     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
774       if (I->readsRegister(Reg, TRI))
775         break;
776       --Lat;
777     }
778     Dep.setLatency(Lat);
779   }
780 }
781 
782 namespace {
783 struct FillMFMAShadowMutation : ScheduleDAGMutation {
784   const SIInstrInfo *TII;
785 
786   ScheduleDAGMI *DAG;
787 
788   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
789 
790   bool isSALU(const SUnit *SU) const {
791     const MachineInstr *MI = SU->getInstr();
792     return MI && TII->isSALU(*MI) && !MI->isTerminator();
793   }
794 
795   bool isVALU(const SUnit *SU) const {
796     const MachineInstr *MI = SU->getInstr();
797     return MI && TII->isVALU(*MI);
798   }
799 
800   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
801     if (Pred->NodeNum < Succ->NodeNum)
802       return true;
803 
804     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
805 
806     for (unsigned I = 0; I < Succs.size(); ++I) {
807       for (const SDep &SI : Succs[I]->Succs) {
808         const SUnit *SU = SI.getSUnit();
809         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
810           Succs.push_back(SU);
811       }
812     }
813 
814     SmallPtrSet<const SUnit*, 32> Visited;
815     while (!Preds.empty()) {
816       const SUnit *SU = Preds.pop_back_val();
817       if (llvm::find(Succs, SU) != Succs.end())
818         return false;
819       Visited.insert(SU);
820       for (const SDep &SI : SU->Preds)
821         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
822           Preds.push_back(SI.getSUnit());
823     }
824 
825     return true;
826   }
827 
828   // Link as much SALU intructions in chain as possible. Return the size
829   // of the chain. Links up to MaxChain instructions.
830   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
831                          SmallPtrSetImpl<SUnit *> &Visited) const {
832     SmallVector<SUnit *, 8> Worklist({To});
833     unsigned Linked = 0;
834 
835     while (!Worklist.empty() && MaxChain-- > 0) {
836       SUnit *SU = Worklist.pop_back_val();
837       if (!Visited.insert(SU).second)
838         continue;
839 
840       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
841                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
842 
843       if (SU->addPred(SDep(From, SDep::Artificial), false))
844         ++Linked;
845 
846       for (SDep &SI : From->Succs) {
847         SUnit *SUv = SI.getSUnit();
848         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
849           SUv->addPred(SDep(SU, SDep::Artificial), false);
850       }
851 
852       for (SDep &SI : SU->Succs) {
853         SUnit *Succ = SI.getSUnit();
854         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
855           Worklist.push_back(Succ);
856       }
857     }
858 
859     return Linked;
860   }
861 
862   void apply(ScheduleDAGInstrs *DAGInstrs) override {
863     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
864     if (!ST.hasMAIInsts() || DisablePowerSched)
865       return;
866     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
867     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
868     if (!TSchedModel || DAG->SUnits.empty())
869       return;
870 
871     // Scan for MFMA long latency instructions and try to add a dependency
872     // of available SALU instructions to give them a chance to fill MFMA
873     // shadow. That is desirable to fill MFMA shadow with SALU instructions
874     // rather than VALU to prevent power consumption bursts and throttle.
875     auto LastSALU = DAG->SUnits.begin();
876     auto E = DAG->SUnits.end();
877     SmallPtrSet<SUnit*, 32> Visited;
878     for (SUnit &SU : DAG->SUnits) {
879       MachineInstr &MAI = *SU.getInstr();
880       if (!TII->isMAI(MAI) ||
881            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
882            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
883         continue;
884 
885       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
886 
887       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
888                  dbgs() << "Need " << Lat
889                         << " instructions to cover latency.\n");
890 
891       // Find up to Lat independent scalar instructions as early as
892       // possible such that they can be scheduled after this MFMA.
893       for ( ; Lat && LastSALU != E; ++LastSALU) {
894         if (Visited.count(&*LastSALU))
895           continue;
896 
897         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
898           continue;
899 
900         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
901       }
902     }
903   }
904 };
905 } // namespace
906 
907 void GCNSubtarget::getPostRAMutations(
908     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
909   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
910 }
911 
912 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
913   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
914     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
915   else
916     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
917 }
918 
919 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
920   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
921     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
922   else
923     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
924 }
925