1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                                  StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
81 
82   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
84 
85   // FIXME: I don't think think Evergreen has any useful support for
86   // denormals, but should be checked. Should we issue a warning somewhere
87   // if someone tries to enable these?
88   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
89     FullFS += "+fp64-fp16-denormals,";
90   } else {
91     FullFS += "-fp32-denormals,";
92   }
93 
94   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
95 
96   FullFS += FS;
97 
98   ParseSubtargetFeatures(GPU, FullFS);
99 
100   // We don't support FP64 for EG/NI atm.
101   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
102 
103   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105   // variants of MUBUF instructions.
106   if (!hasAddr64() && !FS.contains("flat-for-global")) {
107     FlatForGlobal = true;
108   }
109 
110   // Set defaults if needed.
111   if (MaxPrivateElementSize == 0)
112     MaxPrivateElementSize = 4;
113 
114   if (LDSBankCount == 0)
115     LDSBankCount = 32;
116 
117   if (TT.getArch() == Triple::amdgcn) {
118     if (LocalMemorySize == 0)
119       LocalMemorySize = 32768;
120 
121     // Do something sensible for unspecified target.
122     if (!HasMovrel && !HasVGPRIndexMode)
123       HasMovrel = true;
124   }
125 
126   // Don't crash on invalid devices.
127   if (WavefrontSize == 0)
128     WavefrontSize = 64;
129 
130   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
131 
132   return *this;
133 }
134 
135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
136   TargetTriple(TT),
137   Has16BitInsts(false),
138   HasMadMixInsts(false),
139   FP32Denormals(false),
140   FPExceptions(false),
141   HasSDWA(false),
142   HasVOP3PInsts(false),
143   HasMulI24(true),
144   HasMulU24(true),
145   HasInv2PiInlineImm(false),
146   HasFminFmaxLegacy(true),
147   EnablePromoteAlloca(false),
148   HasTrigReducedRange(false),
149   LocalMemorySize(0),
150   WavefrontSize(0)
151   { }
152 
153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
154                            const GCNTargetMachine &TM) :
155     AMDGPUGenSubtargetInfo(TT, GPU, FS),
156     AMDGPUSubtarget(TT),
157     TargetTriple(TT),
158     Gen(SOUTHERN_ISLANDS),
159     InstrItins(getInstrItineraryForCPU(GPU)),
160     LDSBankCount(0),
161     MaxPrivateElementSize(0),
162 
163     FastFMAF32(false),
164     HalfRate64Ops(false),
165 
166     FP64FP16Denormals(false),
167     DX10Clamp(false),
168     FlatForGlobal(false),
169     AutoWaitcntBeforeBarrier(false),
170     CodeObjectV3(false),
171     UnalignedScratchAccess(false),
172     UnalignedBufferAccess(false),
173 
174     HasApertureRegs(false),
175     EnableXNACK(false),
176     TrapHandler(false),
177 
178     EnableHugePrivateBuffer(false),
179     EnableLoadStoreOpt(false),
180     EnableUnsafeDSOffsetFolding(false),
181     EnableSIScheduler(false),
182     EnableDS128(false),
183     EnablePRTStrictNull(false),
184     DumpCode(false),
185 
186     FP64(false),
187     GCN3Encoding(false),
188     CIInsts(false),
189     VIInsts(false),
190     GFX9Insts(false),
191     SGPRInitBug(false),
192     HasSMemRealTime(false),
193     HasIntClamp(false),
194     HasFmaMixInsts(false),
195     HasMovrel(false),
196     HasVGPRIndexMode(false),
197     HasScalarStores(false),
198     HasScalarAtomics(false),
199     HasSDWAOmod(false),
200     HasSDWAScalar(false),
201     HasSDWASdst(false),
202     HasSDWAMac(false),
203     HasSDWAOutModsVOPC(false),
204     HasDPP(false),
205     HasR128A16(false),
206     HasDLInsts(false),
207     HasDot1Insts(false),
208     HasDot2Insts(false),
209     EnableSRAMECC(false),
210     FlatAddressSpace(false),
211     FlatInstOffsets(false),
212     FlatGlobalInsts(false),
213     FlatScratchInsts(false),
214     AddNoCarryInsts(false),
215     HasUnpackedD16VMem(false),
216 
217     ScalarizeGlobal(false),
218 
219     FeatureDisable(false),
220     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
221     TLInfo(TM, *this),
222     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
223   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
224   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
225   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
226   InstSelector.reset(new AMDGPUInstructionSelector(
227   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
228 }
229 
230 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
231   const Function &F) const {
232   if (NWaves == 1)
233     return getLocalMemorySize();
234   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
235   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
236   unsigned MaxWaves = getMaxWavesPerEU();
237   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
238 }
239 
240 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
241   const Function &F) const {
242   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
243   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
244   unsigned MaxWaves = getMaxWavesPerEU();
245   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
246   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
247   NumWaves = std::min(NumWaves, MaxWaves);
248   NumWaves = std::max(NumWaves, 1u);
249   return NumWaves;
250 }
251 
252 unsigned
253 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
254   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
255   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
256 }
257 
258 std::pair<unsigned, unsigned>
259 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
260   switch (CC) {
261   case CallingConv::AMDGPU_CS:
262   case CallingConv::AMDGPU_KERNEL:
263   case CallingConv::SPIR_KERNEL:
264     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
265   case CallingConv::AMDGPU_VS:
266   case CallingConv::AMDGPU_LS:
267   case CallingConv::AMDGPU_HS:
268   case CallingConv::AMDGPU_ES:
269   case CallingConv::AMDGPU_GS:
270   case CallingConv::AMDGPU_PS:
271     return std::make_pair(1, getWavefrontSize());
272   default:
273     return std::make_pair(1, 16 * getWavefrontSize());
274   }
275 }
276 
277 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
278   const Function &F) const {
279   // FIXME: 1024 if function.
280   // Default minimum/maximum flat work group sizes.
281   std::pair<unsigned, unsigned> Default =
282     getDefaultFlatWorkGroupSize(F.getCallingConv());
283 
284   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
285   // starts using "amdgpu-flat-work-group-size" attribute.
286   Default.second = AMDGPU::getIntegerAttribute(
287     F, "amdgpu-max-work-group-size", Default.second);
288   Default.first = std::min(Default.first, Default.second);
289 
290   // Requested minimum/maximum flat work group sizes.
291   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
292     F, "amdgpu-flat-work-group-size", Default);
293 
294   // Make sure requested minimum is less than requested maximum.
295   if (Requested.first > Requested.second)
296     return Default;
297 
298   // Make sure requested values do not violate subtarget's specifications.
299   if (Requested.first < getMinFlatWorkGroupSize())
300     return Default;
301   if (Requested.second > getMaxFlatWorkGroupSize())
302     return Default;
303 
304   return Requested;
305 }
306 
307 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
308   const Function &F) const {
309   // Default minimum/maximum number of waves per execution unit.
310   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
311 
312   // Default/requested minimum/maximum flat work group sizes.
313   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
314 
315   // If minimum/maximum flat work group sizes were explicitly requested using
316   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
317   // number of waves per execution unit to values implied by requested
318   // minimum/maximum flat work group sizes.
319   unsigned MinImpliedByFlatWorkGroupSize =
320     getMaxWavesPerEU(FlatWorkGroupSizes.second);
321   bool RequestedFlatWorkGroupSize = false;
322 
323   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
324   // starts using "amdgpu-flat-work-group-size" attribute.
325   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
326       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
327     Default.first = MinImpliedByFlatWorkGroupSize;
328     RequestedFlatWorkGroupSize = true;
329   }
330 
331   // Requested minimum/maximum number of waves per execution unit.
332   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
333     F, "amdgpu-waves-per-eu", Default, true);
334 
335   // Make sure requested minimum is less than requested maximum.
336   if (Requested.second && Requested.first > Requested.second)
337     return Default;
338 
339   // Make sure requested values do not violate subtarget's specifications.
340   if (Requested.first < getMinWavesPerEU() ||
341       Requested.first > getMaxWavesPerEU())
342     return Default;
343   if (Requested.second > getMaxWavesPerEU())
344     return Default;
345 
346   // Make sure requested values are compatible with values implied by requested
347   // minimum/maximum flat work group sizes.
348   if (RequestedFlatWorkGroupSize &&
349       Requested.first < MinImpliedByFlatWorkGroupSize)
350     return Default;
351 
352   return Requested;
353 }
354 
355 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
356   Function *Kernel = I->getParent()->getParent();
357   unsigned MinSize = 0;
358   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
359   bool IdQuery = false;
360 
361   // If reqd_work_group_size is present it narrows value down.
362   if (auto *CI = dyn_cast<CallInst>(I)) {
363     const Function *F = CI->getCalledFunction();
364     if (F) {
365       unsigned Dim = UINT_MAX;
366       switch (F->getIntrinsicID()) {
367       case Intrinsic::amdgcn_workitem_id_x:
368       case Intrinsic::r600_read_tidig_x:
369         IdQuery = true;
370         LLVM_FALLTHROUGH;
371       case Intrinsic::r600_read_local_size_x:
372         Dim = 0;
373         break;
374       case Intrinsic::amdgcn_workitem_id_y:
375       case Intrinsic::r600_read_tidig_y:
376         IdQuery = true;
377         LLVM_FALLTHROUGH;
378       case Intrinsic::r600_read_local_size_y:
379         Dim = 1;
380         break;
381       case Intrinsic::amdgcn_workitem_id_z:
382       case Intrinsic::r600_read_tidig_z:
383         IdQuery = true;
384         LLVM_FALLTHROUGH;
385       case Intrinsic::r600_read_local_size_z:
386         Dim = 2;
387         break;
388       default:
389         break;
390       }
391       if (Dim <= 3) {
392         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
393           if (Node->getNumOperands() == 3)
394             MinSize = MaxSize = mdconst::extract<ConstantInt>(
395                                   Node->getOperand(Dim))->getZExtValue();
396       }
397     }
398   }
399 
400   if (!MaxSize)
401     return false;
402 
403   // Range metadata is [Lo, Hi). For ID query we need to pass max size
404   // as Hi. For size query we need to pass Hi + 1.
405   if (IdQuery)
406     MinSize = 0;
407   else
408     ++MaxSize;
409 
410   MDBuilder MDB(I->getContext());
411   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
412                                                   APInt(32, MaxSize));
413   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
414   return true;
415 }
416 
417 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
418                                                  unsigned &MaxAlign) const {
419   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
420          F.getCallingConv() == CallingConv::SPIR_KERNEL);
421 
422   const DataLayout &DL = F.getParent()->getDataLayout();
423   uint64_t ExplicitArgBytes = 0;
424   MaxAlign = 1;
425 
426   for (const Argument &Arg : F.args()) {
427     Type *ArgTy = Arg.getType();
428 
429     unsigned Align = DL.getABITypeAlignment(ArgTy);
430     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
431     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
432     MaxAlign = std::max(MaxAlign, Align);
433   }
434 
435   return ExplicitArgBytes;
436 }
437 
438 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
439                                                 unsigned &MaxAlign) const {
440   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
441 
442   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
443 
444   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
445   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
446   if (ImplicitBytes != 0) {
447     unsigned Alignment = getAlignmentForImplicitArgPtr();
448     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
449   }
450 
451   // Being able to dereference past the end is useful for emitting scalar loads.
452   return alignTo(TotalSize, 4);
453 }
454 
455 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
456                              const TargetMachine &TM) :
457   R600GenSubtargetInfo(TT, GPU, FS),
458   AMDGPUSubtarget(TT),
459   InstrInfo(*this),
460   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
461   FMA(false),
462   CaymanISA(false),
463   CFALUBug(false),
464   DX10Clamp(false),
465   HasVertexCache(false),
466   R600ALUInst(false),
467   FP64(false),
468   TexVTXClauseSize(0),
469   Gen(R600),
470   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
471   InstrItins(getInstrItineraryForCPU(GPU)) { }
472 
473 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
474                                       unsigned NumRegionInstrs) const {
475   // Track register pressure so the scheduler can try to decrease
476   // pressure once register usage is above the threshold defined by
477   // SIRegisterInfo::getRegPressureSetLimit()
478   Policy.ShouldTrackPressure = true;
479 
480   // Enabling both top down and bottom up scheduling seems to give us less
481   // register spills than just using one of these approaches on its own.
482   Policy.OnlyTopDown = false;
483   Policy.OnlyBottomUp = false;
484 
485   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
486   if (!enableSIScheduler())
487     Policy.ShouldTrackLaneMasks = true;
488 }
489 
490 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
491   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
492     if (SGPRs <= 80)
493       return 10;
494     if (SGPRs <= 88)
495       return 9;
496     if (SGPRs <= 100)
497       return 8;
498     return 7;
499   }
500   if (SGPRs <= 48)
501     return 10;
502   if (SGPRs <= 56)
503     return 9;
504   if (SGPRs <= 64)
505     return 8;
506   if (SGPRs <= 72)
507     return 7;
508   if (SGPRs <= 80)
509     return 6;
510   return 5;
511 }
512 
513 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
514   if (VGPRs <= 24)
515     return 10;
516   if (VGPRs <= 28)
517     return 9;
518   if (VGPRs <= 32)
519     return 8;
520   if (VGPRs <= 36)
521     return 7;
522   if (VGPRs <= 40)
523     return 6;
524   if (VGPRs <= 48)
525     return 5;
526   if (VGPRs <= 64)
527     return 4;
528   if (VGPRs <= 84)
529     return 3;
530   if (VGPRs <= 128)
531     return 2;
532   return 1;
533 }
534 
535 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
536   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
537   if (MFI.hasFlatScratchInit()) {
538     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
539       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
540     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
541       return 4; // FLAT_SCRATCH, VCC (in that order).
542   }
543 
544   if (isXNACKEnabled())
545     return 4; // XNACK, VCC (in that order).
546   return 2; // VCC.
547 }
548 
549 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
550   const Function &F = MF.getFunction();
551   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
552 
553   // Compute maximum number of SGPRs function can use using default/requested
554   // minimum number of waves per execution unit.
555   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
556   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
557   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
558 
559   // Check if maximum number of SGPRs was explicitly requested using
560   // "amdgpu-num-sgpr" attribute.
561   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
562     unsigned Requested = AMDGPU::getIntegerAttribute(
563       F, "amdgpu-num-sgpr", MaxNumSGPRs);
564 
565     // Make sure requested value does not violate subtarget's specifications.
566     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
567       Requested = 0;
568 
569     // If more SGPRs are required to support the input user/system SGPRs,
570     // increase to accommodate them.
571     //
572     // FIXME: This really ends up using the requested number of SGPRs + number
573     // of reserved special registers in total. Theoretically you could re-use
574     // the last input registers for these special registers, but this would
575     // require a lot of complexity to deal with the weird aliasing.
576     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
577     if (Requested && Requested < InputNumSGPRs)
578       Requested = InputNumSGPRs;
579 
580     // Make sure requested value is compatible with values implied by
581     // default/requested minimum/maximum number of waves per execution unit.
582     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
583       Requested = 0;
584     if (WavesPerEU.second &&
585         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
586       Requested = 0;
587 
588     if (Requested)
589       MaxNumSGPRs = Requested;
590   }
591 
592   if (hasSGPRInitBug())
593     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
594 
595   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
596                   MaxAddressableNumSGPRs);
597 }
598 
599 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
600   const Function &F = MF.getFunction();
601   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
602 
603   // Compute maximum number of VGPRs function can use using default/requested
604   // minimum number of waves per execution unit.
605   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
606   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
607 
608   // Check if maximum number of VGPRs was explicitly requested using
609   // "amdgpu-num-vgpr" attribute.
610   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
611     unsigned Requested = AMDGPU::getIntegerAttribute(
612       F, "amdgpu-num-vgpr", MaxNumVGPRs);
613 
614     // Make sure requested value is compatible with values implied by
615     // default/requested minimum/maximum number of waves per execution unit.
616     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
617       Requested = 0;
618     if (WavesPerEU.second &&
619         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
620       Requested = 0;
621 
622     if (Requested)
623       MaxNumVGPRs = Requested;
624   }
625 
626   return MaxNumVGPRs;
627 }
628 
629 namespace {
630 struct MemOpClusterMutation : ScheduleDAGMutation {
631   const SIInstrInfo *TII;
632 
633   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
634 
635   void apply(ScheduleDAGInstrs *DAGInstrs) override {
636     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
637 
638     SUnit *SUa = nullptr;
639     // Search for two consequent memory operations and link them
640     // to prevent scheduler from moving them apart.
641     // In DAG pre-process SUnits are in the original order of
642     // the instructions before scheduling.
643     for (SUnit &SU : DAG->SUnits) {
644       MachineInstr &MI2 = *SU.getInstr();
645       if (!MI2.mayLoad() && !MI2.mayStore()) {
646         SUa = nullptr;
647         continue;
648       }
649       if (!SUa) {
650         SUa = &SU;
651         continue;
652       }
653 
654       MachineInstr &MI1 = *SUa->getInstr();
655       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
656           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
657           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
658           (TII->isDS(MI1)   && TII->isDS(MI2))) {
659         SU.addPredBarrier(SUa);
660 
661         for (const SDep &SI : SU.Preds) {
662           if (SI.getSUnit() != SUa)
663             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
664         }
665 
666         if (&SU != &DAG->ExitSU) {
667           for (const SDep &SI : SUa->Succs) {
668             if (SI.getSUnit() != &SU)
669               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
670           }
671         }
672       }
673 
674       SUa = &SU;
675     }
676   }
677 };
678 } // namespace
679 
680 void GCNSubtarget::getPostRAMutations(
681     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
682   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
683 }
684 
685 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
686   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
687     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
688   else
689     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
690 }
691 
692 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
693   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
694     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
695   else
696     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
697 }
698