1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                                  StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
81 
82   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
84 
85   // FIXME: I don't think think Evergreen has any useful support for
86   // denormals, but should be checked. Should we issue a warning somewhere
87   // if someone tries to enable these?
88   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
89     FullFS += "+fp64-fp16-denormals,";
90   } else {
91     FullFS += "-fp32-denormals,";
92   }
93 
94   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
95 
96   FullFS += FS;
97 
98   ParseSubtargetFeatures(GPU, FullFS);
99 
100   // We don't support FP64 for EG/NI atm.
101   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
102 
103   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105   // variants of MUBUF instructions.
106   if (!hasAddr64() && !FS.contains("flat-for-global")) {
107     FlatForGlobal = true;
108   }
109 
110   // Set defaults if needed.
111   if (MaxPrivateElementSize == 0)
112     MaxPrivateElementSize = 4;
113 
114   if (LDSBankCount == 0)
115     LDSBankCount = 32;
116 
117   if (TT.getArch() == Triple::amdgcn) {
118     if (LocalMemorySize == 0)
119       LocalMemorySize = 32768;
120 
121     // Do something sensible for unspecified target.
122     if (!HasMovrel && !HasVGPRIndexMode)
123       HasMovrel = true;
124   }
125 
126   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
127 
128   return *this;
129 }
130 
131 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
132   TargetTriple(TT),
133   Has16BitInsts(false),
134   HasMadMixInsts(false),
135   FP32Denormals(false),
136   FPExceptions(false),
137   HasSDWA(false),
138   HasVOP3PInsts(false),
139   HasMulI24(true),
140   HasMulU24(true),
141   HasInv2PiInlineImm(false),
142   HasFminFmaxLegacy(true),
143   EnablePromoteAlloca(false),
144   HasTrigReducedRange(false),
145   LocalMemorySize(0),
146   WavefrontSize(0)
147   { }
148 
149 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
150                            const GCNTargetMachine &TM) :
151     AMDGPUGenSubtargetInfo(TT, GPU, FS),
152     AMDGPUSubtarget(TT),
153     TargetTriple(TT),
154     Gen(SOUTHERN_ISLANDS),
155     IsaVersion(ISAVersion0_0_0),
156     InstrItins(getInstrItineraryForCPU(GPU)),
157     LDSBankCount(0),
158     MaxPrivateElementSize(0),
159 
160     FastFMAF32(false),
161     HalfRate64Ops(false),
162 
163     FP64FP16Denormals(false),
164     DX10Clamp(false),
165     FlatForGlobal(false),
166     AutoWaitcntBeforeBarrier(false),
167     CodeObjectV3(false),
168     UnalignedScratchAccess(false),
169     UnalignedBufferAccess(false),
170 
171     HasApertureRegs(false),
172     EnableXNACK(false),
173     TrapHandler(false),
174     DebuggerInsertNops(false),
175     DebuggerEmitPrologue(false),
176 
177     EnableHugePrivateBuffer(false),
178     EnableLoadStoreOpt(false),
179     EnableUnsafeDSOffsetFolding(false),
180     EnableSIScheduler(false),
181     EnableDS128(false),
182     EnablePRTStrictNull(false),
183     DumpCode(false),
184 
185     FP64(false),
186     GCN3Encoding(false),
187     CIInsts(false),
188     VIInsts(false),
189     GFX9Insts(false),
190     SGPRInitBug(false),
191     HasSMemRealTime(false),
192     HasIntClamp(false),
193     HasFmaMixInsts(false),
194     HasMovrel(false),
195     HasVGPRIndexMode(false),
196     HasScalarStores(false),
197     HasScalarAtomics(false),
198     HasSDWAOmod(false),
199     HasSDWAScalar(false),
200     HasSDWASdst(false),
201     HasSDWAMac(false),
202     HasSDWAOutModsVOPC(false),
203     HasDPP(false),
204     HasR128A16(false),
205     HasDLInsts(false),
206     HasDotInsts(false),
207     EnableSRAMECC(false),
208     FlatAddressSpace(false),
209     FlatInstOffsets(false),
210     FlatGlobalInsts(false),
211     FlatScratchInsts(false),
212     AddNoCarryInsts(false),
213     HasUnpackedD16VMem(false),
214 
215     ScalarizeGlobal(false),
216 
217     FeatureDisable(false),
218     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
219     TLInfo(TM, *this),
220     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
221   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
222   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
223   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
224   InstSelector.reset(new AMDGPUInstructionSelector(
225   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
226 }
227 
228 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
229   const Function &F) const {
230   if (NWaves == 1)
231     return getLocalMemorySize();
232   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
233   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
234   unsigned MaxWaves = getMaxWavesPerEU();
235   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
236 }
237 
238 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
239   const Function &F) const {
240   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
241   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
242   unsigned MaxWaves = getMaxWavesPerEU();
243   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
244   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
245   NumWaves = std::min(NumWaves, MaxWaves);
246   NumWaves = std::max(NumWaves, 1u);
247   return NumWaves;
248 }
249 
250 unsigned
251 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
252   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
253   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
254 }
255 
256 std::pair<unsigned, unsigned>
257 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
258   switch (CC) {
259   case CallingConv::AMDGPU_CS:
260   case CallingConv::AMDGPU_KERNEL:
261   case CallingConv::SPIR_KERNEL:
262     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
263   case CallingConv::AMDGPU_VS:
264   case CallingConv::AMDGPU_LS:
265   case CallingConv::AMDGPU_HS:
266   case CallingConv::AMDGPU_ES:
267   case CallingConv::AMDGPU_GS:
268   case CallingConv::AMDGPU_PS:
269     return std::make_pair(1, getWavefrontSize());
270   default:
271     return std::make_pair(1, 16 * getWavefrontSize());
272   }
273 }
274 
275 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
276   const Function &F) const {
277   // FIXME: 1024 if function.
278   // Default minimum/maximum flat work group sizes.
279   std::pair<unsigned, unsigned> Default =
280     getDefaultFlatWorkGroupSize(F.getCallingConv());
281 
282   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
283   // starts using "amdgpu-flat-work-group-size" attribute.
284   Default.second = AMDGPU::getIntegerAttribute(
285     F, "amdgpu-max-work-group-size", Default.second);
286   Default.first = std::min(Default.first, Default.second);
287 
288   // Requested minimum/maximum flat work group sizes.
289   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
290     F, "amdgpu-flat-work-group-size", Default);
291 
292   // Make sure requested minimum is less than requested maximum.
293   if (Requested.first > Requested.second)
294     return Default;
295 
296   // Make sure requested values do not violate subtarget's specifications.
297   if (Requested.first < getMinFlatWorkGroupSize())
298     return Default;
299   if (Requested.second > getMaxFlatWorkGroupSize())
300     return Default;
301 
302   return Requested;
303 }
304 
305 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
306   const Function &F) const {
307   // Default minimum/maximum number of waves per execution unit.
308   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
309 
310   // Default/requested minimum/maximum flat work group sizes.
311   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
312 
313   // If minimum/maximum flat work group sizes were explicitly requested using
314   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
315   // number of waves per execution unit to values implied by requested
316   // minimum/maximum flat work group sizes.
317   unsigned MinImpliedByFlatWorkGroupSize =
318     getMaxWavesPerEU(FlatWorkGroupSizes.second);
319   bool RequestedFlatWorkGroupSize = false;
320 
321   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
322   // starts using "amdgpu-flat-work-group-size" attribute.
323   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
324       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
325     Default.first = MinImpliedByFlatWorkGroupSize;
326     RequestedFlatWorkGroupSize = true;
327   }
328 
329   // Requested minimum/maximum number of waves per execution unit.
330   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
331     F, "amdgpu-waves-per-eu", Default, true);
332 
333   // Make sure requested minimum is less than requested maximum.
334   if (Requested.second && Requested.first > Requested.second)
335     return Default;
336 
337   // Make sure requested values do not violate subtarget's specifications.
338   if (Requested.first < getMinWavesPerEU() ||
339       Requested.first > getMaxWavesPerEU())
340     return Default;
341   if (Requested.second > getMaxWavesPerEU())
342     return Default;
343 
344   // Make sure requested values are compatible with values implied by requested
345   // minimum/maximum flat work group sizes.
346   if (RequestedFlatWorkGroupSize &&
347       Requested.first < MinImpliedByFlatWorkGroupSize)
348     return Default;
349 
350   return Requested;
351 }
352 
353 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
354   Function *Kernel = I->getParent()->getParent();
355   unsigned MinSize = 0;
356   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
357   bool IdQuery = false;
358 
359   // If reqd_work_group_size is present it narrows value down.
360   if (auto *CI = dyn_cast<CallInst>(I)) {
361     const Function *F = CI->getCalledFunction();
362     if (F) {
363       unsigned Dim = UINT_MAX;
364       switch (F->getIntrinsicID()) {
365       case Intrinsic::amdgcn_workitem_id_x:
366       case Intrinsic::r600_read_tidig_x:
367         IdQuery = true;
368         LLVM_FALLTHROUGH;
369       case Intrinsic::r600_read_local_size_x:
370         Dim = 0;
371         break;
372       case Intrinsic::amdgcn_workitem_id_y:
373       case Intrinsic::r600_read_tidig_y:
374         IdQuery = true;
375         LLVM_FALLTHROUGH;
376       case Intrinsic::r600_read_local_size_y:
377         Dim = 1;
378         break;
379       case Intrinsic::amdgcn_workitem_id_z:
380       case Intrinsic::r600_read_tidig_z:
381         IdQuery = true;
382         LLVM_FALLTHROUGH;
383       case Intrinsic::r600_read_local_size_z:
384         Dim = 2;
385         break;
386       default:
387         break;
388       }
389       if (Dim <= 3) {
390         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
391           if (Node->getNumOperands() == 3)
392             MinSize = MaxSize = mdconst::extract<ConstantInt>(
393                                   Node->getOperand(Dim))->getZExtValue();
394       }
395     }
396   }
397 
398   if (!MaxSize)
399     return false;
400 
401   // Range metadata is [Lo, Hi). For ID query we need to pass max size
402   // as Hi. For size query we need to pass Hi + 1.
403   if (IdQuery)
404     MinSize = 0;
405   else
406     ++MaxSize;
407 
408   MDBuilder MDB(I->getContext());
409   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
410                                                   APInt(32, MaxSize));
411   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
412   return true;
413 }
414 
415 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
416                                                  unsigned &MaxAlign) const {
417   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
418          F.getCallingConv() == CallingConv::SPIR_KERNEL);
419 
420   const DataLayout &DL = F.getParent()->getDataLayout();
421   uint64_t ExplicitArgBytes = 0;
422   MaxAlign = 1;
423 
424   for (const Argument &Arg : F.args()) {
425     Type *ArgTy = Arg.getType();
426 
427     unsigned Align = DL.getABITypeAlignment(ArgTy);
428     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
429     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
430     MaxAlign = std::max(MaxAlign, Align);
431   }
432 
433   return ExplicitArgBytes;
434 }
435 
436 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
437                                                 unsigned &MaxAlign) const {
438   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
439 
440   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
441 
442   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
443   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
444   if (ImplicitBytes != 0) {
445     unsigned Alignment = getAlignmentForImplicitArgPtr();
446     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
447   }
448 
449   // Being able to dereference past the end is useful for emitting scalar loads.
450   return alignTo(TotalSize, 4);
451 }
452 
453 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
454                              const TargetMachine &TM) :
455   R600GenSubtargetInfo(TT, GPU, FS),
456   AMDGPUSubtarget(TT),
457   InstrInfo(*this),
458   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
459   FMA(false),
460   CaymanISA(false),
461   CFALUBug(false),
462   DX10Clamp(false),
463   HasVertexCache(false),
464   R600ALUInst(false),
465   FP64(false),
466   TexVTXClauseSize(0),
467   Gen(R600),
468   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
469   InstrItins(getInstrItineraryForCPU(GPU)) { }
470 
471 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
472                                       unsigned NumRegionInstrs) const {
473   // Track register pressure so the scheduler can try to decrease
474   // pressure once register usage is above the threshold defined by
475   // SIRegisterInfo::getRegPressureSetLimit()
476   Policy.ShouldTrackPressure = true;
477 
478   // Enabling both top down and bottom up scheduling seems to give us less
479   // register spills than just using one of these approaches on its own.
480   Policy.OnlyTopDown = false;
481   Policy.OnlyBottomUp = false;
482 
483   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
484   if (!enableSIScheduler())
485     Policy.ShouldTrackLaneMasks = true;
486 }
487 
488 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
489   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
490     if (SGPRs <= 80)
491       return 10;
492     if (SGPRs <= 88)
493       return 9;
494     if (SGPRs <= 100)
495       return 8;
496     return 7;
497   }
498   if (SGPRs <= 48)
499     return 10;
500   if (SGPRs <= 56)
501     return 9;
502   if (SGPRs <= 64)
503     return 8;
504   if (SGPRs <= 72)
505     return 7;
506   if (SGPRs <= 80)
507     return 6;
508   return 5;
509 }
510 
511 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
512   if (VGPRs <= 24)
513     return 10;
514   if (VGPRs <= 28)
515     return 9;
516   if (VGPRs <= 32)
517     return 8;
518   if (VGPRs <= 36)
519     return 7;
520   if (VGPRs <= 40)
521     return 6;
522   if (VGPRs <= 48)
523     return 5;
524   if (VGPRs <= 64)
525     return 4;
526   if (VGPRs <= 84)
527     return 3;
528   if (VGPRs <= 128)
529     return 2;
530   return 1;
531 }
532 
533 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
534   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
535   if (MFI.hasFlatScratchInit()) {
536     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
537       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
538     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
539       return 4; // FLAT_SCRATCH, VCC (in that order).
540   }
541 
542   if (isXNACKEnabled())
543     return 4; // XNACK, VCC (in that order).
544   return 2; // VCC.
545 }
546 
547 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
548   const Function &F = MF.getFunction();
549   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
550 
551   // Compute maximum number of SGPRs function can use using default/requested
552   // minimum number of waves per execution unit.
553   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
554   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
555   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
556 
557   // Check if maximum number of SGPRs was explicitly requested using
558   // "amdgpu-num-sgpr" attribute.
559   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
560     unsigned Requested = AMDGPU::getIntegerAttribute(
561       F, "amdgpu-num-sgpr", MaxNumSGPRs);
562 
563     // Make sure requested value does not violate subtarget's specifications.
564     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
565       Requested = 0;
566 
567     // If more SGPRs are required to support the input user/system SGPRs,
568     // increase to accommodate them.
569     //
570     // FIXME: This really ends up using the requested number of SGPRs + number
571     // of reserved special registers in total. Theoretically you could re-use
572     // the last input registers for these special registers, but this would
573     // require a lot of complexity to deal with the weird aliasing.
574     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
575     if (Requested && Requested < InputNumSGPRs)
576       Requested = InputNumSGPRs;
577 
578     // Make sure requested value is compatible with values implied by
579     // default/requested minimum/maximum number of waves per execution unit.
580     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
581       Requested = 0;
582     if (WavesPerEU.second &&
583         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
584       Requested = 0;
585 
586     if (Requested)
587       MaxNumSGPRs = Requested;
588   }
589 
590   if (hasSGPRInitBug())
591     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
592 
593   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
594                   MaxAddressableNumSGPRs);
595 }
596 
597 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
598   const Function &F = MF.getFunction();
599   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
600 
601   // Compute maximum number of VGPRs function can use using default/requested
602   // minimum number of waves per execution unit.
603   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
604   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
605 
606   // Check if maximum number of VGPRs was explicitly requested using
607   // "amdgpu-num-vgpr" attribute.
608   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
609     unsigned Requested = AMDGPU::getIntegerAttribute(
610       F, "amdgpu-num-vgpr", MaxNumVGPRs);
611 
612     // Make sure requested value is compatible with values implied by
613     // default/requested minimum/maximum number of waves per execution unit.
614     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
615       Requested = 0;
616     if (WavesPerEU.second &&
617         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
618       Requested = 0;
619 
620     if (Requested)
621       MaxNumVGPRs = Requested;
622   }
623 
624   return MaxNumVGPRs;
625 }
626 
627 namespace {
628 struct MemOpClusterMutation : ScheduleDAGMutation {
629   const SIInstrInfo *TII;
630 
631   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
632 
633   void apply(ScheduleDAGInstrs *DAGInstrs) override {
634     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
635 
636     SUnit *SUa = nullptr;
637     // Search for two consequent memory operations and link them
638     // to prevent scheduler from moving them apart.
639     // In DAG pre-process SUnits are in the original order of
640     // the instructions before scheduling.
641     for (SUnit &SU : DAG->SUnits) {
642       MachineInstr &MI2 = *SU.getInstr();
643       if (!MI2.mayLoad() && !MI2.mayStore()) {
644         SUa = nullptr;
645         continue;
646       }
647       if (!SUa) {
648         SUa = &SU;
649         continue;
650       }
651 
652       MachineInstr &MI1 = *SUa->getInstr();
653       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
654           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
655           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
656           (TII->isDS(MI1)   && TII->isDS(MI2))) {
657         SU.addPredBarrier(SUa);
658 
659         for (const SDep &SI : SU.Preds) {
660           if (SI.getSUnit() != SUa)
661             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
662         }
663 
664         if (&SU != &DAG->ExitSU) {
665           for (const SDep &SI : SUa->Succs) {
666             if (SI.getSUnit() != &SU)
667               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
668           }
669         }
670       }
671 
672       SUa = &SU;
673     }
674   }
675 };
676 } // namespace
677 
678 void GCNSubtarget::getPostRAMutations(
679     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
680   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
681 }
682 
683 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
684   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
685     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
686   else
687     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
688 }
689 
690 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
691   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
692     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
693   else
694     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
695 }
696