1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                                  StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
81 
82   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
84 
85   // FIXME: I don't think think Evergreen has any useful support for
86   // denormals, but should be checked. Should we issue a warning somewhere
87   // if someone tries to enable these?
88   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
89     FullFS += "+fp64-fp16-denormals,";
90   } else {
91     FullFS += "-fp32-denormals,";
92   }
93 
94   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
95 
96   FullFS += FS;
97 
98   ParseSubtargetFeatures(GPU, FullFS);
99 
100   // We don't support FP64 for EG/NI atm.
101   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
102 
103   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105   // variants of MUBUF instructions.
106   if (!hasAddr64() && !FS.contains("flat-for-global")) {
107     FlatForGlobal = true;
108   }
109 
110   // Set defaults if needed.
111   if (MaxPrivateElementSize == 0)
112     MaxPrivateElementSize = 4;
113 
114   if (LDSBankCount == 0)
115     LDSBankCount = 32;
116 
117   if (TT.getArch() == Triple::amdgcn) {
118     if (LocalMemorySize == 0)
119       LocalMemorySize = 32768;
120 
121     // Do something sensible for unspecified target.
122     if (!HasMovrel && !HasVGPRIndexMode)
123       HasMovrel = true;
124   }
125 
126   // Don't crash on invalid devices.
127   if (WavefrontSize == 0)
128     WavefrontSize = 64;
129 
130   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
131 
132   return *this;
133 }
134 
135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
136   TargetTriple(TT),
137   Has16BitInsts(false),
138   HasMadMixInsts(false),
139   FP32Denormals(false),
140   FPExceptions(false),
141   HasSDWA(false),
142   HasVOP3PInsts(false),
143   HasMulI24(true),
144   HasMulU24(true),
145   HasInv2PiInlineImm(false),
146   HasFminFmaxLegacy(true),
147   EnablePromoteAlloca(false),
148   HasTrigReducedRange(false),
149   LocalMemorySize(0),
150   WavefrontSize(0)
151   { }
152 
153 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
154                            const GCNTargetMachine &TM) :
155     AMDGPUGenSubtargetInfo(TT, GPU, FS),
156     AMDGPUSubtarget(TT),
157     TargetTriple(TT),
158     Gen(SOUTHERN_ISLANDS),
159     InstrItins(getInstrItineraryForCPU(GPU)),
160     LDSBankCount(0),
161     MaxPrivateElementSize(0),
162 
163     FastFMAF32(false),
164     HalfRate64Ops(false),
165 
166     FP64FP16Denormals(false),
167     DX10Clamp(false),
168     FlatForGlobal(false),
169     AutoWaitcntBeforeBarrier(false),
170     CodeObjectV3(false),
171     UnalignedScratchAccess(false),
172     UnalignedBufferAccess(false),
173 
174     HasApertureRegs(false),
175     EnableXNACK(false),
176     TrapHandler(false),
177     DebuggerInsertNops(false),
178     DebuggerEmitPrologue(false),
179 
180     EnableHugePrivateBuffer(false),
181     EnableLoadStoreOpt(false),
182     EnableUnsafeDSOffsetFolding(false),
183     EnableSIScheduler(false),
184     EnableDS128(false),
185     EnablePRTStrictNull(false),
186     DumpCode(false),
187 
188     FP64(false),
189     GCN3Encoding(false),
190     CIInsts(false),
191     VIInsts(false),
192     GFX9Insts(false),
193     SGPRInitBug(false),
194     HasSMemRealTime(false),
195     HasIntClamp(false),
196     HasFmaMixInsts(false),
197     HasMovrel(false),
198     HasVGPRIndexMode(false),
199     HasScalarStores(false),
200     HasScalarAtomics(false),
201     HasSDWAOmod(false),
202     HasSDWAScalar(false),
203     HasSDWASdst(false),
204     HasSDWAMac(false),
205     HasSDWAOutModsVOPC(false),
206     HasDPP(false),
207     HasR128A16(false),
208     HasDLInsts(false),
209     HasDot1Insts(false),
210     HasDot2Insts(false),
211     EnableSRAMECC(false),
212     FlatAddressSpace(false),
213     FlatInstOffsets(false),
214     FlatGlobalInsts(false),
215     FlatScratchInsts(false),
216     AddNoCarryInsts(false),
217     HasUnpackedD16VMem(false),
218 
219     ScalarizeGlobal(false),
220 
221     FeatureDisable(false),
222     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
223     TLInfo(TM, *this),
224     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
225   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
226   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
227   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
228   InstSelector.reset(new AMDGPUInstructionSelector(
229   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
230 }
231 
232 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
233   const Function &F) const {
234   if (NWaves == 1)
235     return getLocalMemorySize();
236   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
237   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
238   unsigned MaxWaves = getMaxWavesPerEU();
239   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
240 }
241 
242 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
243   const Function &F) const {
244   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
245   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
246   unsigned MaxWaves = getMaxWavesPerEU();
247   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
248   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
249   NumWaves = std::min(NumWaves, MaxWaves);
250   NumWaves = std::max(NumWaves, 1u);
251   return NumWaves;
252 }
253 
254 unsigned
255 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
256   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
257   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
258 }
259 
260 std::pair<unsigned, unsigned>
261 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
262   switch (CC) {
263   case CallingConv::AMDGPU_CS:
264   case CallingConv::AMDGPU_KERNEL:
265   case CallingConv::SPIR_KERNEL:
266     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
267   case CallingConv::AMDGPU_VS:
268   case CallingConv::AMDGPU_LS:
269   case CallingConv::AMDGPU_HS:
270   case CallingConv::AMDGPU_ES:
271   case CallingConv::AMDGPU_GS:
272   case CallingConv::AMDGPU_PS:
273     return std::make_pair(1, getWavefrontSize());
274   default:
275     return std::make_pair(1, 16 * getWavefrontSize());
276   }
277 }
278 
279 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
280   const Function &F) const {
281   // FIXME: 1024 if function.
282   // Default minimum/maximum flat work group sizes.
283   std::pair<unsigned, unsigned> Default =
284     getDefaultFlatWorkGroupSize(F.getCallingConv());
285 
286   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
287   // starts using "amdgpu-flat-work-group-size" attribute.
288   Default.second = AMDGPU::getIntegerAttribute(
289     F, "amdgpu-max-work-group-size", Default.second);
290   Default.first = std::min(Default.first, Default.second);
291 
292   // Requested minimum/maximum flat work group sizes.
293   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
294     F, "amdgpu-flat-work-group-size", Default);
295 
296   // Make sure requested minimum is less than requested maximum.
297   if (Requested.first > Requested.second)
298     return Default;
299 
300   // Make sure requested values do not violate subtarget's specifications.
301   if (Requested.first < getMinFlatWorkGroupSize())
302     return Default;
303   if (Requested.second > getMaxFlatWorkGroupSize())
304     return Default;
305 
306   return Requested;
307 }
308 
309 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
310   const Function &F) const {
311   // Default minimum/maximum number of waves per execution unit.
312   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
313 
314   // Default/requested minimum/maximum flat work group sizes.
315   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
316 
317   // If minimum/maximum flat work group sizes were explicitly requested using
318   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
319   // number of waves per execution unit to values implied by requested
320   // minimum/maximum flat work group sizes.
321   unsigned MinImpliedByFlatWorkGroupSize =
322     getMaxWavesPerEU(FlatWorkGroupSizes.second);
323   bool RequestedFlatWorkGroupSize = false;
324 
325   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
326   // starts using "amdgpu-flat-work-group-size" attribute.
327   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
328       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
329     Default.first = MinImpliedByFlatWorkGroupSize;
330     RequestedFlatWorkGroupSize = true;
331   }
332 
333   // Requested minimum/maximum number of waves per execution unit.
334   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
335     F, "amdgpu-waves-per-eu", Default, true);
336 
337   // Make sure requested minimum is less than requested maximum.
338   if (Requested.second && Requested.first > Requested.second)
339     return Default;
340 
341   // Make sure requested values do not violate subtarget's specifications.
342   if (Requested.first < getMinWavesPerEU() ||
343       Requested.first > getMaxWavesPerEU())
344     return Default;
345   if (Requested.second > getMaxWavesPerEU())
346     return Default;
347 
348   // Make sure requested values are compatible with values implied by requested
349   // minimum/maximum flat work group sizes.
350   if (RequestedFlatWorkGroupSize &&
351       Requested.first < MinImpliedByFlatWorkGroupSize)
352     return Default;
353 
354   return Requested;
355 }
356 
357 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
358   Function *Kernel = I->getParent()->getParent();
359   unsigned MinSize = 0;
360   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
361   bool IdQuery = false;
362 
363   // If reqd_work_group_size is present it narrows value down.
364   if (auto *CI = dyn_cast<CallInst>(I)) {
365     const Function *F = CI->getCalledFunction();
366     if (F) {
367       unsigned Dim = UINT_MAX;
368       switch (F->getIntrinsicID()) {
369       case Intrinsic::amdgcn_workitem_id_x:
370       case Intrinsic::r600_read_tidig_x:
371         IdQuery = true;
372         LLVM_FALLTHROUGH;
373       case Intrinsic::r600_read_local_size_x:
374         Dim = 0;
375         break;
376       case Intrinsic::amdgcn_workitem_id_y:
377       case Intrinsic::r600_read_tidig_y:
378         IdQuery = true;
379         LLVM_FALLTHROUGH;
380       case Intrinsic::r600_read_local_size_y:
381         Dim = 1;
382         break;
383       case Intrinsic::amdgcn_workitem_id_z:
384       case Intrinsic::r600_read_tidig_z:
385         IdQuery = true;
386         LLVM_FALLTHROUGH;
387       case Intrinsic::r600_read_local_size_z:
388         Dim = 2;
389         break;
390       default:
391         break;
392       }
393       if (Dim <= 3) {
394         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
395           if (Node->getNumOperands() == 3)
396             MinSize = MaxSize = mdconst::extract<ConstantInt>(
397                                   Node->getOperand(Dim))->getZExtValue();
398       }
399     }
400   }
401 
402   if (!MaxSize)
403     return false;
404 
405   // Range metadata is [Lo, Hi). For ID query we need to pass max size
406   // as Hi. For size query we need to pass Hi + 1.
407   if (IdQuery)
408     MinSize = 0;
409   else
410     ++MaxSize;
411 
412   MDBuilder MDB(I->getContext());
413   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
414                                                   APInt(32, MaxSize));
415   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
416   return true;
417 }
418 
419 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
420                                                  unsigned &MaxAlign) const {
421   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
422          F.getCallingConv() == CallingConv::SPIR_KERNEL);
423 
424   const DataLayout &DL = F.getParent()->getDataLayout();
425   uint64_t ExplicitArgBytes = 0;
426   MaxAlign = 1;
427 
428   for (const Argument &Arg : F.args()) {
429     Type *ArgTy = Arg.getType();
430 
431     unsigned Align = DL.getABITypeAlignment(ArgTy);
432     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
433     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
434     MaxAlign = std::max(MaxAlign, Align);
435   }
436 
437   return ExplicitArgBytes;
438 }
439 
440 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
441                                                 unsigned &MaxAlign) const {
442   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
443 
444   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
445 
446   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
447   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
448   if (ImplicitBytes != 0) {
449     unsigned Alignment = getAlignmentForImplicitArgPtr();
450     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
451   }
452 
453   // Being able to dereference past the end is useful for emitting scalar loads.
454   return alignTo(TotalSize, 4);
455 }
456 
457 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
458                              const TargetMachine &TM) :
459   R600GenSubtargetInfo(TT, GPU, FS),
460   AMDGPUSubtarget(TT),
461   InstrInfo(*this),
462   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
463   FMA(false),
464   CaymanISA(false),
465   CFALUBug(false),
466   DX10Clamp(false),
467   HasVertexCache(false),
468   R600ALUInst(false),
469   FP64(false),
470   TexVTXClauseSize(0),
471   Gen(R600),
472   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
473   InstrItins(getInstrItineraryForCPU(GPU)) { }
474 
475 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
476                                       unsigned NumRegionInstrs) const {
477   // Track register pressure so the scheduler can try to decrease
478   // pressure once register usage is above the threshold defined by
479   // SIRegisterInfo::getRegPressureSetLimit()
480   Policy.ShouldTrackPressure = true;
481 
482   // Enabling both top down and bottom up scheduling seems to give us less
483   // register spills than just using one of these approaches on its own.
484   Policy.OnlyTopDown = false;
485   Policy.OnlyBottomUp = false;
486 
487   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
488   if (!enableSIScheduler())
489     Policy.ShouldTrackLaneMasks = true;
490 }
491 
492 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
493   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
494     if (SGPRs <= 80)
495       return 10;
496     if (SGPRs <= 88)
497       return 9;
498     if (SGPRs <= 100)
499       return 8;
500     return 7;
501   }
502   if (SGPRs <= 48)
503     return 10;
504   if (SGPRs <= 56)
505     return 9;
506   if (SGPRs <= 64)
507     return 8;
508   if (SGPRs <= 72)
509     return 7;
510   if (SGPRs <= 80)
511     return 6;
512   return 5;
513 }
514 
515 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
516   if (VGPRs <= 24)
517     return 10;
518   if (VGPRs <= 28)
519     return 9;
520   if (VGPRs <= 32)
521     return 8;
522   if (VGPRs <= 36)
523     return 7;
524   if (VGPRs <= 40)
525     return 6;
526   if (VGPRs <= 48)
527     return 5;
528   if (VGPRs <= 64)
529     return 4;
530   if (VGPRs <= 84)
531     return 3;
532   if (VGPRs <= 128)
533     return 2;
534   return 1;
535 }
536 
537 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
538   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
539   if (MFI.hasFlatScratchInit()) {
540     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
541       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
542     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
543       return 4; // FLAT_SCRATCH, VCC (in that order).
544   }
545 
546   if (isXNACKEnabled())
547     return 4; // XNACK, VCC (in that order).
548   return 2; // VCC.
549 }
550 
551 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
552   const Function &F = MF.getFunction();
553   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
554 
555   // Compute maximum number of SGPRs function can use using default/requested
556   // minimum number of waves per execution unit.
557   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
558   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
559   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
560 
561   // Check if maximum number of SGPRs was explicitly requested using
562   // "amdgpu-num-sgpr" attribute.
563   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
564     unsigned Requested = AMDGPU::getIntegerAttribute(
565       F, "amdgpu-num-sgpr", MaxNumSGPRs);
566 
567     // Make sure requested value does not violate subtarget's specifications.
568     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
569       Requested = 0;
570 
571     // If more SGPRs are required to support the input user/system SGPRs,
572     // increase to accommodate them.
573     //
574     // FIXME: This really ends up using the requested number of SGPRs + number
575     // of reserved special registers in total. Theoretically you could re-use
576     // the last input registers for these special registers, but this would
577     // require a lot of complexity to deal with the weird aliasing.
578     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
579     if (Requested && Requested < InputNumSGPRs)
580       Requested = InputNumSGPRs;
581 
582     // Make sure requested value is compatible with values implied by
583     // default/requested minimum/maximum number of waves per execution unit.
584     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
585       Requested = 0;
586     if (WavesPerEU.second &&
587         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
588       Requested = 0;
589 
590     if (Requested)
591       MaxNumSGPRs = Requested;
592   }
593 
594   if (hasSGPRInitBug())
595     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
596 
597   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
598                   MaxAddressableNumSGPRs);
599 }
600 
601 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
602   const Function &F = MF.getFunction();
603   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
604 
605   // Compute maximum number of VGPRs function can use using default/requested
606   // minimum number of waves per execution unit.
607   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
608   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
609 
610   // Check if maximum number of VGPRs was explicitly requested using
611   // "amdgpu-num-vgpr" attribute.
612   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
613     unsigned Requested = AMDGPU::getIntegerAttribute(
614       F, "amdgpu-num-vgpr", MaxNumVGPRs);
615 
616     // Make sure requested value is compatible with values implied by
617     // default/requested minimum/maximum number of waves per execution unit.
618     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
619       Requested = 0;
620     if (WavesPerEU.second &&
621         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
622       Requested = 0;
623 
624     if (Requested)
625       MaxNumVGPRs = Requested;
626   }
627 
628   return MaxNumVGPRs;
629 }
630 
631 namespace {
632 struct MemOpClusterMutation : ScheduleDAGMutation {
633   const SIInstrInfo *TII;
634 
635   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
636 
637   void apply(ScheduleDAGInstrs *DAGInstrs) override {
638     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
639 
640     SUnit *SUa = nullptr;
641     // Search for two consequent memory operations and link them
642     // to prevent scheduler from moving them apart.
643     // In DAG pre-process SUnits are in the original order of
644     // the instructions before scheduling.
645     for (SUnit &SU : DAG->SUnits) {
646       MachineInstr &MI2 = *SU.getInstr();
647       if (!MI2.mayLoad() && !MI2.mayStore()) {
648         SUa = nullptr;
649         continue;
650       }
651       if (!SUa) {
652         SUa = &SU;
653         continue;
654       }
655 
656       MachineInstr &MI1 = *SUa->getInstr();
657       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
658           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
659           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
660           (TII->isDS(MI1)   && TII->isDS(MI2))) {
661         SU.addPredBarrier(SUa);
662 
663         for (const SDep &SI : SU.Preds) {
664           if (SI.getSUnit() != SUa)
665             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
666         }
667 
668         if (&SU != &DAG->ExitSU) {
669           for (const SDep &SI : SUa->Succs) {
670             if (SI.getSUnit() != &SU)
671               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
672           }
673         }
674       }
675 
676       SUa = &SU;
677     }
678   }
679 };
680 } // namespace
681 
682 void GCNSubtarget::getPostRAMutations(
683     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
684   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
685 }
686 
687 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
688   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
689     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
690   else
691     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
692 }
693 
694 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
695   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
696     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
697   else
698     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
699 }
700