1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
46 R600Subtarget &
47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48                                                StringRef GPU, StringRef FS) {
49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50   FullFS += FS;
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // FIXME: I don't think think Evergreen has any useful support for
54   // denormals, but should be checked. Should we issue a warning somewhere
55   // if someone tries to enable these?
56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57     FP32Denormals = false;
58   }
59 
60   HasMulU24 = getGeneration() >= EVERGREEN;
61   HasMulI24 = hasCaymanISA();
62 
63   return *this;
64 }
65 
66 GCNSubtarget &
67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                  StringRef GPU, StringRef FS) {
69   // Determine default and user-specified characteristics
70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71   // enabled, but some instructions do not respect them and they run at the
72   // double precision rate, so don't enable by default.
73   //
74   // We want to be able to turn these off, but making this a subtarget feature
75   // for SI has the unhelpful behavior that it unsets everything else if you
76   // disable it.
77 
78   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 
80   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82 
83   // FIXME: I don't think think Evergreen has any useful support for
84   // denormals, but should be checked. Should we issue a warning somewhere
85   // if someone tries to enable these?
86   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87     FullFS += "+fp64-fp16-denormals,";
88   } else {
89     FullFS += "-fp32-denormals,";
90   }
91 
92   FullFS += FS;
93 
94   ParseSubtargetFeatures(GPU, FullFS);
95 
96   // We don't support FP64 for EG/NI atm.
97   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
98 
99   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101   // variants of MUBUF instructions.
102   if (!hasAddr64() && !FS.contains("flat-for-global")) {
103     FlatForGlobal = true;
104   }
105 
106   // Set defaults if needed.
107   if (MaxPrivateElementSize == 0)
108     MaxPrivateElementSize = 4;
109 
110   if (LDSBankCount == 0)
111     LDSBankCount = 32;
112 
113   if (TT.getArch() == Triple::amdgcn) {
114     if (LocalMemorySize == 0)
115       LocalMemorySize = 32768;
116 
117     // Do something sensible for unspecified target.
118     if (!HasMovrel && !HasVGPRIndexMode)
119       HasMovrel = true;
120   }
121 
122   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
123 
124   return *this;
125 }
126 
127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
128   TargetTriple(TT),
129   Has16BitInsts(false),
130   HasMadMixInsts(false),
131   FP32Denormals(false),
132   FPExceptions(false),
133   HasSDWA(false),
134   HasVOP3PInsts(false),
135   HasMulI24(true),
136   HasMulU24(true),
137   HasInv2PiInlineImm(false),
138   HasFminFmaxLegacy(true),
139   EnablePromoteAlloca(false),
140   HasTrigReducedRange(false),
141   LocalMemorySize(0),
142   WavefrontSize(0)
143   { }
144 
145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
146                            const GCNTargetMachine &TM) :
147     AMDGPUGenSubtargetInfo(TT, GPU, FS),
148     AMDGPUSubtarget(TT),
149     TargetTriple(TT),
150     Gen(SOUTHERN_ISLANDS),
151     IsaVersion(ISAVersion0_0_0),
152     InstrItins(getInstrItineraryForCPU(GPU)),
153     LDSBankCount(0),
154     MaxPrivateElementSize(0),
155 
156     FastFMAF32(false),
157     HalfRate64Ops(false),
158 
159     FP64FP16Denormals(false),
160     DX10Clamp(false),
161     FlatForGlobal(false),
162     AutoWaitcntBeforeBarrier(false),
163     CodeObjectV3(false),
164     UnalignedScratchAccess(false),
165     UnalignedBufferAccess(false),
166 
167     HasApertureRegs(false),
168     EnableXNACK(false),
169     TrapHandler(false),
170     DebuggerInsertNops(false),
171     DebuggerEmitPrologue(false),
172 
173     EnableHugePrivateBuffer(false),
174     EnableVGPRSpilling(false),
175     EnableLoadStoreOpt(false),
176     EnableUnsafeDSOffsetFolding(false),
177     EnableSIScheduler(false),
178     EnableDS128(false),
179     DumpCode(false),
180 
181     FP64(false),
182     GCN3Encoding(false),
183     CIInsts(false),
184     VIInsts(false),
185     GFX9Insts(false),
186     SGPRInitBug(false),
187     HasSMemRealTime(false),
188     HasIntClamp(false),
189     HasFmaMixInsts(false),
190     HasMovrel(false),
191     HasVGPRIndexMode(false),
192     HasScalarStores(false),
193     HasScalarAtomics(false),
194     HasSDWAOmod(false),
195     HasSDWAScalar(false),
196     HasSDWASdst(false),
197     HasSDWAMac(false),
198     HasSDWAOutModsVOPC(false),
199     HasDPP(false),
200     HasR128A16(false),
201     HasDLInsts(false),
202     D16PreservesUnusedBits(false),
203     FlatAddressSpace(false),
204     FlatInstOffsets(false),
205     FlatGlobalInsts(false),
206     FlatScratchInsts(false),
207     AddNoCarryInsts(false),
208     HasUnpackedD16VMem(false),
209 
210     ScalarizeGlobal(false),
211 
212     FeatureDisable(false),
213     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
214     TLInfo(TM, *this),
215     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
216   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
217   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
218   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
219   InstSelector.reset(new AMDGPUInstructionSelector(
220   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
221 }
222 
223 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
224   const Function &F) const {
225   if (NWaves == 1)
226     return getLocalMemorySize();
227   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
228   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
229   unsigned MaxWaves = getMaxWavesPerEU();
230   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
231 }
232 
233 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
234   const Function &F) const {
235   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
236   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
237   unsigned MaxWaves = getMaxWavesPerEU();
238   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
239   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
240   NumWaves = std::min(NumWaves, MaxWaves);
241   NumWaves = std::max(NumWaves, 1u);
242   return NumWaves;
243 }
244 
245 unsigned
246 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
247   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
248   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
249 }
250 
251 std::pair<unsigned, unsigned>
252 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
253   switch (CC) {
254   case CallingConv::AMDGPU_CS:
255   case CallingConv::AMDGPU_KERNEL:
256   case CallingConv::SPIR_KERNEL:
257     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
258   case CallingConv::AMDGPU_VS:
259   case CallingConv::AMDGPU_LS:
260   case CallingConv::AMDGPU_HS:
261   case CallingConv::AMDGPU_ES:
262   case CallingConv::AMDGPU_GS:
263   case CallingConv::AMDGPU_PS:
264     return std::make_pair(1, getWavefrontSize());
265   default:
266     return std::make_pair(1, 16 * getWavefrontSize());
267   }
268 }
269 
270 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
271   const Function &F) const {
272   // FIXME: 1024 if function.
273   // Default minimum/maximum flat work group sizes.
274   std::pair<unsigned, unsigned> Default =
275     getDefaultFlatWorkGroupSize(F.getCallingConv());
276 
277   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
278   // starts using "amdgpu-flat-work-group-size" attribute.
279   Default.second = AMDGPU::getIntegerAttribute(
280     F, "amdgpu-max-work-group-size", Default.second);
281   Default.first = std::min(Default.first, Default.second);
282 
283   // Requested minimum/maximum flat work group sizes.
284   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
285     F, "amdgpu-flat-work-group-size", Default);
286 
287   // Make sure requested minimum is less than requested maximum.
288   if (Requested.first > Requested.second)
289     return Default;
290 
291   // Make sure requested values do not violate subtarget's specifications.
292   if (Requested.first < getMinFlatWorkGroupSize())
293     return Default;
294   if (Requested.second > getMaxFlatWorkGroupSize())
295     return Default;
296 
297   return Requested;
298 }
299 
300 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
301   const Function &F) const {
302   // Default minimum/maximum number of waves per execution unit.
303   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
304 
305   // Default/requested minimum/maximum flat work group sizes.
306   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
307 
308   // If minimum/maximum flat work group sizes were explicitly requested using
309   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
310   // number of waves per execution unit to values implied by requested
311   // minimum/maximum flat work group sizes.
312   unsigned MinImpliedByFlatWorkGroupSize =
313     getMaxWavesPerEU(FlatWorkGroupSizes.second);
314   bool RequestedFlatWorkGroupSize = false;
315 
316   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
317   // starts using "amdgpu-flat-work-group-size" attribute.
318   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
319       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
320     Default.first = MinImpliedByFlatWorkGroupSize;
321     RequestedFlatWorkGroupSize = true;
322   }
323 
324   // Requested minimum/maximum number of waves per execution unit.
325   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
326     F, "amdgpu-waves-per-eu", Default, true);
327 
328   // Make sure requested minimum is less than requested maximum.
329   if (Requested.second && Requested.first > Requested.second)
330     return Default;
331 
332   // Make sure requested values do not violate subtarget's specifications.
333   if (Requested.first < getMinWavesPerEU() ||
334       Requested.first > getMaxWavesPerEU())
335     return Default;
336   if (Requested.second > getMaxWavesPerEU())
337     return Default;
338 
339   // Make sure requested values are compatible with values implied by requested
340   // minimum/maximum flat work group sizes.
341   if (RequestedFlatWorkGroupSize &&
342       Requested.first < MinImpliedByFlatWorkGroupSize)
343     return Default;
344 
345   return Requested;
346 }
347 
348 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
349   Function *Kernel = I->getParent()->getParent();
350   unsigned MinSize = 0;
351   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
352   bool IdQuery = false;
353 
354   // If reqd_work_group_size is present it narrows value down.
355   if (auto *CI = dyn_cast<CallInst>(I)) {
356     const Function *F = CI->getCalledFunction();
357     if (F) {
358       unsigned Dim = UINT_MAX;
359       switch (F->getIntrinsicID()) {
360       case Intrinsic::amdgcn_workitem_id_x:
361       case Intrinsic::r600_read_tidig_x:
362         IdQuery = true;
363         LLVM_FALLTHROUGH;
364       case Intrinsic::r600_read_local_size_x:
365         Dim = 0;
366         break;
367       case Intrinsic::amdgcn_workitem_id_y:
368       case Intrinsic::r600_read_tidig_y:
369         IdQuery = true;
370         LLVM_FALLTHROUGH;
371       case Intrinsic::r600_read_local_size_y:
372         Dim = 1;
373         break;
374       case Intrinsic::amdgcn_workitem_id_z:
375       case Intrinsic::r600_read_tidig_z:
376         IdQuery = true;
377         LLVM_FALLTHROUGH;
378       case Intrinsic::r600_read_local_size_z:
379         Dim = 2;
380         break;
381       default:
382         break;
383       }
384       if (Dim <= 3) {
385         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
386           if (Node->getNumOperands() == 3)
387             MinSize = MaxSize = mdconst::extract<ConstantInt>(
388                                   Node->getOperand(Dim))->getZExtValue();
389       }
390     }
391   }
392 
393   if (!MaxSize)
394     return false;
395 
396   // Range metadata is [Lo, Hi). For ID query we need to pass max size
397   // as Hi. For size query we need to pass Hi + 1.
398   if (IdQuery)
399     MinSize = 0;
400   else
401     ++MaxSize;
402 
403   MDBuilder MDB(I->getContext());
404   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
405                                                   APInt(32, MaxSize));
406   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
407   return true;
408 }
409 
410 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
411                                                  unsigned &MaxAlign) const {
412   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
413          F.getCallingConv() == CallingConv::SPIR_KERNEL);
414 
415   const DataLayout &DL = F.getParent()->getDataLayout();
416   uint64_t ExplicitArgBytes = 0;
417   MaxAlign = 1;
418 
419   for (const Argument &Arg : F.args()) {
420     Type *ArgTy = Arg.getType();
421 
422     unsigned Align = DL.getABITypeAlignment(ArgTy);
423     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
424     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
425     MaxAlign = std::max(MaxAlign, Align);
426   }
427 
428   return ExplicitArgBytes;
429 }
430 
431 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
432                                                 unsigned &MaxAlign) const {
433   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
434 
435   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
436 
437   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
438   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
439   if (ImplicitBytes != 0) {
440     unsigned Alignment = getAlignmentForImplicitArgPtr();
441     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
442   }
443 
444   // Being able to dereference past the end is useful for emitting scalar loads.
445   return alignTo(TotalSize, 4);
446 }
447 
448 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
449                              const TargetMachine &TM) :
450   R600GenSubtargetInfo(TT, GPU, FS),
451   AMDGPUSubtarget(TT),
452   InstrInfo(*this),
453   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
454   FMA(false),
455   CaymanISA(false),
456   CFALUBug(false),
457   DX10Clamp(false),
458   HasVertexCache(false),
459   R600ALUInst(false),
460   FP64(false),
461   TexVTXClauseSize(0),
462   Gen(R600),
463   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
464   InstrItins(getInstrItineraryForCPU(GPU)) { }
465 
466 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
467                                       unsigned NumRegionInstrs) const {
468   // Track register pressure so the scheduler can try to decrease
469   // pressure once register usage is above the threshold defined by
470   // SIRegisterInfo::getRegPressureSetLimit()
471   Policy.ShouldTrackPressure = true;
472 
473   // Enabling both top down and bottom up scheduling seems to give us less
474   // register spills than just using one of these approaches on its own.
475   Policy.OnlyTopDown = false;
476   Policy.OnlyBottomUp = false;
477 
478   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
479   if (!enableSIScheduler())
480     Policy.ShouldTrackLaneMasks = true;
481 }
482 
483 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
484   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
485 }
486 
487 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
488   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
489     if (SGPRs <= 80)
490       return 10;
491     if (SGPRs <= 88)
492       return 9;
493     if (SGPRs <= 100)
494       return 8;
495     return 7;
496   }
497   if (SGPRs <= 48)
498     return 10;
499   if (SGPRs <= 56)
500     return 9;
501   if (SGPRs <= 64)
502     return 8;
503   if (SGPRs <= 72)
504     return 7;
505   if (SGPRs <= 80)
506     return 6;
507   return 5;
508 }
509 
510 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
511   if (VGPRs <= 24)
512     return 10;
513   if (VGPRs <= 28)
514     return 9;
515   if (VGPRs <= 32)
516     return 8;
517   if (VGPRs <= 36)
518     return 7;
519   if (VGPRs <= 40)
520     return 6;
521   if (VGPRs <= 48)
522     return 5;
523   if (VGPRs <= 64)
524     return 4;
525   if (VGPRs <= 84)
526     return 3;
527   if (VGPRs <= 128)
528     return 2;
529   return 1;
530 }
531 
532 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
533   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
534   if (MFI.hasFlatScratchInit()) {
535     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
536       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
537     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
538       return 4; // FLAT_SCRATCH, VCC (in that order).
539   }
540 
541   if (isXNACKEnabled())
542     return 4; // XNACK, VCC (in that order).
543   return 2; // VCC.
544 }
545 
546 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
547   const Function &F = MF.getFunction();
548   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
549 
550   // Compute maximum number of SGPRs function can use using default/requested
551   // minimum number of waves per execution unit.
552   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
553   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
554   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
555 
556   // Check if maximum number of SGPRs was explicitly requested using
557   // "amdgpu-num-sgpr" attribute.
558   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
559     unsigned Requested = AMDGPU::getIntegerAttribute(
560       F, "amdgpu-num-sgpr", MaxNumSGPRs);
561 
562     // Make sure requested value does not violate subtarget's specifications.
563     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
564       Requested = 0;
565 
566     // If more SGPRs are required to support the input user/system SGPRs,
567     // increase to accommodate them.
568     //
569     // FIXME: This really ends up using the requested number of SGPRs + number
570     // of reserved special registers in total. Theoretically you could re-use
571     // the last input registers for these special registers, but this would
572     // require a lot of complexity to deal with the weird aliasing.
573     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
574     if (Requested && Requested < InputNumSGPRs)
575       Requested = InputNumSGPRs;
576 
577     // Make sure requested value is compatible with values implied by
578     // default/requested minimum/maximum number of waves per execution unit.
579     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
580       Requested = 0;
581     if (WavesPerEU.second &&
582         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
583       Requested = 0;
584 
585     if (Requested)
586       MaxNumSGPRs = Requested;
587   }
588 
589   if (hasSGPRInitBug())
590     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
591 
592   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
593                   MaxAddressableNumSGPRs);
594 }
595 
596 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
597   const Function &F = MF.getFunction();
598   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
599 
600   // Compute maximum number of VGPRs function can use using default/requested
601   // minimum number of waves per execution unit.
602   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
603   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
604 
605   // Check if maximum number of VGPRs was explicitly requested using
606   // "amdgpu-num-vgpr" attribute.
607   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
608     unsigned Requested = AMDGPU::getIntegerAttribute(
609       F, "amdgpu-num-vgpr", MaxNumVGPRs);
610 
611     // Make sure requested value is compatible with values implied by
612     // default/requested minimum/maximum number of waves per execution unit.
613     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
614       Requested = 0;
615     if (WavesPerEU.second &&
616         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
617       Requested = 0;
618 
619     if (Requested)
620       MaxNumVGPRs = Requested;
621   }
622 
623   return MaxNumVGPRs;
624 }
625 
626 namespace {
627 struct MemOpClusterMutation : ScheduleDAGMutation {
628   const SIInstrInfo *TII;
629 
630   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
631 
632   void apply(ScheduleDAGInstrs *DAGInstrs) override {
633     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
634 
635     SUnit *SUa = nullptr;
636     // Search for two consequent memory operations and link them
637     // to prevent scheduler from moving them apart.
638     // In DAG pre-process SUnits are in the original order of
639     // the instructions before scheduling.
640     for (SUnit &SU : DAG->SUnits) {
641       MachineInstr &MI2 = *SU.getInstr();
642       if (!MI2.mayLoad() && !MI2.mayStore()) {
643         SUa = nullptr;
644         continue;
645       }
646       if (!SUa) {
647         SUa = &SU;
648         continue;
649       }
650 
651       MachineInstr &MI1 = *SUa->getInstr();
652       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
653           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
654           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
655           (TII->isDS(MI1)   && TII->isDS(MI2))) {
656         SU.addPredBarrier(SUa);
657 
658         for (const SDep &SI : SU.Preds) {
659           if (SI.getSUnit() != SUa)
660             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
661         }
662 
663         if (&SU != &DAG->ExitSU) {
664           for (const SDep &SI : SUa->Succs) {
665             if (SI.getSUnit() != &SU)
666               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
667           }
668         }
669       }
670 
671       SUa = &SU;
672     }
673   }
674 };
675 } // namespace
676 
677 void GCNSubtarget::getPostRAMutations(
678     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
679   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
680 }
681 
682 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
683   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
684     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
685   else
686     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
687 }
688 
689 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
690   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
691     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
692   else
693     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
694 }
695