1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
46 R600Subtarget &
47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48                                                StringRef GPU, StringRef FS) {
49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50   FullFS += FS;
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // FIXME: I don't think think Evergreen has any useful support for
54   // denormals, but should be checked. Should we issue a warning somewhere
55   // if someone tries to enable these?
56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57     FP32Denormals = false;
58   }
59 
60   HasMulU24 = getGeneration() >= EVERGREEN;
61   HasMulI24 = hasCaymanISA();
62 
63   return *this;
64 }
65 
66 GCNSubtarget &
67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                  StringRef GPU, StringRef FS) {
69   // Determine default and user-specified characteristics
70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71   // enabled, but some instructions do not respect them and they run at the
72   // double precision rate, so don't enable by default.
73   //
74   // We want to be able to turn these off, but making this a subtarget feature
75   // for SI has the unhelpful behavior that it unsets everything else if you
76   // disable it.
77 
78   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
79 
80   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
82 
83   // FIXME: I don't think think Evergreen has any useful support for
84   // denormals, but should be checked. Should we issue a warning somewhere
85   // if someone tries to enable these?
86   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87     FullFS += "+fp64-fp16-denormals,";
88   } else {
89     FullFS += "-fp32-denormals,";
90   }
91 
92   FullFS += FS;
93 
94   ParseSubtargetFeatures(GPU, FullFS);
95 
96   // We don't support FP64 for EG/NI atm.
97   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
98 
99   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101   // variants of MUBUF instructions.
102   if (!hasAddr64() && !FS.contains("flat-for-global")) {
103     FlatForGlobal = true;
104   }
105 
106   // Set defaults if needed.
107   if (MaxPrivateElementSize == 0)
108     MaxPrivateElementSize = 4;
109 
110   if (LDSBankCount == 0)
111     LDSBankCount = 32;
112 
113   if (TT.getArch() == Triple::amdgcn) {
114     if (LocalMemorySize == 0)
115       LocalMemorySize = 32768;
116 
117     // Do something sensible for unspecified target.
118     if (!HasMovrel && !HasVGPRIndexMode)
119       HasMovrel = true;
120   }
121 
122   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
123 
124   return *this;
125 }
126 
127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
128   TargetTriple(TT),
129   Has16BitInsts(false),
130   HasMadMixInsts(false),
131   FP32Denormals(false),
132   FPExceptions(false),
133   HasSDWA(false),
134   HasVOP3PInsts(false),
135   HasMulI24(true),
136   HasMulU24(true),
137   HasInv2PiInlineImm(false),
138   HasFminFmaxLegacy(true),
139   EnablePromoteAlloca(false),
140   HasTrigReducedRange(false),
141   LocalMemorySize(0),
142   WavefrontSize(0)
143   { }
144 
145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
146                            const GCNTargetMachine &TM) :
147     AMDGPUGenSubtargetInfo(TT, GPU, FS),
148     AMDGPUSubtarget(TT),
149     TargetTriple(TT),
150     Gen(SOUTHERN_ISLANDS),
151     IsaVersion(ISAVersion0_0_0),
152     InstrItins(getInstrItineraryForCPU(GPU)),
153     LDSBankCount(0),
154     MaxPrivateElementSize(0),
155 
156     FastFMAF32(false),
157     HalfRate64Ops(false),
158 
159     FP64FP16Denormals(false),
160     DX10Clamp(false),
161     FlatForGlobal(false),
162     AutoWaitcntBeforeBarrier(false),
163     CodeObjectV3(false),
164     UnalignedScratchAccess(false),
165     UnalignedBufferAccess(false),
166 
167     HasApertureRegs(false),
168     EnableXNACK(false),
169     TrapHandler(false),
170     DebuggerInsertNops(false),
171     DebuggerEmitPrologue(false),
172 
173     EnableHugePrivateBuffer(false),
174     EnableLoadStoreOpt(false),
175     EnableUnsafeDSOffsetFolding(false),
176     EnableSIScheduler(false),
177     EnableDS128(false),
178     DumpCode(false),
179 
180     FP64(false),
181     GCN3Encoding(false),
182     CIInsts(false),
183     VIInsts(false),
184     GFX9Insts(false),
185     SGPRInitBug(false),
186     HasSMemRealTime(false),
187     HasIntClamp(false),
188     HasFmaMixInsts(false),
189     HasMovrel(false),
190     HasVGPRIndexMode(false),
191     HasScalarStores(false),
192     HasScalarAtomics(false),
193     HasSDWAOmod(false),
194     HasSDWAScalar(false),
195     HasSDWASdst(false),
196     HasSDWAMac(false),
197     HasSDWAOutModsVOPC(false),
198     HasDPP(false),
199     HasR128A16(false),
200     HasDLInsts(false),
201     EnableSRAMECC(false),
202     FlatAddressSpace(false),
203     FlatInstOffsets(false),
204     FlatGlobalInsts(false),
205     FlatScratchInsts(false),
206     AddNoCarryInsts(false),
207     HasUnpackedD16VMem(false),
208 
209     ScalarizeGlobal(false),
210 
211     FeatureDisable(false),
212     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
213     TLInfo(TM, *this),
214     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
215   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
216   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
217   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
218   InstSelector.reset(new AMDGPUInstructionSelector(
219   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
220 }
221 
222 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
223   const Function &F) const {
224   if (NWaves == 1)
225     return getLocalMemorySize();
226   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
227   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
228   unsigned MaxWaves = getMaxWavesPerEU();
229   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
230 }
231 
232 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
233   const Function &F) const {
234   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
235   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
236   unsigned MaxWaves = getMaxWavesPerEU();
237   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
238   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
239   NumWaves = std::min(NumWaves, MaxWaves);
240   NumWaves = std::max(NumWaves, 1u);
241   return NumWaves;
242 }
243 
244 unsigned
245 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
246   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
247   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
248 }
249 
250 std::pair<unsigned, unsigned>
251 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
252   switch (CC) {
253   case CallingConv::AMDGPU_CS:
254   case CallingConv::AMDGPU_KERNEL:
255   case CallingConv::SPIR_KERNEL:
256     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
257   case CallingConv::AMDGPU_VS:
258   case CallingConv::AMDGPU_LS:
259   case CallingConv::AMDGPU_HS:
260   case CallingConv::AMDGPU_ES:
261   case CallingConv::AMDGPU_GS:
262   case CallingConv::AMDGPU_PS:
263     return std::make_pair(1, getWavefrontSize());
264   default:
265     return std::make_pair(1, 16 * getWavefrontSize());
266   }
267 }
268 
269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
270   const Function &F) const {
271   // FIXME: 1024 if function.
272   // Default minimum/maximum flat work group sizes.
273   std::pair<unsigned, unsigned> Default =
274     getDefaultFlatWorkGroupSize(F.getCallingConv());
275 
276   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
277   // starts using "amdgpu-flat-work-group-size" attribute.
278   Default.second = AMDGPU::getIntegerAttribute(
279     F, "amdgpu-max-work-group-size", Default.second);
280   Default.first = std::min(Default.first, Default.second);
281 
282   // Requested minimum/maximum flat work group sizes.
283   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
284     F, "amdgpu-flat-work-group-size", Default);
285 
286   // Make sure requested minimum is less than requested maximum.
287   if (Requested.first > Requested.second)
288     return Default;
289 
290   // Make sure requested values do not violate subtarget's specifications.
291   if (Requested.first < getMinFlatWorkGroupSize())
292     return Default;
293   if (Requested.second > getMaxFlatWorkGroupSize())
294     return Default;
295 
296   return Requested;
297 }
298 
299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
300   const Function &F) const {
301   // Default minimum/maximum number of waves per execution unit.
302   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
303 
304   // Default/requested minimum/maximum flat work group sizes.
305   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
306 
307   // If minimum/maximum flat work group sizes were explicitly requested using
308   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
309   // number of waves per execution unit to values implied by requested
310   // minimum/maximum flat work group sizes.
311   unsigned MinImpliedByFlatWorkGroupSize =
312     getMaxWavesPerEU(FlatWorkGroupSizes.second);
313   bool RequestedFlatWorkGroupSize = false;
314 
315   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
316   // starts using "amdgpu-flat-work-group-size" attribute.
317   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
318       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
319     Default.first = MinImpliedByFlatWorkGroupSize;
320     RequestedFlatWorkGroupSize = true;
321   }
322 
323   // Requested minimum/maximum number of waves per execution unit.
324   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
325     F, "amdgpu-waves-per-eu", Default, true);
326 
327   // Make sure requested minimum is less than requested maximum.
328   if (Requested.second && Requested.first > Requested.second)
329     return Default;
330 
331   // Make sure requested values do not violate subtarget's specifications.
332   if (Requested.first < getMinWavesPerEU() ||
333       Requested.first > getMaxWavesPerEU())
334     return Default;
335   if (Requested.second > getMaxWavesPerEU())
336     return Default;
337 
338   // Make sure requested values are compatible with values implied by requested
339   // minimum/maximum flat work group sizes.
340   if (RequestedFlatWorkGroupSize &&
341       Requested.first < MinImpliedByFlatWorkGroupSize)
342     return Default;
343 
344   return Requested;
345 }
346 
347 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
348   Function *Kernel = I->getParent()->getParent();
349   unsigned MinSize = 0;
350   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
351   bool IdQuery = false;
352 
353   // If reqd_work_group_size is present it narrows value down.
354   if (auto *CI = dyn_cast<CallInst>(I)) {
355     const Function *F = CI->getCalledFunction();
356     if (F) {
357       unsigned Dim = UINT_MAX;
358       switch (F->getIntrinsicID()) {
359       case Intrinsic::amdgcn_workitem_id_x:
360       case Intrinsic::r600_read_tidig_x:
361         IdQuery = true;
362         LLVM_FALLTHROUGH;
363       case Intrinsic::r600_read_local_size_x:
364         Dim = 0;
365         break;
366       case Intrinsic::amdgcn_workitem_id_y:
367       case Intrinsic::r600_read_tidig_y:
368         IdQuery = true;
369         LLVM_FALLTHROUGH;
370       case Intrinsic::r600_read_local_size_y:
371         Dim = 1;
372         break;
373       case Intrinsic::amdgcn_workitem_id_z:
374       case Intrinsic::r600_read_tidig_z:
375         IdQuery = true;
376         LLVM_FALLTHROUGH;
377       case Intrinsic::r600_read_local_size_z:
378         Dim = 2;
379         break;
380       default:
381         break;
382       }
383       if (Dim <= 3) {
384         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
385           if (Node->getNumOperands() == 3)
386             MinSize = MaxSize = mdconst::extract<ConstantInt>(
387                                   Node->getOperand(Dim))->getZExtValue();
388       }
389     }
390   }
391 
392   if (!MaxSize)
393     return false;
394 
395   // Range metadata is [Lo, Hi). For ID query we need to pass max size
396   // as Hi. For size query we need to pass Hi + 1.
397   if (IdQuery)
398     MinSize = 0;
399   else
400     ++MaxSize;
401 
402   MDBuilder MDB(I->getContext());
403   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
404                                                   APInt(32, MaxSize));
405   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
406   return true;
407 }
408 
409 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
410                                                  unsigned &MaxAlign) const {
411   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
412          F.getCallingConv() == CallingConv::SPIR_KERNEL);
413 
414   const DataLayout &DL = F.getParent()->getDataLayout();
415   uint64_t ExplicitArgBytes = 0;
416   MaxAlign = 1;
417 
418   for (const Argument &Arg : F.args()) {
419     Type *ArgTy = Arg.getType();
420 
421     unsigned Align = DL.getABITypeAlignment(ArgTy);
422     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
423     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
424     MaxAlign = std::max(MaxAlign, Align);
425   }
426 
427   return ExplicitArgBytes;
428 }
429 
430 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
431                                                 unsigned &MaxAlign) const {
432   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
433 
434   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
435 
436   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
437   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
438   if (ImplicitBytes != 0) {
439     unsigned Alignment = getAlignmentForImplicitArgPtr();
440     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
441   }
442 
443   // Being able to dereference past the end is useful for emitting scalar loads.
444   return alignTo(TotalSize, 4);
445 }
446 
447 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
448                              const TargetMachine &TM) :
449   R600GenSubtargetInfo(TT, GPU, FS),
450   AMDGPUSubtarget(TT),
451   InstrInfo(*this),
452   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
453   FMA(false),
454   CaymanISA(false),
455   CFALUBug(false),
456   DX10Clamp(false),
457   HasVertexCache(false),
458   R600ALUInst(false),
459   FP64(false),
460   TexVTXClauseSize(0),
461   Gen(R600),
462   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
463   InstrItins(getInstrItineraryForCPU(GPU)) { }
464 
465 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
466                                       unsigned NumRegionInstrs) const {
467   // Track register pressure so the scheduler can try to decrease
468   // pressure once register usage is above the threshold defined by
469   // SIRegisterInfo::getRegPressureSetLimit()
470   Policy.ShouldTrackPressure = true;
471 
472   // Enabling both top down and bottom up scheduling seems to give us less
473   // register spills than just using one of these approaches on its own.
474   Policy.OnlyTopDown = false;
475   Policy.OnlyBottomUp = false;
476 
477   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
478   if (!enableSIScheduler())
479     Policy.ShouldTrackLaneMasks = true;
480 }
481 
482 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
483   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
484     if (SGPRs <= 80)
485       return 10;
486     if (SGPRs <= 88)
487       return 9;
488     if (SGPRs <= 100)
489       return 8;
490     return 7;
491   }
492   if (SGPRs <= 48)
493     return 10;
494   if (SGPRs <= 56)
495     return 9;
496   if (SGPRs <= 64)
497     return 8;
498   if (SGPRs <= 72)
499     return 7;
500   if (SGPRs <= 80)
501     return 6;
502   return 5;
503 }
504 
505 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
506   if (VGPRs <= 24)
507     return 10;
508   if (VGPRs <= 28)
509     return 9;
510   if (VGPRs <= 32)
511     return 8;
512   if (VGPRs <= 36)
513     return 7;
514   if (VGPRs <= 40)
515     return 6;
516   if (VGPRs <= 48)
517     return 5;
518   if (VGPRs <= 64)
519     return 4;
520   if (VGPRs <= 84)
521     return 3;
522   if (VGPRs <= 128)
523     return 2;
524   return 1;
525 }
526 
527 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
528   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
529   if (MFI.hasFlatScratchInit()) {
530     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
531       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
532     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
533       return 4; // FLAT_SCRATCH, VCC (in that order).
534   }
535 
536   if (isXNACKEnabled())
537     return 4; // XNACK, VCC (in that order).
538   return 2; // VCC.
539 }
540 
541 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
542   const Function &F = MF.getFunction();
543   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
544 
545   // Compute maximum number of SGPRs function can use using default/requested
546   // minimum number of waves per execution unit.
547   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
548   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
549   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
550 
551   // Check if maximum number of SGPRs was explicitly requested using
552   // "amdgpu-num-sgpr" attribute.
553   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
554     unsigned Requested = AMDGPU::getIntegerAttribute(
555       F, "amdgpu-num-sgpr", MaxNumSGPRs);
556 
557     // Make sure requested value does not violate subtarget's specifications.
558     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
559       Requested = 0;
560 
561     // If more SGPRs are required to support the input user/system SGPRs,
562     // increase to accommodate them.
563     //
564     // FIXME: This really ends up using the requested number of SGPRs + number
565     // of reserved special registers in total. Theoretically you could re-use
566     // the last input registers for these special registers, but this would
567     // require a lot of complexity to deal with the weird aliasing.
568     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
569     if (Requested && Requested < InputNumSGPRs)
570       Requested = InputNumSGPRs;
571 
572     // Make sure requested value is compatible with values implied by
573     // default/requested minimum/maximum number of waves per execution unit.
574     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
575       Requested = 0;
576     if (WavesPerEU.second &&
577         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
578       Requested = 0;
579 
580     if (Requested)
581       MaxNumSGPRs = Requested;
582   }
583 
584   if (hasSGPRInitBug())
585     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
586 
587   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
588                   MaxAddressableNumSGPRs);
589 }
590 
591 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
592   const Function &F = MF.getFunction();
593   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
594 
595   // Compute maximum number of VGPRs function can use using default/requested
596   // minimum number of waves per execution unit.
597   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
598   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
599 
600   // Check if maximum number of VGPRs was explicitly requested using
601   // "amdgpu-num-vgpr" attribute.
602   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
603     unsigned Requested = AMDGPU::getIntegerAttribute(
604       F, "amdgpu-num-vgpr", MaxNumVGPRs);
605 
606     // Make sure requested value is compatible with values implied by
607     // default/requested minimum/maximum number of waves per execution unit.
608     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
609       Requested = 0;
610     if (WavesPerEU.second &&
611         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
612       Requested = 0;
613 
614     if (Requested)
615       MaxNumVGPRs = Requested;
616   }
617 
618   return MaxNumVGPRs;
619 }
620 
621 namespace {
622 struct MemOpClusterMutation : ScheduleDAGMutation {
623   const SIInstrInfo *TII;
624 
625   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
626 
627   void apply(ScheduleDAGInstrs *DAGInstrs) override {
628     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
629 
630     SUnit *SUa = nullptr;
631     // Search for two consequent memory operations and link them
632     // to prevent scheduler from moving them apart.
633     // In DAG pre-process SUnits are in the original order of
634     // the instructions before scheduling.
635     for (SUnit &SU : DAG->SUnits) {
636       MachineInstr &MI2 = *SU.getInstr();
637       if (!MI2.mayLoad() && !MI2.mayStore()) {
638         SUa = nullptr;
639         continue;
640       }
641       if (!SUa) {
642         SUa = &SU;
643         continue;
644       }
645 
646       MachineInstr &MI1 = *SUa->getInstr();
647       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
648           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
649           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
650           (TII->isDS(MI1)   && TII->isDS(MI2))) {
651         SU.addPredBarrier(SUa);
652 
653         for (const SDep &SI : SU.Preds) {
654           if (SI.getSUnit() != SUa)
655             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
656         }
657 
658         if (&SU != &DAG->ExitSU) {
659           for (const SDep &SI : SUa->Succs) {
660             if (SI.getSUnit() != &SU)
661               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
662           }
663         }
664       }
665 
666       SUa = &SU;
667     }
668   }
669 };
670 } // namespace
671 
672 void GCNSubtarget::getPostRAMutations(
673     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
674   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
675 }
676 
677 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
678   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
679     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
680   else
681     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
682 }
683 
684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
685   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
686     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
687   else
688     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
689 }
690