1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "amdgpu-subtarget"
34 
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43 
44 GCNSubtarget::~GCNSubtarget() = default;
45 
46 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48                                                StringRef GPU, StringRef FS) {
49   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50   FullFS += FS;
51   ParseSubtargetFeatures(GPU, FullFS);
52 
53   // FIXME: I don't think think Evergreen has any useful support for
54   // denormals, but should be checked. Should we issue a warning somewhere
55   // if someone tries to enable these?
56   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57     FP32Denormals = false;
58   }
59 
60   HasMulU24 = getGeneration() >= EVERGREEN;
61   HasMulI24 = hasCaymanISA();
62 
63   return *this;
64 }
65 
66 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68                                                  StringRef GPU, StringRef FS) {
69   // Determine default and user-specified characteristics
70   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71   // enabled, but some instructions do not respect them and they run at the
72   // double precision rate, so don't enable by default.
73   //
74   // We want to be able to turn these off, but making this a subtarget feature
75   // for SI has the unhelpful behavior that it unsets everything else if you
76   // disable it.
77   //
78   // Similarly we want enable-prt-strict-null to be on by default and not to
79   // unset everything else if it is disabled
80 
81   SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   FullFS += FS;
98 
99   ParseSubtargetFeatures(GPU, FullFS);
100 
101   // We don't support FP64 for EG/NI atm.
102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 
104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106   // variants of MUBUF instructions.
107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
108     FlatForGlobal = true;
109   }
110 
111   // Set defaults if needed.
112   if (MaxPrivateElementSize == 0)
113     MaxPrivateElementSize = 4;
114 
115   if (LDSBankCount == 0)
116     LDSBankCount = 32;
117 
118   if (TT.getArch() == Triple::amdgcn) {
119     if (LocalMemorySize == 0)
120       LocalMemorySize = 32768;
121 
122     // Do something sensible for unspecified target.
123     if (!HasMovrel && !HasVGPRIndexMode)
124       HasMovrel = true;
125   }
126 
127   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
128 
129   return *this;
130 }
131 
AMDGPUSubtarget(const Triple & TT)132 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
133   TargetTriple(TT),
134   Has16BitInsts(false),
135   HasMadMixInsts(false),
136   FP32Denormals(false),
137   FPExceptions(false),
138   HasSDWA(false),
139   HasVOP3PInsts(false),
140   HasMulI24(true),
141   HasMulU24(true),
142   HasInv2PiInlineImm(false),
143   HasFminFmaxLegacy(true),
144   EnablePromoteAlloca(false),
145   HasTrigReducedRange(false),
146   LocalMemorySize(0),
147   WavefrontSize(0)
148   { }
149 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)150 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
151                            const GCNTargetMachine &TM) :
152     AMDGPUGenSubtargetInfo(TT, GPU, FS),
153     AMDGPUSubtarget(TT),
154     TargetTriple(TT),
155     Gen(SOUTHERN_ISLANDS),
156     IsaVersion(ISAVersion0_0_0),
157     InstrItins(getInstrItineraryForCPU(GPU)),
158     LDSBankCount(0),
159     MaxPrivateElementSize(0),
160 
161     FastFMAF32(false),
162     HalfRate64Ops(false),
163 
164     FP64FP16Denormals(false),
165     DX10Clamp(false),
166     FlatForGlobal(false),
167     AutoWaitcntBeforeBarrier(false),
168     CodeObjectV3(false),
169     UnalignedScratchAccess(false),
170     UnalignedBufferAccess(false),
171 
172     HasApertureRegs(false),
173     EnableXNACK(false),
174     TrapHandler(false),
175     DebuggerInsertNops(false),
176     DebuggerEmitPrologue(false),
177 
178     EnableHugePrivateBuffer(false),
179     EnableLoadStoreOpt(false),
180     EnableUnsafeDSOffsetFolding(false),
181     EnableSIScheduler(false),
182     EnableDS128(false),
183     EnablePRTStrictNull(false),
184     DumpCode(false),
185 
186     FP64(false),
187     GCN3Encoding(false),
188     CIInsts(false),
189     VIInsts(false),
190     GFX9Insts(false),
191     SGPRInitBug(false),
192     HasSMemRealTime(false),
193     HasIntClamp(false),
194     HasFmaMixInsts(false),
195     HasMovrel(false),
196     HasVGPRIndexMode(false),
197     HasScalarStores(false),
198     HasScalarAtomics(false),
199     HasSDWAOmod(false),
200     HasSDWAScalar(false),
201     HasSDWASdst(false),
202     HasSDWAMac(false),
203     HasSDWAOutModsVOPC(false),
204     HasDPP(false),
205     HasR128A16(false),
206     HasDLInsts(false),
207     HasDotInsts(false),
208     EnableSRAMECC(false),
209     FlatAddressSpace(false),
210     FlatInstOffsets(false),
211     FlatGlobalInsts(false),
212     FlatScratchInsts(false),
213     AddNoCarryInsts(false),
214     HasUnpackedD16VMem(false),
215 
216     ScalarizeGlobal(false),
217 
218     FeatureDisable(false),
219     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
220     TLInfo(TM, *this),
221     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
222   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
223   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
224   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
225   InstSelector.reset(new AMDGPUInstructionSelector(
226   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
227 }
228 
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
230   const Function &F) const {
231   if (NWaves == 1)
232     return getLocalMemorySize();
233   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
234   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
235   unsigned MaxWaves = getMaxWavesPerEU();
236   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
237 }
238 
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
240   const Function &F) const {
241   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
242   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
243   unsigned MaxWaves = getMaxWavesPerEU();
244   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
245   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
246   NumWaves = std::min(NumWaves, MaxWaves);
247   NumWaves = std::max(NumWaves, 1u);
248   return NumWaves;
249 }
250 
251 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
253   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
254   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
255 }
256 
257 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
259   switch (CC) {
260   case CallingConv::AMDGPU_CS:
261   case CallingConv::AMDGPU_KERNEL:
262   case CallingConv::SPIR_KERNEL:
263     return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
264   case CallingConv::AMDGPU_VS:
265   case CallingConv::AMDGPU_LS:
266   case CallingConv::AMDGPU_HS:
267   case CallingConv::AMDGPU_ES:
268   case CallingConv::AMDGPU_GS:
269   case CallingConv::AMDGPU_PS:
270     return std::make_pair(1, getWavefrontSize());
271   default:
272     return std::make_pair(1, 16 * getWavefrontSize());
273   }
274 }
275 
getFlatWorkGroupSizes(const Function & F) const276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
277   const Function &F) const {
278   // FIXME: 1024 if function.
279   // Default minimum/maximum flat work group sizes.
280   std::pair<unsigned, unsigned> Default =
281     getDefaultFlatWorkGroupSize(F.getCallingConv());
282 
283   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
284   // starts using "amdgpu-flat-work-group-size" attribute.
285   Default.second = AMDGPU::getIntegerAttribute(
286     F, "amdgpu-max-work-group-size", Default.second);
287   Default.first = std::min(Default.first, Default.second);
288 
289   // Requested minimum/maximum flat work group sizes.
290   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
291     F, "amdgpu-flat-work-group-size", Default);
292 
293   // Make sure requested minimum is less than requested maximum.
294   if (Requested.first > Requested.second)
295     return Default;
296 
297   // Make sure requested values do not violate subtarget's specifications.
298   if (Requested.first < getMinFlatWorkGroupSize())
299     return Default;
300   if (Requested.second > getMaxFlatWorkGroupSize())
301     return Default;
302 
303   return Requested;
304 }
305 
getWavesPerEU(const Function & F) const306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
307   const Function &F) const {
308   // Default minimum/maximum number of waves per execution unit.
309   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
310 
311   // Default/requested minimum/maximum flat work group sizes.
312   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
313 
314   // If minimum/maximum flat work group sizes were explicitly requested using
315   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
316   // number of waves per execution unit to values implied by requested
317   // minimum/maximum flat work group sizes.
318   unsigned MinImpliedByFlatWorkGroupSize =
319     getMaxWavesPerEU(FlatWorkGroupSizes.second);
320   bool RequestedFlatWorkGroupSize = false;
321 
322   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
323   // starts using "amdgpu-flat-work-group-size" attribute.
324   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
325       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
326     Default.first = MinImpliedByFlatWorkGroupSize;
327     RequestedFlatWorkGroupSize = true;
328   }
329 
330   // Requested minimum/maximum number of waves per execution unit.
331   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
332     F, "amdgpu-waves-per-eu", Default, true);
333 
334   // Make sure requested minimum is less than requested maximum.
335   if (Requested.second && Requested.first > Requested.second)
336     return Default;
337 
338   // Make sure requested values do not violate subtarget's specifications.
339   if (Requested.first < getMinWavesPerEU() ||
340       Requested.first > getMaxWavesPerEU())
341     return Default;
342   if (Requested.second > getMaxWavesPerEU())
343     return Default;
344 
345   // Make sure requested values are compatible with values implied by requested
346   // minimum/maximum flat work group sizes.
347   if (RequestedFlatWorkGroupSize &&
348       Requested.first < MinImpliedByFlatWorkGroupSize)
349     return Default;
350 
351   return Requested;
352 }
353 
makeLIDRangeMetadata(Instruction * I) const354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
355   Function *Kernel = I->getParent()->getParent();
356   unsigned MinSize = 0;
357   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
358   bool IdQuery = false;
359 
360   // If reqd_work_group_size is present it narrows value down.
361   if (auto *CI = dyn_cast<CallInst>(I)) {
362     const Function *F = CI->getCalledFunction();
363     if (F) {
364       unsigned Dim = UINT_MAX;
365       switch (F->getIntrinsicID()) {
366       case Intrinsic::amdgcn_workitem_id_x:
367       case Intrinsic::r600_read_tidig_x:
368         IdQuery = true;
369         LLVM_FALLTHROUGH;
370       case Intrinsic::r600_read_local_size_x:
371         Dim = 0;
372         break;
373       case Intrinsic::amdgcn_workitem_id_y:
374       case Intrinsic::r600_read_tidig_y:
375         IdQuery = true;
376         LLVM_FALLTHROUGH;
377       case Intrinsic::r600_read_local_size_y:
378         Dim = 1;
379         break;
380       case Intrinsic::amdgcn_workitem_id_z:
381       case Intrinsic::r600_read_tidig_z:
382         IdQuery = true;
383         LLVM_FALLTHROUGH;
384       case Intrinsic::r600_read_local_size_z:
385         Dim = 2;
386         break;
387       default:
388         break;
389       }
390       if (Dim <= 3) {
391         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
392           if (Node->getNumOperands() == 3)
393             MinSize = MaxSize = mdconst::extract<ConstantInt>(
394                                   Node->getOperand(Dim))->getZExtValue();
395       }
396     }
397   }
398 
399   if (!MaxSize)
400     return false;
401 
402   // Range metadata is [Lo, Hi). For ID query we need to pass max size
403   // as Hi. For size query we need to pass Hi + 1.
404   if (IdQuery)
405     MinSize = 0;
406   else
407     ++MaxSize;
408 
409   MDBuilder MDB(I->getContext());
410   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
411                                                   APInt(32, MaxSize));
412   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
413   return true;
414 }
415 
getExplicitKernArgSize(const Function & F,unsigned & MaxAlign) const416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
417                                                  unsigned &MaxAlign) const {
418   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
419          F.getCallingConv() == CallingConv::SPIR_KERNEL);
420 
421   const DataLayout &DL = F.getParent()->getDataLayout();
422   uint64_t ExplicitArgBytes = 0;
423   MaxAlign = 1;
424 
425   for (const Argument &Arg : F.args()) {
426     Type *ArgTy = Arg.getType();
427 
428     unsigned Align = DL.getABITypeAlignment(ArgTy);
429     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
430     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
431     MaxAlign = std::max(MaxAlign, Align);
432   }
433 
434   return ExplicitArgBytes;
435 }
436 
getKernArgSegmentSize(const Function & F,unsigned & MaxAlign) const437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
438                                                 unsigned &MaxAlign) const {
439   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
440 
441   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
442 
443   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
444   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
445   if (ImplicitBytes != 0) {
446     unsigned Alignment = getAlignmentForImplicitArgPtr();
447     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
448   }
449 
450   // Being able to dereference past the end is useful for emitting scalar loads.
451   return alignTo(TotalSize, 4);
452 }
453 
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
455                              const TargetMachine &TM) :
456   R600GenSubtargetInfo(TT, GPU, FS),
457   AMDGPUSubtarget(TT),
458   InstrInfo(*this),
459   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
460   FMA(false),
461   CaymanISA(false),
462   CFALUBug(false),
463   DX10Clamp(false),
464   HasVertexCache(false),
465   R600ALUInst(false),
466   FP64(false),
467   TexVTXClauseSize(0),
468   Gen(R600),
469   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
470   InstrItins(getInstrItineraryForCPU(GPU)) { }
471 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const472 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
473                                       unsigned NumRegionInstrs) const {
474   // Track register pressure so the scheduler can try to decrease
475   // pressure once register usage is above the threshold defined by
476   // SIRegisterInfo::getRegPressureSetLimit()
477   Policy.ShouldTrackPressure = true;
478 
479   // Enabling both top down and bottom up scheduling seems to give us less
480   // register spills than just using one of these approaches on its own.
481   Policy.OnlyTopDown = false;
482   Policy.OnlyBottomUp = false;
483 
484   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
485   if (!enableSIScheduler())
486     Policy.ShouldTrackLaneMasks = true;
487 }
488 
getOccupancyWithNumSGPRs(unsigned SGPRs) const489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
490   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
491     if (SGPRs <= 80)
492       return 10;
493     if (SGPRs <= 88)
494       return 9;
495     if (SGPRs <= 100)
496       return 8;
497     return 7;
498   }
499   if (SGPRs <= 48)
500     return 10;
501   if (SGPRs <= 56)
502     return 9;
503   if (SGPRs <= 64)
504     return 8;
505   if (SGPRs <= 72)
506     return 7;
507   if (SGPRs <= 80)
508     return 6;
509   return 5;
510 }
511 
getOccupancyWithNumVGPRs(unsigned VGPRs) const512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
513   if (VGPRs <= 24)
514     return 10;
515   if (VGPRs <= 28)
516     return 9;
517   if (VGPRs <= 32)
518     return 8;
519   if (VGPRs <= 36)
520     return 7;
521   if (VGPRs <= 40)
522     return 6;
523   if (VGPRs <= 48)
524     return 5;
525   if (VGPRs <= 64)
526     return 4;
527   if (VGPRs <= 84)
528     return 3;
529   if (VGPRs <= 128)
530     return 2;
531   return 1;
532 }
533 
getReservedNumSGPRs(const MachineFunction & MF) const534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
535   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
536   if (MFI.hasFlatScratchInit()) {
537     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
538       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
539     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
540       return 4; // FLAT_SCRATCH, VCC (in that order).
541   }
542 
543   if (isXNACKEnabled())
544     return 4; // XNACK, VCC (in that order).
545   return 2; // VCC.
546 }
547 
getMaxNumSGPRs(const MachineFunction & MF) const548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
549   const Function &F = MF.getFunction();
550   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
551 
552   // Compute maximum number of SGPRs function can use using default/requested
553   // minimum number of waves per execution unit.
554   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
555   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
556   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
557 
558   // Check if maximum number of SGPRs was explicitly requested using
559   // "amdgpu-num-sgpr" attribute.
560   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
561     unsigned Requested = AMDGPU::getIntegerAttribute(
562       F, "amdgpu-num-sgpr", MaxNumSGPRs);
563 
564     // Make sure requested value does not violate subtarget's specifications.
565     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
566       Requested = 0;
567 
568     // If more SGPRs are required to support the input user/system SGPRs,
569     // increase to accommodate them.
570     //
571     // FIXME: This really ends up using the requested number of SGPRs + number
572     // of reserved special registers in total. Theoretically you could re-use
573     // the last input registers for these special registers, but this would
574     // require a lot of complexity to deal with the weird aliasing.
575     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
576     if (Requested && Requested < InputNumSGPRs)
577       Requested = InputNumSGPRs;
578 
579     // Make sure requested value is compatible with values implied by
580     // default/requested minimum/maximum number of waves per execution unit.
581     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
582       Requested = 0;
583     if (WavesPerEU.second &&
584         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
585       Requested = 0;
586 
587     if (Requested)
588       MaxNumSGPRs = Requested;
589   }
590 
591   if (hasSGPRInitBug())
592     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
593 
594   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
595                   MaxAddressableNumSGPRs);
596 }
597 
getMaxNumVGPRs(const MachineFunction & MF) const598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
599   const Function &F = MF.getFunction();
600   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
601 
602   // Compute maximum number of VGPRs function can use using default/requested
603   // minimum number of waves per execution unit.
604   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
605   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
606 
607   // Check if maximum number of VGPRs was explicitly requested using
608   // "amdgpu-num-vgpr" attribute.
609   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
610     unsigned Requested = AMDGPU::getIntegerAttribute(
611       F, "amdgpu-num-vgpr", MaxNumVGPRs);
612 
613     // Make sure requested value is compatible with values implied by
614     // default/requested minimum/maximum number of waves per execution unit.
615     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
616       Requested = 0;
617     if (WavesPerEU.second &&
618         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
619       Requested = 0;
620 
621     if (Requested)
622       MaxNumVGPRs = Requested;
623   }
624 
625   return MaxNumVGPRs;
626 }
627 
628 namespace {
629 struct MemOpClusterMutation : ScheduleDAGMutation {
630   const SIInstrInfo *TII;
631 
MemOpClusterMutation__anonb9fc40a70111::MemOpClusterMutation632   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
633 
apply__anonb9fc40a70111::MemOpClusterMutation634   void apply(ScheduleDAGInstrs *DAGInstrs) override {
635     ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
636 
637     SUnit *SUa = nullptr;
638     // Search for two consequent memory operations and link them
639     // to prevent scheduler from moving them apart.
640     // In DAG pre-process SUnits are in the original order of
641     // the instructions before scheduling.
642     for (SUnit &SU : DAG->SUnits) {
643       MachineInstr &MI2 = *SU.getInstr();
644       if (!MI2.mayLoad() && !MI2.mayStore()) {
645         SUa = nullptr;
646         continue;
647       }
648       if (!SUa) {
649         SUa = &SU;
650         continue;
651       }
652 
653       MachineInstr &MI1 = *SUa->getInstr();
654       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
655           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
656           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
657           (TII->isDS(MI1)   && TII->isDS(MI2))) {
658         SU.addPredBarrier(SUa);
659 
660         for (const SDep &SI : SU.Preds) {
661           if (SI.getSUnit() != SUa)
662             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
663         }
664 
665         if (&SU != &DAG->ExitSU) {
666           for (const SDep &SI : SUa->Succs) {
667             if (SI.getSUnit() != &SU)
668               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
669           }
670         }
671       }
672 
673       SUa = &SU;
674     }
675   }
676 };
677 } // namespace
678 
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const679 void GCNSubtarget::getPostRAMutations(
680     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
681   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
682 }
683 
get(const MachineFunction & MF)684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
685   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
686     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
687   else
688     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
689 }
690 
get(const TargetMachine & TM,const Function & F)691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
692   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
693     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
694   else
695     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
696 }
697