1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   FullFS += FS;
98 
99   ParseSubtargetFeatures(GPU, FullFS);
100 
101   // We don't support FP64 for EG/NI atm.
102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 
104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106   // variants of MUBUF instructions.
107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
108     FlatForGlobal = true;
109   }
110 
111   // Set defaults if needed.
112   if (MaxPrivateElementSize == 0)
113     MaxPrivateElementSize = 4;
114 
115   if (LDSBankCount == 0)
116     LDSBankCount = 32;
117 
118   if (TT.getArch() == Triple::amdgcn) {
119     if (LocalMemorySize == 0)
120       LocalMemorySize = 32768;
121 
122     // Do something sensible for unspecified target.
123     if (!HasMovrel && !HasVGPRIndexMode)
124       HasMovrel = true;
125   }
126 
127   // Don't crash on invalid devices.
128   if (WavefrontSize == 0)
129     WavefrontSize = 64;
130 
131   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 
133   // ECC is on by default, but turn it off if the hardware doesn't support it
134   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
135   // ECC.
136   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
137     ToggleFeature(AMDGPU::FeatureSRAMECC);
138     EnableSRAMECC = false;
139   }
140 
141   return *this;
142 }
143 
144 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
145   TargetTriple(TT),
146   Has16BitInsts(false),
147   HasMadMixInsts(false),
148   FP32Denormals(false),
149   FPExceptions(false),
150   HasSDWA(false),
151   HasVOP3PInsts(false),
152   HasMulI24(true),
153   HasMulU24(true),
154   HasInv2PiInlineImm(false),
155   HasFminFmaxLegacy(true),
156   EnablePromoteAlloca(false),
157   HasTrigReducedRange(false),
158   LocalMemorySize(0),
159   WavefrontSize(0)
160   { }
161 
162 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
163                            const GCNTargetMachine &TM) :
164     AMDGPUGenSubtargetInfo(TT, GPU, FS),
165     AMDGPUSubtarget(TT),
166     TargetTriple(TT),
167     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
168     InstrItins(getInstrItineraryForCPU(GPU)),
169     LDSBankCount(0),
170     MaxPrivateElementSize(0),
171 
172     FastFMAF32(false),
173     HalfRate64Ops(false),
174 
175     FP64FP16Denormals(false),
176     FlatForGlobal(false),
177     AutoWaitcntBeforeBarrier(false),
178     CodeObjectV3(false),
179     UnalignedScratchAccess(false),
180     UnalignedBufferAccess(false),
181 
182     HasApertureRegs(false),
183     EnableXNACK(false),
184     EnableCuMode(false),
185     TrapHandler(false),
186 
187     EnableHugePrivateBuffer(false),
188     EnableLoadStoreOpt(false),
189     EnableUnsafeDSOffsetFolding(false),
190     EnableSIScheduler(false),
191     EnableDS128(false),
192     EnablePRTStrictNull(false),
193     DumpCode(false),
194 
195     FP64(false),
196     GCN3Encoding(false),
197     CIInsts(false),
198     GFX8Insts(false),
199     GFX9Insts(false),
200     GFX10Insts(false),
201     GFX7GFX8GFX9Insts(false),
202     SGPRInitBug(false),
203     HasSMemRealTime(false),
204     HasIntClamp(false),
205     HasFmaMixInsts(false),
206     HasMovrel(false),
207     HasVGPRIndexMode(false),
208     HasScalarStores(false),
209     HasScalarAtomics(false),
210     HasSDWAOmod(false),
211     HasSDWAScalar(false),
212     HasSDWASdst(false),
213     HasSDWAMac(false),
214     HasSDWAOutModsVOPC(false),
215     HasDPP(false),
216     HasR128A16(false),
217     HasNSAEncoding(false),
218     HasDLInsts(false),
219     HasDot1Insts(false),
220     HasDot2Insts(false),
221     EnableSRAMECC(false),
222     DoesNotSupportSRAMECC(false),
223     HasNoSdstCMPX(false),
224     HasVscnt(false),
225     HasRegisterBanking(false),
226     HasVOP3Literal(false),
227     HasNoDataDepHazard(false),
228     FlatAddressSpace(false),
229     FlatInstOffsets(false),
230     FlatGlobalInsts(false),
231     FlatScratchInsts(false),
232     ScalarFlatScratchInsts(false),
233     AddNoCarryInsts(false),
234     HasUnpackedD16VMem(false),
235     LDSMisalignedBug(false),
236 
237     ScalarizeGlobal(false),
238 
239     HasVcmpxPermlaneHazard(false),
240     HasVMEMtoScalarWriteHazard(false),
241     HasSMEMtoVectorWriteHazard(false),
242     HasInstFwdPrefetchBug(false),
243     HasVcmpxExecWARHazard(false),
244     HasLdsBranchVmemWARHazard(false),
245     HasNSAtoVMEMBug(false),
246     HasFlatSegmentOffsetBug(false),
247 
248     FeatureDisable(false),
249     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
250     TLInfo(TM, *this),
251     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
252   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
253   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
254   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
255   InstSelector.reset(new AMDGPUInstructionSelector(
256   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
257 }
258 
259 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
260   if (getGeneration() < GFX10)
261     return 1;
262 
263   switch (Opcode) {
264   case AMDGPU::V_LSHLREV_B64:
265   case AMDGPU::V_LSHLREV_B64_gfx10:
266   case AMDGPU::V_LSHL_B64:
267   case AMDGPU::V_LSHRREV_B64:
268   case AMDGPU::V_LSHRREV_B64_gfx10:
269   case AMDGPU::V_LSHR_B64:
270   case AMDGPU::V_ASHRREV_I64:
271   case AMDGPU::V_ASHRREV_I64_gfx10:
272   case AMDGPU::V_ASHR_I64:
273     return 1;
274   }
275 
276   return 2;
277 }
278 
279 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
280   const Function &F) const {
281   if (NWaves == 1)
282     return getLocalMemorySize();
283   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
284   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
285   if (!WorkGroupsPerCu)
286     return 0;
287   unsigned MaxWaves = getMaxWavesPerEU();
288   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
289 }
290 
291 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
292   const Function &F) const {
293   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
294   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
295   if (!WorkGroupsPerCu)
296     return 0;
297   unsigned MaxWaves = getMaxWavesPerEU();
298   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
299   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
300   NumWaves = std::min(NumWaves, MaxWaves);
301   NumWaves = std::max(NumWaves, 1u);
302   return NumWaves;
303 }
304 
305 unsigned
306 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
307   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
308   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
309 }
310 
311 std::pair<unsigned, unsigned>
312 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
313   switch (CC) {
314   case CallingConv::AMDGPU_CS:
315   case CallingConv::AMDGPU_KERNEL:
316   case CallingConv::SPIR_KERNEL:
317     return std::make_pair(getWavefrontSize() * 2,
318                           std::max(getWavefrontSize() * 4, 256u));
319   case CallingConv::AMDGPU_VS:
320   case CallingConv::AMDGPU_LS:
321   case CallingConv::AMDGPU_HS:
322   case CallingConv::AMDGPU_ES:
323   case CallingConv::AMDGPU_GS:
324   case CallingConv::AMDGPU_PS:
325     return std::make_pair(1, getWavefrontSize());
326   default:
327     return std::make_pair(1, 16 * getWavefrontSize());
328   }
329 }
330 
331 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
332   const Function &F) const {
333   // FIXME: 1024 if function.
334   // Default minimum/maximum flat work group sizes.
335   std::pair<unsigned, unsigned> Default =
336     getDefaultFlatWorkGroupSize(F.getCallingConv());
337 
338   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
339   // starts using "amdgpu-flat-work-group-size" attribute.
340   Default.second = AMDGPU::getIntegerAttribute(
341     F, "amdgpu-max-work-group-size", Default.second);
342   Default.first = std::min(Default.first, Default.second);
343 
344   // Requested minimum/maximum flat work group sizes.
345   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
346     F, "amdgpu-flat-work-group-size", Default);
347 
348   // Make sure requested minimum is less than requested maximum.
349   if (Requested.first > Requested.second)
350     return Default;
351 
352   // Make sure requested values do not violate subtarget's specifications.
353   if (Requested.first < getMinFlatWorkGroupSize())
354     return Default;
355   if (Requested.second > getMaxFlatWorkGroupSize())
356     return Default;
357 
358   return Requested;
359 }
360 
361 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
362   const Function &F) const {
363   // Default minimum/maximum number of waves per execution unit.
364   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
365 
366   // Default/requested minimum/maximum flat work group sizes.
367   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
368 
369   // If minimum/maximum flat work group sizes were explicitly requested using
370   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
371   // number of waves per execution unit to values implied by requested
372   // minimum/maximum flat work group sizes.
373   unsigned MinImpliedByFlatWorkGroupSize =
374     getMaxWavesPerEU(FlatWorkGroupSizes.second);
375   bool RequestedFlatWorkGroupSize = false;
376 
377   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
378   // starts using "amdgpu-flat-work-group-size" attribute.
379   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
380       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
381     Default.first = MinImpliedByFlatWorkGroupSize;
382     RequestedFlatWorkGroupSize = true;
383   }
384 
385   // Requested minimum/maximum number of waves per execution unit.
386   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
387     F, "amdgpu-waves-per-eu", Default, true);
388 
389   // Make sure requested minimum is less than requested maximum.
390   if (Requested.second && Requested.first > Requested.second)
391     return Default;
392 
393   // Make sure requested values do not violate subtarget's specifications.
394   if (Requested.first < getMinWavesPerEU() ||
395       Requested.first > getMaxWavesPerEU())
396     return Default;
397   if (Requested.second > getMaxWavesPerEU())
398     return Default;
399 
400   // Make sure requested values are compatible with values implied by requested
401   // minimum/maximum flat work group sizes.
402   if (RequestedFlatWorkGroupSize &&
403       Requested.first < MinImpliedByFlatWorkGroupSize)
404     return Default;
405 
406   return Requested;
407 }
408 
409 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
410   Function *Kernel = I->getParent()->getParent();
411   unsigned MinSize = 0;
412   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
413   bool IdQuery = false;
414 
415   // If reqd_work_group_size is present it narrows value down.
416   if (auto *CI = dyn_cast<CallInst>(I)) {
417     const Function *F = CI->getCalledFunction();
418     if (F) {
419       unsigned Dim = UINT_MAX;
420       switch (F->getIntrinsicID()) {
421       case Intrinsic::amdgcn_workitem_id_x:
422       case Intrinsic::r600_read_tidig_x:
423         IdQuery = true;
424         LLVM_FALLTHROUGH;
425       case Intrinsic::r600_read_local_size_x:
426         Dim = 0;
427         break;
428       case Intrinsic::amdgcn_workitem_id_y:
429       case Intrinsic::r600_read_tidig_y:
430         IdQuery = true;
431         LLVM_FALLTHROUGH;
432       case Intrinsic::r600_read_local_size_y:
433         Dim = 1;
434         break;
435       case Intrinsic::amdgcn_workitem_id_z:
436       case Intrinsic::r600_read_tidig_z:
437         IdQuery = true;
438         LLVM_FALLTHROUGH;
439       case Intrinsic::r600_read_local_size_z:
440         Dim = 2;
441         break;
442       default:
443         break;
444       }
445       if (Dim <= 3) {
446         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
447           if (Node->getNumOperands() == 3)
448             MinSize = MaxSize = mdconst::extract<ConstantInt>(
449                                   Node->getOperand(Dim))->getZExtValue();
450       }
451     }
452   }
453 
454   if (!MaxSize)
455     return false;
456 
457   // Range metadata is [Lo, Hi). For ID query we need to pass max size
458   // as Hi. For size query we need to pass Hi + 1.
459   if (IdQuery)
460     MinSize = 0;
461   else
462     ++MaxSize;
463 
464   MDBuilder MDB(I->getContext());
465   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
466                                                   APInt(32, MaxSize));
467   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
468   return true;
469 }
470 
471 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
472                                                  unsigned &MaxAlign) const {
473   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
474          F.getCallingConv() == CallingConv::SPIR_KERNEL);
475 
476   const DataLayout &DL = F.getParent()->getDataLayout();
477   uint64_t ExplicitArgBytes = 0;
478   MaxAlign = 1;
479 
480   for (const Argument &Arg : F.args()) {
481     Type *ArgTy = Arg.getType();
482 
483     unsigned Align = DL.getABITypeAlignment(ArgTy);
484     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
485     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
486     MaxAlign = std::max(MaxAlign, Align);
487   }
488 
489   return ExplicitArgBytes;
490 }
491 
492 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
493                                                 unsigned &MaxAlign) const {
494   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
495 
496   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
497 
498   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
499   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
500   if (ImplicitBytes != 0) {
501     unsigned Alignment = getAlignmentForImplicitArgPtr();
502     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
503   }
504 
505   // Being able to dereference past the end is useful for emitting scalar loads.
506   return alignTo(TotalSize, 4);
507 }
508 
509 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
510                              const TargetMachine &TM) :
511   R600GenSubtargetInfo(TT, GPU, FS),
512   AMDGPUSubtarget(TT),
513   InstrInfo(*this),
514   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
515   FMA(false),
516   CaymanISA(false),
517   CFALUBug(false),
518   HasVertexCache(false),
519   R600ALUInst(false),
520   FP64(false),
521   TexVTXClauseSize(0),
522   Gen(R600),
523   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
524   InstrItins(getInstrItineraryForCPU(GPU)) { }
525 
526 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
527                                       unsigned NumRegionInstrs) const {
528   // Track register pressure so the scheduler can try to decrease
529   // pressure once register usage is above the threshold defined by
530   // SIRegisterInfo::getRegPressureSetLimit()
531   Policy.ShouldTrackPressure = true;
532 
533   // Enabling both top down and bottom up scheduling seems to give us less
534   // register spills than just using one of these approaches on its own.
535   Policy.OnlyTopDown = false;
536   Policy.OnlyBottomUp = false;
537 
538   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
539   if (!enableSIScheduler())
540     Policy.ShouldTrackLaneMasks = true;
541 }
542 
543 bool GCNSubtarget::hasMadF16() const {
544   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
545 }
546 
547 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
548   if (getGeneration() >= AMDGPUSubtarget::GFX10)
549     return 10;
550 
551   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
552     if (SGPRs <= 80)
553       return 10;
554     if (SGPRs <= 88)
555       return 9;
556     if (SGPRs <= 100)
557       return 8;
558     return 7;
559   }
560   if (SGPRs <= 48)
561     return 10;
562   if (SGPRs <= 56)
563     return 9;
564   if (SGPRs <= 64)
565     return 8;
566   if (SGPRs <= 72)
567     return 7;
568   if (SGPRs <= 80)
569     return 6;
570   return 5;
571 }
572 
573 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
574   if (VGPRs <= 24)
575     return 10;
576   if (VGPRs <= 28)
577     return 9;
578   if (VGPRs <= 32)
579     return 8;
580   if (VGPRs <= 36)
581     return 7;
582   if (VGPRs <= 40)
583     return 6;
584   if (VGPRs <= 48)
585     return 5;
586   if (VGPRs <= 64)
587     return 4;
588   if (VGPRs <= 84)
589     return 3;
590   if (VGPRs <= 128)
591     return 2;
592   return 1;
593 }
594 
595 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
596   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
597   if (getGeneration() >= AMDGPUSubtarget::GFX10)
598     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
599 
600   if (MFI.hasFlatScratchInit()) {
601     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
602       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
603     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
604       return 4; // FLAT_SCRATCH, VCC (in that order).
605   }
606 
607   if (isXNACKEnabled())
608     return 4; // XNACK, VCC (in that order).
609   return 2; // VCC.
610 }
611 
612 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
613   const Function &F = MF.getFunction();
614   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
615 
616   // Compute maximum number of SGPRs function can use using default/requested
617   // minimum number of waves per execution unit.
618   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
619   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
620   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
621 
622   // Check if maximum number of SGPRs was explicitly requested using
623   // "amdgpu-num-sgpr" attribute.
624   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
625     unsigned Requested = AMDGPU::getIntegerAttribute(
626       F, "amdgpu-num-sgpr", MaxNumSGPRs);
627 
628     // Make sure requested value does not violate subtarget's specifications.
629     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
630       Requested = 0;
631 
632     // If more SGPRs are required to support the input user/system SGPRs,
633     // increase to accommodate them.
634     //
635     // FIXME: This really ends up using the requested number of SGPRs + number
636     // of reserved special registers in total. Theoretically you could re-use
637     // the last input registers for these special registers, but this would
638     // require a lot of complexity to deal with the weird aliasing.
639     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
640     if (Requested && Requested < InputNumSGPRs)
641       Requested = InputNumSGPRs;
642 
643     // Make sure requested value is compatible with values implied by
644     // default/requested minimum/maximum number of waves per execution unit.
645     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
646       Requested = 0;
647     if (WavesPerEU.second &&
648         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
649       Requested = 0;
650 
651     if (Requested)
652       MaxNumSGPRs = Requested;
653   }
654 
655   if (hasSGPRInitBug())
656     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
657 
658   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
659                   MaxAddressableNumSGPRs);
660 }
661 
662 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
663   const Function &F = MF.getFunction();
664   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
665 
666   // Compute maximum number of VGPRs function can use using default/requested
667   // minimum number of waves per execution unit.
668   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
669   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
670 
671   // Check if maximum number of VGPRs was explicitly requested using
672   // "amdgpu-num-vgpr" attribute.
673   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
674     unsigned Requested = AMDGPU::getIntegerAttribute(
675       F, "amdgpu-num-vgpr", MaxNumVGPRs);
676 
677     // Make sure requested value is compatible with values implied by
678     // default/requested minimum/maximum number of waves per execution unit.
679     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
680       Requested = 0;
681     if (WavesPerEU.second &&
682         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
683       Requested = 0;
684 
685     if (Requested)
686       MaxNumVGPRs = Requested;
687   }
688 
689   return MaxNumVGPRs;
690 }
691 
692 namespace {
693 struct MemOpClusterMutation : ScheduleDAGMutation {
694   const SIInstrInfo *TII;
695 
696   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
697 
698   void apply(ScheduleDAGInstrs *DAG) override {
699     SUnit *SUa = nullptr;
700     // Search for two consequent memory operations and link them
701     // to prevent scheduler from moving them apart.
702     // In DAG pre-process SUnits are in the original order of
703     // the instructions before scheduling.
704     for (SUnit &SU : DAG->SUnits) {
705       MachineInstr &MI2 = *SU.getInstr();
706       if (!MI2.mayLoad() && !MI2.mayStore()) {
707         SUa = nullptr;
708         continue;
709       }
710       if (!SUa) {
711         SUa = &SU;
712         continue;
713       }
714 
715       MachineInstr &MI1 = *SUa->getInstr();
716       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
717           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
718           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
719           (TII->isDS(MI1)   && TII->isDS(MI2))) {
720         SU.addPredBarrier(SUa);
721 
722         for (const SDep &SI : SU.Preds) {
723           if (SI.getSUnit() != SUa)
724             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
725         }
726 
727         if (&SU != &DAG->ExitSU) {
728           for (const SDep &SI : SUa->Succs) {
729             if (SI.getSUnit() != &SU)
730               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
731           }
732         }
733       }
734 
735       SUa = &SU;
736     }
737   }
738 };
739 } // namespace
740 
741 void GCNSubtarget::getPostRAMutations(
742     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
743   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
744 }
745 
746 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
747   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
748     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
749   else
750     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
751 }
752 
753 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
754   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
755     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
756   else
757     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
758 }
759