1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   FullFS += FS;
98 
99   ParseSubtargetFeatures(GPU, FullFS);
100 
101   // We don't support FP64 for EG/NI atm.
102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 
104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106   // variants of MUBUF instructions.
107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
108     FlatForGlobal = true;
109   }
110 
111   // Set defaults if needed.
112   if (MaxPrivateElementSize == 0)
113     MaxPrivateElementSize = 4;
114 
115   if (LDSBankCount == 0)
116     LDSBankCount = 32;
117 
118   if (TT.getArch() == Triple::amdgcn) {
119     if (LocalMemorySize == 0)
120       LocalMemorySize = 32768;
121 
122     // Do something sensible for unspecified target.
123     if (!HasMovrel && !HasVGPRIndexMode)
124       HasMovrel = true;
125   }
126 
127   // Don't crash on invalid devices.
128   if (WavefrontSize == 0)
129     WavefrontSize = 64;
130 
131   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 
133   if (DoesNotSupportXNACK && EnableXNACK) {
134     ToggleFeature(AMDGPU::FeatureXNACK);
135     EnableXNACK = false;
136   }
137 
138   // ECC is on by default, but turn it off if the hardware doesn't support it
139   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
140   // ECC.
141   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
142     ToggleFeature(AMDGPU::FeatureSRAMECC);
143     EnableSRAMECC = false;
144   }
145 
146   return *this;
147 }
148 
149 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
150   TargetTriple(TT),
151   Has16BitInsts(false),
152   HasMadMixInsts(false),
153   FP32Denormals(false),
154   FPExceptions(false),
155   HasSDWA(false),
156   HasVOP3PInsts(false),
157   HasMulI24(true),
158   HasMulU24(true),
159   HasInv2PiInlineImm(false),
160   HasFminFmaxLegacy(true),
161   EnablePromoteAlloca(false),
162   HasTrigReducedRange(false),
163   LocalMemorySize(0),
164   WavefrontSize(0)
165   { }
166 
167 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
168                            const GCNTargetMachine &TM) :
169     AMDGPUGenSubtargetInfo(TT, GPU, FS),
170     AMDGPUSubtarget(TT),
171     TargetTriple(TT),
172     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
173     InstrItins(getInstrItineraryForCPU(GPU)),
174     LDSBankCount(0),
175     MaxPrivateElementSize(0),
176 
177     FastFMAF32(false),
178     HalfRate64Ops(false),
179 
180     FP64FP16Denormals(false),
181     FlatForGlobal(false),
182     AutoWaitcntBeforeBarrier(false),
183     CodeObjectV3(false),
184     UnalignedScratchAccess(false),
185     UnalignedBufferAccess(false),
186 
187     HasApertureRegs(false),
188     EnableXNACK(false),
189     DoesNotSupportXNACK(false),
190     EnableCuMode(false),
191     TrapHandler(false),
192 
193     EnableHugePrivateBuffer(false),
194     EnableLoadStoreOpt(false),
195     EnableUnsafeDSOffsetFolding(false),
196     EnableSIScheduler(false),
197     EnableDS128(false),
198     EnablePRTStrictNull(false),
199     DumpCode(false),
200 
201     FP64(false),
202     GCN3Encoding(false),
203     CIInsts(false),
204     GFX8Insts(false),
205     GFX9Insts(false),
206     GFX10Insts(false),
207     GFX7GFX8GFX9Insts(false),
208     SGPRInitBug(false),
209     HasSMemRealTime(false),
210     HasIntClamp(false),
211     HasFmaMixInsts(false),
212     HasMovrel(false),
213     HasVGPRIndexMode(false),
214     HasScalarStores(false),
215     HasScalarAtomics(false),
216     HasSDWAOmod(false),
217     HasSDWAScalar(false),
218     HasSDWASdst(false),
219     HasSDWAMac(false),
220     HasSDWAOutModsVOPC(false),
221     HasDPP(false),
222     HasR128A16(false),
223     HasNSAEncoding(false),
224     HasDLInsts(false),
225     HasDot1Insts(false),
226     HasDot2Insts(false),
227     EnableSRAMECC(false),
228     DoesNotSupportSRAMECC(false),
229     HasNoSdstCMPX(false),
230     HasVscnt(false),
231     HasRegisterBanking(false),
232     HasVOP3Literal(false),
233     HasNoDataDepHazard(false),
234     FlatAddressSpace(false),
235     FlatInstOffsets(false),
236     FlatGlobalInsts(false),
237     FlatScratchInsts(false),
238     ScalarFlatScratchInsts(false),
239     AddNoCarryInsts(false),
240     HasUnpackedD16VMem(false),
241     LDSMisalignedBug(false),
242 
243     ScalarizeGlobal(false),
244 
245     HasVcmpxPermlaneHazard(false),
246     HasVMEMtoScalarWriteHazard(false),
247     HasSMEMtoVectorWriteHazard(false),
248     HasInstFwdPrefetchBug(false),
249     HasVcmpxExecWARHazard(false),
250     HasLdsBranchVmemWARHazard(false),
251     HasNSAtoVMEMBug(false),
252     HasFlatSegmentOffsetBug(false),
253 
254     FeatureDisable(false),
255     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
256     TLInfo(TM, *this),
257     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
258   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
259   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
260   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
261   InstSelector.reset(new AMDGPUInstructionSelector(
262   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
263 }
264 
265 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
266   if (getGeneration() < GFX10)
267     return 1;
268 
269   switch (Opcode) {
270   case AMDGPU::V_LSHLREV_B64:
271   case AMDGPU::V_LSHLREV_B64_gfx10:
272   case AMDGPU::V_LSHL_B64:
273   case AMDGPU::V_LSHRREV_B64:
274   case AMDGPU::V_LSHRREV_B64_gfx10:
275   case AMDGPU::V_LSHR_B64:
276   case AMDGPU::V_ASHRREV_I64:
277   case AMDGPU::V_ASHRREV_I64_gfx10:
278   case AMDGPU::V_ASHR_I64:
279     return 1;
280   }
281 
282   return 2;
283 }
284 
285 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
286   const Function &F) const {
287   if (NWaves == 1)
288     return getLocalMemorySize();
289   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
290   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
291   if (!WorkGroupsPerCu)
292     return 0;
293   unsigned MaxWaves = getMaxWavesPerEU();
294   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
295 }
296 
297 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
298   const Function &F) const {
299   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
300   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
301   if (!WorkGroupsPerCu)
302     return 0;
303   unsigned MaxWaves = getMaxWavesPerEU();
304   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
305   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
306   NumWaves = std::min(NumWaves, MaxWaves);
307   NumWaves = std::max(NumWaves, 1u);
308   return NumWaves;
309 }
310 
311 unsigned
312 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
313   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
314   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
315 }
316 
317 std::pair<unsigned, unsigned>
318 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
319   switch (CC) {
320   case CallingConv::AMDGPU_CS:
321   case CallingConv::AMDGPU_KERNEL:
322   case CallingConv::SPIR_KERNEL:
323     return std::make_pair(getWavefrontSize() * 2,
324                           std::max(getWavefrontSize() * 4, 256u));
325   case CallingConv::AMDGPU_VS:
326   case CallingConv::AMDGPU_LS:
327   case CallingConv::AMDGPU_HS:
328   case CallingConv::AMDGPU_ES:
329   case CallingConv::AMDGPU_GS:
330   case CallingConv::AMDGPU_PS:
331     return std::make_pair(1, getWavefrontSize());
332   default:
333     return std::make_pair(1, 16 * getWavefrontSize());
334   }
335 }
336 
337 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
338   const Function &F) const {
339   // FIXME: 1024 if function.
340   // Default minimum/maximum flat work group sizes.
341   std::pair<unsigned, unsigned> Default =
342     getDefaultFlatWorkGroupSize(F.getCallingConv());
343 
344   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
345   // starts using "amdgpu-flat-work-group-size" attribute.
346   Default.second = AMDGPU::getIntegerAttribute(
347     F, "amdgpu-max-work-group-size", Default.second);
348   Default.first = std::min(Default.first, Default.second);
349 
350   // Requested minimum/maximum flat work group sizes.
351   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
352     F, "amdgpu-flat-work-group-size", Default);
353 
354   // Make sure requested minimum is less than requested maximum.
355   if (Requested.first > Requested.second)
356     return Default;
357 
358   // Make sure requested values do not violate subtarget's specifications.
359   if (Requested.first < getMinFlatWorkGroupSize())
360     return Default;
361   if (Requested.second > getMaxFlatWorkGroupSize())
362     return Default;
363 
364   return Requested;
365 }
366 
367 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
368   const Function &F) const {
369   // Default minimum/maximum number of waves per execution unit.
370   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
371 
372   // Default/requested minimum/maximum flat work group sizes.
373   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
374 
375   // If minimum/maximum flat work group sizes were explicitly requested using
376   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
377   // number of waves per execution unit to values implied by requested
378   // minimum/maximum flat work group sizes.
379   unsigned MinImpliedByFlatWorkGroupSize =
380     getMaxWavesPerEU(FlatWorkGroupSizes.second);
381   bool RequestedFlatWorkGroupSize = false;
382 
383   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
384   // starts using "amdgpu-flat-work-group-size" attribute.
385   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
386       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
387     Default.first = MinImpliedByFlatWorkGroupSize;
388     RequestedFlatWorkGroupSize = true;
389   }
390 
391   // Requested minimum/maximum number of waves per execution unit.
392   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
393     F, "amdgpu-waves-per-eu", Default, true);
394 
395   // Make sure requested minimum is less than requested maximum.
396   if (Requested.second && Requested.first > Requested.second)
397     return Default;
398 
399   // Make sure requested values do not violate subtarget's specifications.
400   if (Requested.first < getMinWavesPerEU() ||
401       Requested.first > getMaxWavesPerEU())
402     return Default;
403   if (Requested.second > getMaxWavesPerEU())
404     return Default;
405 
406   // Make sure requested values are compatible with values implied by requested
407   // minimum/maximum flat work group sizes.
408   if (RequestedFlatWorkGroupSize &&
409       Requested.first < MinImpliedByFlatWorkGroupSize)
410     return Default;
411 
412   return Requested;
413 }
414 
415 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
416   Function *Kernel = I->getParent()->getParent();
417   unsigned MinSize = 0;
418   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
419   bool IdQuery = false;
420 
421   // If reqd_work_group_size is present it narrows value down.
422   if (auto *CI = dyn_cast<CallInst>(I)) {
423     const Function *F = CI->getCalledFunction();
424     if (F) {
425       unsigned Dim = UINT_MAX;
426       switch (F->getIntrinsicID()) {
427       case Intrinsic::amdgcn_workitem_id_x:
428       case Intrinsic::r600_read_tidig_x:
429         IdQuery = true;
430         LLVM_FALLTHROUGH;
431       case Intrinsic::r600_read_local_size_x:
432         Dim = 0;
433         break;
434       case Intrinsic::amdgcn_workitem_id_y:
435       case Intrinsic::r600_read_tidig_y:
436         IdQuery = true;
437         LLVM_FALLTHROUGH;
438       case Intrinsic::r600_read_local_size_y:
439         Dim = 1;
440         break;
441       case Intrinsic::amdgcn_workitem_id_z:
442       case Intrinsic::r600_read_tidig_z:
443         IdQuery = true;
444         LLVM_FALLTHROUGH;
445       case Intrinsic::r600_read_local_size_z:
446         Dim = 2;
447         break;
448       default:
449         break;
450       }
451       if (Dim <= 3) {
452         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
453           if (Node->getNumOperands() == 3)
454             MinSize = MaxSize = mdconst::extract<ConstantInt>(
455                                   Node->getOperand(Dim))->getZExtValue();
456       }
457     }
458   }
459 
460   if (!MaxSize)
461     return false;
462 
463   // Range metadata is [Lo, Hi). For ID query we need to pass max size
464   // as Hi. For size query we need to pass Hi + 1.
465   if (IdQuery)
466     MinSize = 0;
467   else
468     ++MaxSize;
469 
470   MDBuilder MDB(I->getContext());
471   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
472                                                   APInt(32, MaxSize));
473   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
474   return true;
475 }
476 
477 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
478                                                  unsigned &MaxAlign) const {
479   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
480          F.getCallingConv() == CallingConv::SPIR_KERNEL);
481 
482   const DataLayout &DL = F.getParent()->getDataLayout();
483   uint64_t ExplicitArgBytes = 0;
484   MaxAlign = 1;
485 
486   for (const Argument &Arg : F.args()) {
487     Type *ArgTy = Arg.getType();
488 
489     unsigned Align = DL.getABITypeAlignment(ArgTy);
490     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
491     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
492     MaxAlign = std::max(MaxAlign, Align);
493   }
494 
495   return ExplicitArgBytes;
496 }
497 
498 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
499                                                 unsigned &MaxAlign) const {
500   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
501 
502   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
503 
504   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
505   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
506   if (ImplicitBytes != 0) {
507     unsigned Alignment = getAlignmentForImplicitArgPtr();
508     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
509   }
510 
511   // Being able to dereference past the end is useful for emitting scalar loads.
512   return alignTo(TotalSize, 4);
513 }
514 
515 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
516                              const TargetMachine &TM) :
517   R600GenSubtargetInfo(TT, GPU, FS),
518   AMDGPUSubtarget(TT),
519   InstrInfo(*this),
520   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
521   FMA(false),
522   CaymanISA(false),
523   CFALUBug(false),
524   HasVertexCache(false),
525   R600ALUInst(false),
526   FP64(false),
527   TexVTXClauseSize(0),
528   Gen(R600),
529   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
530   InstrItins(getInstrItineraryForCPU(GPU)) { }
531 
532 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
533                                       unsigned NumRegionInstrs) const {
534   // Track register pressure so the scheduler can try to decrease
535   // pressure once register usage is above the threshold defined by
536   // SIRegisterInfo::getRegPressureSetLimit()
537   Policy.ShouldTrackPressure = true;
538 
539   // Enabling both top down and bottom up scheduling seems to give us less
540   // register spills than just using one of these approaches on its own.
541   Policy.OnlyTopDown = false;
542   Policy.OnlyBottomUp = false;
543 
544   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
545   if (!enableSIScheduler())
546     Policy.ShouldTrackLaneMasks = true;
547 }
548 
549 bool GCNSubtarget::hasMadF16() const {
550   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
551 }
552 
553 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
554   if (getGeneration() >= AMDGPUSubtarget::GFX10)
555     return 10;
556 
557   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
558     if (SGPRs <= 80)
559       return 10;
560     if (SGPRs <= 88)
561       return 9;
562     if (SGPRs <= 100)
563       return 8;
564     return 7;
565   }
566   if (SGPRs <= 48)
567     return 10;
568   if (SGPRs <= 56)
569     return 9;
570   if (SGPRs <= 64)
571     return 8;
572   if (SGPRs <= 72)
573     return 7;
574   if (SGPRs <= 80)
575     return 6;
576   return 5;
577 }
578 
579 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
580   if (VGPRs <= 24)
581     return 10;
582   if (VGPRs <= 28)
583     return 9;
584   if (VGPRs <= 32)
585     return 8;
586   if (VGPRs <= 36)
587     return 7;
588   if (VGPRs <= 40)
589     return 6;
590   if (VGPRs <= 48)
591     return 5;
592   if (VGPRs <= 64)
593     return 4;
594   if (VGPRs <= 84)
595     return 3;
596   if (VGPRs <= 128)
597     return 2;
598   return 1;
599 }
600 
601 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
602   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
603   if (getGeneration() >= AMDGPUSubtarget::GFX10)
604     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
605 
606   if (MFI.hasFlatScratchInit()) {
607     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
608       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
609     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
610       return 4; // FLAT_SCRATCH, VCC (in that order).
611   }
612 
613   if (isXNACKEnabled())
614     return 4; // XNACK, VCC (in that order).
615   return 2; // VCC.
616 }
617 
618 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
619   const Function &F = MF.getFunction();
620   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
621 
622   // Compute maximum number of SGPRs function can use using default/requested
623   // minimum number of waves per execution unit.
624   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
625   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
626   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
627 
628   // Check if maximum number of SGPRs was explicitly requested using
629   // "amdgpu-num-sgpr" attribute.
630   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
631     unsigned Requested = AMDGPU::getIntegerAttribute(
632       F, "amdgpu-num-sgpr", MaxNumSGPRs);
633 
634     // Make sure requested value does not violate subtarget's specifications.
635     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
636       Requested = 0;
637 
638     // If more SGPRs are required to support the input user/system SGPRs,
639     // increase to accommodate them.
640     //
641     // FIXME: This really ends up using the requested number of SGPRs + number
642     // of reserved special registers in total. Theoretically you could re-use
643     // the last input registers for these special registers, but this would
644     // require a lot of complexity to deal with the weird aliasing.
645     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
646     if (Requested && Requested < InputNumSGPRs)
647       Requested = InputNumSGPRs;
648 
649     // Make sure requested value is compatible with values implied by
650     // default/requested minimum/maximum number of waves per execution unit.
651     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
652       Requested = 0;
653     if (WavesPerEU.second &&
654         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
655       Requested = 0;
656 
657     if (Requested)
658       MaxNumSGPRs = Requested;
659   }
660 
661   if (hasSGPRInitBug())
662     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
663 
664   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
665                   MaxAddressableNumSGPRs);
666 }
667 
668 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
669   const Function &F = MF.getFunction();
670   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
671 
672   // Compute maximum number of VGPRs function can use using default/requested
673   // minimum number of waves per execution unit.
674   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
675   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
676 
677   // Check if maximum number of VGPRs was explicitly requested using
678   // "amdgpu-num-vgpr" attribute.
679   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
680     unsigned Requested = AMDGPU::getIntegerAttribute(
681       F, "amdgpu-num-vgpr", MaxNumVGPRs);
682 
683     // Make sure requested value is compatible with values implied by
684     // default/requested minimum/maximum number of waves per execution unit.
685     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
686       Requested = 0;
687     if (WavesPerEU.second &&
688         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
689       Requested = 0;
690 
691     if (Requested)
692       MaxNumVGPRs = Requested;
693   }
694 
695   return MaxNumVGPRs;
696 }
697 
698 namespace {
699 struct MemOpClusterMutation : ScheduleDAGMutation {
700   const SIInstrInfo *TII;
701 
702   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
703 
704   void apply(ScheduleDAGInstrs *DAG) override {
705     SUnit *SUa = nullptr;
706     // Search for two consequent memory operations and link them
707     // to prevent scheduler from moving them apart.
708     // In DAG pre-process SUnits are in the original order of
709     // the instructions before scheduling.
710     for (SUnit &SU : DAG->SUnits) {
711       MachineInstr &MI2 = *SU.getInstr();
712       if (!MI2.mayLoad() && !MI2.mayStore()) {
713         SUa = nullptr;
714         continue;
715       }
716       if (!SUa) {
717         SUa = &SU;
718         continue;
719       }
720 
721       MachineInstr &MI1 = *SUa->getInstr();
722       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
723           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
724           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
725           (TII->isDS(MI1)   && TII->isDS(MI2))) {
726         SU.addPredBarrier(SUa);
727 
728         for (const SDep &SI : SU.Preds) {
729           if (SI.getSUnit() != SUa)
730             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
731         }
732 
733         if (&SU != &DAG->ExitSU) {
734           for (const SDep &SI : SUa->Succs) {
735             if (SI.getSUnit() != &SU)
736               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
737           }
738         }
739       }
740 
741       SUa = &SU;
742     }
743   }
744 };
745 } // namespace
746 
747 void GCNSubtarget::getPostRAMutations(
748     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
749   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
750 }
751 
752 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
753   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
754     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
755   else
756     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
757 }
758 
759 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
760   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
761     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
762   else
763     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
764 }
765