1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 GCNSubtarget::~GCNSubtarget() = default;
44 
45 R600Subtarget &
46 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
47                                                StringRef GPU, StringRef FS) {
48   SmallString<256> FullFS("+promote-alloca,");
49   FullFS += FS;
50   ParseSubtargetFeatures(GPU, FullFS);
51 
52   // FIXME: I don't think think Evergreen has any useful support for
53   // denormals, but should be checked. Should we issue a warning somewhere
54   // if someone tries to enable these?
55   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
56     FP32Denormals = false;
57   }
58 
59   HasMulU24 = getGeneration() >= EVERGREEN;
60   HasMulI24 = hasCaymanISA();
61 
62   return *this;
63 }
64 
65 GCNSubtarget &
66 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
67                                               StringRef GPU, StringRef FS) {
68   // Determine default and user-specified characteristics
69   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70   // enabled, but some instructions do not respect them and they run at the
71   // double precision rate, so don't enable by default.
72   //
73   // We want to be able to turn these off, but making this a subtarget feature
74   // for SI has the unhelpful behavior that it unsets everything else if you
75   // disable it.
76   //
77   // Similarly we want enable-prt-strict-null to be on by default and not to
78   // unset everything else if it is disabled
79 
80   // Assuming ECC is enabled is the conservative default.
81   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,");
82 
83   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 
86   // FIXME: I don't think think Evergreen has any useful support for
87   // denormals, but should be checked. Should we issue a warning somewhere
88   // if someone tries to enable these?
89   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90     FullFS += "+fp64-fp16-denormals,";
91   } else {
92     FullFS += "-fp32-denormals,";
93   }
94 
95   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96 
97   FullFS += FS;
98 
99   ParseSubtargetFeatures(GPU, FullFS);
100 
101   // We don't support FP64 for EG/NI atm.
102   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103 
104   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106   // variants of MUBUF instructions.
107   if (!hasAddr64() && !FS.contains("flat-for-global")) {
108     FlatForGlobal = true;
109   }
110 
111   // Set defaults if needed.
112   if (MaxPrivateElementSize == 0)
113     MaxPrivateElementSize = 4;
114 
115   if (LDSBankCount == 0)
116     LDSBankCount = 32;
117 
118   if (TT.getArch() == Triple::amdgcn) {
119     if (LocalMemorySize == 0)
120       LocalMemorySize = 32768;
121 
122     // Do something sensible for unspecified target.
123     if (!HasMovrel && !HasVGPRIndexMode)
124       HasMovrel = true;
125   }
126 
127   // Don't crash on invalid devices.
128   if (WavefrontSize == 0)
129     WavefrontSize = 64;
130 
131   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
132 
133   // ECC is on by default, but turn it off if the hardware doesn't support it
134   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
135   // ECC.
136   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
137     ToggleFeature(AMDGPU::FeatureSRAMECC);
138     EnableSRAMECC = false;
139   }
140 
141   return *this;
142 }
143 
144 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
145   TargetTriple(TT),
146   Has16BitInsts(false),
147   HasMadMixInsts(false),
148   FP32Denormals(false),
149   FPExceptions(false),
150   HasSDWA(false),
151   HasVOP3PInsts(false),
152   HasMulI24(true),
153   HasMulU24(true),
154   HasInv2PiInlineImm(false),
155   HasFminFmaxLegacy(true),
156   EnablePromoteAlloca(false),
157   HasTrigReducedRange(false),
158   LocalMemorySize(0),
159   WavefrontSize(0)
160   { }
161 
162 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
163                            const GCNTargetMachine &TM) :
164     AMDGPUGenSubtargetInfo(TT, GPU, FS),
165     AMDGPUSubtarget(TT),
166     TargetTriple(TT),
167     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
168     InstrItins(getInstrItineraryForCPU(GPU)),
169     LDSBankCount(0),
170     MaxPrivateElementSize(0),
171 
172     FastFMAF32(false),
173     HalfRate64Ops(false),
174 
175     FP64FP16Denormals(false),
176     FlatForGlobal(false),
177     AutoWaitcntBeforeBarrier(false),
178     CodeObjectV3(false),
179     UnalignedScratchAccess(false),
180     UnalignedBufferAccess(false),
181 
182     HasApertureRegs(false),
183     EnableXNACK(false),
184     EnableCuMode(false),
185     TrapHandler(false),
186 
187     EnableHugePrivateBuffer(false),
188     EnableLoadStoreOpt(false),
189     EnableUnsafeDSOffsetFolding(false),
190     EnableSIScheduler(false),
191     EnableDS128(false),
192     EnablePRTStrictNull(false),
193     DumpCode(false),
194 
195     FP64(false),
196     GCN3Encoding(false),
197     CIInsts(false),
198     GFX8Insts(false),
199     GFX9Insts(false),
200     GFX10Insts(false),
201     GFX7GFX8GFX9Insts(false),
202     SGPRInitBug(false),
203     HasSMemRealTime(false),
204     HasIntClamp(false),
205     HasFmaMixInsts(false),
206     HasMovrel(false),
207     HasVGPRIndexMode(false),
208     HasScalarStores(false),
209     HasScalarAtomics(false),
210     HasSDWAOmod(false),
211     HasSDWAScalar(false),
212     HasSDWASdst(false),
213     HasSDWAMac(false),
214     HasSDWAOutModsVOPC(false),
215     HasDPP(false),
216     HasR128A16(false),
217     HasNSAEncoding(false),
218     HasDLInsts(false),
219     HasDot1Insts(false),
220     HasDot2Insts(false),
221     EnableSRAMECC(false),
222     DoesNotSupportSRAMECC(false),
223     HasNoSdstCMPX(false),
224     HasVscnt(false),
225     HasRegisterBanking(false),
226     HasVOP3Literal(false),
227     HasNoDataDepHazard(false),
228     FlatAddressSpace(false),
229     FlatInstOffsets(false),
230     FlatGlobalInsts(false),
231     FlatScratchInsts(false),
232     ScalarFlatScratchInsts(false),
233     AddNoCarryInsts(false),
234     HasUnpackedD16VMem(false),
235     LDSMisalignedBug(false),
236 
237     ScalarizeGlobal(false),
238 
239     HasVcmpxPermlaneHazard(false),
240     HasVMEMtoScalarWriteHazard(false),
241     HasSMEMtoVectorWriteHazard(false),
242     HasInstFwdPrefetchBug(false),
243     HasVcmpxExecWARHazard(false),
244     HasLdsBranchVmemWARHazard(false),
245     HasNSAtoVMEMBug(false),
246     HasFlatSegmentOffsetBug(false),
247 
248     FeatureDisable(false),
249     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
250     TLInfo(TM, *this),
251     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
252   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
253   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
254   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
255   InstSelector.reset(new AMDGPUInstructionSelector(
256   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
257 }
258 
259 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
260   const Function &F) const {
261   if (NWaves == 1)
262     return getLocalMemorySize();
263   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
264   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
265   if (!WorkGroupsPerCu)
266     return 0;
267   unsigned MaxWaves = getMaxWavesPerEU();
268   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
269 }
270 
271 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
272   const Function &F) const {
273   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
274   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
275   if (!WorkGroupsPerCu)
276     return 0;
277   unsigned MaxWaves = getMaxWavesPerEU();
278   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
279   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
280   NumWaves = std::min(NumWaves, MaxWaves);
281   NumWaves = std::max(NumWaves, 1u);
282   return NumWaves;
283 }
284 
285 unsigned
286 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
287   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
288   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
289 }
290 
291 std::pair<unsigned, unsigned>
292 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
293   switch (CC) {
294   case CallingConv::AMDGPU_CS:
295   case CallingConv::AMDGPU_KERNEL:
296   case CallingConv::SPIR_KERNEL:
297     return std::make_pair(getWavefrontSize() * 2,
298                           std::max(getWavefrontSize() * 4, 256u));
299   case CallingConv::AMDGPU_VS:
300   case CallingConv::AMDGPU_LS:
301   case CallingConv::AMDGPU_HS:
302   case CallingConv::AMDGPU_ES:
303   case CallingConv::AMDGPU_GS:
304   case CallingConv::AMDGPU_PS:
305     return std::make_pair(1, getWavefrontSize());
306   default:
307     return std::make_pair(1, 16 * getWavefrontSize());
308   }
309 }
310 
311 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
312   const Function &F) const {
313   // FIXME: 1024 if function.
314   // Default minimum/maximum flat work group sizes.
315   std::pair<unsigned, unsigned> Default =
316     getDefaultFlatWorkGroupSize(F.getCallingConv());
317 
318   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
319   // starts using "amdgpu-flat-work-group-size" attribute.
320   Default.second = AMDGPU::getIntegerAttribute(
321     F, "amdgpu-max-work-group-size", Default.second);
322   Default.first = std::min(Default.first, Default.second);
323 
324   // Requested minimum/maximum flat work group sizes.
325   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
326     F, "amdgpu-flat-work-group-size", Default);
327 
328   // Make sure requested minimum is less than requested maximum.
329   if (Requested.first > Requested.second)
330     return Default;
331 
332   // Make sure requested values do not violate subtarget's specifications.
333   if (Requested.first < getMinFlatWorkGroupSize())
334     return Default;
335   if (Requested.second > getMaxFlatWorkGroupSize())
336     return Default;
337 
338   return Requested;
339 }
340 
341 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
342   const Function &F) const {
343   // Default minimum/maximum number of waves per execution unit.
344   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
345 
346   // Default/requested minimum/maximum flat work group sizes.
347   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
348 
349   // If minimum/maximum flat work group sizes were explicitly requested using
350   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
351   // number of waves per execution unit to values implied by requested
352   // minimum/maximum flat work group sizes.
353   unsigned MinImpliedByFlatWorkGroupSize =
354     getMaxWavesPerEU(FlatWorkGroupSizes.second);
355   bool RequestedFlatWorkGroupSize = false;
356 
357   // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
358   // starts using "amdgpu-flat-work-group-size" attribute.
359   if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
360       F.hasFnAttribute("amdgpu-flat-work-group-size")) {
361     Default.first = MinImpliedByFlatWorkGroupSize;
362     RequestedFlatWorkGroupSize = true;
363   }
364 
365   // Requested minimum/maximum number of waves per execution unit.
366   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
367     F, "amdgpu-waves-per-eu", Default, true);
368 
369   // Make sure requested minimum is less than requested maximum.
370   if (Requested.second && Requested.first > Requested.second)
371     return Default;
372 
373   // Make sure requested values do not violate subtarget's specifications.
374   if (Requested.first < getMinWavesPerEU() ||
375       Requested.first > getMaxWavesPerEU())
376     return Default;
377   if (Requested.second > getMaxWavesPerEU())
378     return Default;
379 
380   // Make sure requested values are compatible with values implied by requested
381   // minimum/maximum flat work group sizes.
382   if (RequestedFlatWorkGroupSize &&
383       Requested.first < MinImpliedByFlatWorkGroupSize)
384     return Default;
385 
386   return Requested;
387 }
388 
389 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
390   Function *Kernel = I->getParent()->getParent();
391   unsigned MinSize = 0;
392   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
393   bool IdQuery = false;
394 
395   // If reqd_work_group_size is present it narrows value down.
396   if (auto *CI = dyn_cast<CallInst>(I)) {
397     const Function *F = CI->getCalledFunction();
398     if (F) {
399       unsigned Dim = UINT_MAX;
400       switch (F->getIntrinsicID()) {
401       case Intrinsic::amdgcn_workitem_id_x:
402       case Intrinsic::r600_read_tidig_x:
403         IdQuery = true;
404         LLVM_FALLTHROUGH;
405       case Intrinsic::r600_read_local_size_x:
406         Dim = 0;
407         break;
408       case Intrinsic::amdgcn_workitem_id_y:
409       case Intrinsic::r600_read_tidig_y:
410         IdQuery = true;
411         LLVM_FALLTHROUGH;
412       case Intrinsic::r600_read_local_size_y:
413         Dim = 1;
414         break;
415       case Intrinsic::amdgcn_workitem_id_z:
416       case Intrinsic::r600_read_tidig_z:
417         IdQuery = true;
418         LLVM_FALLTHROUGH;
419       case Intrinsic::r600_read_local_size_z:
420         Dim = 2;
421         break;
422       default:
423         break;
424       }
425       if (Dim <= 3) {
426         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
427           if (Node->getNumOperands() == 3)
428             MinSize = MaxSize = mdconst::extract<ConstantInt>(
429                                   Node->getOperand(Dim))->getZExtValue();
430       }
431     }
432   }
433 
434   if (!MaxSize)
435     return false;
436 
437   // Range metadata is [Lo, Hi). For ID query we need to pass max size
438   // as Hi. For size query we need to pass Hi + 1.
439   if (IdQuery)
440     MinSize = 0;
441   else
442     ++MaxSize;
443 
444   MDBuilder MDB(I->getContext());
445   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
446                                                   APInt(32, MaxSize));
447   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
448   return true;
449 }
450 
451 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
452                                                  unsigned &MaxAlign) const {
453   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
454          F.getCallingConv() == CallingConv::SPIR_KERNEL);
455 
456   const DataLayout &DL = F.getParent()->getDataLayout();
457   uint64_t ExplicitArgBytes = 0;
458   MaxAlign = 1;
459 
460   for (const Argument &Arg : F.args()) {
461     Type *ArgTy = Arg.getType();
462 
463     unsigned Align = DL.getABITypeAlignment(ArgTy);
464     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
465     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
466     MaxAlign = std::max(MaxAlign, Align);
467   }
468 
469   return ExplicitArgBytes;
470 }
471 
472 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
473                                                 unsigned &MaxAlign) const {
474   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
475 
476   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
477 
478   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
479   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
480   if (ImplicitBytes != 0) {
481     unsigned Alignment = getAlignmentForImplicitArgPtr();
482     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
483   }
484 
485   // Being able to dereference past the end is useful for emitting scalar loads.
486   return alignTo(TotalSize, 4);
487 }
488 
489 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
490                              const TargetMachine &TM) :
491   R600GenSubtargetInfo(TT, GPU, FS),
492   AMDGPUSubtarget(TT),
493   InstrInfo(*this),
494   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
495   FMA(false),
496   CaymanISA(false),
497   CFALUBug(false),
498   HasVertexCache(false),
499   R600ALUInst(false),
500   FP64(false),
501   TexVTXClauseSize(0),
502   Gen(R600),
503   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
504   InstrItins(getInstrItineraryForCPU(GPU)) { }
505 
506 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
507                                       unsigned NumRegionInstrs) const {
508   // Track register pressure so the scheduler can try to decrease
509   // pressure once register usage is above the threshold defined by
510   // SIRegisterInfo::getRegPressureSetLimit()
511   Policy.ShouldTrackPressure = true;
512 
513   // Enabling both top down and bottom up scheduling seems to give us less
514   // register spills than just using one of these approaches on its own.
515   Policy.OnlyTopDown = false;
516   Policy.OnlyBottomUp = false;
517 
518   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
519   if (!enableSIScheduler())
520     Policy.ShouldTrackLaneMasks = true;
521 }
522 
523 bool GCNSubtarget::hasMadF16() const {
524   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
525 }
526 
527 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
528   if (getGeneration() >= AMDGPUSubtarget::GFX10)
529     return 10;
530 
531   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
532     if (SGPRs <= 80)
533       return 10;
534     if (SGPRs <= 88)
535       return 9;
536     if (SGPRs <= 100)
537       return 8;
538     return 7;
539   }
540   if (SGPRs <= 48)
541     return 10;
542   if (SGPRs <= 56)
543     return 9;
544   if (SGPRs <= 64)
545     return 8;
546   if (SGPRs <= 72)
547     return 7;
548   if (SGPRs <= 80)
549     return 6;
550   return 5;
551 }
552 
553 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
554   if (VGPRs <= 24)
555     return 10;
556   if (VGPRs <= 28)
557     return 9;
558   if (VGPRs <= 32)
559     return 8;
560   if (VGPRs <= 36)
561     return 7;
562   if (VGPRs <= 40)
563     return 6;
564   if (VGPRs <= 48)
565     return 5;
566   if (VGPRs <= 64)
567     return 4;
568   if (VGPRs <= 84)
569     return 3;
570   if (VGPRs <= 128)
571     return 2;
572   return 1;
573 }
574 
575 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
576   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
577   if (getGeneration() >= AMDGPUSubtarget::GFX10)
578     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
579 
580   if (MFI.hasFlatScratchInit()) {
581     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
582       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
583     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
584       return 4; // FLAT_SCRATCH, VCC (in that order).
585   }
586 
587   if (isXNACKEnabled())
588     return 4; // XNACK, VCC (in that order).
589   return 2; // VCC.
590 }
591 
592 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
593   const Function &F = MF.getFunction();
594   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
595 
596   // Compute maximum number of SGPRs function can use using default/requested
597   // minimum number of waves per execution unit.
598   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
599   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
600   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
601 
602   // Check if maximum number of SGPRs was explicitly requested using
603   // "amdgpu-num-sgpr" attribute.
604   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
605     unsigned Requested = AMDGPU::getIntegerAttribute(
606       F, "amdgpu-num-sgpr", MaxNumSGPRs);
607 
608     // Make sure requested value does not violate subtarget's specifications.
609     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
610       Requested = 0;
611 
612     // If more SGPRs are required to support the input user/system SGPRs,
613     // increase to accommodate them.
614     //
615     // FIXME: This really ends up using the requested number of SGPRs + number
616     // of reserved special registers in total. Theoretically you could re-use
617     // the last input registers for these special registers, but this would
618     // require a lot of complexity to deal with the weird aliasing.
619     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
620     if (Requested && Requested < InputNumSGPRs)
621       Requested = InputNumSGPRs;
622 
623     // Make sure requested value is compatible with values implied by
624     // default/requested minimum/maximum number of waves per execution unit.
625     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
626       Requested = 0;
627     if (WavesPerEU.second &&
628         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
629       Requested = 0;
630 
631     if (Requested)
632       MaxNumSGPRs = Requested;
633   }
634 
635   if (hasSGPRInitBug())
636     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
637 
638   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
639                   MaxAddressableNumSGPRs);
640 }
641 
642 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
643   const Function &F = MF.getFunction();
644   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
645 
646   // Compute maximum number of VGPRs function can use using default/requested
647   // minimum number of waves per execution unit.
648   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
649   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
650 
651   // Check if maximum number of VGPRs was explicitly requested using
652   // "amdgpu-num-vgpr" attribute.
653   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
654     unsigned Requested = AMDGPU::getIntegerAttribute(
655       F, "amdgpu-num-vgpr", MaxNumVGPRs);
656 
657     // Make sure requested value is compatible with values implied by
658     // default/requested minimum/maximum number of waves per execution unit.
659     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
660       Requested = 0;
661     if (WavesPerEU.second &&
662         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
663       Requested = 0;
664 
665     if (Requested)
666       MaxNumVGPRs = Requested;
667   }
668 
669   return MaxNumVGPRs;
670 }
671 
672 namespace {
673 struct MemOpClusterMutation : ScheduleDAGMutation {
674   const SIInstrInfo *TII;
675 
676   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
677 
678   void apply(ScheduleDAGInstrs *DAG) override {
679     SUnit *SUa = nullptr;
680     // Search for two consequent memory operations and link them
681     // to prevent scheduler from moving them apart.
682     // In DAG pre-process SUnits are in the original order of
683     // the instructions before scheduling.
684     for (SUnit &SU : DAG->SUnits) {
685       MachineInstr &MI2 = *SU.getInstr();
686       if (!MI2.mayLoad() && !MI2.mayStore()) {
687         SUa = nullptr;
688         continue;
689       }
690       if (!SUa) {
691         SUa = &SU;
692         continue;
693       }
694 
695       MachineInstr &MI1 = *SUa->getInstr();
696       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
697           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
698           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
699           (TII->isDS(MI1)   && TII->isDS(MI2))) {
700         SU.addPredBarrier(SUa);
701 
702         for (const SDep &SI : SU.Preds) {
703           if (SI.getSUnit() != SUa)
704             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
705         }
706 
707         if (&SU != &DAG->ExitSU) {
708           for (const SDep &SI : SUa->Succs) {
709             if (SI.getSUnit() != &SU)
710               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
711           }
712         }
713       }
714 
715       SUa = &SU;
716     }
717   }
718 };
719 } // namespace
720 
721 void GCNSubtarget::getPostRAMutations(
722     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
723   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
724 }
725 
726 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
727   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
728     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
729   else
730     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
731 }
732 
733 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
734   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
735     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
736   else
737     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
738 }
739