1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPUSubtarget.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include <algorithm>
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "amdgpu-subtarget"
34
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
43
44 GCNSubtarget::~GCNSubtarget() = default;
45
46 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48 StringRef GPU, StringRef FS) {
49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
50 FullFS += FS;
51 ParseSubtargetFeatures(GPU, FullFS);
52
53 // FIXME: I don't think think Evergreen has any useful support for
54 // denormals, but should be checked. Should we issue a warning somewhere
55 // if someone tries to enable these?
56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57 FP32Denormals = false;
58 }
59
60 HasMulU24 = getGeneration() >= EVERGREEN;
61 HasMulI24 = hasCaymanISA();
62
63 return *this;
64 }
65
66 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68 StringRef GPU, StringRef FS) {
69 // Determine default and user-specified characteristics
70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71 // enabled, but some instructions do not respect them and they run at the
72 // double precision rate, so don't enable by default.
73 //
74 // We want to be able to turn these off, but making this a subtarget feature
75 // for SI has the unhelpful behavior that it unsets everything else if you
76 // disable it.
77 //
78 // Similarly we want enable-prt-strict-null to be on by default and not to
79 // unset everything else if it is disabled
80
81 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
82
83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85
86 // FIXME: I don't think think Evergreen has any useful support for
87 // denormals, but should be checked. Should we issue a warning somewhere
88 // if someone tries to enable these?
89 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
90 FullFS += "+fp64-fp16-denormals,";
91 } else {
92 FullFS += "-fp32-denormals,";
93 }
94
95 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
96
97 FullFS += FS;
98
99 ParseSubtargetFeatures(GPU, FullFS);
100
101 // We don't support FP64 for EG/NI atm.
102 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
103
104 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
105 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
106 // variants of MUBUF instructions.
107 if (!hasAddr64() && !FS.contains("flat-for-global")) {
108 FlatForGlobal = true;
109 }
110
111 // Set defaults if needed.
112 if (MaxPrivateElementSize == 0)
113 MaxPrivateElementSize = 4;
114
115 if (LDSBankCount == 0)
116 LDSBankCount = 32;
117
118 if (TT.getArch() == Triple::amdgcn) {
119 if (LocalMemorySize == 0)
120 LocalMemorySize = 32768;
121
122 // Do something sensible for unspecified target.
123 if (!HasMovrel && !HasVGPRIndexMode)
124 HasMovrel = true;
125 }
126
127 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
128
129 return *this;
130 }
131
AMDGPUSubtarget(const Triple & TT)132 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
133 TargetTriple(TT),
134 Has16BitInsts(false),
135 HasMadMixInsts(false),
136 FP32Denormals(false),
137 FPExceptions(false),
138 HasSDWA(false),
139 HasVOP3PInsts(false),
140 HasMulI24(true),
141 HasMulU24(true),
142 HasInv2PiInlineImm(false),
143 HasFminFmaxLegacy(true),
144 EnablePromoteAlloca(false),
145 HasTrigReducedRange(false),
146 LocalMemorySize(0),
147 WavefrontSize(0)
148 { }
149
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)150 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
151 const GCNTargetMachine &TM) :
152 AMDGPUGenSubtargetInfo(TT, GPU, FS),
153 AMDGPUSubtarget(TT),
154 TargetTriple(TT),
155 Gen(SOUTHERN_ISLANDS),
156 IsaVersion(ISAVersion0_0_0),
157 InstrItins(getInstrItineraryForCPU(GPU)),
158 LDSBankCount(0),
159 MaxPrivateElementSize(0),
160
161 FastFMAF32(false),
162 HalfRate64Ops(false),
163
164 FP64FP16Denormals(false),
165 DX10Clamp(false),
166 FlatForGlobal(false),
167 AutoWaitcntBeforeBarrier(false),
168 CodeObjectV3(false),
169 UnalignedScratchAccess(false),
170 UnalignedBufferAccess(false),
171
172 HasApertureRegs(false),
173 EnableXNACK(false),
174 TrapHandler(false),
175 DebuggerInsertNops(false),
176 DebuggerEmitPrologue(false),
177
178 EnableHugePrivateBuffer(false),
179 EnableLoadStoreOpt(false),
180 EnableUnsafeDSOffsetFolding(false),
181 EnableSIScheduler(false),
182 EnableDS128(false),
183 EnablePRTStrictNull(false),
184 DumpCode(false),
185
186 FP64(false),
187 GCN3Encoding(false),
188 CIInsts(false),
189 VIInsts(false),
190 GFX9Insts(false),
191 SGPRInitBug(false),
192 HasSMemRealTime(false),
193 HasIntClamp(false),
194 HasFmaMixInsts(false),
195 HasMovrel(false),
196 HasVGPRIndexMode(false),
197 HasScalarStores(false),
198 HasScalarAtomics(false),
199 HasSDWAOmod(false),
200 HasSDWAScalar(false),
201 HasSDWASdst(false),
202 HasSDWAMac(false),
203 HasSDWAOutModsVOPC(false),
204 HasDPP(false),
205 HasR128A16(false),
206 HasDLInsts(false),
207 HasDotInsts(false),
208 EnableSRAMECC(false),
209 FlatAddressSpace(false),
210 FlatInstOffsets(false),
211 FlatGlobalInsts(false),
212 FlatScratchInsts(false),
213 AddNoCarryInsts(false),
214 HasUnpackedD16VMem(false),
215
216 ScalarizeGlobal(false),
217
218 FeatureDisable(false),
219 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
220 TLInfo(TM, *this),
221 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
222 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
223 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
224 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
225 InstSelector.reset(new AMDGPUInstructionSelector(
226 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
227 }
228
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const229 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
230 const Function &F) const {
231 if (NWaves == 1)
232 return getLocalMemorySize();
233 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
234 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
235 unsigned MaxWaves = getMaxWavesPerEU();
236 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
237 }
238
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const239 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
240 const Function &F) const {
241 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
242 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
243 unsigned MaxWaves = getMaxWavesPerEU();
244 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
245 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
246 NumWaves = std::min(NumWaves, MaxWaves);
247 NumWaves = std::max(NumWaves, 1u);
248 return NumWaves;
249 }
250
251 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const252 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
253 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
254 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
255 }
256
257 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const258 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
259 switch (CC) {
260 case CallingConv::AMDGPU_CS:
261 case CallingConv::AMDGPU_KERNEL:
262 case CallingConv::SPIR_KERNEL:
263 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
264 case CallingConv::AMDGPU_VS:
265 case CallingConv::AMDGPU_LS:
266 case CallingConv::AMDGPU_HS:
267 case CallingConv::AMDGPU_ES:
268 case CallingConv::AMDGPU_GS:
269 case CallingConv::AMDGPU_PS:
270 return std::make_pair(1, getWavefrontSize());
271 default:
272 return std::make_pair(1, 16 * getWavefrontSize());
273 }
274 }
275
getFlatWorkGroupSizes(const Function & F) const276 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
277 const Function &F) const {
278 // FIXME: 1024 if function.
279 // Default minimum/maximum flat work group sizes.
280 std::pair<unsigned, unsigned> Default =
281 getDefaultFlatWorkGroupSize(F.getCallingConv());
282
283 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
284 // starts using "amdgpu-flat-work-group-size" attribute.
285 Default.second = AMDGPU::getIntegerAttribute(
286 F, "amdgpu-max-work-group-size", Default.second);
287 Default.first = std::min(Default.first, Default.second);
288
289 // Requested minimum/maximum flat work group sizes.
290 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
291 F, "amdgpu-flat-work-group-size", Default);
292
293 // Make sure requested minimum is less than requested maximum.
294 if (Requested.first > Requested.second)
295 return Default;
296
297 // Make sure requested values do not violate subtarget's specifications.
298 if (Requested.first < getMinFlatWorkGroupSize())
299 return Default;
300 if (Requested.second > getMaxFlatWorkGroupSize())
301 return Default;
302
303 return Requested;
304 }
305
getWavesPerEU(const Function & F) const306 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
307 const Function &F) const {
308 // Default minimum/maximum number of waves per execution unit.
309 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
310
311 // Default/requested minimum/maximum flat work group sizes.
312 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
313
314 // If minimum/maximum flat work group sizes were explicitly requested using
315 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
316 // number of waves per execution unit to values implied by requested
317 // minimum/maximum flat work group sizes.
318 unsigned MinImpliedByFlatWorkGroupSize =
319 getMaxWavesPerEU(FlatWorkGroupSizes.second);
320 bool RequestedFlatWorkGroupSize = false;
321
322 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
323 // starts using "amdgpu-flat-work-group-size" attribute.
324 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
325 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
326 Default.first = MinImpliedByFlatWorkGroupSize;
327 RequestedFlatWorkGroupSize = true;
328 }
329
330 // Requested minimum/maximum number of waves per execution unit.
331 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
332 F, "amdgpu-waves-per-eu", Default, true);
333
334 // Make sure requested minimum is less than requested maximum.
335 if (Requested.second && Requested.first > Requested.second)
336 return Default;
337
338 // Make sure requested values do not violate subtarget's specifications.
339 if (Requested.first < getMinWavesPerEU() ||
340 Requested.first > getMaxWavesPerEU())
341 return Default;
342 if (Requested.second > getMaxWavesPerEU())
343 return Default;
344
345 // Make sure requested values are compatible with values implied by requested
346 // minimum/maximum flat work group sizes.
347 if (RequestedFlatWorkGroupSize &&
348 Requested.first < MinImpliedByFlatWorkGroupSize)
349 return Default;
350
351 return Requested;
352 }
353
makeLIDRangeMetadata(Instruction * I) const354 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
355 Function *Kernel = I->getParent()->getParent();
356 unsigned MinSize = 0;
357 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
358 bool IdQuery = false;
359
360 // If reqd_work_group_size is present it narrows value down.
361 if (auto *CI = dyn_cast<CallInst>(I)) {
362 const Function *F = CI->getCalledFunction();
363 if (F) {
364 unsigned Dim = UINT_MAX;
365 switch (F->getIntrinsicID()) {
366 case Intrinsic::amdgcn_workitem_id_x:
367 case Intrinsic::r600_read_tidig_x:
368 IdQuery = true;
369 LLVM_FALLTHROUGH;
370 case Intrinsic::r600_read_local_size_x:
371 Dim = 0;
372 break;
373 case Intrinsic::amdgcn_workitem_id_y:
374 case Intrinsic::r600_read_tidig_y:
375 IdQuery = true;
376 LLVM_FALLTHROUGH;
377 case Intrinsic::r600_read_local_size_y:
378 Dim = 1;
379 break;
380 case Intrinsic::amdgcn_workitem_id_z:
381 case Intrinsic::r600_read_tidig_z:
382 IdQuery = true;
383 LLVM_FALLTHROUGH;
384 case Intrinsic::r600_read_local_size_z:
385 Dim = 2;
386 break;
387 default:
388 break;
389 }
390 if (Dim <= 3) {
391 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
392 if (Node->getNumOperands() == 3)
393 MinSize = MaxSize = mdconst::extract<ConstantInt>(
394 Node->getOperand(Dim))->getZExtValue();
395 }
396 }
397 }
398
399 if (!MaxSize)
400 return false;
401
402 // Range metadata is [Lo, Hi). For ID query we need to pass max size
403 // as Hi. For size query we need to pass Hi + 1.
404 if (IdQuery)
405 MinSize = 0;
406 else
407 ++MaxSize;
408
409 MDBuilder MDB(I->getContext());
410 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
411 APInt(32, MaxSize));
412 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
413 return true;
414 }
415
getExplicitKernArgSize(const Function & F,unsigned & MaxAlign) const416 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
417 unsigned &MaxAlign) const {
418 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
419 F.getCallingConv() == CallingConv::SPIR_KERNEL);
420
421 const DataLayout &DL = F.getParent()->getDataLayout();
422 uint64_t ExplicitArgBytes = 0;
423 MaxAlign = 1;
424
425 for (const Argument &Arg : F.args()) {
426 Type *ArgTy = Arg.getType();
427
428 unsigned Align = DL.getABITypeAlignment(ArgTy);
429 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
430 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
431 MaxAlign = std::max(MaxAlign, Align);
432 }
433
434 return ExplicitArgBytes;
435 }
436
getKernArgSegmentSize(const Function & F,unsigned & MaxAlign) const437 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
438 unsigned &MaxAlign) const {
439 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
440
441 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
442
443 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
444 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
445 if (ImplicitBytes != 0) {
446 unsigned Alignment = getAlignmentForImplicitArgPtr();
447 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
448 }
449
450 // Being able to dereference past the end is useful for emitting scalar loads.
451 return alignTo(TotalSize, 4);
452 }
453
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)454 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
455 const TargetMachine &TM) :
456 R600GenSubtargetInfo(TT, GPU, FS),
457 AMDGPUSubtarget(TT),
458 InstrInfo(*this),
459 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
460 FMA(false),
461 CaymanISA(false),
462 CFALUBug(false),
463 DX10Clamp(false),
464 HasVertexCache(false),
465 R600ALUInst(false),
466 FP64(false),
467 TexVTXClauseSize(0),
468 Gen(R600),
469 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
470 InstrItins(getInstrItineraryForCPU(GPU)) { }
471
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const472 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
473 unsigned NumRegionInstrs) const {
474 // Track register pressure so the scheduler can try to decrease
475 // pressure once register usage is above the threshold defined by
476 // SIRegisterInfo::getRegPressureSetLimit()
477 Policy.ShouldTrackPressure = true;
478
479 // Enabling both top down and bottom up scheduling seems to give us less
480 // register spills than just using one of these approaches on its own.
481 Policy.OnlyTopDown = false;
482 Policy.OnlyBottomUp = false;
483
484 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
485 if (!enableSIScheduler())
486 Policy.ShouldTrackLaneMasks = true;
487 }
488
getOccupancyWithNumSGPRs(unsigned SGPRs) const489 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
490 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
491 if (SGPRs <= 80)
492 return 10;
493 if (SGPRs <= 88)
494 return 9;
495 if (SGPRs <= 100)
496 return 8;
497 return 7;
498 }
499 if (SGPRs <= 48)
500 return 10;
501 if (SGPRs <= 56)
502 return 9;
503 if (SGPRs <= 64)
504 return 8;
505 if (SGPRs <= 72)
506 return 7;
507 if (SGPRs <= 80)
508 return 6;
509 return 5;
510 }
511
getOccupancyWithNumVGPRs(unsigned VGPRs) const512 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
513 if (VGPRs <= 24)
514 return 10;
515 if (VGPRs <= 28)
516 return 9;
517 if (VGPRs <= 32)
518 return 8;
519 if (VGPRs <= 36)
520 return 7;
521 if (VGPRs <= 40)
522 return 6;
523 if (VGPRs <= 48)
524 return 5;
525 if (VGPRs <= 64)
526 return 4;
527 if (VGPRs <= 84)
528 return 3;
529 if (VGPRs <= 128)
530 return 2;
531 return 1;
532 }
533
getReservedNumSGPRs(const MachineFunction & MF) const534 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
535 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
536 if (MFI.hasFlatScratchInit()) {
537 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
538 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
539 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
540 return 4; // FLAT_SCRATCH, VCC (in that order).
541 }
542
543 if (isXNACKEnabled())
544 return 4; // XNACK, VCC (in that order).
545 return 2; // VCC.
546 }
547
getMaxNumSGPRs(const MachineFunction & MF) const548 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
549 const Function &F = MF.getFunction();
550 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
551
552 // Compute maximum number of SGPRs function can use using default/requested
553 // minimum number of waves per execution unit.
554 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
555 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
556 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
557
558 // Check if maximum number of SGPRs was explicitly requested using
559 // "amdgpu-num-sgpr" attribute.
560 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
561 unsigned Requested = AMDGPU::getIntegerAttribute(
562 F, "amdgpu-num-sgpr", MaxNumSGPRs);
563
564 // Make sure requested value does not violate subtarget's specifications.
565 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
566 Requested = 0;
567
568 // If more SGPRs are required to support the input user/system SGPRs,
569 // increase to accommodate them.
570 //
571 // FIXME: This really ends up using the requested number of SGPRs + number
572 // of reserved special registers in total. Theoretically you could re-use
573 // the last input registers for these special registers, but this would
574 // require a lot of complexity to deal with the weird aliasing.
575 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
576 if (Requested && Requested < InputNumSGPRs)
577 Requested = InputNumSGPRs;
578
579 // Make sure requested value is compatible with values implied by
580 // default/requested minimum/maximum number of waves per execution unit.
581 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
582 Requested = 0;
583 if (WavesPerEU.second &&
584 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
585 Requested = 0;
586
587 if (Requested)
588 MaxNumSGPRs = Requested;
589 }
590
591 if (hasSGPRInitBug())
592 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
593
594 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
595 MaxAddressableNumSGPRs);
596 }
597
getMaxNumVGPRs(const MachineFunction & MF) const598 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
599 const Function &F = MF.getFunction();
600 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
601
602 // Compute maximum number of VGPRs function can use using default/requested
603 // minimum number of waves per execution unit.
604 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
605 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
606
607 // Check if maximum number of VGPRs was explicitly requested using
608 // "amdgpu-num-vgpr" attribute.
609 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
610 unsigned Requested = AMDGPU::getIntegerAttribute(
611 F, "amdgpu-num-vgpr", MaxNumVGPRs);
612
613 // Make sure requested value is compatible with values implied by
614 // default/requested minimum/maximum number of waves per execution unit.
615 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
616 Requested = 0;
617 if (WavesPerEU.second &&
618 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
619 Requested = 0;
620
621 if (Requested)
622 MaxNumVGPRs = Requested;
623 }
624
625 return MaxNumVGPRs;
626 }
627
628 namespace {
629 struct MemOpClusterMutation : ScheduleDAGMutation {
630 const SIInstrInfo *TII;
631
MemOpClusterMutation__anonb9fc40a70111::MemOpClusterMutation632 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
633
apply__anonb9fc40a70111::MemOpClusterMutation634 void apply(ScheduleDAGInstrs *DAGInstrs) override {
635 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
636
637 SUnit *SUa = nullptr;
638 // Search for two consequent memory operations and link them
639 // to prevent scheduler from moving them apart.
640 // In DAG pre-process SUnits are in the original order of
641 // the instructions before scheduling.
642 for (SUnit &SU : DAG->SUnits) {
643 MachineInstr &MI2 = *SU.getInstr();
644 if (!MI2.mayLoad() && !MI2.mayStore()) {
645 SUa = nullptr;
646 continue;
647 }
648 if (!SUa) {
649 SUa = &SU;
650 continue;
651 }
652
653 MachineInstr &MI1 = *SUa->getInstr();
654 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
655 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
656 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
657 (TII->isDS(MI1) && TII->isDS(MI2))) {
658 SU.addPredBarrier(SUa);
659
660 for (const SDep &SI : SU.Preds) {
661 if (SI.getSUnit() != SUa)
662 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
663 }
664
665 if (&SU != &DAG->ExitSU) {
666 for (const SDep &SI : SUa->Succs) {
667 if (SI.getSUnit() != &SU)
668 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
669 }
670 }
671 }
672
673 SUa = &SU;
674 }
675 }
676 };
677 } // namespace
678
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const679 void GCNSubtarget::getPostRAMutations(
680 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
681 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
682 }
683
get(const MachineFunction & MF)684 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
685 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
687 else
688 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
689 }
690
get(const TargetMachine & TM,const Function & F)691 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
692 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
694 else
695 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
696 }
697