1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief The AMDGPU target machine contains all of the hardware specific
12 /// information  needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUTargetObjectFile.h"
20 #include "AMDGPUTargetTransformInfo.h"
21 #include "GCNSchedStrategy.h"
22 #include "R600ISelLowering.h"
23 #include "R600InstrInfo.h"
24 #include "R600MachineScheduler.h"
25 #include "SIISelLowering.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineScheduler.h"
28 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
29 #include "llvm/CodeGen/Passes.h"
30 #include "llvm/CodeGen/TargetPassConfig.h"
31 #include "llvm/Support/TargetRegistry.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/IPO/AlwaysInliner.h"
34 #include "llvm/Transforms/Scalar.h"
35 #include "llvm/Transforms/Scalar/GVN.h"
36 #include "llvm/Transforms/Vectorize.h"
37 
38 using namespace llvm;
39 
40 static cl::opt<bool> EnableR600StructurizeCFG(
41   "r600-ir-structurize",
42   cl::desc("Use StructurizeCFG IR pass"),
43   cl::init(true));
44 
45 static cl::opt<bool> EnableSROA(
46   "amdgpu-sroa",
47   cl::desc("Run SROA after promote alloca pass"),
48   cl::ReallyHidden,
49   cl::init(true));
50 
51 static cl::opt<bool> EnableR600IfConvert(
52   "r600-if-convert",
53   cl::desc("Use if conversion pass"),
54   cl::ReallyHidden,
55   cl::init(true));
56 
57 // Option to disable vectorizer for tests.
58 static cl::opt<bool> EnableLoadStoreVectorizer(
59   "amdgpu-load-store-vectorizer",
60   cl::desc("Enable load store vectorizer"),
61   cl::init(false),
62   cl::Hidden);
63 
64 extern "C" void LLVMInitializeAMDGPUTarget() {
65   // Register the target
66   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
67   RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
68 
69   PassRegistry *PR = PassRegistry::getPassRegistry();
70   initializeSILowerI1CopiesPass(*PR);
71   initializeSIFixSGPRCopiesPass(*PR);
72   initializeSIFoldOperandsPass(*PR);
73   initializeSIShrinkInstructionsPass(*PR);
74   initializeSIFixControlFlowLiveIntervalsPass(*PR);
75   initializeSILoadStoreOptimizerPass(*PR);
76   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
77   initializeAMDGPUAnnotateUniformValuesPass(*PR);
78   initializeAMDGPUPromoteAllocaPass(*PR);
79   initializeAMDGPUCodeGenPreparePass(*PR);
80   initializeSIAnnotateControlFlowPass(*PR);
81   initializeSIInsertWaitsPass(*PR);
82   initializeSIWholeQuadModePass(*PR);
83   initializeSILowerControlFlowPass(*PR);
84   initializeSIInsertSkipsPass(*PR);
85   initializeSIDebuggerInsertNopsPass(*PR);
86 }
87 
88 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
89   return make_unique<AMDGPUTargetObjectFile>();
90 }
91 
92 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
93   return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
94 }
95 
96 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
97   return new SIScheduleDAGMI(C);
98 }
99 
100 static ScheduleDAGInstrs *
101 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
102   ScheduleDAGMILive *DAG =
103       new ScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
104   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
105   return DAG;
106 }
107 
108 static MachineSchedRegistry
109 R600SchedRegistry("r600", "Run R600's custom scheduler",
110                    createR600MachineScheduler);
111 
112 static MachineSchedRegistry
113 SISchedRegistry("si", "Run SI's custom scheduler",
114                 createSIMachineScheduler);
115 
116 static MachineSchedRegistry
117 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
118                              "Run GCN scheduler to maximize occupancy",
119                              createGCNMaxOccupancyMachineScheduler);
120 
121 static StringRef computeDataLayout(const Triple &TT) {
122   if (TT.getArch() == Triple::r600) {
123     // 32-bit pointers.
124     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
125             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
126   }
127 
128   // 32-bit private, local, and region pointers. 64-bit global, constant and
129   // flat.
130   return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
131          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
132          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
133 }
134 
135 LLVM_READNONE
136 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
137   if (!GPU.empty())
138     return GPU;
139 
140   // HSA only supports CI+, so change the default GPU to a CI for HSA.
141   if (TT.getArch() == Triple::amdgcn)
142     return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
143 
144   return "r600";
145 }
146 
147 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
148   // The AMDGPU toolchain only supports generating shared objects, so we
149   // must always use PIC.
150   return Reloc::PIC_;
151 }
152 
153 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
154                                          StringRef CPU, StringRef FS,
155                                          TargetOptions Options,
156                                          Optional<Reloc::Model> RM,
157                                          CodeModel::Model CM,
158                                          CodeGenOpt::Level OptLevel)
159   : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
160                       FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
161     TLOF(createTLOF(getTargetTriple())),
162     IntrinsicInfo() {
163   setRequiresStructuredCFG(true);
164   initAsmInfo();
165 }
166 
167 AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
168 
169 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
170   Attribute GPUAttr = F.getFnAttribute("target-cpu");
171   return GPUAttr.hasAttribute(Attribute::None) ?
172     getTargetCPU() : GPUAttr.getValueAsString();
173 }
174 
175 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
176   Attribute FSAttr = F.getFnAttribute("target-features");
177 
178   return FSAttr.hasAttribute(Attribute::None) ?
179     getTargetFeatureString() :
180     FSAttr.getValueAsString();
181 }
182 
183 //===----------------------------------------------------------------------===//
184 // R600 Target Machine (R600 -> Cayman)
185 //===----------------------------------------------------------------------===//
186 
187 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
188                                      StringRef CPU, StringRef FS,
189                                      TargetOptions Options,
190                                      Optional<Reloc::Model> RM,
191                                      CodeModel::Model CM, CodeGenOpt::Level OL)
192   : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
193 
194 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
195   const Function &F) const {
196   StringRef GPU = getGPUName(F);
197   StringRef FS = getFeatureString(F);
198 
199   SmallString<128> SubtargetKey(GPU);
200   SubtargetKey.append(FS);
201 
202   auto &I = SubtargetMap[SubtargetKey];
203   if (!I) {
204     // This needs to be done before we create a new subtarget since any
205     // creation will depend on the TM and the code generation flags on the
206     // function that reside in TargetOptions.
207     resetTargetOptions(F);
208     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
209   }
210 
211   return I.get();
212 }
213 
214 //===----------------------------------------------------------------------===//
215 // GCN Target Machine (SI+)
216 //===----------------------------------------------------------------------===//
217 
218 #ifdef LLVM_BUILD_GLOBAL_ISEL
219 namespace {
220 struct SIGISelActualAccessor : public GISelAccessor {
221   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
222   const AMDGPUCallLowering *getCallLowering() const override {
223     return CallLoweringInfo.get();
224   }
225 };
226 } // End anonymous namespace.
227 #endif
228 
229 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
230                                    StringRef CPU, StringRef FS,
231                                    TargetOptions Options,
232                                    Optional<Reloc::Model> RM,
233                                    CodeModel::Model CM, CodeGenOpt::Level OL)
234   : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
235 
236 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
237   StringRef GPU = getGPUName(F);
238   StringRef FS = getFeatureString(F);
239 
240   SmallString<128> SubtargetKey(GPU);
241   SubtargetKey.append(FS);
242 
243   auto &I = SubtargetMap[SubtargetKey];
244   if (!I) {
245     // This needs to be done before we create a new subtarget since any
246     // creation will depend on the TM and the code generation flags on the
247     // function that reside in TargetOptions.
248     resetTargetOptions(F);
249     I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
250 
251 #ifndef LLVM_BUILD_GLOBAL_ISEL
252     GISelAccessor *GISel = new GISelAccessor();
253 #else
254     SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
255     GISel->CallLoweringInfo.reset(
256       new AMDGPUCallLowering(*I->getTargetLowering()));
257 #endif
258 
259     I->setGISelAccessor(*GISel);
260   }
261 
262   return I.get();
263 }
264 
265 //===----------------------------------------------------------------------===//
266 // AMDGPU Pass Setup
267 //===----------------------------------------------------------------------===//
268 
269 namespace {
270 
271 class AMDGPUPassConfig : public TargetPassConfig {
272 public:
273   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
274     : TargetPassConfig(TM, PM) {
275 
276     // Exceptions and StackMaps are not supported, so these passes will never do
277     // anything.
278     disablePass(&StackMapLivenessID);
279     disablePass(&FuncletLayoutID);
280   }
281 
282   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
283     return getTM<AMDGPUTargetMachine>();
284   }
285 
286   void addEarlyCSEOrGVNPass();
287   void addStraightLineScalarOptimizationPasses();
288   void addIRPasses() override;
289   void addCodeGenPrepare() override;
290   bool addPreISel() override;
291   bool addInstSelector() override;
292   bool addGCPasses() override;
293 };
294 
295 class R600PassConfig final : public AMDGPUPassConfig {
296 public:
297   R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
298     : AMDGPUPassConfig(TM, PM) { }
299 
300   ScheduleDAGInstrs *createMachineScheduler(
301     MachineSchedContext *C) const override {
302     return createR600MachineScheduler(C);
303   }
304 
305   bool addPreISel() override;
306   void addPreRegAlloc() override;
307   void addPreSched2() override;
308   void addPreEmitPass() override;
309 };
310 
311 class GCNPassConfig final : public AMDGPUPassConfig {
312 public:
313   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
314     : AMDGPUPassConfig(TM, PM) { }
315 
316   GCNTargetMachine &getGCNTargetMachine() const {
317     return getTM<GCNTargetMachine>();
318   }
319 
320   ScheduleDAGInstrs *
321   createMachineScheduler(MachineSchedContext *C) const override;
322 
323   void addIRPasses() override;
324   bool addPreISel() override;
325   void addMachineSSAOptimization() override;
326   bool addInstSelector() override;
327 #ifdef LLVM_BUILD_GLOBAL_ISEL
328   bool addIRTranslator() override;
329   bool addLegalizeMachineIR() override;
330   bool addRegBankSelect() override;
331   bool addGlobalInstructionSelect() override;
332 #endif
333   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
334   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
335   void addPreRegAlloc() override;
336   void addPreSched2() override;
337   void addPreEmitPass() override;
338 };
339 
340 } // End of anonymous namespace
341 
342 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
343   return TargetIRAnalysis([this](const Function &F) {
344     return TargetTransformInfo(AMDGPUTTIImpl(this, F));
345   });
346 }
347 
348 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
349   if (getOptLevel() == CodeGenOpt::Aggressive)
350     addPass(createGVNPass());
351   else
352     addPass(createEarlyCSEPass());
353 }
354 
355 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
356   addPass(createSeparateConstOffsetFromGEPPass());
357   addPass(createSpeculativeExecutionPass());
358   // ReassociateGEPs exposes more opportunites for SLSR. See
359   // the example in reassociate-geps-and-slsr.ll.
360   addPass(createStraightLineStrengthReducePass());
361   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
362   // EarlyCSE can reuse.
363   addEarlyCSEOrGVNPass();
364   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
365   addPass(createNaryReassociatePass());
366   // NaryReassociate on GEPs creates redundant common expressions, so run
367   // EarlyCSE after it.
368   addPass(createEarlyCSEPass());
369 }
370 
371 void AMDGPUPassConfig::addIRPasses() {
372   // There is no reason to run these.
373   disablePass(&StackMapLivenessID);
374   disablePass(&FuncletLayoutID);
375   disablePass(&PatchableFunctionID);
376 
377   // Function calls are not supported, so make sure we inline everything.
378   addPass(createAMDGPUAlwaysInlinePass());
379   addPass(createAlwaysInlinerLegacyPass());
380   // We need to add the barrier noop pass, otherwise adding the function
381   // inlining pass will cause all of the PassConfigs passes to be run
382   // one function at a time, which means if we have a nodule with two
383   // functions, then we will generate code for the first function
384   // without ever running any passes on the second.
385   addPass(createBarrierNoopPass());
386 
387   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
388   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
389 
390   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
391   if (TM.getOptLevel() > CodeGenOpt::None) {
392     addPass(createAMDGPUPromoteAlloca(&TM));
393 
394     if (EnableSROA)
395       addPass(createSROAPass());
396   }
397 
398   addStraightLineScalarOptimizationPasses();
399 
400   TargetPassConfig::addIRPasses();
401 
402   // EarlyCSE is not always strong enough to clean up what LSR produces. For
403   // example, GVN can combine
404   //
405   //   %0 = add %a, %b
406   //   %1 = add %b, %a
407   //
408   // and
409   //
410   //   %0 = shl nsw %a, 2
411   //   %1 = shl %a, 2
412   //
413   // but EarlyCSE can do neither of them.
414   if (getOptLevel() != CodeGenOpt::None)
415     addEarlyCSEOrGVNPass();
416 }
417 
418 void AMDGPUPassConfig::addCodeGenPrepare() {
419   TargetPassConfig::addCodeGenPrepare();
420 
421   if (EnableLoadStoreVectorizer)
422     addPass(createLoadStoreVectorizerPass());
423 }
424 
425 bool AMDGPUPassConfig::addPreISel() {
426   addPass(createFlattenCFGPass());
427   return false;
428 }
429 
430 bool AMDGPUPassConfig::addInstSelector() {
431   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
432   return false;
433 }
434 
435 bool AMDGPUPassConfig::addGCPasses() {
436   // Do nothing. GC is not supported.
437   return false;
438 }
439 
440 //===----------------------------------------------------------------------===//
441 // R600 Pass Setup
442 //===----------------------------------------------------------------------===//
443 
444 bool R600PassConfig::addPreISel() {
445   AMDGPUPassConfig::addPreISel();
446 
447   if (EnableR600StructurizeCFG)
448     addPass(createStructurizeCFGPass());
449   return false;
450 }
451 
452 void R600PassConfig::addPreRegAlloc() {
453   addPass(createR600VectorRegMerger(*TM));
454 }
455 
456 void R600PassConfig::addPreSched2() {
457   addPass(createR600EmitClauseMarkers(), false);
458   if (EnableR600IfConvert)
459     addPass(&IfConverterID, false);
460   addPass(createR600ClauseMergePass(*TM), false);
461 }
462 
463 void R600PassConfig::addPreEmitPass() {
464   addPass(createAMDGPUCFGStructurizerPass(), false);
465   addPass(createR600ExpandSpecialInstrsPass(*TM), false);
466   addPass(&FinalizeMachineBundlesID, false);
467   addPass(createR600Packetizer(*TM), false);
468   addPass(createR600ControlFlowFinalizer(*TM), false);
469 }
470 
471 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
472   return new R600PassConfig(this, PM);
473 }
474 
475 //===----------------------------------------------------------------------===//
476 // GCN Pass Setup
477 //===----------------------------------------------------------------------===//
478 
479 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
480   MachineSchedContext *C) const {
481   const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
482   if (ST.enableSIScheduler())
483     return createSIMachineScheduler(C);
484   return createGCNMaxOccupancyMachineScheduler(C);
485 }
486 
487 bool GCNPassConfig::addPreISel() {
488   AMDGPUPassConfig::addPreISel();
489 
490   // FIXME: We need to run a pass to propagate the attributes when calls are
491   // supported.
492   addPass(&AMDGPUAnnotateKernelFeaturesID);
493   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
494   addPass(createSinkingPass());
495   addPass(createSITypeRewriter());
496   addPass(createAMDGPUAnnotateUniformValues());
497   addPass(createSIAnnotateControlFlowPass());
498 
499   return false;
500 }
501 
502 void GCNPassConfig::addMachineSSAOptimization() {
503   TargetPassConfig::addMachineSSAOptimization();
504 
505   // We want to fold operands after PeepholeOptimizer has run (or as part of
506   // it), because it will eliminate extra copies making it easier to fold the
507   // real source operand. We want to eliminate dead instructions after, so that
508   // we see fewer uses of the copies. We then need to clean up the dead
509   // instructions leftover after the operands are folded as well.
510   //
511   // XXX - Can we get away without running DeadMachineInstructionElim again?
512   addPass(&SIFoldOperandsID);
513   addPass(&DeadMachineInstructionElimID);
514   addPass(&SILoadStoreOptimizerID);
515 }
516 
517 void GCNPassConfig::addIRPasses() {
518   // TODO: May want to move later or split into an early and late one.
519   addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
520 
521   AMDGPUPassConfig::addIRPasses();
522 }
523 
524 bool GCNPassConfig::addInstSelector() {
525   AMDGPUPassConfig::addInstSelector();
526   addPass(createSILowerI1CopiesPass());
527   addPass(&SIFixSGPRCopiesID);
528   return false;
529 }
530 
531 #ifdef LLVM_BUILD_GLOBAL_ISEL
532 bool GCNPassConfig::addIRTranslator() {
533   addPass(new IRTranslator());
534   return false;
535 }
536 
537 bool GCNPassConfig::addLegalizeMachineIR() {
538   return false;
539 }
540 
541 bool GCNPassConfig::addRegBankSelect() {
542   return false;
543 }
544 
545 bool GCNPassConfig::addGlobalInstructionSelect() {
546   return false;
547 }
548 #endif
549 
550 void GCNPassConfig::addPreRegAlloc() {
551 
552   addPass(createSIShrinkInstructionsPass());
553   addPass(createSIWholeQuadModePass());
554 }
555 
556 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
557   // FIXME: We have to disable the verifier here because of PHIElimination +
558   // TwoAddressInstructions disabling it.
559   insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false);
560 
561   TargetPassConfig::addFastRegAlloc(RegAllocPass);
562 }
563 
564 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
565   // This needs to be run directly before register allocation because earlier
566   // passes might recompute live intervals.
567   insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
568 
569   // TODO: It might be better to run this right after phi elimination, but for
570   // now that would require not running the verifier.
571   insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID);
572 
573   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
574 }
575 
576 void GCNPassConfig::addPreSched2() {
577 }
578 
579 void GCNPassConfig::addPreEmitPass() {
580   // The hazard recognizer that runs as part of the post-ra scheduler does not
581   // guarantee to be able handle all hazards correctly. This is because if there
582   // are multiple scheduling regions in a basic block, the regions are scheduled
583   // bottom up, so when we begin to schedule a region we don't know what
584   // instructions were emitted directly before it.
585   //
586   // Here we add a stand-alone hazard recognizer pass which can handle all
587   // cases.
588   addPass(&PostRAHazardRecognizerID);
589 
590   addPass(createSIInsertWaitsPass());
591   addPass(createSIShrinkInstructionsPass());
592   addPass(&SIInsertSkipsPassID);
593   addPass(createSIDebuggerInsertNopsPass());
594 }
595 
596 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
597   return new GCNPassConfig(this, PM);
598 }
599