1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// The AMDGPU target machine contains all of the hardware specific
12 /// information  needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUAliasAnalysis.h"
19 #include "AMDGPUCallLowering.h"
20 #include "AMDGPUInstructionSelector.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
27 #include "R600MachineScheduler.h"
28 #include "SIMachineScheduler.h"
29 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
32 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
33 #include "llvm/CodeGen/Passes.h"
34 #include "llvm/CodeGen/TargetPassConfig.h"
35 #include "llvm/IR/Attributes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/LegacyPassManager.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Compiler.h"
41 #include "llvm/Support/TargetRegistry.h"
42 #include "llvm/Target/TargetLoweringObjectFile.h"
43 #include "llvm/Transforms/IPO.h"
44 #include "llvm/Transforms/IPO/AlwaysInliner.h"
45 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Scalar/GVN.h"
48 #include "llvm/Transforms/Utils.h"
49 #include "llvm/Transforms/Vectorize.h"
50 #include <memory>
51 
52 using namespace llvm;
53 
54 static cl::opt<bool> EnableR600StructurizeCFG(
55   "r600-ir-structurize",
56   cl::desc("Use StructurizeCFG IR pass"),
57   cl::init(true));
58 
59 static cl::opt<bool> EnableSROA(
60   "amdgpu-sroa",
61   cl::desc("Run SROA after promote alloca pass"),
62   cl::ReallyHidden,
63   cl::init(true));
64 
65 static cl::opt<bool>
66 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
67                         cl::desc("Run early if-conversion"),
68                         cl::init(false));
69 
70 static cl::opt<bool> EnableR600IfConvert(
71   "r600-if-convert",
72   cl::desc("Use if conversion pass"),
73   cl::ReallyHidden,
74   cl::init(true));
75 
76 // Option to disable vectorizer for tests.
77 static cl::opt<bool> EnableLoadStoreVectorizer(
78   "amdgpu-load-store-vectorizer",
79   cl::desc("Enable load store vectorizer"),
80   cl::init(true),
81   cl::Hidden);
82 
83 // Option to control global loads scalarization
84 static cl::opt<bool> ScalarizeGlobal(
85   "amdgpu-scalarize-global-loads",
86   cl::desc("Enable global load scalarization"),
87   cl::init(true),
88   cl::Hidden);
89 
90 // Option to run internalize pass.
91 static cl::opt<bool> InternalizeSymbols(
92   "amdgpu-internalize-symbols",
93   cl::desc("Enable elimination of non-kernel functions and unused globals"),
94   cl::init(false),
95   cl::Hidden);
96 
97 // Option to inline all early.
98 static cl::opt<bool> EarlyInlineAll(
99   "amdgpu-early-inline-all",
100   cl::desc("Inline all functions early"),
101   cl::init(false),
102   cl::Hidden);
103 
104 static cl::opt<bool> EnableSDWAPeephole(
105   "amdgpu-sdwa-peephole",
106   cl::desc("Enable SDWA peepholer"),
107   cl::init(true));
108 
109 // Enable address space based alias analysis
110 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
111   cl::desc("Enable AMDGPU Alias Analysis"),
112   cl::init(true));
113 
114 // Option to run late CFG structurizer
115 static cl::opt<bool, true> LateCFGStructurize(
116   "amdgpu-late-structurize",
117   cl::desc("Enable late CFG structurization"),
118   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
119   cl::Hidden);
120 
121 static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
122   "amdgpu-function-calls",
123   cl::desc("Enable AMDGPU function call support"),
124   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
125   cl::init(false),
126   cl::Hidden);
127 
128 // Enable lib calls simplifications
129 static cl::opt<bool> EnableLibCallSimplify(
130   "amdgpu-simplify-libcall",
131   cl::desc("Enable amdgpu library simplifications"),
132   cl::init(true),
133   cl::Hidden);
134 
135 static cl::opt<bool> EnableLowerKernelArguments(
136   "amdgpu-ir-lower-kernel-arguments",
137   cl::desc("Lower kernel argument loads in IR pass"),
138   cl::init(true),
139   cl::Hidden);
140 
141 // Enable atomic optimization
142 static cl::opt<bool> EnableAtomicOptimizations(
143   "amdgpu-atomic-optimizations",
144   cl::desc("Enable atomic optimizations"),
145   cl::init(false),
146   cl::Hidden);
147 
148 extern "C" void LLVMInitializeAMDGPUTarget() {
149   // Register the target
150   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
151   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
152 
153   PassRegistry *PR = PassRegistry::getPassRegistry();
154   initializeR600ClauseMergePassPass(*PR);
155   initializeR600ControlFlowFinalizerPass(*PR);
156   initializeR600PacketizerPass(*PR);
157   initializeR600ExpandSpecialInstrsPassPass(*PR);
158   initializeR600VectorRegMergerPass(*PR);
159   initializeGlobalISel(*PR);
160   initializeAMDGPUDAGToDAGISelPass(*PR);
161   initializeSILowerI1CopiesPass(*PR);
162   initializeSIFixSGPRCopiesPass(*PR);
163   initializeSIFixVGPRCopiesPass(*PR);
164   initializeSIFixupVectorISelPass(*PR);
165   initializeSIFoldOperandsPass(*PR);
166   initializeSIPeepholeSDWAPass(*PR);
167   initializeSIShrinkInstructionsPass(*PR);
168   initializeSIOptimizeExecMaskingPreRAPass(*PR);
169   initializeSILoadStoreOptimizerPass(*PR);
170   initializeAMDGPUFixFunctionBitcastsPass(*PR);
171   initializeAMDGPUAlwaysInlinePass(*PR);
172   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
173   initializeAMDGPUAnnotateUniformValuesPass(*PR);
174   initializeAMDGPUArgumentUsageInfoPass(*PR);
175   initializeAMDGPUAtomicOptimizerPass(*PR);
176   initializeAMDGPULowerKernelArgumentsPass(*PR);
177   initializeAMDGPULowerKernelAttributesPass(*PR);
178   initializeAMDGPULowerIntrinsicsPass(*PR);
179   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
180   initializeAMDGPUPromoteAllocaPass(*PR);
181   initializeAMDGPUCodeGenPreparePass(*PR);
182   initializeAMDGPURewriteOutArgumentsPass(*PR);
183   initializeAMDGPUUnifyMetadataPass(*PR);
184   initializeSIAnnotateControlFlowPass(*PR);
185   initializeSIInsertWaitcntsPass(*PR);
186   initializeSIWholeQuadModePass(*PR);
187   initializeSILowerControlFlowPass(*PR);
188   initializeSIInsertSkipsPass(*PR);
189   initializeSIMemoryLegalizerPass(*PR);
190   initializeSIDebuggerInsertNopsPass(*PR);
191   initializeSIOptimizeExecMaskingPass(*PR);
192   initializeSIFixWWMLivenessPass(*PR);
193   initializeSIFormMemoryClausesPass(*PR);
194   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
195   initializeAMDGPUAAWrapperPassPass(*PR);
196   initializeAMDGPUExternalAAWrapperPass(*PR);
197   initializeAMDGPUUseNativeCallsPass(*PR);
198   initializeAMDGPUSimplifyLibCallsPass(*PR);
199   initializeAMDGPUInlinerPass(*PR);
200 }
201 
202 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
203   return llvm::make_unique<AMDGPUTargetObjectFile>();
204 }
205 
206 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
207   return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
208 }
209 
210 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
211   return new SIScheduleDAGMI(C);
212 }
213 
214 static ScheduleDAGInstrs *
215 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
216   ScheduleDAGMILive *DAG =
217     new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
218   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
219   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
220   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
221   return DAG;
222 }
223 
224 static ScheduleDAGInstrs *
225 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
226   auto DAG = new GCNIterativeScheduler(C,
227     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
228   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
229   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
230   return DAG;
231 }
232 
233 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
234   return new GCNIterativeScheduler(C,
235     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
236 }
237 
238 static ScheduleDAGInstrs *
239 createIterativeILPMachineScheduler(MachineSchedContext *C) {
240   auto DAG = new GCNIterativeScheduler(C,
241     GCNIterativeScheduler::SCHEDULE_ILP);
242   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
243   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
244   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
245   return DAG;
246 }
247 
248 static MachineSchedRegistry
249 R600SchedRegistry("r600", "Run R600's custom scheduler",
250                    createR600MachineScheduler);
251 
252 static MachineSchedRegistry
253 SISchedRegistry("si", "Run SI's custom scheduler",
254                 createSIMachineScheduler);
255 
256 static MachineSchedRegistry
257 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
258                              "Run GCN scheduler to maximize occupancy",
259                              createGCNMaxOccupancyMachineScheduler);
260 
261 static MachineSchedRegistry
262 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
263   "Run GCN scheduler to maximize occupancy (experimental)",
264   createIterativeGCNMaxOccupancyMachineScheduler);
265 
266 static MachineSchedRegistry
267 GCNMinRegSchedRegistry("gcn-minreg",
268   "Run GCN iterative scheduler for minimal register usage (experimental)",
269   createMinRegScheduler);
270 
271 static MachineSchedRegistry
272 GCNILPSchedRegistry("gcn-ilp",
273   "Run GCN iterative scheduler for ILP scheduling (experimental)",
274   createIterativeILPMachineScheduler);
275 
276 static StringRef computeDataLayout(const Triple &TT) {
277   if (TT.getArch() == Triple::r600) {
278     // 32-bit pointers.
279       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
280              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
281   }
282 
283   // 32-bit private, local, and region pointers. 64-bit global, constant and
284   // flat.
285     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
286          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
287          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
288 }
289 
290 LLVM_READNONE
291 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
292   if (!GPU.empty())
293     return GPU;
294 
295   if (TT.getArch() == Triple::amdgcn)
296     return "generic";
297 
298   return "r600";
299 }
300 
301 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
302   // The AMDGPU toolchain only supports generating shared objects, so we
303   // must always use PIC.
304   return Reloc::PIC_;
305 }
306 
307 static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) {
308   if (CM)
309     return *CM;
310   return CodeModel::Small;
311 }
312 
313 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
314                                          StringRef CPU, StringRef FS,
315                                          TargetOptions Options,
316                                          Optional<Reloc::Model> RM,
317                                          Optional<CodeModel::Model> CM,
318                                          CodeGenOpt::Level OptLevel)
319     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
320                         FS, Options, getEffectiveRelocModel(RM),
321                         getEffectiveCodeModel(CM), OptLevel),
322       TLOF(createTLOF(getTargetTriple())) {
323   initAsmInfo();
324 }
325 
326 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
327 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
328 
329 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
330 
331 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
332   Attribute GPUAttr = F.getFnAttribute("target-cpu");
333   return GPUAttr.hasAttribute(Attribute::None) ?
334     getTargetCPU() : GPUAttr.getValueAsString();
335 }
336 
337 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
338   Attribute FSAttr = F.getFnAttribute("target-features");
339 
340   return FSAttr.hasAttribute(Attribute::None) ?
341     getTargetFeatureString() :
342     FSAttr.getValueAsString();
343 }
344 
345 /// Predicate for Internalize pass.
346 static bool mustPreserveGV(const GlobalValue &GV) {
347   if (const Function *F = dyn_cast<Function>(&GV))
348     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
349 
350   return !GV.use_empty();
351 }
352 
353 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
354   Builder.DivergentTarget = true;
355 
356   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
357   bool Internalize = InternalizeSymbols;
358   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls;
359   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
360   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
361 
362   if (EnableAMDGPUFunctionCalls) {
363     delete Builder.Inliner;
364     Builder.Inliner = createAMDGPUFunctionInliningPass();
365   }
366 
367   Builder.addExtension(
368     PassManagerBuilder::EP_ModuleOptimizerEarly,
369     [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
370                                          legacy::PassManagerBase &PM) {
371       if (AMDGPUAA) {
372         PM.add(createAMDGPUAAWrapperPass());
373         PM.add(createAMDGPUExternalAAWrapperPass());
374       }
375       PM.add(createAMDGPUUnifyMetadataPass());
376       if (Internalize) {
377         PM.add(createInternalizePass(mustPreserveGV));
378         PM.add(createGlobalDCEPass());
379       }
380       if (EarlyInline)
381         PM.add(createAMDGPUAlwaysInlinePass(false));
382   });
383 
384   const auto &Opt = Options;
385   Builder.addExtension(
386     PassManagerBuilder::EP_EarlyAsPossible,
387     [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
388                                       legacy::PassManagerBase &PM) {
389       if (AMDGPUAA) {
390         PM.add(createAMDGPUAAWrapperPass());
391         PM.add(createAMDGPUExternalAAWrapperPass());
392       }
393       PM.add(llvm::createAMDGPUUseNativeCallsPass());
394       if (LibCallSimplify)
395         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
396   });
397 
398   Builder.addExtension(
399     PassManagerBuilder::EP_CGSCCOptimizerLate,
400     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
401       // Add infer address spaces pass to the opt pipeline after inlining
402       // but before SROA to increase SROA opportunities.
403       PM.add(createInferAddressSpacesPass());
404 
405       // This should run after inlining to have any chance of doing anything,
406       // and before other cleanup optimizations.
407       PM.add(createAMDGPULowerKernelAttributesPass());
408   });
409 }
410 
411 //===----------------------------------------------------------------------===//
412 // R600 Target Machine (R600 -> Cayman)
413 //===----------------------------------------------------------------------===//
414 
415 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
416                                      StringRef CPU, StringRef FS,
417                                      TargetOptions Options,
418                                      Optional<Reloc::Model> RM,
419                                      Optional<CodeModel::Model> CM,
420                                      CodeGenOpt::Level OL, bool JIT)
421     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
422   setRequiresStructuredCFG(true);
423 }
424 
425 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
426   const Function &F) const {
427   StringRef GPU = getGPUName(F);
428   StringRef FS = getFeatureString(F);
429 
430   SmallString<128> SubtargetKey(GPU);
431   SubtargetKey.append(FS);
432 
433   auto &I = SubtargetMap[SubtargetKey];
434   if (!I) {
435     // This needs to be done before we create a new subtarget since any
436     // creation will depend on the TM and the code generation flags on the
437     // function that reside in TargetOptions.
438     resetTargetOptions(F);
439     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
440   }
441 
442   return I.get();
443 }
444 
445 TargetTransformInfo
446 R600TargetMachine::getTargetTransformInfo(const Function &F) {
447   return TargetTransformInfo(R600TTIImpl(this, F));
448 }
449 
450 //===----------------------------------------------------------------------===//
451 // GCN Target Machine (SI+)
452 //===----------------------------------------------------------------------===//
453 
454 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
455                                    StringRef CPU, StringRef FS,
456                                    TargetOptions Options,
457                                    Optional<Reloc::Model> RM,
458                                    Optional<CodeModel::Model> CM,
459                                    CodeGenOpt::Level OL, bool JIT)
460     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
461 
462 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
463   StringRef GPU = getGPUName(F);
464   StringRef FS = getFeatureString(F);
465 
466   SmallString<128> SubtargetKey(GPU);
467   SubtargetKey.append(FS);
468 
469   auto &I = SubtargetMap[SubtargetKey];
470   if (!I) {
471     // This needs to be done before we create a new subtarget since any
472     // creation will depend on the TM and the code generation flags on the
473     // function that reside in TargetOptions.
474     resetTargetOptions(F);
475     I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
476   }
477 
478   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
479 
480   return I.get();
481 }
482 
483 TargetTransformInfo
484 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
485   return TargetTransformInfo(GCNTTIImpl(this, F));
486 }
487 
488 //===----------------------------------------------------------------------===//
489 // AMDGPU Pass Setup
490 //===----------------------------------------------------------------------===//
491 
492 namespace {
493 
494 class AMDGPUPassConfig : public TargetPassConfig {
495 public:
496   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
497     : TargetPassConfig(TM, PM) {
498     // Exceptions and StackMaps are not supported, so these passes will never do
499     // anything.
500     disablePass(&StackMapLivenessID);
501     disablePass(&FuncletLayoutID);
502   }
503 
504   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
505     return getTM<AMDGPUTargetMachine>();
506   }
507 
508   ScheduleDAGInstrs *
509   createMachineScheduler(MachineSchedContext *C) const override {
510     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
511     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
512     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
513     return DAG;
514   }
515 
516   void addEarlyCSEOrGVNPass();
517   void addStraightLineScalarOptimizationPasses();
518   void addIRPasses() override;
519   void addCodeGenPrepare() override;
520   bool addPreISel() override;
521   bool addInstSelector() override;
522   bool addGCPasses() override;
523 };
524 
525 class R600PassConfig final : public AMDGPUPassConfig {
526 public:
527   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
528     : AMDGPUPassConfig(TM, PM) {}
529 
530   ScheduleDAGInstrs *createMachineScheduler(
531     MachineSchedContext *C) const override {
532     return createR600MachineScheduler(C);
533   }
534 
535   bool addPreISel() override;
536   bool addInstSelector() override;
537   void addPreRegAlloc() override;
538   void addPreSched2() override;
539   void addPreEmitPass() override;
540 };
541 
542 class GCNPassConfig final : public AMDGPUPassConfig {
543 public:
544   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
545     : AMDGPUPassConfig(TM, PM) {
546     // It is necessary to know the register usage of the entire call graph.  We
547     // allow calls without EnableAMDGPUFunctionCalls if they are marked
548     // noinline, so this is always required.
549     setRequiresCodeGenSCCOrder(true);
550   }
551 
552   GCNTargetMachine &getGCNTargetMachine() const {
553     return getTM<GCNTargetMachine>();
554   }
555 
556   ScheduleDAGInstrs *
557   createMachineScheduler(MachineSchedContext *C) const override;
558 
559   bool addPreISel() override;
560   void addMachineSSAOptimization() override;
561   bool addILPOpts() override;
562   bool addInstSelector() override;
563   bool addIRTranslator() override;
564   bool addLegalizeMachineIR() override;
565   bool addRegBankSelect() override;
566   bool addGlobalInstructionSelect() override;
567   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
568   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
569   void addPreRegAlloc() override;
570   void addPostRegAlloc() override;
571   void addPreSched2() override;
572   void addPreEmitPass() override;
573 };
574 
575 } // end anonymous namespace
576 
577 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
578   if (getOptLevel() == CodeGenOpt::Aggressive)
579     addPass(createGVNPass());
580   else
581     addPass(createEarlyCSEPass());
582 }
583 
584 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
585   addPass(createLICMPass());
586   addPass(createSeparateConstOffsetFromGEPPass());
587   addPass(createSpeculativeExecutionPass());
588   // ReassociateGEPs exposes more opportunites for SLSR. See
589   // the example in reassociate-geps-and-slsr.ll.
590   addPass(createStraightLineStrengthReducePass());
591   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
592   // EarlyCSE can reuse.
593   addEarlyCSEOrGVNPass();
594   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
595   addPass(createNaryReassociatePass());
596   // NaryReassociate on GEPs creates redundant common expressions, so run
597   // EarlyCSE after it.
598   addPass(createEarlyCSEPass());
599 }
600 
601 void AMDGPUPassConfig::addIRPasses() {
602   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
603 
604   // There is no reason to run these.
605   disablePass(&StackMapLivenessID);
606   disablePass(&FuncletLayoutID);
607   disablePass(&PatchableFunctionID);
608 
609   addPass(createAtomicExpandPass());
610 
611   // This must occur before inlining, as the inliner will not look through
612   // bitcast calls.
613   addPass(createAMDGPUFixFunctionBitcastsPass());
614 
615   addPass(createAMDGPULowerIntrinsicsPass());
616 
617   // Function calls are not supported, so make sure we inline everything.
618   addPass(createAMDGPUAlwaysInlinePass());
619   addPass(createAlwaysInlinerLegacyPass());
620   // We need to add the barrier noop pass, otherwise adding the function
621   // inlining pass will cause all of the PassConfigs passes to be run
622   // one function at a time, which means if we have a nodule with two
623   // functions, then we will generate code for the first function
624   // without ever running any passes on the second.
625   addPass(createBarrierNoopPass());
626 
627   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
628     // TODO: May want to move later or split into an early and late one.
629 
630     addPass(createAMDGPUCodeGenPreparePass());
631   }
632 
633   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
634   if (TM.getTargetTriple().getArch() == Triple::r600)
635     addPass(createR600OpenCLImageTypeLoweringPass());
636 
637   // Replace OpenCL enqueued block function pointers with global variables.
638   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
639 
640   if (TM.getOptLevel() > CodeGenOpt::None) {
641     addPass(createInferAddressSpacesPass());
642     addPass(createAMDGPUPromoteAlloca());
643 
644     if (EnableSROA)
645       addPass(createSROAPass());
646 
647     addStraightLineScalarOptimizationPasses();
648 
649     if (EnableAMDGPUAliasAnalysis) {
650       addPass(createAMDGPUAAWrapperPass());
651       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
652                                              AAResults &AAR) {
653         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
654           AAR.addAAResult(WrapperPass->getResult());
655         }));
656     }
657   }
658 
659   TargetPassConfig::addIRPasses();
660 
661   // EarlyCSE is not always strong enough to clean up what LSR produces. For
662   // example, GVN can combine
663   //
664   //   %0 = add %a, %b
665   //   %1 = add %b, %a
666   //
667   // and
668   //
669   //   %0 = shl nsw %a, 2
670   //   %1 = shl %a, 2
671   //
672   // but EarlyCSE can do neither of them.
673   if (getOptLevel() != CodeGenOpt::None)
674     addEarlyCSEOrGVNPass();
675 }
676 
677 void AMDGPUPassConfig::addCodeGenPrepare() {
678   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
679       EnableLowerKernelArguments)
680     addPass(createAMDGPULowerKernelArgumentsPass());
681 
682   TargetPassConfig::addCodeGenPrepare();
683 
684   if (EnableLoadStoreVectorizer)
685     addPass(createLoadStoreVectorizerPass());
686 }
687 
688 bool AMDGPUPassConfig::addPreISel() {
689   addPass(createLowerSwitchPass());
690   addPass(createFlattenCFGPass());
691   return false;
692 }
693 
694 bool AMDGPUPassConfig::addInstSelector() {
695   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
696   return false;
697 }
698 
699 bool AMDGPUPassConfig::addGCPasses() {
700   // Do nothing. GC is not supported.
701   return false;
702 }
703 
704 //===----------------------------------------------------------------------===//
705 // R600 Pass Setup
706 //===----------------------------------------------------------------------===//
707 
708 bool R600PassConfig::addPreISel() {
709   AMDGPUPassConfig::addPreISel();
710 
711   if (EnableR600StructurizeCFG)
712     addPass(createStructurizeCFGPass());
713   return false;
714 }
715 
716 bool R600PassConfig::addInstSelector() {
717   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
718   return false;
719 }
720 
721 void R600PassConfig::addPreRegAlloc() {
722   addPass(createR600VectorRegMerger());
723 }
724 
725 void R600PassConfig::addPreSched2() {
726   addPass(createR600EmitClauseMarkers(), false);
727   if (EnableR600IfConvert)
728     addPass(&IfConverterID, false);
729   addPass(createR600ClauseMergePass(), false);
730 }
731 
732 void R600PassConfig::addPreEmitPass() {
733   addPass(createAMDGPUCFGStructurizerPass(), false);
734   addPass(createR600ExpandSpecialInstrsPass(), false);
735   addPass(&FinalizeMachineBundlesID, false);
736   addPass(createR600Packetizer(), false);
737   addPass(createR600ControlFlowFinalizer(), false);
738 }
739 
740 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
741   return new R600PassConfig(*this, PM);
742 }
743 
744 //===----------------------------------------------------------------------===//
745 // GCN Pass Setup
746 //===----------------------------------------------------------------------===//
747 
748 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
749   MachineSchedContext *C) const {
750   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
751   if (ST.enableSIScheduler())
752     return createSIMachineScheduler(C);
753   return createGCNMaxOccupancyMachineScheduler(C);
754 }
755 
756 bool GCNPassConfig::addPreISel() {
757   AMDGPUPassConfig::addPreISel();
758 
759   if (EnableAtomicOptimizations) {
760     addPass(createAMDGPUAtomicOptimizerPass());
761   }
762 
763   // FIXME: We need to run a pass to propagate the attributes when calls are
764   // supported.
765   addPass(createAMDGPUAnnotateKernelFeaturesPass());
766 
767   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
768   // regions formed by them.
769   addPass(&AMDGPUUnifyDivergentExitNodesID);
770   if (!LateCFGStructurize) {
771     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
772   }
773   addPass(createSinkingPass());
774   addPass(createAMDGPUAnnotateUniformValues());
775   if (!LateCFGStructurize) {
776     addPass(createSIAnnotateControlFlowPass());
777   }
778 
779   return false;
780 }
781 
782 void GCNPassConfig::addMachineSSAOptimization() {
783   TargetPassConfig::addMachineSSAOptimization();
784 
785   // We want to fold operands after PeepholeOptimizer has run (or as part of
786   // it), because it will eliminate extra copies making it easier to fold the
787   // real source operand. We want to eliminate dead instructions after, so that
788   // we see fewer uses of the copies. We then need to clean up the dead
789   // instructions leftover after the operands are folded as well.
790   //
791   // XXX - Can we get away without running DeadMachineInstructionElim again?
792   addPass(&SIFoldOperandsID);
793   addPass(&DeadMachineInstructionElimID);
794   addPass(&SILoadStoreOptimizerID);
795   if (EnableSDWAPeephole) {
796     addPass(&SIPeepholeSDWAID);
797     addPass(&EarlyMachineLICMID);
798     addPass(&MachineCSEID);
799     addPass(&SIFoldOperandsID);
800     addPass(&DeadMachineInstructionElimID);
801   }
802   addPass(createSIShrinkInstructionsPass());
803 }
804 
805 bool GCNPassConfig::addILPOpts() {
806   if (EnableEarlyIfConversion)
807     addPass(&EarlyIfConverterID);
808 
809   TargetPassConfig::addILPOpts();
810   return false;
811 }
812 
813 bool GCNPassConfig::addInstSelector() {
814   AMDGPUPassConfig::addInstSelector();
815   addPass(&SIFixSGPRCopiesID);
816   addPass(createSILowerI1CopiesPass());
817   addPass(createSIFixupVectorISelPass());
818   return false;
819 }
820 
821 bool GCNPassConfig::addIRTranslator() {
822   addPass(new IRTranslator());
823   return false;
824 }
825 
826 bool GCNPassConfig::addLegalizeMachineIR() {
827   addPass(new Legalizer());
828   return false;
829 }
830 
831 bool GCNPassConfig::addRegBankSelect() {
832   addPass(new RegBankSelect());
833   return false;
834 }
835 
836 bool GCNPassConfig::addGlobalInstructionSelect() {
837   addPass(new InstructionSelect());
838   return false;
839 }
840 
841 void GCNPassConfig::addPreRegAlloc() {
842   if (LateCFGStructurize) {
843     addPass(createAMDGPUMachineCFGStructurizerPass());
844   }
845   addPass(createSIWholeQuadModePass());
846 }
847 
848 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
849   // FIXME: We have to disable the verifier here because of PHIElimination +
850   // TwoAddressInstructions disabling it.
851 
852   // This must be run immediately after phi elimination and before
853   // TwoAddressInstructions, otherwise the processing of the tied operand of
854   // SI_ELSE will introduce a copy of the tied operand source after the else.
855   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
856 
857   // This must be run after SILowerControlFlow, since it needs to use the
858   // machine-level CFG, but before register allocation.
859   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
860 
861   TargetPassConfig::addFastRegAlloc(RegAllocPass);
862 }
863 
864 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
865   insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
866 
867   insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
868 
869   // This must be run immediately after phi elimination and before
870   // TwoAddressInstructions, otherwise the processing of the tied operand of
871   // SI_ELSE will introduce a copy of the tied operand source after the else.
872   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
873 
874   // This must be run after SILowerControlFlow, since it needs to use the
875   // machine-level CFG, but before register allocation.
876   insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
877 
878   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
879 }
880 
881 void GCNPassConfig::addPostRegAlloc() {
882   addPass(&SIFixVGPRCopiesID);
883   addPass(&SIOptimizeExecMaskingID);
884   TargetPassConfig::addPostRegAlloc();
885 }
886 
887 void GCNPassConfig::addPreSched2() {
888 }
889 
890 void GCNPassConfig::addPreEmitPass() {
891   addPass(createSIMemoryLegalizerPass());
892   addPass(createSIInsertWaitcntsPass());
893   addPass(createSIShrinkInstructionsPass());
894 
895   // The hazard recognizer that runs as part of the post-ra scheduler does not
896   // guarantee to be able handle all hazards correctly. This is because if there
897   // are multiple scheduling regions in a basic block, the regions are scheduled
898   // bottom up, so when we begin to schedule a region we don't know what
899   // instructions were emitted directly before it.
900   //
901   // Here we add a stand-alone hazard recognizer pass which can handle all
902   // cases.
903   //
904   // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
905   // be better for it to emit S_NOP <N> when possible.
906   addPass(&PostRAHazardRecognizerID);
907 
908   addPass(&SIInsertSkipsPassID);
909   addPass(createSIDebuggerInsertNopsPass());
910   addPass(&BranchRelaxationPassID);
911 }
912 
913 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
914   return new GCNPassConfig(*this, PM);
915 }
916