1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUInstructionSelector.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "R600MachineScheduler.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "SIMachineScheduler.h"
31 #include "TargetInfo/AMDGPUTargetInfo.h"
32 #include "llvm/Analysis/CGSCCPassManager.h"
33 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
34 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
35 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
36 #include "llvm/CodeGen/GlobalISel/Localizer.h"
37 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
38 #include "llvm/CodeGen/MIRParser/MIParser.h"
39 #include "llvm/CodeGen/Passes.h"
40 #include "llvm/CodeGen/TargetPassConfig.h"
41 #include "llvm/IR/Attributes.h"
42 #include "llvm/IR/Function.h"
43 #include "llvm/IR/LegacyPassManager.h"
44 #include "llvm/IR/PassManager.h"
45 #include "llvm/InitializePasses.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Passes/PassBuilder.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/Compiler.h"
50 #include "llvm/Support/TargetRegistry.h"
51 #include "llvm/Target/TargetLoweringObjectFile.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/IPO/AlwaysInliner.h"
54 #include "llvm/Transforms/IPO/GlobalDCE.h"
55 #include "llvm/Transforms/IPO/Internalize.h"
56 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
57 #include "llvm/Transforms/Scalar.h"
58 #include "llvm/Transforms/Scalar/GVN.h"
59 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
60 #include "llvm/Transforms/Utils.h"
61 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
62 #include "llvm/Transforms/Vectorize.h"
63 #include <memory>
64 
65 using namespace llvm;
66 
67 static cl::opt<bool> EnableR600StructurizeCFG(
68   "r600-ir-structurize",
69   cl::desc("Use StructurizeCFG IR pass"),
70   cl::init(true));
71 
72 static cl::opt<bool> EnableSROA(
73   "amdgpu-sroa",
74   cl::desc("Run SROA after promote alloca pass"),
75   cl::ReallyHidden,
76   cl::init(true));
77 
78 static cl::opt<bool>
79 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
80                         cl::desc("Run early if-conversion"),
81                         cl::init(false));
82 
83 static cl::opt<bool>
84 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
85             cl::desc("Run pre-RA exec mask optimizations"),
86             cl::init(true));
87 
88 static cl::opt<bool> EnableR600IfConvert(
89   "r600-if-convert",
90   cl::desc("Use if conversion pass"),
91   cl::ReallyHidden,
92   cl::init(true));
93 
94 // Option to disable vectorizer for tests.
95 static cl::opt<bool> EnableLoadStoreVectorizer(
96   "amdgpu-load-store-vectorizer",
97   cl::desc("Enable load store vectorizer"),
98   cl::init(true),
99   cl::Hidden);
100 
101 // Option to control global loads scalarization
102 static cl::opt<bool> ScalarizeGlobal(
103   "amdgpu-scalarize-global-loads",
104   cl::desc("Enable global load scalarization"),
105   cl::init(true),
106   cl::Hidden);
107 
108 // Option to run internalize pass.
109 static cl::opt<bool> InternalizeSymbols(
110   "amdgpu-internalize-symbols",
111   cl::desc("Enable elimination of non-kernel functions and unused globals"),
112   cl::init(false),
113   cl::Hidden);
114 
115 // Option to inline all early.
116 static cl::opt<bool> EarlyInlineAll(
117   "amdgpu-early-inline-all",
118   cl::desc("Inline all functions early"),
119   cl::init(false),
120   cl::Hidden);
121 
122 static cl::opt<bool> EnableSDWAPeephole(
123   "amdgpu-sdwa-peephole",
124   cl::desc("Enable SDWA peepholer"),
125   cl::init(true));
126 
127 static cl::opt<bool> EnableDPPCombine(
128   "amdgpu-dpp-combine",
129   cl::desc("Enable DPP combiner"),
130   cl::init(true));
131 
132 // Enable address space based alias analysis
133 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
134   cl::desc("Enable AMDGPU Alias Analysis"),
135   cl::init(true));
136 
137 // Option to run late CFG structurizer
138 static cl::opt<bool, true> LateCFGStructurize(
139   "amdgpu-late-structurize",
140   cl::desc("Enable late CFG structurization"),
141   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
142   cl::Hidden);
143 
144 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
145   "amdgpu-function-calls",
146   cl::desc("Enable AMDGPU function call support"),
147   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
148   cl::init(true),
149   cl::Hidden);
150 
151 static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
152   "amdgpu-fixed-function-abi",
153   cl::desc("Enable all implicit function arguments"),
154   cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
155   cl::init(false),
156   cl::Hidden);
157 
158 // Enable lib calls simplifications
159 static cl::opt<bool> EnableLibCallSimplify(
160   "amdgpu-simplify-libcall",
161   cl::desc("Enable amdgpu library simplifications"),
162   cl::init(true),
163   cl::Hidden);
164 
165 static cl::opt<bool> EnableLowerKernelArguments(
166   "amdgpu-ir-lower-kernel-arguments",
167   cl::desc("Lower kernel argument loads in IR pass"),
168   cl::init(true),
169   cl::Hidden);
170 
171 static cl::opt<bool> EnableRegReassign(
172   "amdgpu-reassign-regs",
173   cl::desc("Enable register reassign optimizations on gfx10+"),
174   cl::init(true),
175   cl::Hidden);
176 
177 // Enable atomic optimization
178 static cl::opt<bool> EnableAtomicOptimizations(
179   "amdgpu-atomic-optimizations",
180   cl::desc("Enable atomic optimizations"),
181   cl::init(false),
182   cl::Hidden);
183 
184 // Enable Mode register optimization
185 static cl::opt<bool> EnableSIModeRegisterPass(
186   "amdgpu-mode-register",
187   cl::desc("Enable mode register pass"),
188   cl::init(true),
189   cl::Hidden);
190 
191 // Option is used in lit tests to prevent deadcoding of patterns inspected.
192 static cl::opt<bool>
193 EnableDCEInRA("amdgpu-dce-in-ra",
194     cl::init(true), cl::Hidden,
195     cl::desc("Enable machine DCE inside regalloc"));
196 
197 static cl::opt<bool> EnableScalarIRPasses(
198   "amdgpu-scalar-ir-passes",
199   cl::desc("Enable scalar IR passes"),
200   cl::init(true),
201   cl::Hidden);
202 
203 static cl::opt<bool> EnableStructurizerWorkarounds(
204     "amdgpu-enable-structurizer-workarounds",
205     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
206     cl::Hidden);
207 
208 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
209   // Register the target
210   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
211   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
212 
213   PassRegistry *PR = PassRegistry::getPassRegistry();
214   initializeR600ClauseMergePassPass(*PR);
215   initializeR600ControlFlowFinalizerPass(*PR);
216   initializeR600PacketizerPass(*PR);
217   initializeR600ExpandSpecialInstrsPassPass(*PR);
218   initializeR600VectorRegMergerPass(*PR);
219   initializeGlobalISel(*PR);
220   initializeAMDGPUDAGToDAGISelPass(*PR);
221   initializeGCNDPPCombinePass(*PR);
222   initializeSILowerI1CopiesPass(*PR);
223   initializeSILowerSGPRSpillsPass(*PR);
224   initializeSIFixSGPRCopiesPass(*PR);
225   initializeSIFixVGPRCopiesPass(*PR);
226   initializeSIFoldOperandsPass(*PR);
227   initializeSIPeepholeSDWAPass(*PR);
228   initializeSIShrinkInstructionsPass(*PR);
229   initializeSIOptimizeExecMaskingPreRAPass(*PR);
230   initializeSILoadStoreOptimizerPass(*PR);
231   initializeAMDGPUFixFunctionBitcastsPass(*PR);
232   initializeAMDGPUAlwaysInlinePass(*PR);
233   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
234   initializeAMDGPUAnnotateUniformValuesPass(*PR);
235   initializeAMDGPUArgumentUsageInfoPass(*PR);
236   initializeAMDGPUAtomicOptimizerPass(*PR);
237   initializeAMDGPULowerKernelArgumentsPass(*PR);
238   initializeAMDGPULowerKernelAttributesPass(*PR);
239   initializeAMDGPULowerIntrinsicsPass(*PR);
240   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
241   initializeAMDGPUPostLegalizerCombinerPass(*PR);
242   initializeAMDGPUPreLegalizerCombinerPass(*PR);
243   initializeAMDGPUPromoteAllocaPass(*PR);
244   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
245   initializeAMDGPUCodeGenPreparePass(*PR);
246   initializeAMDGPULateCodeGenPreparePass(*PR);
247   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
248   initializeAMDGPUPropagateAttributesLatePass(*PR);
249   initializeAMDGPURewriteOutArgumentsPass(*PR);
250   initializeAMDGPUUnifyMetadataPass(*PR);
251   initializeSIAnnotateControlFlowPass(*PR);
252   initializeSIInsertHardClausesPass(*PR);
253   initializeSIInsertWaitcntsPass(*PR);
254   initializeSIModeRegisterPass(*PR);
255   initializeSIWholeQuadModePass(*PR);
256   initializeSILowerControlFlowPass(*PR);
257   initializeSIRemoveShortExecBranchesPass(*PR);
258   initializeSIPreEmitPeepholePass(*PR);
259   initializeSIInsertSkipsPass(*PR);
260   initializeSIMemoryLegalizerPass(*PR);
261   initializeSIOptimizeExecMaskingPass(*PR);
262   initializeSIPreAllocateWWMRegsPass(*PR);
263   initializeSIFormMemoryClausesPass(*PR);
264   initializeSIPostRABundlerPass(*PR);
265   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
266   initializeAMDGPUAAWrapperPassPass(*PR);
267   initializeAMDGPUExternalAAWrapperPass(*PR);
268   initializeAMDGPUUseNativeCallsPass(*PR);
269   initializeAMDGPUSimplifyLibCallsPass(*PR);
270   initializeAMDGPUInlinerPass(*PR);
271   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
272   initializeGCNRegBankReassignPass(*PR);
273   initializeGCNNSAReassignPass(*PR);
274   initializeSIAddIMGInitPass(*PR);
275 }
276 
277 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
278   return std::make_unique<AMDGPUTargetObjectFile>();
279 }
280 
281 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
282   return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
283 }
284 
285 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
286   return new SIScheduleDAGMI(C);
287 }
288 
289 static ScheduleDAGInstrs *
290 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
291   ScheduleDAGMILive *DAG =
292     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
293   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
294   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
295   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
296   return DAG;
297 }
298 
299 static ScheduleDAGInstrs *
300 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
301   auto DAG = new GCNIterativeScheduler(C,
302     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
303   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
304   return DAG;
305 }
306 
307 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
308   return new GCNIterativeScheduler(C,
309     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
310 }
311 
312 static ScheduleDAGInstrs *
313 createIterativeILPMachineScheduler(MachineSchedContext *C) {
314   auto DAG = new GCNIterativeScheduler(C,
315     GCNIterativeScheduler::SCHEDULE_ILP);
316   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
317   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
318   return DAG;
319 }
320 
321 static MachineSchedRegistry
322 R600SchedRegistry("r600", "Run R600's custom scheduler",
323                    createR600MachineScheduler);
324 
325 static MachineSchedRegistry
326 SISchedRegistry("si", "Run SI's custom scheduler",
327                 createSIMachineScheduler);
328 
329 static MachineSchedRegistry
330 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
331                              "Run GCN scheduler to maximize occupancy",
332                              createGCNMaxOccupancyMachineScheduler);
333 
334 static MachineSchedRegistry
335 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
336   "Run GCN scheduler to maximize occupancy (experimental)",
337   createIterativeGCNMaxOccupancyMachineScheduler);
338 
339 static MachineSchedRegistry
340 GCNMinRegSchedRegistry("gcn-minreg",
341   "Run GCN iterative scheduler for minimal register usage (experimental)",
342   createMinRegScheduler);
343 
344 static MachineSchedRegistry
345 GCNILPSchedRegistry("gcn-ilp",
346   "Run GCN iterative scheduler for ILP scheduling (experimental)",
347   createIterativeILPMachineScheduler);
348 
349 static StringRef computeDataLayout(const Triple &TT) {
350   if (TT.getArch() == Triple::r600) {
351     // 32-bit pointers.
352     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
353            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
354   }
355 
356   // 32-bit private, local, and region pointers. 64-bit global, constant and
357   // flat, non-integral buffer fat pointers.
358   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
359          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
360          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
361          "-ni:7";
362 }
363 
364 LLVM_READNONE
365 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
366   if (!GPU.empty())
367     return GPU;
368 
369   // Need to default to a target with flat support for HSA.
370   if (TT.getArch() == Triple::amdgcn)
371     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
372 
373   return "r600";
374 }
375 
376 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
377   // The AMDGPU toolchain only supports generating shared objects, so we
378   // must always use PIC.
379   return Reloc::PIC_;
380 }
381 
382 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
383                                          StringRef CPU, StringRef FS,
384                                          TargetOptions Options,
385                                          Optional<Reloc::Model> RM,
386                                          Optional<CodeModel::Model> CM,
387                                          CodeGenOpt::Level OptLevel)
388     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
389                         FS, Options, getEffectiveRelocModel(RM),
390                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
391       TLOF(createTLOF(getTargetTriple())) {
392   initAsmInfo();
393   if (TT.getArch() == Triple::amdgcn) {
394     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
395       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
396     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
397       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
398   }
399 }
400 
401 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
402 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
403 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
404 
405 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
406 
407 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
408   Attribute GPUAttr = F.getFnAttribute("target-cpu");
409   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
410 }
411 
412 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
413   Attribute FSAttr = F.getFnAttribute("target-features");
414 
415   return FSAttr.isValid() ? FSAttr.getValueAsString()
416                           : getTargetFeatureString();
417 }
418 
419 /// Predicate for Internalize pass.
420 static bool mustPreserveGV(const GlobalValue &GV) {
421   if (const Function *F = dyn_cast<Function>(&GV))
422     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
423 
424   return !GV.use_empty();
425 }
426 
427 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
428   Builder.DivergentTarget = true;
429 
430   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
431   bool Internalize = InternalizeSymbols;
432   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
433   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
434   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
435 
436   if (EnableFunctionCalls) {
437     delete Builder.Inliner;
438     Builder.Inliner = createAMDGPUFunctionInliningPass();
439   }
440 
441   Builder.addExtension(
442     PassManagerBuilder::EP_ModuleOptimizerEarly,
443     [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
444                                                legacy::PassManagerBase &PM) {
445       if (AMDGPUAA) {
446         PM.add(createAMDGPUAAWrapperPass());
447         PM.add(createAMDGPUExternalAAWrapperPass());
448       }
449       PM.add(createAMDGPUUnifyMetadataPass());
450       PM.add(createAMDGPUPrintfRuntimeBinding());
451       if (Internalize)
452         PM.add(createInternalizePass(mustPreserveGV));
453       PM.add(createAMDGPUPropagateAttributesLatePass(this));
454       if (Internalize)
455         PM.add(createGlobalDCEPass());
456       if (EarlyInline)
457         PM.add(createAMDGPUAlwaysInlinePass(false));
458   });
459 
460   Builder.addExtension(
461     PassManagerBuilder::EP_EarlyAsPossible,
462     [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
463                                       legacy::PassManagerBase &PM) {
464       if (AMDGPUAA) {
465         PM.add(createAMDGPUAAWrapperPass());
466         PM.add(createAMDGPUExternalAAWrapperPass());
467       }
468       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
469       PM.add(llvm::createAMDGPUUseNativeCallsPass());
470       if (LibCallSimplify)
471         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
472   });
473 
474   Builder.addExtension(
475     PassManagerBuilder::EP_CGSCCOptimizerLate,
476     [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
477       // Add infer address spaces pass to the opt pipeline after inlining
478       // but before SROA to increase SROA opportunities.
479       PM.add(createInferAddressSpacesPass());
480 
481       // This should run after inlining to have any chance of doing anything,
482       // and before other cleanup optimizations.
483       PM.add(createAMDGPULowerKernelAttributesPass());
484 
485       // Promote alloca to vector before SROA and loop unroll. If we manage
486       // to eliminate allocas before unroll we may choose to unroll less.
487       if (EnableOpt)
488         PM.add(createAMDGPUPromoteAllocaToVector());
489   });
490 }
491 
492 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
493   AAM.registerFunctionAnalysis<AMDGPUAA>();
494 }
495 
496 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
497                                                        bool DebugPassManager) {
498   PB.registerPipelineParsingCallback(
499       [this](StringRef PassName, ModulePassManager &PM,
500              ArrayRef<PassBuilder::PipelineElement>) {
501         if (PassName == "amdgpu-propagate-attributes-late") {
502           PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
503           return true;
504         }
505         if (PassName == "amdgpu-unify-metadata") {
506           PM.addPass(AMDGPUUnifyMetadataPass());
507           return true;
508         }
509         if (PassName == "amdgpu-printf-runtime-binding") {
510           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
511           return true;
512         }
513         if (PassName == "amdgpu-always-inline") {
514           PM.addPass(AMDGPUAlwaysInlinePass());
515           return true;
516         }
517         return false;
518       });
519   PB.registerPipelineParsingCallback(
520       [this](StringRef PassName, FunctionPassManager &PM,
521              ArrayRef<PassBuilder::PipelineElement>) {
522         if (PassName == "amdgpu-simplifylib") {
523           PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
524           return true;
525         }
526         if (PassName == "amdgpu-usenative") {
527           PM.addPass(AMDGPUUseNativeCallsPass());
528           return true;
529         }
530         if (PassName == "amdgpu-promote-alloca") {
531           PM.addPass(AMDGPUPromoteAllocaPass(*this));
532           return true;
533         }
534         if (PassName == "amdgpu-promote-alloca-to-vector") {
535           PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
536           return true;
537         }
538         if (PassName == "amdgpu-lower-kernel-attributes") {
539           PM.addPass(AMDGPULowerKernelAttributesPass());
540           return true;
541         }
542         if (PassName == "amdgpu-propagate-attributes-early") {
543           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
544           return true;
545         }
546 
547         return false;
548       });
549 
550   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
551     FAM.registerPass([&] { return AMDGPUAA(); });
552   });
553 
554   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
555     if (AAName == "amdgpu-aa") {
556       AAM.registerFunctionAnalysis<AMDGPUAA>();
557       return true;
558     }
559     return false;
560   });
561 
562   PB.registerPipelineStartEPCallback([this, DebugPassManager](
563                                          ModulePassManager &PM,
564                                          PassBuilder::OptimizationLevel Level) {
565     FunctionPassManager FPM(DebugPassManager);
566     FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
567     FPM.addPass(AMDGPUUseNativeCallsPass());
568     if (EnableLibCallSimplify && Level != PassBuilder::OptimizationLevel::O0)
569       FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
570     PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
571   });
572 
573   PB.registerPipelineEarlySimplificationEPCallback(
574       [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
575         if (Level == PassBuilder::OptimizationLevel::O0)
576           return;
577 
578         PM.addPass(AMDGPUUnifyMetadataPass());
579         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
580 
581         if (InternalizeSymbols) {
582           PM.addPass(InternalizePass(mustPreserveGV));
583         }
584         PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
585         if (InternalizeSymbols) {
586           PM.addPass(GlobalDCEPass());
587         }
588         if (EarlyInlineAll && !EnableFunctionCalls)
589           PM.addPass(AMDGPUAlwaysInlinePass());
590       });
591 
592   PB.registerCGSCCOptimizerLateEPCallback(
593       [this, DebugPassManager](CGSCCPassManager &PM,
594                                PassBuilder::OptimizationLevel Level) {
595           FunctionPassManager FPM(DebugPassManager);
596 
597           // Add infer address spaces pass to the opt pipeline after inlining
598           // but before SROA to increase SROA opportunities.
599           FPM.addPass(InferAddressSpacesPass());
600 
601           // This should run after inlining to have any chance of doing
602           // anything, and before other cleanup optimizations.
603           FPM.addPass(AMDGPULowerKernelAttributesPass());
604 
605           if (Level != PassBuilder::OptimizationLevel::O0) {
606             // Promote alloca to vector before SROA and loop unroll. If we
607             // manage to eliminate allocas before unroll we may choose to unroll
608             // less.
609             FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
610           }
611 
612           PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
613       });
614 }
615 
616 //===----------------------------------------------------------------------===//
617 // R600 Target Machine (R600 -> Cayman)
618 //===----------------------------------------------------------------------===//
619 
620 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
621                                      StringRef CPU, StringRef FS,
622                                      TargetOptions Options,
623                                      Optional<Reloc::Model> RM,
624                                      Optional<CodeModel::Model> CM,
625                                      CodeGenOpt::Level OL, bool JIT)
626     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
627   setRequiresStructuredCFG(true);
628 
629   // Override the default since calls aren't supported for r600.
630   if (EnableFunctionCalls &&
631       EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
632     EnableFunctionCalls = false;
633 }
634 
635 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
636   const Function &F) const {
637   StringRef GPU = getGPUName(F);
638   StringRef FS = getFeatureString(F);
639 
640   SmallString<128> SubtargetKey(GPU);
641   SubtargetKey.append(FS);
642 
643   auto &I = SubtargetMap[SubtargetKey];
644   if (!I) {
645     // This needs to be done before we create a new subtarget since any
646     // creation will depend on the TM and the code generation flags on the
647     // function that reside in TargetOptions.
648     resetTargetOptions(F);
649     I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
650   }
651 
652   return I.get();
653 }
654 
655 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
656                                               unsigned DestAS) const {
657   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
658          AMDGPU::isFlatGlobalAddrSpace(DestAS);
659 }
660 
661 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
662   const auto *LD = dyn_cast<LoadInst>(V);
663   if (!LD)
664     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
665 
666   // It must be a generic pointer loaded.
667   assert(V->getType()->isPointerTy() &&
668          V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
669 
670   const auto *Ptr = LD->getPointerOperand();
671   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
672     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
673   // For a generic pointer loaded from the constant memory, it could be assumed
674   // as a global pointer since the constant memory is only populated on the
675   // host side. As implied by the offload programming model, only global
676   // pointers could be referenced on the host side.
677   return AMDGPUAS::GLOBAL_ADDRESS;
678 }
679 
680 TargetTransformInfo
681 R600TargetMachine::getTargetTransformInfo(const Function &F) {
682   return TargetTransformInfo(R600TTIImpl(this, F));
683 }
684 
685 //===----------------------------------------------------------------------===//
686 // GCN Target Machine (SI+)
687 //===----------------------------------------------------------------------===//
688 
689 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
690                                    StringRef CPU, StringRef FS,
691                                    TargetOptions Options,
692                                    Optional<Reloc::Model> RM,
693                                    Optional<CodeModel::Model> CM,
694                                    CodeGenOpt::Level OL, bool JIT)
695     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
696 
697 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
698   StringRef GPU = getGPUName(F);
699   StringRef FS = getFeatureString(F);
700 
701   SmallString<128> SubtargetKey(GPU);
702   SubtargetKey.append(FS);
703 
704   auto &I = SubtargetMap[SubtargetKey];
705   if (!I) {
706     // This needs to be done before we create a new subtarget since any
707     // creation will depend on the TM and the code generation flags on the
708     // function that reside in TargetOptions.
709     resetTargetOptions(F);
710     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
711   }
712 
713   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
714 
715   return I.get();
716 }
717 
718 TargetTransformInfo
719 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
720   return TargetTransformInfo(GCNTTIImpl(this, F));
721 }
722 
723 //===----------------------------------------------------------------------===//
724 // AMDGPU Pass Setup
725 //===----------------------------------------------------------------------===//
726 
727 namespace {
728 
729 class AMDGPUPassConfig : public TargetPassConfig {
730 public:
731   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
732     : TargetPassConfig(TM, PM) {
733     // Exceptions and StackMaps are not supported, so these passes will never do
734     // anything.
735     disablePass(&StackMapLivenessID);
736     disablePass(&FuncletLayoutID);
737   }
738 
739   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
740     return getTM<AMDGPUTargetMachine>();
741   }
742 
743   ScheduleDAGInstrs *
744   createMachineScheduler(MachineSchedContext *C) const override {
745     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
746     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
747     return DAG;
748   }
749 
750   void addEarlyCSEOrGVNPass();
751   void addStraightLineScalarOptimizationPasses();
752   void addIRPasses() override;
753   void addCodeGenPrepare() override;
754   bool addPreISel() override;
755   bool addInstSelector() override;
756   bool addGCPasses() override;
757 
758   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
759 };
760 
761 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
762   return getStandardCSEConfigForOpt(TM->getOptLevel());
763 }
764 
765 class R600PassConfig final : public AMDGPUPassConfig {
766 public:
767   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
768     : AMDGPUPassConfig(TM, PM) {}
769 
770   ScheduleDAGInstrs *createMachineScheduler(
771     MachineSchedContext *C) const override {
772     return createR600MachineScheduler(C);
773   }
774 
775   bool addPreISel() override;
776   bool addInstSelector() override;
777   void addPreRegAlloc() override;
778   void addPreSched2() override;
779   void addPreEmitPass() override;
780 };
781 
782 class GCNPassConfig final : public AMDGPUPassConfig {
783 public:
784   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
785     : AMDGPUPassConfig(TM, PM) {
786     // It is necessary to know the register usage of the entire call graph.  We
787     // allow calls without EnableAMDGPUFunctionCalls if they are marked
788     // noinline, so this is always required.
789     setRequiresCodeGenSCCOrder(true);
790   }
791 
792   GCNTargetMachine &getGCNTargetMachine() const {
793     return getTM<GCNTargetMachine>();
794   }
795 
796   ScheduleDAGInstrs *
797   createMachineScheduler(MachineSchedContext *C) const override;
798 
799   bool addPreISel() override;
800   void addMachineSSAOptimization() override;
801   bool addILPOpts() override;
802   bool addInstSelector() override;
803   bool addIRTranslator() override;
804   void addPreLegalizeMachineIR() override;
805   bool addLegalizeMachineIR() override;
806   void addPreRegBankSelect() override;
807   bool addRegBankSelect() override;
808   bool addGlobalInstructionSelect() override;
809   void addFastRegAlloc() override;
810   void addOptimizedRegAlloc() override;
811   void addPreRegAlloc() override;
812   bool addPreRewrite() override;
813   void addPostRegAlloc() override;
814   void addPreSched2() override;
815   void addPreEmitPass() override;
816 };
817 
818 } // end anonymous namespace
819 
820 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
821   if (getOptLevel() == CodeGenOpt::Aggressive)
822     addPass(createGVNPass());
823   else
824     addPass(createEarlyCSEPass());
825 }
826 
827 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
828   addPass(createLICMPass());
829   addPass(createSeparateConstOffsetFromGEPPass());
830   addPass(createSpeculativeExecutionPass());
831   // ReassociateGEPs exposes more opportunites for SLSR. See
832   // the example in reassociate-geps-and-slsr.ll.
833   addPass(createStraightLineStrengthReducePass());
834   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
835   // EarlyCSE can reuse.
836   addEarlyCSEOrGVNPass();
837   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
838   addPass(createNaryReassociatePass());
839   // NaryReassociate on GEPs creates redundant common expressions, so run
840   // EarlyCSE after it.
841   addPass(createEarlyCSEPass());
842 }
843 
844 void AMDGPUPassConfig::addIRPasses() {
845   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
846 
847   // There is no reason to run these.
848   disablePass(&StackMapLivenessID);
849   disablePass(&FuncletLayoutID);
850   disablePass(&PatchableFunctionID);
851 
852   addPass(createAMDGPUPrintfRuntimeBinding());
853 
854   // This must occur before inlining, as the inliner will not look through
855   // bitcast calls.
856   addPass(createAMDGPUFixFunctionBitcastsPass());
857 
858   // A call to propagate attributes pass in the backend in case opt was not run.
859   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
860 
861   addPass(createAtomicExpandPass());
862 
863 
864   addPass(createAMDGPULowerIntrinsicsPass());
865 
866   // Function calls are not supported, so make sure we inline everything.
867   addPass(createAMDGPUAlwaysInlinePass());
868   addPass(createAlwaysInlinerLegacyPass());
869   // We need to add the barrier noop pass, otherwise adding the function
870   // inlining pass will cause all of the PassConfigs passes to be run
871   // one function at a time, which means if we have a nodule with two
872   // functions, then we will generate code for the first function
873   // without ever running any passes on the second.
874   addPass(createBarrierNoopPass());
875 
876   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
877   if (TM.getTargetTriple().getArch() == Triple::r600)
878     addPass(createR600OpenCLImageTypeLoweringPass());
879 
880   // Replace OpenCL enqueued block function pointers with global variables.
881   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
882 
883   if (TM.getOptLevel() > CodeGenOpt::None) {
884     addPass(createInferAddressSpacesPass());
885     addPass(createAMDGPUPromoteAlloca());
886 
887     if (EnableSROA)
888       addPass(createSROAPass());
889 
890     if (EnableScalarIRPasses)
891       addStraightLineScalarOptimizationPasses();
892 
893     if (EnableAMDGPUAliasAnalysis) {
894       addPass(createAMDGPUAAWrapperPass());
895       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
896                                              AAResults &AAR) {
897         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
898           AAR.addAAResult(WrapperPass->getResult());
899         }));
900     }
901   }
902 
903   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
904     // TODO: May want to move later or split into an early and late one.
905     addPass(createAMDGPUCodeGenPreparePass());
906   }
907 
908   TargetPassConfig::addIRPasses();
909 
910   // EarlyCSE is not always strong enough to clean up what LSR produces. For
911   // example, GVN can combine
912   //
913   //   %0 = add %a, %b
914   //   %1 = add %b, %a
915   //
916   // and
917   //
918   //   %0 = shl nsw %a, 2
919   //   %1 = shl %a, 2
920   //
921   // but EarlyCSE can do neither of them.
922   if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
923     addEarlyCSEOrGVNPass();
924 }
925 
926 void AMDGPUPassConfig::addCodeGenPrepare() {
927   if (TM->getTargetTriple().getArch() == Triple::amdgcn)
928     addPass(createAMDGPUAnnotateKernelFeaturesPass());
929 
930   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
931       EnableLowerKernelArguments)
932     addPass(createAMDGPULowerKernelArgumentsPass());
933 
934   addPass(&AMDGPUPerfHintAnalysisID);
935 
936   TargetPassConfig::addCodeGenPrepare();
937 
938   if (EnableLoadStoreVectorizer)
939     addPass(createLoadStoreVectorizerPass());
940 
941   // LowerSwitch pass may introduce unreachable blocks that can
942   // cause unexpected behavior for subsequent passes. Placing it
943   // here seems better that these blocks would get cleaned up by
944   // UnreachableBlockElim inserted next in the pass flow.
945   addPass(createLowerSwitchPass());
946 }
947 
948 bool AMDGPUPassConfig::addPreISel() {
949   addPass(createFlattenCFGPass());
950   return false;
951 }
952 
953 bool AMDGPUPassConfig::addInstSelector() {
954   // Defer the verifier until FinalizeISel.
955   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
956   return false;
957 }
958 
959 bool AMDGPUPassConfig::addGCPasses() {
960   // Do nothing. GC is not supported.
961   return false;
962 }
963 
964 //===----------------------------------------------------------------------===//
965 // R600 Pass Setup
966 //===----------------------------------------------------------------------===//
967 
968 bool R600PassConfig::addPreISel() {
969   AMDGPUPassConfig::addPreISel();
970 
971   if (EnableR600StructurizeCFG)
972     addPass(createStructurizeCFGPass());
973   return false;
974 }
975 
976 bool R600PassConfig::addInstSelector() {
977   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
978   return false;
979 }
980 
981 void R600PassConfig::addPreRegAlloc() {
982   addPass(createR600VectorRegMerger());
983 }
984 
985 void R600PassConfig::addPreSched2() {
986   addPass(createR600EmitClauseMarkers(), false);
987   if (EnableR600IfConvert)
988     addPass(&IfConverterID, false);
989   addPass(createR600ClauseMergePass(), false);
990 }
991 
992 void R600PassConfig::addPreEmitPass() {
993   addPass(createAMDGPUCFGStructurizerPass(), false);
994   addPass(createR600ExpandSpecialInstrsPass(), false);
995   addPass(&FinalizeMachineBundlesID, false);
996   addPass(createR600Packetizer(), false);
997   addPass(createR600ControlFlowFinalizer(), false);
998 }
999 
1000 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
1001   return new R600PassConfig(*this, PM);
1002 }
1003 
1004 //===----------------------------------------------------------------------===//
1005 // GCN Pass Setup
1006 //===----------------------------------------------------------------------===//
1007 
1008 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1009   MachineSchedContext *C) const {
1010   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1011   if (ST.enableSIScheduler())
1012     return createSIMachineScheduler(C);
1013   return createGCNMaxOccupancyMachineScheduler(C);
1014 }
1015 
1016 bool GCNPassConfig::addPreISel() {
1017   AMDGPUPassConfig::addPreISel();
1018 
1019   addPass(createAMDGPULateCodeGenPreparePass());
1020   if (EnableAtomicOptimizations) {
1021     addPass(createAMDGPUAtomicOptimizerPass());
1022   }
1023 
1024   // FIXME: We need to run a pass to propagate the attributes when calls are
1025   // supported.
1026 
1027   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1028   // regions formed by them.
1029   addPass(&AMDGPUUnifyDivergentExitNodesID);
1030   if (!LateCFGStructurize) {
1031     if (EnableStructurizerWorkarounds) {
1032       addPass(createFixIrreduciblePass());
1033       addPass(createUnifyLoopExitsPass());
1034     }
1035     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1036   }
1037   addPass(createSinkingPass());
1038   addPass(createAMDGPUAnnotateUniformValues());
1039   if (!LateCFGStructurize) {
1040     addPass(createSIAnnotateControlFlowPass());
1041   }
1042   addPass(createLCSSAPass());
1043 
1044   return false;
1045 }
1046 
1047 void GCNPassConfig::addMachineSSAOptimization() {
1048   TargetPassConfig::addMachineSSAOptimization();
1049 
1050   // We want to fold operands after PeepholeOptimizer has run (or as part of
1051   // it), because it will eliminate extra copies making it easier to fold the
1052   // real source operand. We want to eliminate dead instructions after, so that
1053   // we see fewer uses of the copies. We then need to clean up the dead
1054   // instructions leftover after the operands are folded as well.
1055   //
1056   // XXX - Can we get away without running DeadMachineInstructionElim again?
1057   addPass(&SIFoldOperandsID);
1058   if (EnableDPPCombine)
1059     addPass(&GCNDPPCombineID);
1060   addPass(&DeadMachineInstructionElimID);
1061   addPass(&SILoadStoreOptimizerID);
1062   if (EnableSDWAPeephole) {
1063     addPass(&SIPeepholeSDWAID);
1064     addPass(&EarlyMachineLICMID);
1065     addPass(&MachineCSEID);
1066     addPass(&SIFoldOperandsID);
1067     addPass(&DeadMachineInstructionElimID);
1068   }
1069   addPass(createSIShrinkInstructionsPass());
1070 }
1071 
1072 bool GCNPassConfig::addILPOpts() {
1073   if (EnableEarlyIfConversion)
1074     addPass(&EarlyIfConverterID);
1075 
1076   TargetPassConfig::addILPOpts();
1077   return false;
1078 }
1079 
1080 bool GCNPassConfig::addInstSelector() {
1081   AMDGPUPassConfig::addInstSelector();
1082   addPass(&SIFixSGPRCopiesID);
1083   addPass(createSILowerI1CopiesPass());
1084   addPass(createSIAddIMGInitPass());
1085   return false;
1086 }
1087 
1088 bool GCNPassConfig::addIRTranslator() {
1089   addPass(new IRTranslator(getOptLevel()));
1090   return false;
1091 }
1092 
1093 void GCNPassConfig::addPreLegalizeMachineIR() {
1094   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1095   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1096   addPass(new Localizer());
1097 }
1098 
1099 bool GCNPassConfig::addLegalizeMachineIR() {
1100   addPass(new Legalizer());
1101   return false;
1102 }
1103 
1104 void GCNPassConfig::addPreRegBankSelect() {
1105   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1106   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1107 }
1108 
1109 bool GCNPassConfig::addRegBankSelect() {
1110   addPass(new RegBankSelect());
1111   return false;
1112 }
1113 
1114 bool GCNPassConfig::addGlobalInstructionSelect() {
1115   addPass(new InstructionSelect());
1116   return false;
1117 }
1118 
1119 void GCNPassConfig::addPreRegAlloc() {
1120   if (LateCFGStructurize) {
1121     addPass(createAMDGPUMachineCFGStructurizerPass());
1122   }
1123 }
1124 
1125 void GCNPassConfig::addFastRegAlloc() {
1126   // FIXME: We have to disable the verifier here because of PHIElimination +
1127   // TwoAddressInstructions disabling it.
1128 
1129   // This must be run immediately after phi elimination and before
1130   // TwoAddressInstructions, otherwise the processing of the tied operand of
1131   // SI_ELSE will introduce a copy of the tied operand source after the else.
1132   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1133 
1134   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1135   insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1136 
1137   TargetPassConfig::addFastRegAlloc();
1138 }
1139 
1140 void GCNPassConfig::addOptimizedRegAlloc() {
1141   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1142   // instructions that cause scheduling barriers.
1143   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1144   insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1145 
1146   if (OptExecMaskPreRA)
1147     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1148   insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1149 
1150   // This must be run immediately after phi elimination and before
1151   // TwoAddressInstructions, otherwise the processing of the tied operand of
1152   // SI_ELSE will introduce a copy of the tied operand source after the else.
1153   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1154 
1155   if (EnableDCEInRA)
1156     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1157 
1158   TargetPassConfig::addOptimizedRegAlloc();
1159 }
1160 
1161 bool GCNPassConfig::addPreRewrite() {
1162   if (EnableRegReassign) {
1163     addPass(&GCNNSAReassignID);
1164     addPass(&GCNRegBankReassignID);
1165   }
1166   return true;
1167 }
1168 
1169 void GCNPassConfig::addPostRegAlloc() {
1170   addPass(&SIFixVGPRCopiesID);
1171   if (getOptLevel() > CodeGenOpt::None)
1172     addPass(&SIOptimizeExecMaskingID);
1173   TargetPassConfig::addPostRegAlloc();
1174 
1175   // Equivalent of PEI for SGPRs.
1176   addPass(&SILowerSGPRSpillsID);
1177 }
1178 
1179 void GCNPassConfig::addPreSched2() {
1180   addPass(&SIPostRABundlerID);
1181 }
1182 
1183 void GCNPassConfig::addPreEmitPass() {
1184   addPass(createSIMemoryLegalizerPass());
1185   addPass(createSIInsertWaitcntsPass());
1186   addPass(createSIShrinkInstructionsPass());
1187   addPass(createSIModeRegisterPass());
1188 
1189   if (getOptLevel() > CodeGenOpt::None)
1190     addPass(&SIInsertHardClausesID);
1191 
1192   addPass(&SIRemoveShortExecBranchesID);
1193   addPass(&SIInsertSkipsPassID);
1194   addPass(&SIPreEmitPeepholeID);
1195   // The hazard recognizer that runs as part of the post-ra scheduler does not
1196   // guarantee to be able handle all hazards correctly. This is because if there
1197   // are multiple scheduling regions in a basic block, the regions are scheduled
1198   // bottom up, so when we begin to schedule a region we don't know what
1199   // instructions were emitted directly before it.
1200   //
1201   // Here we add a stand-alone hazard recognizer pass which can handle all
1202   // cases.
1203   addPass(&PostRAHazardRecognizerID);
1204   addPass(&BranchRelaxationPassID);
1205 }
1206 
1207 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1208   return new GCNPassConfig(*this, PM);
1209 }
1210 
1211 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1212   return new yaml::SIMachineFunctionInfo();
1213 }
1214 
1215 yaml::MachineFunctionInfo *
1216 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1217   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1218   return new yaml::SIMachineFunctionInfo(*MFI,
1219                                          *MF.getSubtarget().getRegisterInfo());
1220 }
1221 
1222 bool GCNTargetMachine::parseMachineFunctionInfo(
1223     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1224     SMDiagnostic &Error, SMRange &SourceRange) const {
1225   const yaml::SIMachineFunctionInfo &YamlMFI =
1226       reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1227   MachineFunction &MF = PFS.MF;
1228   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1229 
1230   MFI->initializeBaseYamlFields(YamlMFI);
1231 
1232   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1233     Register TempReg;
1234     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1235       SourceRange = RegName.SourceRange;
1236       return true;
1237     }
1238     RegVal = TempReg;
1239 
1240     return false;
1241   };
1242 
1243   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1244     // Create a diagnostic for a the register string literal.
1245     const MemoryBuffer &Buffer =
1246         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1247     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1248                          RegName.Value.size(), SourceMgr::DK_Error,
1249                          "incorrect register class for field", RegName.Value,
1250                          None, None);
1251     SourceRange = RegName.SourceRange;
1252     return true;
1253   };
1254 
1255   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1256       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1257       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1258     return true;
1259 
1260   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1261       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1262     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1263   }
1264 
1265   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1266       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1267     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1268   }
1269 
1270   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1271       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1272     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1273   }
1274 
1275   auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1276                                    const TargetRegisterClass &RC,
1277                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1278                                    unsigned SystemSGPRs) {
1279     // Skip parsing if it's not present.
1280     if (!A)
1281       return false;
1282 
1283     if (A->IsRegister) {
1284       Register Reg;
1285       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1286         SourceRange = A->RegisterName.SourceRange;
1287         return true;
1288       }
1289       if (!RC.contains(Reg))
1290         return diagnoseRegisterClass(A->RegisterName);
1291       Arg = ArgDescriptor::createRegister(Reg);
1292     } else
1293       Arg = ArgDescriptor::createStack(A->StackOffset);
1294     // Check and apply the optional mask.
1295     if (A->Mask)
1296       Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1297 
1298     MFI->NumUserSGPRs += UserSGPRs;
1299     MFI->NumSystemSGPRs += SystemSGPRs;
1300     return false;
1301   };
1302 
1303   if (YamlMFI.ArgInfo &&
1304       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1305                              AMDGPU::SGPR_128RegClass,
1306                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1307        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1308                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1309                              2, 0) ||
1310        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1311                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1312        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1313                              AMDGPU::SReg_64RegClass,
1314                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1315        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1316                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1317                              2, 0) ||
1318        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1319                              AMDGPU::SReg_64RegClass,
1320                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1321        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1322                              AMDGPU::SGPR_32RegClass,
1323                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1324        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1325                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1326                              0, 1) ||
1327        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1328                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1329                              0, 1) ||
1330        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1331                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1332                              0, 1) ||
1333        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1334                              AMDGPU::SGPR_32RegClass,
1335                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1336        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1337                              AMDGPU::SGPR_32RegClass,
1338                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1339        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1340                              AMDGPU::SReg_64RegClass,
1341                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1342        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1343                              AMDGPU::SReg_64RegClass,
1344                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1345        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1346                              AMDGPU::VGPR_32RegClass,
1347                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1348        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1349                              AMDGPU::VGPR_32RegClass,
1350                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1351        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1352                              AMDGPU::VGPR_32RegClass,
1353                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1354     return true;
1355 
1356   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1357   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1358   MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1359   MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1360   MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1361   MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1362 
1363   return false;
1364 }
1365