1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUExportClustering.h"
19 #include "AMDGPUMacroFusion.h"
20 #include "AMDGPUTargetObjectFile.h"
21 #include "AMDGPUTargetTransformInfo.h"
22 #include "GCNIterativeScheduler.h"
23 #include "GCNSchedStrategy.h"
24 #include "R600MachineScheduler.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIMachineScheduler.h"
27 #include "TargetInfo/AMDGPUTargetInfo.h"
28 #include "llvm/Analysis/CGSCCPassManager.h"
29 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
32 #include "llvm/CodeGen/GlobalISel/Localizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/RegAllocRegistry.h"
37 #include "llvm/CodeGen/TargetPassConfig.h"
38 #include "llvm/IR/LegacyPassManager.h"
39 #include "llvm/IR/PassManager.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Passes/PassBuilder.h"
42 #include "llvm/Support/TargetRegistry.h"
43 #include "llvm/Transforms/IPO.h"
44 #include "llvm/Transforms/IPO/AlwaysInliner.h"
45 #include "llvm/Transforms/IPO/GlobalDCE.h"
46 #include "llvm/Transforms/IPO/Internalize.h"
47 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
48 #include "llvm/Transforms/Scalar.h"
49 #include "llvm/Transforms/Scalar/GVN.h"
50 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
51 #include "llvm/Transforms/Utils.h"
52 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
53 #include "llvm/Transforms/Vectorize.h"
54 
55 using namespace llvm;
56 
57 namespace {
58 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
59 public:
60   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
61     : RegisterRegAllocBase(N, D, C) {}
62 };
63 
64 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
65 public:
66   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
67     : RegisterRegAllocBase(N, D, C) {}
68 };
69 
70 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
71                               const TargetRegisterClass &RC) {
72   return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
73 }
74 
75 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
76                               const TargetRegisterClass &RC) {
77   return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
78 }
79 
80 
81 /// -{sgpr|vgpr}-regalloc=... command line option.
82 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
83 
84 /// A dummy default pass factory indicates whether the register allocator is
85 /// overridden on the command line.
86 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
87 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
88 
89 static SGPRRegisterRegAlloc
90 defaultSGPRRegAlloc("default",
91                     "pick SGPR register allocator based on -O option",
92                     useDefaultRegisterAllocator);
93 
94 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
95                RegisterPassParser<SGPRRegisterRegAlloc>>
96 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
97              cl::desc("Register allocator to use for SGPRs"));
98 
99 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
100                RegisterPassParser<VGPRRegisterRegAlloc>>
101 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
102              cl::desc("Register allocator to use for VGPRs"));
103 
104 
105 static void initializeDefaultSGPRRegisterAllocatorOnce() {
106   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
107 
108   if (!Ctor) {
109     Ctor = SGPRRegAlloc;
110     SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
111   }
112 }
113 
114 static void initializeDefaultVGPRRegisterAllocatorOnce() {
115   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
116 
117   if (!Ctor) {
118     Ctor = VGPRRegAlloc;
119     VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
120   }
121 }
122 
123 static FunctionPass *createBasicSGPRRegisterAllocator() {
124   return createBasicRegisterAllocator(onlyAllocateSGPRs);
125 }
126 
127 static FunctionPass *createGreedySGPRRegisterAllocator() {
128   return createGreedyRegisterAllocator(onlyAllocateSGPRs);
129 }
130 
131 static FunctionPass *createFastSGPRRegisterAllocator() {
132   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
133 }
134 
135 static FunctionPass *createBasicVGPRRegisterAllocator() {
136   return createBasicRegisterAllocator(onlyAllocateVGPRs);
137 }
138 
139 static FunctionPass *createGreedyVGPRRegisterAllocator() {
140   return createGreedyRegisterAllocator(onlyAllocateVGPRs);
141 }
142 
143 static FunctionPass *createFastVGPRRegisterAllocator() {
144   return createFastRegisterAllocator(onlyAllocateVGPRs, true);
145 }
146 
147 static SGPRRegisterRegAlloc basicRegAllocSGPR(
148   "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
149 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
150   "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
151 
152 static SGPRRegisterRegAlloc fastRegAllocSGPR(
153   "fast", "fast register allocator", createFastSGPRRegisterAllocator);
154 
155 
156 static VGPRRegisterRegAlloc basicRegAllocVGPR(
157   "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
158 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
159   "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
160 
161 static VGPRRegisterRegAlloc fastRegAllocVGPR(
162   "fast", "fast register allocator", createFastVGPRRegisterAllocator);
163 }
164 
165 
166 static cl::opt<bool> EnableR600StructurizeCFG(
167   "r600-ir-structurize",
168   cl::desc("Use StructurizeCFG IR pass"),
169   cl::init(true));
170 
171 static cl::opt<bool> EnableSROA(
172   "amdgpu-sroa",
173   cl::desc("Run SROA after promote alloca pass"),
174   cl::ReallyHidden,
175   cl::init(true));
176 
177 static cl::opt<bool>
178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
179                         cl::desc("Run early if-conversion"),
180                         cl::init(false));
181 
182 static cl::opt<bool>
183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
184             cl::desc("Run pre-RA exec mask optimizations"),
185             cl::init(true));
186 
187 static cl::opt<bool> EnableR600IfConvert(
188   "r600-if-convert",
189   cl::desc("Use if conversion pass"),
190   cl::ReallyHidden,
191   cl::init(true));
192 
193 // Option to disable vectorizer for tests.
194 static cl::opt<bool> EnableLoadStoreVectorizer(
195   "amdgpu-load-store-vectorizer",
196   cl::desc("Enable load store vectorizer"),
197   cl::init(true),
198   cl::Hidden);
199 
200 // Option to control global loads scalarization
201 static cl::opt<bool> ScalarizeGlobal(
202   "amdgpu-scalarize-global-loads",
203   cl::desc("Enable global load scalarization"),
204   cl::init(true),
205   cl::Hidden);
206 
207 // Option to run internalize pass.
208 static cl::opt<bool> InternalizeSymbols(
209   "amdgpu-internalize-symbols",
210   cl::desc("Enable elimination of non-kernel functions and unused globals"),
211   cl::init(false),
212   cl::Hidden);
213 
214 // Option to inline all early.
215 static cl::opt<bool> EarlyInlineAll(
216   "amdgpu-early-inline-all",
217   cl::desc("Inline all functions early"),
218   cl::init(false),
219   cl::Hidden);
220 
221 static cl::opt<bool> EnableSDWAPeephole(
222   "amdgpu-sdwa-peephole",
223   cl::desc("Enable SDWA peepholer"),
224   cl::init(true));
225 
226 static cl::opt<bool> EnableDPPCombine(
227   "amdgpu-dpp-combine",
228   cl::desc("Enable DPP combiner"),
229   cl::init(true));
230 
231 // Enable address space based alias analysis
232 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
233   cl::desc("Enable AMDGPU Alias Analysis"),
234   cl::init(true));
235 
236 // Option to run late CFG structurizer
237 static cl::opt<bool, true> LateCFGStructurize(
238   "amdgpu-late-structurize",
239   cl::desc("Enable late CFG structurization"),
240   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
241   cl::Hidden);
242 
243 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
244   "amdgpu-function-calls",
245   cl::desc("Enable AMDGPU function call support"),
246   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
247   cl::init(true),
248   cl::Hidden);
249 
250 static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
251   "amdgpu-fixed-function-abi",
252   cl::desc("Enable all implicit function arguments"),
253   cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
254   cl::init(false),
255   cl::Hidden);
256 
257 // Enable lib calls simplifications
258 static cl::opt<bool> EnableLibCallSimplify(
259   "amdgpu-simplify-libcall",
260   cl::desc("Enable amdgpu library simplifications"),
261   cl::init(true),
262   cl::Hidden);
263 
264 static cl::opt<bool> EnableLowerKernelArguments(
265   "amdgpu-ir-lower-kernel-arguments",
266   cl::desc("Lower kernel argument loads in IR pass"),
267   cl::init(true),
268   cl::Hidden);
269 
270 static cl::opt<bool> EnableRegReassign(
271   "amdgpu-reassign-regs",
272   cl::desc("Enable register reassign optimizations on gfx10+"),
273   cl::init(true),
274   cl::Hidden);
275 
276 static cl::opt<bool> OptVGPRLiveRange(
277     "amdgpu-opt-vgpr-liverange",
278     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
279     cl::init(true), cl::Hidden);
280 
281 // Enable atomic optimization
282 static cl::opt<bool> EnableAtomicOptimizations(
283   "amdgpu-atomic-optimizations",
284   cl::desc("Enable atomic optimizations"),
285   cl::init(false),
286   cl::Hidden);
287 
288 // Enable Mode register optimization
289 static cl::opt<bool> EnableSIModeRegisterPass(
290   "amdgpu-mode-register",
291   cl::desc("Enable mode register pass"),
292   cl::init(true),
293   cl::Hidden);
294 
295 // Option is used in lit tests to prevent deadcoding of patterns inspected.
296 static cl::opt<bool>
297 EnableDCEInRA("amdgpu-dce-in-ra",
298     cl::init(true), cl::Hidden,
299     cl::desc("Enable machine DCE inside regalloc"));
300 
301 static cl::opt<bool> EnableScalarIRPasses(
302   "amdgpu-scalar-ir-passes",
303   cl::desc("Enable scalar IR passes"),
304   cl::init(true),
305   cl::Hidden);
306 
307 static cl::opt<bool> EnableStructurizerWorkarounds(
308     "amdgpu-enable-structurizer-workarounds",
309     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
310     cl::Hidden);
311 
312 static cl::opt<bool> EnableLDSReplaceWithPointer(
313     "amdgpu-enable-lds-replace-with-pointer",
314     cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
315     cl::Hidden);
316 
317 static cl::opt<bool, true> EnableLowerModuleLDS(
318     "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
319     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
320     cl::Hidden);
321 
322 static cl::opt<bool> EnablePreRAOptimizations(
323     "amdgpu-enable-pre-ra-optimizations",
324     cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
325     cl::Hidden);
326 
327 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
328   // Register the target
329   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
330   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
331 
332   PassRegistry *PR = PassRegistry::getPassRegistry();
333   initializeR600ClauseMergePassPass(*PR);
334   initializeR600ControlFlowFinalizerPass(*PR);
335   initializeR600PacketizerPass(*PR);
336   initializeR600ExpandSpecialInstrsPassPass(*PR);
337   initializeR600VectorRegMergerPass(*PR);
338   initializeGlobalISel(*PR);
339   initializeAMDGPUDAGToDAGISelPass(*PR);
340   initializeGCNDPPCombinePass(*PR);
341   initializeSILowerI1CopiesPass(*PR);
342   initializeSILowerSGPRSpillsPass(*PR);
343   initializeSIFixSGPRCopiesPass(*PR);
344   initializeSIFixVGPRCopiesPass(*PR);
345   initializeSIFoldOperandsPass(*PR);
346   initializeSIPeepholeSDWAPass(*PR);
347   initializeSIShrinkInstructionsPass(*PR);
348   initializeSIOptimizeExecMaskingPreRAPass(*PR);
349   initializeSIOptimizeVGPRLiveRangePass(*PR);
350   initializeSILoadStoreOptimizerPass(*PR);
351   initializeAMDGPUFixFunctionBitcastsPass(*PR);
352   initializeAMDGPUAlwaysInlinePass(*PR);
353   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
354   initializeAMDGPUAnnotateUniformValuesPass(*PR);
355   initializeAMDGPUArgumentUsageInfoPass(*PR);
356   initializeAMDGPUAtomicOptimizerPass(*PR);
357   initializeAMDGPULowerKernelArgumentsPass(*PR);
358   initializeAMDGPULowerKernelAttributesPass(*PR);
359   initializeAMDGPULowerIntrinsicsPass(*PR);
360   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
361   initializeAMDGPUPostLegalizerCombinerPass(*PR);
362   initializeAMDGPUPreLegalizerCombinerPass(*PR);
363   initializeAMDGPURegBankCombinerPass(*PR);
364   initializeAMDGPUPromoteAllocaPass(*PR);
365   initializeAMDGPUPromoteAllocaToVectorPass(*PR);
366   initializeAMDGPUCodeGenPreparePass(*PR);
367   initializeAMDGPULateCodeGenPreparePass(*PR);
368   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
369   initializeAMDGPUPropagateAttributesLatePass(*PR);
370   initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
371   initializeAMDGPULowerModuleLDSPass(*PR);
372   initializeAMDGPURewriteOutArgumentsPass(*PR);
373   initializeAMDGPUUnifyMetadataPass(*PR);
374   initializeSIAnnotateControlFlowPass(*PR);
375   initializeSIInsertHardClausesPass(*PR);
376   initializeSIInsertWaitcntsPass(*PR);
377   initializeSIModeRegisterPass(*PR);
378   initializeSIWholeQuadModePass(*PR);
379   initializeSILowerControlFlowPass(*PR);
380   initializeSIPreEmitPeepholePass(*PR);
381   initializeSILateBranchLoweringPass(*PR);
382   initializeSIMemoryLegalizerPass(*PR);
383   initializeSIOptimizeExecMaskingPass(*PR);
384   initializeSIPreAllocateWWMRegsPass(*PR);
385   initializeSIFormMemoryClausesPass(*PR);
386   initializeSIPostRABundlerPass(*PR);
387   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
388   initializeAMDGPUAAWrapperPassPass(*PR);
389   initializeAMDGPUExternalAAWrapperPass(*PR);
390   initializeAMDGPUUseNativeCallsPass(*PR);
391   initializeAMDGPUSimplifyLibCallsPass(*PR);
392   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
393   initializeGCNNSAReassignPass(*PR);
394   initializeGCNPreRAOptimizationsPass(*PR);
395 }
396 
397 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
398   return std::make_unique<AMDGPUTargetObjectFile>();
399 }
400 
401 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
402   return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
403 }
404 
405 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
406   return new SIScheduleDAGMI(C);
407 }
408 
409 static ScheduleDAGInstrs *
410 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
411   ScheduleDAGMILive *DAG =
412     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
413   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
414   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
415   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
416   return DAG;
417 }
418 
419 static ScheduleDAGInstrs *
420 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
421   auto DAG = new GCNIterativeScheduler(C,
422     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
423   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
424   return DAG;
425 }
426 
427 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
428   return new GCNIterativeScheduler(C,
429     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
430 }
431 
432 static ScheduleDAGInstrs *
433 createIterativeILPMachineScheduler(MachineSchedContext *C) {
434   auto DAG = new GCNIterativeScheduler(C,
435     GCNIterativeScheduler::SCHEDULE_ILP);
436   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
437   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
438   return DAG;
439 }
440 
441 static MachineSchedRegistry
442 R600SchedRegistry("r600", "Run R600's custom scheduler",
443                    createR600MachineScheduler);
444 
445 static MachineSchedRegistry
446 SISchedRegistry("si", "Run SI's custom scheduler",
447                 createSIMachineScheduler);
448 
449 static MachineSchedRegistry
450 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
451                              "Run GCN scheduler to maximize occupancy",
452                              createGCNMaxOccupancyMachineScheduler);
453 
454 static MachineSchedRegistry
455 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
456   "Run GCN scheduler to maximize occupancy (experimental)",
457   createIterativeGCNMaxOccupancyMachineScheduler);
458 
459 static MachineSchedRegistry
460 GCNMinRegSchedRegistry("gcn-minreg",
461   "Run GCN iterative scheduler for minimal register usage (experimental)",
462   createMinRegScheduler);
463 
464 static MachineSchedRegistry
465 GCNILPSchedRegistry("gcn-ilp",
466   "Run GCN iterative scheduler for ILP scheduling (experimental)",
467   createIterativeILPMachineScheduler);
468 
469 static StringRef computeDataLayout(const Triple &TT) {
470   if (TT.getArch() == Triple::r600) {
471     // 32-bit pointers.
472     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
473            "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
474   }
475 
476   // 32-bit private, local, and region pointers. 64-bit global, constant and
477   // flat, non-integral buffer fat pointers.
478   return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
479          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
480          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
481          "-ni:7";
482 }
483 
484 LLVM_READNONE
485 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
486   if (!GPU.empty())
487     return GPU;
488 
489   // Need to default to a target with flat support for HSA.
490   if (TT.getArch() == Triple::amdgcn)
491     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
492 
493   return "r600";
494 }
495 
496 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
497   // The AMDGPU toolchain only supports generating shared objects, so we
498   // must always use PIC.
499   return Reloc::PIC_;
500 }
501 
502 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
503                                          StringRef CPU, StringRef FS,
504                                          TargetOptions Options,
505                                          Optional<Reloc::Model> RM,
506                                          Optional<CodeModel::Model> CM,
507                                          CodeGenOpt::Level OptLevel)
508     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
509                         FS, Options, getEffectiveRelocModel(RM),
510                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
511       TLOF(createTLOF(getTargetTriple())) {
512   initAsmInfo();
513   if (TT.getArch() == Triple::amdgcn) {
514     if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
515       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
516     else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
517       MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
518   }
519 }
520 
521 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
522 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
523 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
524 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
525 
526 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
527 
528 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
529   Attribute GPUAttr = F.getFnAttribute("target-cpu");
530   return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
531 }
532 
533 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
534   Attribute FSAttr = F.getFnAttribute("target-features");
535 
536   return FSAttr.isValid() ? FSAttr.getValueAsString()
537                           : getTargetFeatureString();
538 }
539 
540 /// Predicate for Internalize pass.
541 static bool mustPreserveGV(const GlobalValue &GV) {
542   if (const Function *F = dyn_cast<Function>(&GV))
543     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
544 
545   GV.removeDeadConstantUsers();
546   return !GV.use_empty();
547 }
548 
549 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
550   Builder.DivergentTarget = true;
551 
552   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
553   bool Internalize = InternalizeSymbols;
554   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
555   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
556   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
557 
558   if (EnableFunctionCalls) {
559     delete Builder.Inliner;
560     Builder.Inliner = createFunctionInliningPass();
561   }
562 
563   Builder.addExtension(
564     PassManagerBuilder::EP_ModuleOptimizerEarly,
565     [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
566                                                legacy::PassManagerBase &PM) {
567       if (AMDGPUAA) {
568         PM.add(createAMDGPUAAWrapperPass());
569         PM.add(createAMDGPUExternalAAWrapperPass());
570       }
571       PM.add(createAMDGPUUnifyMetadataPass());
572       PM.add(createAMDGPUPrintfRuntimeBinding());
573       if (Internalize)
574         PM.add(createInternalizePass(mustPreserveGV));
575       PM.add(createAMDGPUPropagateAttributesLatePass(this));
576       if (Internalize)
577         PM.add(createGlobalDCEPass());
578       if (EarlyInline)
579         PM.add(createAMDGPUAlwaysInlinePass(false));
580   });
581 
582   Builder.addExtension(
583     PassManagerBuilder::EP_EarlyAsPossible,
584     [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
585                                       legacy::PassManagerBase &PM) {
586       if (AMDGPUAA) {
587         PM.add(createAMDGPUAAWrapperPass());
588         PM.add(createAMDGPUExternalAAWrapperPass());
589       }
590       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
591       PM.add(llvm::createAMDGPUUseNativeCallsPass());
592       if (LibCallSimplify)
593         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
594   });
595 
596   Builder.addExtension(
597     PassManagerBuilder::EP_CGSCCOptimizerLate,
598     [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
599       // Add infer address spaces pass to the opt pipeline after inlining
600       // but before SROA to increase SROA opportunities.
601       PM.add(createInferAddressSpacesPass());
602 
603       // This should run after inlining to have any chance of doing anything,
604       // and before other cleanup optimizations.
605       PM.add(createAMDGPULowerKernelAttributesPass());
606 
607       // Promote alloca to vector before SROA and loop unroll. If we manage
608       // to eliminate allocas before unroll we may choose to unroll less.
609       if (EnableOpt)
610         PM.add(createAMDGPUPromoteAllocaToVector());
611   });
612 }
613 
614 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
615   AAM.registerFunctionAnalysis<AMDGPUAA>();
616 }
617 
618 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
619   PB.registerPipelineParsingCallback(
620       [this](StringRef PassName, ModulePassManager &PM,
621              ArrayRef<PassBuilder::PipelineElement>) {
622         if (PassName == "amdgpu-propagate-attributes-late") {
623           PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
624           return true;
625         }
626         if (PassName == "amdgpu-unify-metadata") {
627           PM.addPass(AMDGPUUnifyMetadataPass());
628           return true;
629         }
630         if (PassName == "amdgpu-printf-runtime-binding") {
631           PM.addPass(AMDGPUPrintfRuntimeBindingPass());
632           return true;
633         }
634         if (PassName == "amdgpu-always-inline") {
635           PM.addPass(AMDGPUAlwaysInlinePass());
636           return true;
637         }
638         if (PassName == "amdgpu-replace-lds-use-with-pointer") {
639           PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
640           return true;
641         }
642         if (PassName == "amdgpu-lower-module-lds") {
643           PM.addPass(AMDGPULowerModuleLDSPass());
644           return true;
645         }
646         return false;
647       });
648   PB.registerPipelineParsingCallback(
649       [this](StringRef PassName, FunctionPassManager &PM,
650              ArrayRef<PassBuilder::PipelineElement>) {
651         if (PassName == "amdgpu-simplifylib") {
652           PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
653           return true;
654         }
655         if (PassName == "amdgpu-usenative") {
656           PM.addPass(AMDGPUUseNativeCallsPass());
657           return true;
658         }
659         if (PassName == "amdgpu-promote-alloca") {
660           PM.addPass(AMDGPUPromoteAllocaPass(*this));
661           return true;
662         }
663         if (PassName == "amdgpu-promote-alloca-to-vector") {
664           PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
665           return true;
666         }
667         if (PassName == "amdgpu-lower-kernel-attributes") {
668           PM.addPass(AMDGPULowerKernelAttributesPass());
669           return true;
670         }
671         if (PassName == "amdgpu-propagate-attributes-early") {
672           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
673           return true;
674         }
675         return false;
676       });
677 
678   PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
679     FAM.registerPass([&] { return AMDGPUAA(); });
680   });
681 
682   PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
683     if (AAName == "amdgpu-aa") {
684       AAM.registerFunctionAnalysis<AMDGPUAA>();
685       return true;
686     }
687     return false;
688   });
689 
690   PB.registerPipelineStartEPCallback(
691       [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
692         FunctionPassManager FPM;
693         FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
694         FPM.addPass(AMDGPUUseNativeCallsPass());
695         if (EnableLibCallSimplify &&
696             Level != PassBuilder::OptimizationLevel::O0)
697           FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
698         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
699       });
700 
701   PB.registerPipelineEarlySimplificationEPCallback(
702       [this](ModulePassManager &PM, PassBuilder::OptimizationLevel Level) {
703         if (Level == PassBuilder::OptimizationLevel::O0)
704           return;
705 
706         PM.addPass(AMDGPUUnifyMetadataPass());
707         PM.addPass(AMDGPUPrintfRuntimeBindingPass());
708 
709         if (InternalizeSymbols) {
710           PM.addPass(InternalizePass(mustPreserveGV));
711         }
712         PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
713         if (InternalizeSymbols) {
714           PM.addPass(GlobalDCEPass());
715         }
716         if (EarlyInlineAll && !EnableFunctionCalls)
717           PM.addPass(AMDGPUAlwaysInlinePass());
718       });
719 
720   PB.registerCGSCCOptimizerLateEPCallback(
721       [this](CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
722         if (Level == PassBuilder::OptimizationLevel::O0)
723           return;
724 
725         FunctionPassManager FPM;
726 
727         // Add infer address spaces pass to the opt pipeline after inlining
728         // but before SROA to increase SROA opportunities.
729         FPM.addPass(InferAddressSpacesPass());
730 
731         // This should run after inlining to have any chance of doing
732         // anything, and before other cleanup optimizations.
733         FPM.addPass(AMDGPULowerKernelAttributesPass());
734 
735         if (Level != PassBuilder::OptimizationLevel::O0) {
736           // Promote alloca to vector before SROA and loop unroll. If we
737           // manage to eliminate allocas before unroll we may choose to unroll
738           // less.
739           FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
740         }
741 
742         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
743       });
744 }
745 
746 //===----------------------------------------------------------------------===//
747 // R600 Target Machine (R600 -> Cayman)
748 //===----------------------------------------------------------------------===//
749 
750 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
751                                      StringRef CPU, StringRef FS,
752                                      TargetOptions Options,
753                                      Optional<Reloc::Model> RM,
754                                      Optional<CodeModel::Model> CM,
755                                      CodeGenOpt::Level OL, bool JIT)
756     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
757   setRequiresStructuredCFG(true);
758 
759   // Override the default since calls aren't supported for r600.
760   if (EnableFunctionCalls &&
761       EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
762     EnableFunctionCalls = false;
763 }
764 
765 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
766   const Function &F) const {
767   StringRef GPU = getGPUName(F);
768   StringRef FS = getFeatureString(F);
769 
770   SmallString<128> SubtargetKey(GPU);
771   SubtargetKey.append(FS);
772 
773   auto &I = SubtargetMap[SubtargetKey];
774   if (!I) {
775     // This needs to be done before we create a new subtarget since any
776     // creation will depend on the TM and the code generation flags on the
777     // function that reside in TargetOptions.
778     resetTargetOptions(F);
779     I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
780   }
781 
782   return I.get();
783 }
784 
785 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
786   return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
787           AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
788           AddrSpace == AMDGPUAS::REGION_ADDRESS)
789              ? -1
790              : 0;
791 }
792 
793 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
794                                               unsigned DestAS) const {
795   return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
796          AMDGPU::isFlatGlobalAddrSpace(DestAS);
797 }
798 
799 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
800   const auto *LD = dyn_cast<LoadInst>(V);
801   if (!LD)
802     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
803 
804   // It must be a generic pointer loaded.
805   assert(V->getType()->isPointerTy() &&
806          V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
807 
808   const auto *Ptr = LD->getPointerOperand();
809   if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
810     return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
811   // For a generic pointer loaded from the constant memory, it could be assumed
812   // as a global pointer since the constant memory is only populated on the
813   // host side. As implied by the offload programming model, only global
814   // pointers could be referenced on the host side.
815   return AMDGPUAS::GLOBAL_ADDRESS;
816 }
817 
818 TargetTransformInfo
819 R600TargetMachine::getTargetTransformInfo(const Function &F) {
820   return TargetTransformInfo(R600TTIImpl(this, F));
821 }
822 
823 //===----------------------------------------------------------------------===//
824 // GCN Target Machine (SI+)
825 //===----------------------------------------------------------------------===//
826 
827 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
828                                    StringRef CPU, StringRef FS,
829                                    TargetOptions Options,
830                                    Optional<Reloc::Model> RM,
831                                    Optional<CodeModel::Model> CM,
832                                    CodeGenOpt::Level OL, bool JIT)
833     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
834 
835 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
836   StringRef GPU = getGPUName(F);
837   StringRef FS = getFeatureString(F);
838 
839   SmallString<128> SubtargetKey(GPU);
840   SubtargetKey.append(FS);
841 
842   auto &I = SubtargetMap[SubtargetKey];
843   if (!I) {
844     // This needs to be done before we create a new subtarget since any
845     // creation will depend on the TM and the code generation flags on the
846     // function that reside in TargetOptions.
847     resetTargetOptions(F);
848     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
849   }
850 
851   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
852 
853   return I.get();
854 }
855 
856 TargetTransformInfo
857 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
858   return TargetTransformInfo(GCNTTIImpl(this, F));
859 }
860 
861 //===----------------------------------------------------------------------===//
862 // AMDGPU Pass Setup
863 //===----------------------------------------------------------------------===//
864 
865 namespace {
866 
867 class AMDGPUPassConfig : public TargetPassConfig {
868 public:
869   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
870     : TargetPassConfig(TM, PM) {
871     // Exceptions and StackMaps are not supported, so these passes will never do
872     // anything.
873     disablePass(&StackMapLivenessID);
874     disablePass(&FuncletLayoutID);
875     // Garbage collection is not supported.
876     disablePass(&GCLoweringID);
877     disablePass(&ShadowStackGCLoweringID);
878   }
879 
880   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
881     return getTM<AMDGPUTargetMachine>();
882   }
883 
884   ScheduleDAGInstrs *
885   createMachineScheduler(MachineSchedContext *C) const override {
886     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
887     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
888     return DAG;
889   }
890 
891   void addEarlyCSEOrGVNPass();
892   void addStraightLineScalarOptimizationPasses();
893   void addIRPasses() override;
894   void addCodeGenPrepare() override;
895   bool addPreISel() override;
896   bool addInstSelector() override;
897   bool addGCPasses() override;
898 
899   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
900 
901   /// Check if a pass is enabled given \p Opt option. The option always
902   /// overrides defaults if explicitely used. Otherwise its default will
903   /// be used given that a pass shall work at an optimization \p Level
904   /// minimum.
905   bool isPassEnabled(const cl::opt<bool> &Opt,
906                      CodeGenOpt::Level Level = CodeGenOpt::Default) const {
907     if (Opt.getNumOccurrences())
908       return Opt;
909     if (TM->getOptLevel() < Level)
910       return false;
911     return Opt;
912   }
913 };
914 
915 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
916   return getStandardCSEConfigForOpt(TM->getOptLevel());
917 }
918 
919 class R600PassConfig final : public AMDGPUPassConfig {
920 public:
921   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
922     : AMDGPUPassConfig(TM, PM) {}
923 
924   ScheduleDAGInstrs *createMachineScheduler(
925     MachineSchedContext *C) const override {
926     return createR600MachineScheduler(C);
927   }
928 
929   bool addPreISel() override;
930   bool addInstSelector() override;
931   void addPreRegAlloc() override;
932   void addPreSched2() override;
933   void addPreEmitPass() override;
934 };
935 
936 class GCNPassConfig final : public AMDGPUPassConfig {
937 public:
938   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
939     : AMDGPUPassConfig(TM, PM) {
940     // It is necessary to know the register usage of the entire call graph.  We
941     // allow calls without EnableAMDGPUFunctionCalls if they are marked
942     // noinline, so this is always required.
943     setRequiresCodeGenSCCOrder(true);
944   }
945 
946   GCNTargetMachine &getGCNTargetMachine() const {
947     return getTM<GCNTargetMachine>();
948   }
949 
950   ScheduleDAGInstrs *
951   createMachineScheduler(MachineSchedContext *C) const override;
952 
953   bool addPreISel() override;
954   void addMachineSSAOptimization() override;
955   bool addILPOpts() override;
956   bool addInstSelector() override;
957   bool addIRTranslator() override;
958   void addPreLegalizeMachineIR() override;
959   bool addLegalizeMachineIR() override;
960   void addPreRegBankSelect() override;
961   bool addRegBankSelect() override;
962   void addPreGlobalInstructionSelect() override;
963   bool addGlobalInstructionSelect() override;
964   void addFastRegAlloc() override;
965   void addOptimizedRegAlloc() override;
966 
967   FunctionPass *createSGPRAllocPass(bool Optimized);
968   FunctionPass *createVGPRAllocPass(bool Optimized);
969   FunctionPass *createRegAllocPass(bool Optimized) override;
970 
971   bool addRegAssignAndRewriteFast() override;
972   bool addRegAssignAndRewriteOptimized() override;
973 
974   void addPreRegAlloc() override;
975   bool addPreRewrite() override;
976   void addPostRegAlloc() override;
977   void addPreSched2() override;
978   void addPreEmitPass() override;
979 };
980 
981 } // end anonymous namespace
982 
983 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
984   if (getOptLevel() == CodeGenOpt::Aggressive)
985     addPass(createGVNPass());
986   else
987     addPass(createEarlyCSEPass());
988 }
989 
990 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
991   addPass(createLICMPass());
992   addPass(createSeparateConstOffsetFromGEPPass());
993   addPass(createSpeculativeExecutionPass());
994   // ReassociateGEPs exposes more opportunites for SLSR. See
995   // the example in reassociate-geps-and-slsr.ll.
996   addPass(createStraightLineStrengthReducePass());
997   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
998   // EarlyCSE can reuse.
999   addEarlyCSEOrGVNPass();
1000   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1001   addPass(createNaryReassociatePass());
1002   // NaryReassociate on GEPs creates redundant common expressions, so run
1003   // EarlyCSE after it.
1004   addPass(createEarlyCSEPass());
1005 }
1006 
1007 void AMDGPUPassConfig::addIRPasses() {
1008   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1009 
1010   // There is no reason to run these.
1011   disablePass(&StackMapLivenessID);
1012   disablePass(&FuncletLayoutID);
1013   disablePass(&PatchableFunctionID);
1014 
1015   addPass(createAMDGPUPrintfRuntimeBinding());
1016 
1017   // This must occur before inlining, as the inliner will not look through
1018   // bitcast calls.
1019   addPass(createAMDGPUFixFunctionBitcastsPass());
1020 
1021   // A call to propagate attributes pass in the backend in case opt was not run.
1022   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
1023 
1024   addPass(createAMDGPULowerIntrinsicsPass());
1025 
1026   // Function calls are not supported, so make sure we inline everything.
1027   addPass(createAMDGPUAlwaysInlinePass());
1028   addPass(createAlwaysInlinerLegacyPass());
1029   // We need to add the barrier noop pass, otherwise adding the function
1030   // inlining pass will cause all of the PassConfigs passes to be run
1031   // one function at a time, which means if we have a nodule with two
1032   // functions, then we will generate code for the first function
1033   // without ever running any passes on the second.
1034   addPass(createBarrierNoopPass());
1035 
1036   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1037   if (TM.getTargetTriple().getArch() == Triple::r600)
1038     addPass(createR600OpenCLImageTypeLoweringPass());
1039 
1040   // Replace OpenCL enqueued block function pointers with global variables.
1041   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1042 
1043   // Can increase LDS used by kernel so runs before PromoteAlloca
1044   if (EnableLowerModuleLDS) {
1045     // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
1046     // pass "amdgpu-lower-module-lds", and also it required to be run only if
1047     // "amdgpu-lower-module-lds" pass is enabled.
1048     if (EnableLDSReplaceWithPointer)
1049       addPass(createAMDGPUReplaceLDSUseWithPointerPass());
1050 
1051     addPass(createAMDGPULowerModuleLDSPass());
1052   }
1053 
1054   if (TM.getOptLevel() > CodeGenOpt::None)
1055     addPass(createInferAddressSpacesPass());
1056 
1057   addPass(createAtomicExpandPass());
1058 
1059   if (TM.getOptLevel() > CodeGenOpt::None) {
1060     addPass(createAMDGPUPromoteAlloca());
1061 
1062     if (EnableSROA)
1063       addPass(createSROAPass());
1064     if (isPassEnabled(EnableScalarIRPasses))
1065       addStraightLineScalarOptimizationPasses();
1066 
1067     if (EnableAMDGPUAliasAnalysis) {
1068       addPass(createAMDGPUAAWrapperPass());
1069       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1070                                              AAResults &AAR) {
1071         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1072           AAR.addAAResult(WrapperPass->getResult());
1073         }));
1074     }
1075 
1076     if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1077       // TODO: May want to move later or split into an early and late one.
1078       addPass(createAMDGPUCodeGenPreparePass());
1079     }
1080   }
1081 
1082   TargetPassConfig::addIRPasses();
1083 
1084   // EarlyCSE is not always strong enough to clean up what LSR produces. For
1085   // example, GVN can combine
1086   //
1087   //   %0 = add %a, %b
1088   //   %1 = add %b, %a
1089   //
1090   // and
1091   //
1092   //   %0 = shl nsw %a, 2
1093   //   %1 = shl %a, 2
1094   //
1095   // but EarlyCSE can do neither of them.
1096   if (isPassEnabled(EnableScalarIRPasses))
1097     addEarlyCSEOrGVNPass();
1098 }
1099 
1100 void AMDGPUPassConfig::addCodeGenPrepare() {
1101   if (TM->getTargetTriple().getArch() == Triple::amdgcn)
1102     addPass(createAMDGPUAnnotateKernelFeaturesPass());
1103 
1104   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1105       EnableLowerKernelArguments)
1106     addPass(createAMDGPULowerKernelArgumentsPass());
1107 
1108   if (TM->getOptLevel() > CodeGenOpt::Less)
1109     addPass(&AMDGPUPerfHintAnalysisID);
1110 
1111   TargetPassConfig::addCodeGenPrepare();
1112 
1113   if (isPassEnabled(EnableLoadStoreVectorizer))
1114     addPass(createLoadStoreVectorizerPass());
1115 
1116   // LowerSwitch pass may introduce unreachable blocks that can
1117   // cause unexpected behavior for subsequent passes. Placing it
1118   // here seems better that these blocks would get cleaned up by
1119   // UnreachableBlockElim inserted next in the pass flow.
1120   addPass(createLowerSwitchPass());
1121 }
1122 
1123 bool AMDGPUPassConfig::addPreISel() {
1124   if (TM->getOptLevel() > CodeGenOpt::None)
1125     addPass(createFlattenCFGPass());
1126   return false;
1127 }
1128 
1129 bool AMDGPUPassConfig::addInstSelector() {
1130   // Defer the verifier until FinalizeISel.
1131   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
1132   return false;
1133 }
1134 
1135 bool AMDGPUPassConfig::addGCPasses() {
1136   // Do nothing. GC is not supported.
1137   return false;
1138 }
1139 
1140 //===----------------------------------------------------------------------===//
1141 // R600 Pass Setup
1142 //===----------------------------------------------------------------------===//
1143 
1144 bool R600PassConfig::addPreISel() {
1145   AMDGPUPassConfig::addPreISel();
1146 
1147   if (EnableR600StructurizeCFG)
1148     addPass(createStructurizeCFGPass());
1149   return false;
1150 }
1151 
1152 bool R600PassConfig::addInstSelector() {
1153   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
1154   return false;
1155 }
1156 
1157 void R600PassConfig::addPreRegAlloc() {
1158   addPass(createR600VectorRegMerger());
1159 }
1160 
1161 void R600PassConfig::addPreSched2() {
1162   addPass(createR600EmitClauseMarkers(), false);
1163   if (EnableR600IfConvert)
1164     addPass(&IfConverterID, false);
1165   addPass(createR600ClauseMergePass(), false);
1166 }
1167 
1168 void R600PassConfig::addPreEmitPass() {
1169   addPass(createAMDGPUCFGStructurizerPass(), false);
1170   addPass(createR600ExpandSpecialInstrsPass(), false);
1171   addPass(&FinalizeMachineBundlesID, false);
1172   addPass(createR600Packetizer(), false);
1173   addPass(createR600ControlFlowFinalizer(), false);
1174 }
1175 
1176 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
1177   return new R600PassConfig(*this, PM);
1178 }
1179 
1180 //===----------------------------------------------------------------------===//
1181 // GCN Pass Setup
1182 //===----------------------------------------------------------------------===//
1183 
1184 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1185   MachineSchedContext *C) const {
1186   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1187   if (ST.enableSIScheduler())
1188     return createSIMachineScheduler(C);
1189   return createGCNMaxOccupancyMachineScheduler(C);
1190 }
1191 
1192 bool GCNPassConfig::addPreISel() {
1193   AMDGPUPassConfig::addPreISel();
1194 
1195   if (TM->getOptLevel() > CodeGenOpt::None)
1196     addPass(createAMDGPULateCodeGenPreparePass());
1197 
1198   if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1199     addPass(createAMDGPUAtomicOptimizerPass());
1200   }
1201 
1202   if (TM->getOptLevel() > CodeGenOpt::None)
1203     addPass(createSinkingPass());
1204 
1205   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1206   // regions formed by them.
1207   addPass(&AMDGPUUnifyDivergentExitNodesID);
1208   if (!LateCFGStructurize) {
1209     if (EnableStructurizerWorkarounds) {
1210       addPass(createFixIrreduciblePass());
1211       addPass(createUnifyLoopExitsPass());
1212     }
1213     addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1214   }
1215   addPass(createAMDGPUAnnotateUniformValues());
1216   if (!LateCFGStructurize) {
1217     addPass(createSIAnnotateControlFlowPass());
1218   }
1219   addPass(createLCSSAPass());
1220 
1221   return false;
1222 }
1223 
1224 void GCNPassConfig::addMachineSSAOptimization() {
1225   TargetPassConfig::addMachineSSAOptimization();
1226 
1227   // We want to fold operands after PeepholeOptimizer has run (or as part of
1228   // it), because it will eliminate extra copies making it easier to fold the
1229   // real source operand. We want to eliminate dead instructions after, so that
1230   // we see fewer uses of the copies. We then need to clean up the dead
1231   // instructions leftover after the operands are folded as well.
1232   //
1233   // XXX - Can we get away without running DeadMachineInstructionElim again?
1234   addPass(&SIFoldOperandsID);
1235   if (EnableDPPCombine)
1236     addPass(&GCNDPPCombineID);
1237   addPass(&SILoadStoreOptimizerID);
1238   if (isPassEnabled(EnableSDWAPeephole)) {
1239     addPass(&SIPeepholeSDWAID);
1240     addPass(&EarlyMachineLICMID);
1241     addPass(&MachineCSEID);
1242     addPass(&SIFoldOperandsID);
1243   }
1244   addPass(&DeadMachineInstructionElimID);
1245   addPass(createSIShrinkInstructionsPass());
1246 }
1247 
1248 bool GCNPassConfig::addILPOpts() {
1249   if (EnableEarlyIfConversion)
1250     addPass(&EarlyIfConverterID);
1251 
1252   TargetPassConfig::addILPOpts();
1253   return false;
1254 }
1255 
1256 bool GCNPassConfig::addInstSelector() {
1257   AMDGPUPassConfig::addInstSelector();
1258   addPass(&SIFixSGPRCopiesID);
1259   addPass(createSILowerI1CopiesPass());
1260   return false;
1261 }
1262 
1263 bool GCNPassConfig::addIRTranslator() {
1264   addPass(new IRTranslator(getOptLevel()));
1265   return false;
1266 }
1267 
1268 void GCNPassConfig::addPreLegalizeMachineIR() {
1269   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1270   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1271   addPass(new Localizer());
1272 }
1273 
1274 bool GCNPassConfig::addLegalizeMachineIR() {
1275   addPass(new Legalizer());
1276   return false;
1277 }
1278 
1279 void GCNPassConfig::addPreRegBankSelect() {
1280   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1281   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1282 }
1283 
1284 bool GCNPassConfig::addRegBankSelect() {
1285   addPass(new RegBankSelect());
1286   return false;
1287 }
1288 
1289 void GCNPassConfig::addPreGlobalInstructionSelect() {
1290   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1291   addPass(createAMDGPURegBankCombiner(IsOptNone));
1292 }
1293 
1294 bool GCNPassConfig::addGlobalInstructionSelect() {
1295   addPass(new InstructionSelect(getOptLevel()));
1296   return false;
1297 }
1298 
1299 void GCNPassConfig::addPreRegAlloc() {
1300   if (LateCFGStructurize) {
1301     addPass(createAMDGPUMachineCFGStructurizerPass());
1302   }
1303 }
1304 
1305 void GCNPassConfig::addFastRegAlloc() {
1306   // FIXME: We have to disable the verifier here because of PHIElimination +
1307   // TwoAddressInstructions disabling it.
1308 
1309   // This must be run immediately after phi elimination and before
1310   // TwoAddressInstructions, otherwise the processing of the tied operand of
1311   // SI_ELSE will introduce a copy of the tied operand source after the else.
1312   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1313 
1314   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1315   insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1316 
1317   TargetPassConfig::addFastRegAlloc();
1318 }
1319 
1320 void GCNPassConfig::addOptimizedRegAlloc() {
1321   // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1322   // instructions that cause scheduling barriers.
1323   insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1324   insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1325 
1326   if (OptExecMaskPreRA)
1327     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1328 
1329   if (isPassEnabled(EnablePreRAOptimizations))
1330     insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1331 
1332   // This is not an essential optimization and it has a noticeable impact on
1333   // compilation time, so we only enable it from O2.
1334   if (TM->getOptLevel() > CodeGenOpt::Less)
1335     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1336 
1337   // FIXME: when an instruction has a Killed operand, and the instruction is
1338   // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1339   // the register in LiveVariables, this would trigger a failure in verifier,
1340   // we should fix it and enable the verifier.
1341   if (OptVGPRLiveRange)
1342     insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID, false);
1343   // This must be run immediately after phi elimination and before
1344   // TwoAddressInstructions, otherwise the processing of the tied operand of
1345   // SI_ELSE will introduce a copy of the tied operand source after the else.
1346   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
1347 
1348   if (EnableDCEInRA)
1349     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1350 
1351   TargetPassConfig::addOptimizedRegAlloc();
1352 }
1353 
1354 bool GCNPassConfig::addPreRewrite() {
1355   if (EnableRegReassign)
1356     addPass(&GCNNSAReassignID);
1357   return true;
1358 }
1359 
1360 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1361   // Initialize the global default.
1362   llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1363                   initializeDefaultSGPRRegisterAllocatorOnce);
1364 
1365   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1366   if (Ctor != useDefaultRegisterAllocator)
1367     return Ctor();
1368 
1369   if (Optimized)
1370     return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1371 
1372   return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1373 }
1374 
1375 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1376   // Initialize the global default.
1377   llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1378                   initializeDefaultVGPRRegisterAllocatorOnce);
1379 
1380   RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1381   if (Ctor != useDefaultRegisterAllocator)
1382     return Ctor();
1383 
1384   if (Optimized)
1385     return createGreedyVGPRRegisterAllocator();
1386 
1387   return createFastVGPRRegisterAllocator();
1388 }
1389 
1390 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1391   llvm_unreachable("should not be used");
1392 }
1393 
1394 static const char RegAllocOptNotSupportedMessage[] =
1395   "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1396 
1397 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1398   if (!usingDefaultRegAlloc())
1399     report_fatal_error(RegAllocOptNotSupportedMessage);
1400 
1401   addPass(createSGPRAllocPass(false));
1402 
1403   // Equivalent of PEI for SGPRs.
1404   addPass(&SILowerSGPRSpillsID);
1405 
1406   addPass(createVGPRAllocPass(false));
1407   return true;
1408 }
1409 
1410 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1411   if (!usingDefaultRegAlloc())
1412     report_fatal_error(RegAllocOptNotSupportedMessage);
1413 
1414   addPass(createSGPRAllocPass(true));
1415 
1416   // Commit allocated register changes. This is mostly necessary because too
1417   // many things rely on the use lists of the physical registers, such as the
1418   // verifier. This is only necessary with allocators which use LiveIntervals,
1419   // since FastRegAlloc does the replacments itself.
1420   addPass(createVirtRegRewriter(false));
1421 
1422   // Equivalent of PEI for SGPRs.
1423   addPass(&SILowerSGPRSpillsID);
1424 
1425   addPass(createVGPRAllocPass(true));
1426 
1427   addPreRewrite();
1428   addPass(&VirtRegRewriterID);
1429 
1430   return true;
1431 }
1432 
1433 void GCNPassConfig::addPostRegAlloc() {
1434   addPass(&SIFixVGPRCopiesID);
1435   if (getOptLevel() > CodeGenOpt::None)
1436     addPass(&SIOptimizeExecMaskingID);
1437   TargetPassConfig::addPostRegAlloc();
1438 }
1439 
1440 void GCNPassConfig::addPreSched2() {
1441   addPass(&SIPostRABundlerID);
1442 }
1443 
1444 void GCNPassConfig::addPreEmitPass() {
1445   addPass(createSIMemoryLegalizerPass());
1446   addPass(createSIInsertWaitcntsPass());
1447 
1448   if (TM->getOptLevel() > CodeGenOpt::None)
1449     addPass(createSIShrinkInstructionsPass());
1450 
1451   addPass(createSIModeRegisterPass());
1452 
1453   if (getOptLevel() > CodeGenOpt::None)
1454     addPass(&SIInsertHardClausesID);
1455 
1456   addPass(&SILateBranchLoweringPassID);
1457   if (getOptLevel() > CodeGenOpt::None)
1458     addPass(&SIPreEmitPeepholeID);
1459   // The hazard recognizer that runs as part of the post-ra scheduler does not
1460   // guarantee to be able handle all hazards correctly. This is because if there
1461   // are multiple scheduling regions in a basic block, the regions are scheduled
1462   // bottom up, so when we begin to schedule a region we don't know what
1463   // instructions were emitted directly before it.
1464   //
1465   // Here we add a stand-alone hazard recognizer pass which can handle all
1466   // cases.
1467   addPass(&PostRAHazardRecognizerID);
1468   addPass(&BranchRelaxationPassID);
1469 }
1470 
1471 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1472   return new GCNPassConfig(*this, PM);
1473 }
1474 
1475 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1476   return new yaml::SIMachineFunctionInfo();
1477 }
1478 
1479 yaml::MachineFunctionInfo *
1480 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1481   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1482   return new yaml::SIMachineFunctionInfo(
1483       *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
1484 }
1485 
1486 bool GCNTargetMachine::parseMachineFunctionInfo(
1487     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1488     SMDiagnostic &Error, SMRange &SourceRange) const {
1489   const yaml::SIMachineFunctionInfo &YamlMFI =
1490       reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1491   MachineFunction &MF = PFS.MF;
1492   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1493 
1494   if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1495     return true;
1496 
1497   if (MFI->Occupancy == 0) {
1498     // Fixup the subtarget dependent default value.
1499     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1500     MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1501   }
1502 
1503   auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1504     Register TempReg;
1505     if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1506       SourceRange = RegName.SourceRange;
1507       return true;
1508     }
1509     RegVal = TempReg;
1510 
1511     return false;
1512   };
1513 
1514   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1515     // Create a diagnostic for a the register string literal.
1516     const MemoryBuffer &Buffer =
1517         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1518     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1519                          RegName.Value.size(), SourceMgr::DK_Error,
1520                          "incorrect register class for field", RegName.Value,
1521                          None, None);
1522     SourceRange = RegName.SourceRange;
1523     return true;
1524   };
1525 
1526   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1527       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1528       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1529     return true;
1530 
1531   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1532       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1533     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1534   }
1535 
1536   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1537       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1538     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1539   }
1540 
1541   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1542       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1543     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1544   }
1545 
1546   auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1547                                    const TargetRegisterClass &RC,
1548                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1549                                    unsigned SystemSGPRs) {
1550     // Skip parsing if it's not present.
1551     if (!A)
1552       return false;
1553 
1554     if (A->IsRegister) {
1555       Register Reg;
1556       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1557         SourceRange = A->RegisterName.SourceRange;
1558         return true;
1559       }
1560       if (!RC.contains(Reg))
1561         return diagnoseRegisterClass(A->RegisterName);
1562       Arg = ArgDescriptor::createRegister(Reg);
1563     } else
1564       Arg = ArgDescriptor::createStack(A->StackOffset);
1565     // Check and apply the optional mask.
1566     if (A->Mask)
1567       Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1568 
1569     MFI->NumUserSGPRs += UserSGPRs;
1570     MFI->NumSystemSGPRs += SystemSGPRs;
1571     return false;
1572   };
1573 
1574   if (YamlMFI.ArgInfo &&
1575       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1576                              AMDGPU::SGPR_128RegClass,
1577                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1578        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1579                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1580                              2, 0) ||
1581        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1582                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1583        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1584                              AMDGPU::SReg_64RegClass,
1585                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1586        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1587                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1588                              2, 0) ||
1589        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1590                              AMDGPU::SReg_64RegClass,
1591                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1592        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1593                              AMDGPU::SGPR_32RegClass,
1594                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1595        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1596                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1597                              0, 1) ||
1598        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1599                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1600                              0, 1) ||
1601        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1602                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1603                              0, 1) ||
1604        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1605                              AMDGPU::SGPR_32RegClass,
1606                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1607        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1608                              AMDGPU::SGPR_32RegClass,
1609                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1610        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1611                              AMDGPU::SReg_64RegClass,
1612                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1613        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1614                              AMDGPU::SReg_64RegClass,
1615                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1616        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1617                              AMDGPU::VGPR_32RegClass,
1618                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1619        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1620                              AMDGPU::VGPR_32RegClass,
1621                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1622        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1623                              AMDGPU::VGPR_32RegClass,
1624                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1625     return true;
1626 
1627   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1628   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1629   MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1630   MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1631   MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1632   MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1633 
1634   return false;
1635 }
1636