1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPUTargetObjectFile.h"
23 #include "AMDGPUTargetTransformInfo.h"
24 #include "GCNIterativeScheduler.h"
25 #include "GCNSchedStrategy.h"
26 #include "R600MachineScheduler.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIMachineScheduler.h"
29 #include "TargetInfo/AMDGPUTargetInfo.h"
30 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/TargetPassConfig.h"
37 #include "llvm/IR/Attributes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/LegacyPassManager.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Compiler.h"
44 #include "llvm/Support/TargetRegistry.h"
45 #include "llvm/Target/TargetLoweringObjectFile.h"
46 #include "llvm/Transforms/IPO.h"
47 #include "llvm/Transforms/IPO/AlwaysInliner.h"
48 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
49 #include "llvm/Transforms/Scalar.h"
50 #include "llvm/Transforms/Scalar/GVN.h"
51 #include "llvm/Transforms/Utils.h"
52 #include "llvm/Transforms/Vectorize.h"
53 #include <memory>
54 
55 using namespace llvm;
56 
57 static cl::opt<bool> EnableR600StructurizeCFG(
58   "r600-ir-structurize",
59   cl::desc("Use StructurizeCFG IR pass"),
60   cl::init(true));
61 
62 static cl::opt<bool> EnableSROA(
63   "amdgpu-sroa",
64   cl::desc("Run SROA after promote alloca pass"),
65   cl::ReallyHidden,
66   cl::init(true));
67 
68 static cl::opt<bool>
69 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
70                         cl::desc("Run early if-conversion"),
71                         cl::init(false));
72 
73 static cl::opt<bool>
74 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
75             cl::desc("Run pre-RA exec mask optimizations"),
76             cl::init(true));
77 
78 static cl::opt<bool> EnableR600IfConvert(
79   "r600-if-convert",
80   cl::desc("Use if conversion pass"),
81   cl::ReallyHidden,
82   cl::init(true));
83 
84 // Option to disable vectorizer for tests.
85 static cl::opt<bool> EnableLoadStoreVectorizer(
86   "amdgpu-load-store-vectorizer",
87   cl::desc("Enable load store vectorizer"),
88   cl::init(true),
89   cl::Hidden);
90 
91 // Option to control global loads scalarization
92 static cl::opt<bool> ScalarizeGlobal(
93   "amdgpu-scalarize-global-loads",
94   cl::desc("Enable global load scalarization"),
95   cl::init(true),
96   cl::Hidden);
97 
98 // Option to run internalize pass.
99 static cl::opt<bool> InternalizeSymbols(
100   "amdgpu-internalize-symbols",
101   cl::desc("Enable elimination of non-kernel functions and unused globals"),
102   cl::init(false),
103   cl::Hidden);
104 
105 // Option to inline all early.
106 static cl::opt<bool> EarlyInlineAll(
107   "amdgpu-early-inline-all",
108   cl::desc("Inline all functions early"),
109   cl::init(false),
110   cl::Hidden);
111 
112 static cl::opt<bool> EnableSDWAPeephole(
113   "amdgpu-sdwa-peephole",
114   cl::desc("Enable SDWA peepholer"),
115   cl::init(true));
116 
117 static cl::opt<bool> EnableDPPCombine(
118   "amdgpu-dpp-combine",
119   cl::desc("Enable DPP combiner"),
120   cl::init(true));
121 
122 // Enable address space based alias analysis
123 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
124   cl::desc("Enable AMDGPU Alias Analysis"),
125   cl::init(true));
126 
127 // Option to run late CFG structurizer
128 static cl::opt<bool, true> LateCFGStructurize(
129   "amdgpu-late-structurize",
130   cl::desc("Enable late CFG structurization"),
131   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
132   cl::Hidden);
133 
134 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
135   "amdgpu-function-calls",
136   cl::desc("Enable AMDGPU function call support"),
137   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
138   cl::init(true),
139   cl::Hidden);
140 
141 // Enable lib calls simplifications
142 static cl::opt<bool> EnableLibCallSimplify(
143   "amdgpu-simplify-libcall",
144   cl::desc("Enable amdgpu library simplifications"),
145   cl::init(true),
146   cl::Hidden);
147 
148 static cl::opt<bool> EnableLowerKernelArguments(
149   "amdgpu-ir-lower-kernel-arguments",
150   cl::desc("Lower kernel argument loads in IR pass"),
151   cl::init(true),
152   cl::Hidden);
153 
154 static cl::opt<bool> EnableRegReassign(
155   "amdgpu-reassign-regs",
156   cl::desc("Enable register reassign optimizations on gfx10+"),
157   cl::init(true),
158   cl::Hidden);
159 
160 // Enable atomic optimization
161 static cl::opt<bool> EnableAtomicOptimizations(
162   "amdgpu-atomic-optimizations",
163   cl::desc("Enable atomic optimizations"),
164   cl::init(false),
165   cl::Hidden);
166 
167 // Enable Mode register optimization
168 static cl::opt<bool> EnableSIModeRegisterPass(
169   "amdgpu-mode-register",
170   cl::desc("Enable mode register pass"),
171   cl::init(true),
172   cl::Hidden);
173 
174 // Option is used in lit tests to prevent deadcoding of patterns inspected.
175 static cl::opt<bool>
176 EnableDCEInRA("amdgpu-dce-in-ra",
177     cl::init(true), cl::Hidden,
178     cl::desc("Enable machine DCE inside regalloc"));
179 
180 static cl::opt<bool> EnableScalarIRPasses(
181   "amdgpu-scalar-ir-passes",
182   cl::desc("Enable scalar IR passes"),
183   cl::init(true),
184   cl::Hidden);
185 
186 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
187   // Register the target
188   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
189   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
190 
191   PassRegistry *PR = PassRegistry::getPassRegistry();
192   initializeR600ClauseMergePassPass(*PR);
193   initializeR600ControlFlowFinalizerPass(*PR);
194   initializeR600PacketizerPass(*PR);
195   initializeR600ExpandSpecialInstrsPassPass(*PR);
196   initializeR600VectorRegMergerPass(*PR);
197   initializeGlobalISel(*PR);
198   initializeAMDGPUDAGToDAGISelPass(*PR);
199   initializeGCNDPPCombinePass(*PR);
200   initializeSILowerI1CopiesPass(*PR);
201   initializeSILowerSGPRSpillsPass(*PR);
202   initializeSIFixSGPRCopiesPass(*PR);
203   initializeSIFixVGPRCopiesPass(*PR);
204   initializeSIFixupVectorISelPass(*PR);
205   initializeSIFoldOperandsPass(*PR);
206   initializeSIPeepholeSDWAPass(*PR);
207   initializeSIShrinkInstructionsPass(*PR);
208   initializeSIOptimizeExecMaskingPreRAPass(*PR);
209   initializeSILoadStoreOptimizerPass(*PR);
210   initializeAMDGPUFixFunctionBitcastsPass(*PR);
211   initializeAMDGPUAlwaysInlinePass(*PR);
212   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
213   initializeAMDGPUAnnotateUniformValuesPass(*PR);
214   initializeAMDGPUArgumentUsageInfoPass(*PR);
215   initializeAMDGPUAtomicOptimizerPass(*PR);
216   initializeAMDGPULowerKernelArgumentsPass(*PR);
217   initializeAMDGPULowerKernelAttributesPass(*PR);
218   initializeAMDGPULowerIntrinsicsPass(*PR);
219   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
220   initializeAMDGPUPreLegalizerCombinerPass(*PR);
221   initializeAMDGPUPromoteAllocaPass(*PR);
222   initializeAMDGPUCodeGenPreparePass(*PR);
223   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
224   initializeAMDGPUPropagateAttributesLatePass(*PR);
225   initializeAMDGPURewriteOutArgumentsPass(*PR);
226   initializeAMDGPUUnifyMetadataPass(*PR);
227   initializeSIAnnotateControlFlowPass(*PR);
228   initializeSIInsertWaitcntsPass(*PR);
229   initializeSIModeRegisterPass(*PR);
230   initializeSIWholeQuadModePass(*PR);
231   initializeSILowerControlFlowPass(*PR);
232   initializeSIRemoveShortExecBranchesPass(*PR);
233   initializeSIInsertSkipsPass(*PR);
234   initializeSIMemoryLegalizerPass(*PR);
235   initializeSIOptimizeExecMaskingPass(*PR);
236   initializeSIPreAllocateWWMRegsPass(*PR);
237   initializeSIFormMemoryClausesPass(*PR);
238   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
239   initializeAMDGPUAAWrapperPassPass(*PR);
240   initializeAMDGPUExternalAAWrapperPass(*PR);
241   initializeAMDGPUUseNativeCallsPass(*PR);
242   initializeAMDGPUSimplifyLibCallsPass(*PR);
243   initializeAMDGPUInlinerPass(*PR);
244   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
245   initializeGCNRegBankReassignPass(*PR);
246   initializeGCNNSAReassignPass(*PR);
247 }
248 
249 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
250   return std::make_unique<AMDGPUTargetObjectFile>();
251 }
252 
253 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
254   return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
255 }
256 
257 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
258   return new SIScheduleDAGMI(C);
259 }
260 
261 static ScheduleDAGInstrs *
262 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
263   ScheduleDAGMILive *DAG =
264     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
265   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
266   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
267   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
268   return DAG;
269 }
270 
271 static ScheduleDAGInstrs *
272 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
273   auto DAG = new GCNIterativeScheduler(C,
274     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
275   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
276   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
277   return DAG;
278 }
279 
280 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
281   return new GCNIterativeScheduler(C,
282     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
283 }
284 
285 static ScheduleDAGInstrs *
286 createIterativeILPMachineScheduler(MachineSchedContext *C) {
287   auto DAG = new GCNIterativeScheduler(C,
288     GCNIterativeScheduler::SCHEDULE_ILP);
289   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
290   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
291   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
292   return DAG;
293 }
294 
295 static MachineSchedRegistry
296 R600SchedRegistry("r600", "Run R600's custom scheduler",
297                    createR600MachineScheduler);
298 
299 static MachineSchedRegistry
300 SISchedRegistry("si", "Run SI's custom scheduler",
301                 createSIMachineScheduler);
302 
303 static MachineSchedRegistry
304 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
305                              "Run GCN scheduler to maximize occupancy",
306                              createGCNMaxOccupancyMachineScheduler);
307 
308 static MachineSchedRegistry
309 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
310   "Run GCN scheduler to maximize occupancy (experimental)",
311   createIterativeGCNMaxOccupancyMachineScheduler);
312 
313 static MachineSchedRegistry
314 GCNMinRegSchedRegistry("gcn-minreg",
315   "Run GCN iterative scheduler for minimal register usage (experimental)",
316   createMinRegScheduler);
317 
318 static MachineSchedRegistry
319 GCNILPSchedRegistry("gcn-ilp",
320   "Run GCN iterative scheduler for ILP scheduling (experimental)",
321   createIterativeILPMachineScheduler);
322 
323 static StringRef computeDataLayout(const Triple &TT) {
324   if (TT.getArch() == Triple::r600) {
325     // 32-bit pointers.
326       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
327              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
328   }
329 
330   // 32-bit private, local, and region pointers. 64-bit global, constant and
331   // flat, non-integral buffer fat pointers.
332     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
333          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
334          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
335          "-ni:7";
336 }
337 
338 LLVM_READNONE
339 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
340   if (!GPU.empty())
341     return GPU;
342 
343   // Need to default to a target with flat support for HSA.
344   if (TT.getArch() == Triple::amdgcn)
345     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
346 
347   return "r600";
348 }
349 
350 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
351   // The AMDGPU toolchain only supports generating shared objects, so we
352   // must always use PIC.
353   return Reloc::PIC_;
354 }
355 
356 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
357                                          StringRef CPU, StringRef FS,
358                                          TargetOptions Options,
359                                          Optional<Reloc::Model> RM,
360                                          Optional<CodeModel::Model> CM,
361                                          CodeGenOpt::Level OptLevel)
362     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
363                         FS, Options, getEffectiveRelocModel(RM),
364                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
365       TLOF(createTLOF(getTargetTriple())) {
366   initAsmInfo();
367 }
368 
369 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
370 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
371 
372 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
373 
374 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
375   Attribute GPUAttr = F.getFnAttribute("target-cpu");
376   return GPUAttr.hasAttribute(Attribute::None) ?
377     getTargetCPU() : GPUAttr.getValueAsString();
378 }
379 
380 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
381   Attribute FSAttr = F.getFnAttribute("target-features");
382 
383   return FSAttr.hasAttribute(Attribute::None) ?
384     getTargetFeatureString() :
385     FSAttr.getValueAsString();
386 }
387 
388 /// Predicate for Internalize pass.
389 static bool mustPreserveGV(const GlobalValue &GV) {
390   if (const Function *F = dyn_cast<Function>(&GV))
391     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
392 
393   return !GV.use_empty();
394 }
395 
396 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
397   Builder.DivergentTarget = true;
398 
399   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
400   bool Internalize = InternalizeSymbols;
401   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
402   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
403   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
404 
405   if (EnableFunctionCalls) {
406     delete Builder.Inliner;
407     Builder.Inliner = createAMDGPUFunctionInliningPass();
408   }
409 
410   Builder.addExtension(
411     PassManagerBuilder::EP_ModuleOptimizerEarly,
412     [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
413                                                legacy::PassManagerBase &PM) {
414       if (AMDGPUAA) {
415         PM.add(createAMDGPUAAWrapperPass());
416         PM.add(createAMDGPUExternalAAWrapperPass());
417       }
418       PM.add(createAMDGPUUnifyMetadataPass());
419       PM.add(createAMDGPUPrintfRuntimeBinding());
420       PM.add(createAMDGPUPropagateAttributesLatePass(this));
421       if (Internalize) {
422         PM.add(createInternalizePass(mustPreserveGV));
423         PM.add(createGlobalDCEPass());
424       }
425       if (EarlyInline)
426         PM.add(createAMDGPUAlwaysInlinePass(false));
427   });
428 
429   const auto &Opt = Options;
430   Builder.addExtension(
431     PassManagerBuilder::EP_EarlyAsPossible,
432     [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
433                                             legacy::PassManagerBase &PM) {
434       if (AMDGPUAA) {
435         PM.add(createAMDGPUAAWrapperPass());
436         PM.add(createAMDGPUExternalAAWrapperPass());
437       }
438       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
439       PM.add(llvm::createAMDGPUUseNativeCallsPass());
440       if (LibCallSimplify)
441         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
442   });
443 
444   Builder.addExtension(
445     PassManagerBuilder::EP_CGSCCOptimizerLate,
446     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
447       // Add infer address spaces pass to the opt pipeline after inlining
448       // but before SROA to increase SROA opportunities.
449       PM.add(createInferAddressSpacesPass());
450 
451       // This should run after inlining to have any chance of doing anything,
452       // and before other cleanup optimizations.
453       PM.add(createAMDGPULowerKernelAttributesPass());
454   });
455 }
456 
457 //===----------------------------------------------------------------------===//
458 // R600 Target Machine (R600 -> Cayman)
459 //===----------------------------------------------------------------------===//
460 
461 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
462                                      StringRef CPU, StringRef FS,
463                                      TargetOptions Options,
464                                      Optional<Reloc::Model> RM,
465                                      Optional<CodeModel::Model> CM,
466                                      CodeGenOpt::Level OL, bool JIT)
467     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
468   setRequiresStructuredCFG(true);
469 
470   // Override the default since calls aren't supported for r600.
471   if (EnableFunctionCalls &&
472       EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
473     EnableFunctionCalls = false;
474 }
475 
476 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
477   const Function &F) const {
478   StringRef GPU = getGPUName(F);
479   StringRef FS = getFeatureString(F);
480 
481   SmallString<128> SubtargetKey(GPU);
482   SubtargetKey.append(FS);
483 
484   auto &I = SubtargetMap[SubtargetKey];
485   if (!I) {
486     // This needs to be done before we create a new subtarget since any
487     // creation will depend on the TM and the code generation flags on the
488     // function that reside in TargetOptions.
489     resetTargetOptions(F);
490     I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
491   }
492 
493   return I.get();
494 }
495 
496 TargetTransformInfo
497 R600TargetMachine::getTargetTransformInfo(const Function &F) {
498   return TargetTransformInfo(R600TTIImpl(this, F));
499 }
500 
501 //===----------------------------------------------------------------------===//
502 // GCN Target Machine (SI+)
503 //===----------------------------------------------------------------------===//
504 
505 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
506                                    StringRef CPU, StringRef FS,
507                                    TargetOptions Options,
508                                    Optional<Reloc::Model> RM,
509                                    Optional<CodeModel::Model> CM,
510                                    CodeGenOpt::Level OL, bool JIT)
511     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
512 
513 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
514   StringRef GPU = getGPUName(F);
515   StringRef FS = getFeatureString(F);
516 
517   SmallString<128> SubtargetKey(GPU);
518   SubtargetKey.append(FS);
519 
520   auto &I = SubtargetMap[SubtargetKey];
521   if (!I) {
522     // This needs to be done before we create a new subtarget since any
523     // creation will depend on the TM and the code generation flags on the
524     // function that reside in TargetOptions.
525     resetTargetOptions(F);
526     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
527   }
528 
529   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
530 
531   return I.get();
532 }
533 
534 TargetTransformInfo
535 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
536   return TargetTransformInfo(GCNTTIImpl(this, F));
537 }
538 
539 //===----------------------------------------------------------------------===//
540 // AMDGPU Pass Setup
541 //===----------------------------------------------------------------------===//
542 
543 namespace {
544 
545 class AMDGPUPassConfig : public TargetPassConfig {
546 public:
547   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
548     : TargetPassConfig(TM, PM) {
549     // Exceptions and StackMaps are not supported, so these passes will never do
550     // anything.
551     disablePass(&StackMapLivenessID);
552     disablePass(&FuncletLayoutID);
553   }
554 
555   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
556     return getTM<AMDGPUTargetMachine>();
557   }
558 
559   ScheduleDAGInstrs *
560   createMachineScheduler(MachineSchedContext *C) const override {
561     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
562     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
563     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
564     return DAG;
565   }
566 
567   void addEarlyCSEOrGVNPass();
568   void addStraightLineScalarOptimizationPasses();
569   void addIRPasses() override;
570   void addCodeGenPrepare() override;
571   bool addPreISel() override;
572   bool addInstSelector() override;
573   bool addGCPasses() override;
574 
575   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
576 };
577 
578 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
579   return getStandardCSEConfigForOpt(TM->getOptLevel());
580 }
581 
582 class R600PassConfig final : public AMDGPUPassConfig {
583 public:
584   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
585     : AMDGPUPassConfig(TM, PM) {}
586 
587   ScheduleDAGInstrs *createMachineScheduler(
588     MachineSchedContext *C) const override {
589     return createR600MachineScheduler(C);
590   }
591 
592   bool addPreISel() override;
593   bool addInstSelector() override;
594   void addPreRegAlloc() override;
595   void addPreSched2() override;
596   void addPreEmitPass() override;
597 };
598 
599 class GCNPassConfig final : public AMDGPUPassConfig {
600 public:
601   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
602     : AMDGPUPassConfig(TM, PM) {
603     // It is necessary to know the register usage of the entire call graph.  We
604     // allow calls without EnableAMDGPUFunctionCalls if they are marked
605     // noinline, so this is always required.
606     setRequiresCodeGenSCCOrder(true);
607   }
608 
609   GCNTargetMachine &getGCNTargetMachine() const {
610     return getTM<GCNTargetMachine>();
611   }
612 
613   ScheduleDAGInstrs *
614   createMachineScheduler(MachineSchedContext *C) const override;
615 
616   bool addPreISel() override;
617   void addMachineSSAOptimization() override;
618   bool addILPOpts() override;
619   bool addInstSelector() override;
620   bool addIRTranslator() override;
621   void addPreLegalizeMachineIR() override;
622   bool addLegalizeMachineIR() override;
623   bool addRegBankSelect() override;
624   bool addGlobalInstructionSelect() override;
625   void addFastRegAlloc() override;
626   void addOptimizedRegAlloc() override;
627   void addPreRegAlloc() override;
628   bool addPreRewrite() override;
629   void addPostRegAlloc() override;
630   void addPreSched2() override;
631   void addPreEmitPass() override;
632 };
633 
634 } // end anonymous namespace
635 
636 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
637   if (getOptLevel() == CodeGenOpt::Aggressive)
638     addPass(createGVNPass());
639   else
640     addPass(createEarlyCSEPass());
641 }
642 
643 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
644   addPass(createLICMPass());
645   addPass(createSeparateConstOffsetFromGEPPass());
646   addPass(createSpeculativeExecutionPass());
647   // ReassociateGEPs exposes more opportunites for SLSR. See
648   // the example in reassociate-geps-and-slsr.ll.
649   addPass(createStraightLineStrengthReducePass());
650   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
651   // EarlyCSE can reuse.
652   addEarlyCSEOrGVNPass();
653   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
654   addPass(createNaryReassociatePass());
655   // NaryReassociate on GEPs creates redundant common expressions, so run
656   // EarlyCSE after it.
657   addPass(createEarlyCSEPass());
658 }
659 
660 void AMDGPUPassConfig::addIRPasses() {
661   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
662 
663   // There is no reason to run these.
664   disablePass(&StackMapLivenessID);
665   disablePass(&FuncletLayoutID);
666   disablePass(&PatchableFunctionID);
667 
668   addPass(createAMDGPUPrintfRuntimeBinding());
669 
670   // This must occur before inlining, as the inliner will not look through
671   // bitcast calls.
672   addPass(createAMDGPUFixFunctionBitcastsPass());
673 
674   // A call to propagate attributes pass in the backend in case opt was not run.
675   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
676 
677   addPass(createAtomicExpandPass());
678 
679 
680   addPass(createAMDGPULowerIntrinsicsPass());
681 
682   // Function calls are not supported, so make sure we inline everything.
683   addPass(createAMDGPUAlwaysInlinePass());
684   addPass(createAlwaysInlinerLegacyPass());
685   // We need to add the barrier noop pass, otherwise adding the function
686   // inlining pass will cause all of the PassConfigs passes to be run
687   // one function at a time, which means if we have a nodule with two
688   // functions, then we will generate code for the first function
689   // without ever running any passes on the second.
690   addPass(createBarrierNoopPass());
691 
692   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
693   if (TM.getTargetTriple().getArch() == Triple::r600)
694     addPass(createR600OpenCLImageTypeLoweringPass());
695 
696   // Replace OpenCL enqueued block function pointers with global variables.
697   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
698 
699   if (TM.getOptLevel() > CodeGenOpt::None) {
700     addPass(createInferAddressSpacesPass());
701     addPass(createAMDGPUPromoteAlloca());
702 
703     if (EnableSROA)
704       addPass(createSROAPass());
705 
706     if (EnableScalarIRPasses)
707       addStraightLineScalarOptimizationPasses();
708 
709     if (EnableAMDGPUAliasAnalysis) {
710       addPass(createAMDGPUAAWrapperPass());
711       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
712                                              AAResults &AAR) {
713         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
714           AAR.addAAResult(WrapperPass->getResult());
715         }));
716     }
717   }
718 
719   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
720     // TODO: May want to move later or split into an early and late one.
721     addPass(createAMDGPUCodeGenPreparePass());
722   }
723 
724   TargetPassConfig::addIRPasses();
725 
726   // EarlyCSE is not always strong enough to clean up what LSR produces. For
727   // example, GVN can combine
728   //
729   //   %0 = add %a, %b
730   //   %1 = add %b, %a
731   //
732   // and
733   //
734   //   %0 = shl nsw %a, 2
735   //   %1 = shl %a, 2
736   //
737   // but EarlyCSE can do neither of them.
738   if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
739     addEarlyCSEOrGVNPass();
740 }
741 
742 void AMDGPUPassConfig::addCodeGenPrepare() {
743   if (TM->getTargetTriple().getArch() == Triple::amdgcn)
744     addPass(createAMDGPUAnnotateKernelFeaturesPass());
745 
746   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
747       EnableLowerKernelArguments)
748     addPass(createAMDGPULowerKernelArgumentsPass());
749 
750   addPass(&AMDGPUPerfHintAnalysisID);
751 
752   TargetPassConfig::addCodeGenPrepare();
753 
754   if (EnableLoadStoreVectorizer)
755     addPass(createLoadStoreVectorizerPass());
756 }
757 
758 bool AMDGPUPassConfig::addPreISel() {
759   addPass(createLowerSwitchPass());
760   addPass(createFlattenCFGPass());
761   return false;
762 }
763 
764 bool AMDGPUPassConfig::addInstSelector() {
765   // Defer the verifier until FinalizeISel.
766   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
767   return false;
768 }
769 
770 bool AMDGPUPassConfig::addGCPasses() {
771   // Do nothing. GC is not supported.
772   return false;
773 }
774 
775 //===----------------------------------------------------------------------===//
776 // R600 Pass Setup
777 //===----------------------------------------------------------------------===//
778 
779 bool R600PassConfig::addPreISel() {
780   AMDGPUPassConfig::addPreISel();
781 
782   if (EnableR600StructurizeCFG)
783     addPass(createStructurizeCFGPass());
784   return false;
785 }
786 
787 bool R600PassConfig::addInstSelector() {
788   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
789   return false;
790 }
791 
792 void R600PassConfig::addPreRegAlloc() {
793   addPass(createR600VectorRegMerger());
794 }
795 
796 void R600PassConfig::addPreSched2() {
797   addPass(createR600EmitClauseMarkers(), false);
798   if (EnableR600IfConvert)
799     addPass(&IfConverterID, false);
800   addPass(createR600ClauseMergePass(), false);
801 }
802 
803 void R600PassConfig::addPreEmitPass() {
804   addPass(createAMDGPUCFGStructurizerPass(), false);
805   addPass(createR600ExpandSpecialInstrsPass(), false);
806   addPass(&FinalizeMachineBundlesID, false);
807   addPass(createR600Packetizer(), false);
808   addPass(createR600ControlFlowFinalizer(), false);
809 }
810 
811 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
812   return new R600PassConfig(*this, PM);
813 }
814 
815 //===----------------------------------------------------------------------===//
816 // GCN Pass Setup
817 //===----------------------------------------------------------------------===//
818 
819 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
820   MachineSchedContext *C) const {
821   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
822   if (ST.enableSIScheduler())
823     return createSIMachineScheduler(C);
824   return createGCNMaxOccupancyMachineScheduler(C);
825 }
826 
827 bool GCNPassConfig::addPreISel() {
828   AMDGPUPassConfig::addPreISel();
829 
830   if (EnableAtomicOptimizations) {
831     addPass(createAMDGPUAtomicOptimizerPass());
832   }
833 
834   // FIXME: We need to run a pass to propagate the attributes when calls are
835   // supported.
836 
837   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
838   // regions formed by them.
839   addPass(&AMDGPUUnifyDivergentExitNodesID);
840   if (!LateCFGStructurize) {
841     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
842   }
843   addPass(createSinkingPass());
844   addPass(createAMDGPUAnnotateUniformValues());
845   if (!LateCFGStructurize) {
846     addPass(createSIAnnotateControlFlowPass());
847   }
848   addPass(createLCSSAPass());
849 
850   return false;
851 }
852 
853 void GCNPassConfig::addMachineSSAOptimization() {
854   TargetPassConfig::addMachineSSAOptimization();
855 
856   // We want to fold operands after PeepholeOptimizer has run (or as part of
857   // it), because it will eliminate extra copies making it easier to fold the
858   // real source operand. We want to eliminate dead instructions after, so that
859   // we see fewer uses of the copies. We then need to clean up the dead
860   // instructions leftover after the operands are folded as well.
861   //
862   // XXX - Can we get away without running DeadMachineInstructionElim again?
863   addPass(&SIFoldOperandsID);
864   if (EnableDPPCombine)
865     addPass(&GCNDPPCombineID);
866   addPass(&DeadMachineInstructionElimID);
867   addPass(&SILoadStoreOptimizerID);
868   if (EnableSDWAPeephole) {
869     addPass(&SIPeepholeSDWAID);
870     addPass(&EarlyMachineLICMID);
871     addPass(&MachineCSEID);
872     addPass(&SIFoldOperandsID);
873     addPass(&DeadMachineInstructionElimID);
874   }
875   addPass(createSIShrinkInstructionsPass());
876 }
877 
878 bool GCNPassConfig::addILPOpts() {
879   if (EnableEarlyIfConversion)
880     addPass(&EarlyIfConverterID);
881 
882   TargetPassConfig::addILPOpts();
883   return false;
884 }
885 
886 bool GCNPassConfig::addInstSelector() {
887   AMDGPUPassConfig::addInstSelector();
888   addPass(&SIFixSGPRCopiesID);
889   addPass(createSILowerI1CopiesPass());
890   addPass(createSIFixupVectorISelPass());
891   addPass(createSIAddIMGInitPass());
892   return false;
893 }
894 
895 bool GCNPassConfig::addIRTranslator() {
896   addPass(new IRTranslator());
897   return false;
898 }
899 
900 void GCNPassConfig::addPreLegalizeMachineIR() {
901   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
902   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
903 }
904 
905 bool GCNPassConfig::addLegalizeMachineIR() {
906   addPass(new Legalizer());
907   return false;
908 }
909 
910 bool GCNPassConfig::addRegBankSelect() {
911   addPass(new RegBankSelect());
912   return false;
913 }
914 
915 bool GCNPassConfig::addGlobalInstructionSelect() {
916   addPass(new InstructionSelect());
917   return false;
918 }
919 
920 void GCNPassConfig::addPreRegAlloc() {
921   if (LateCFGStructurize) {
922     addPass(createAMDGPUMachineCFGStructurizerPass());
923   }
924   addPass(createSIWholeQuadModePass());
925 }
926 
927 void GCNPassConfig::addFastRegAlloc() {
928   // FIXME: We have to disable the verifier here because of PHIElimination +
929   // TwoAddressInstructions disabling it.
930 
931   // This must be run immediately after phi elimination and before
932   // TwoAddressInstructions, otherwise the processing of the tied operand of
933   // SI_ELSE will introduce a copy of the tied operand source after the else.
934   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
935 
936   // This must be run just after RegisterCoalescing.
937   insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
938 
939   TargetPassConfig::addFastRegAlloc();
940 }
941 
942 void GCNPassConfig::addOptimizedRegAlloc() {
943   if (OptExecMaskPreRA) {
944     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
945     insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
946   } else {
947     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
948   }
949 
950   // This must be run immediately after phi elimination and before
951   // TwoAddressInstructions, otherwise the processing of the tied operand of
952   // SI_ELSE will introduce a copy of the tied operand source after the else.
953   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
954 
955   // This must be run just after RegisterCoalescing.
956   insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
957 
958   if (EnableDCEInRA)
959     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
960 
961   TargetPassConfig::addOptimizedRegAlloc();
962 }
963 
964 bool GCNPassConfig::addPreRewrite() {
965   if (EnableRegReassign) {
966     addPass(&GCNNSAReassignID);
967     addPass(&GCNRegBankReassignID);
968   }
969   return true;
970 }
971 
972 void GCNPassConfig::addPostRegAlloc() {
973   addPass(&SIFixVGPRCopiesID);
974   if (getOptLevel() > CodeGenOpt::None)
975     addPass(&SIOptimizeExecMaskingID);
976   TargetPassConfig::addPostRegAlloc();
977 
978   // Equivalent of PEI for SGPRs.
979   addPass(&SILowerSGPRSpillsID);
980 }
981 
982 void GCNPassConfig::addPreSched2() {
983 }
984 
985 void GCNPassConfig::addPreEmitPass() {
986   addPass(createSIMemoryLegalizerPass());
987   addPass(createSIInsertWaitcntsPass());
988   addPass(createSIShrinkInstructionsPass());
989   addPass(createSIModeRegisterPass());
990 
991   // The hazard recognizer that runs as part of the post-ra scheduler does not
992   // guarantee to be able handle all hazards correctly. This is because if there
993   // are multiple scheduling regions in a basic block, the regions are scheduled
994   // bottom up, so when we begin to schedule a region we don't know what
995   // instructions were emitted directly before it.
996   //
997   // Here we add a stand-alone hazard recognizer pass which can handle all
998   // cases.
999   //
1000   // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
1001   // be better for it to emit S_NOP <N> when possible.
1002   addPass(&PostRAHazardRecognizerID);
1003 
1004   addPass(&SIRemoveShortExecBranchesID);
1005   addPass(&SIInsertSkipsPassID);
1006   addPass(&BranchRelaxationPassID);
1007 }
1008 
1009 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1010   return new GCNPassConfig(*this, PM);
1011 }
1012 
1013 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1014   return new yaml::SIMachineFunctionInfo();
1015 }
1016 
1017 yaml::MachineFunctionInfo *
1018 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1019   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1020   return new yaml::SIMachineFunctionInfo(*MFI,
1021                                          *MF.getSubtarget().getRegisterInfo());
1022 }
1023 
1024 bool GCNTargetMachine::parseMachineFunctionInfo(
1025     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1026     SMDiagnostic &Error, SMRange &SourceRange) const {
1027   const yaml::SIMachineFunctionInfo &YamlMFI =
1028       reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1029   MachineFunction &MF = PFS.MF;
1030   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1031 
1032   MFI->initializeBaseYamlFields(YamlMFI);
1033 
1034   auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
1035     if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
1036       SourceRange = RegName.SourceRange;
1037       return true;
1038     }
1039 
1040     return false;
1041   };
1042 
1043   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1044     // Create a diagnostic for a the register string literal.
1045     const MemoryBuffer &Buffer =
1046         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1047     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1048                          RegName.Value.size(), SourceMgr::DK_Error,
1049                          "incorrect register class for field", RegName.Value,
1050                          None, None);
1051     SourceRange = RegName.SourceRange;
1052     return true;
1053   };
1054 
1055   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1056       parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
1057       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1058       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1059     return true;
1060 
1061   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1062       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1063     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1064   }
1065 
1066   if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
1067       !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
1068     return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
1069   }
1070 
1071   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1072       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1073     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1074   }
1075 
1076   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1077       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1078     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1079   }
1080 
1081   auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1082                                    const TargetRegisterClass &RC,
1083                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1084                                    unsigned SystemSGPRs) {
1085     // Skip parsing if it's not present.
1086     if (!A)
1087       return false;
1088 
1089     if (A->IsRegister) {
1090       unsigned Reg;
1091       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1092         SourceRange = A->RegisterName.SourceRange;
1093         return true;
1094       }
1095       if (!RC.contains(Reg))
1096         return diagnoseRegisterClass(A->RegisterName);
1097       Arg = ArgDescriptor::createRegister(Reg);
1098     } else
1099       Arg = ArgDescriptor::createStack(A->StackOffset);
1100     // Check and apply the optional mask.
1101     if (A->Mask)
1102       Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1103 
1104     MFI->NumUserSGPRs += UserSGPRs;
1105     MFI->NumSystemSGPRs += SystemSGPRs;
1106     return false;
1107   };
1108 
1109   if (YamlMFI.ArgInfo &&
1110       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1111                              AMDGPU::SGPR_128RegClass,
1112                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1113        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1114                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1115                              2, 0) ||
1116        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1117                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1118        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1119                              AMDGPU::SReg_64RegClass,
1120                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1121        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1122                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1123                              2, 0) ||
1124        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1125                              AMDGPU::SReg_64RegClass,
1126                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1127        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1128                              AMDGPU::SGPR_32RegClass,
1129                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1130        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1131                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1132                              0, 1) ||
1133        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1134                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1135                              0, 1) ||
1136        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1137                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1138                              0, 1) ||
1139        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1140                              AMDGPU::SGPR_32RegClass,
1141                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1142        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1143                              AMDGPU::SGPR_32RegClass,
1144                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1145        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1146                              AMDGPU::SReg_64RegClass,
1147                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1148        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1149                              AMDGPU::SReg_64RegClass,
1150                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1151        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1152                              AMDGPU::VGPR_32RegClass,
1153                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1154        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1155                              AMDGPU::VGPR_32RegClass,
1156                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1157        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1158                              AMDGPU::VGPR_32RegClass,
1159                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1160     return true;
1161 
1162   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1163   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1164   MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals;
1165   MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals;
1166 
1167   return false;
1168 }
1169