1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information  needed to emit code for R600 and SI GPUs.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUTargetMachine.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPUTargetObjectFile.h"
23 #include "AMDGPUTargetTransformInfo.h"
24 #include "GCNIterativeScheduler.h"
25 #include "GCNSchedStrategy.h"
26 #include "R600MachineScheduler.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIMachineScheduler.h"
29 #include "TargetInfo/AMDGPUTargetInfo.h"
30 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/TargetPassConfig.h"
37 #include "llvm/IR/Attributes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/LegacyPassManager.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Compiler.h"
44 #include "llvm/Support/TargetRegistry.h"
45 #include "llvm/Target/TargetLoweringObjectFile.h"
46 #include "llvm/Transforms/IPO.h"
47 #include "llvm/Transforms/IPO/AlwaysInliner.h"
48 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
49 #include "llvm/Transforms/Scalar.h"
50 #include "llvm/Transforms/Scalar/GVN.h"
51 #include "llvm/Transforms/Utils.h"
52 #include "llvm/Transforms/Vectorize.h"
53 #include <memory>
54 
55 using namespace llvm;
56 
57 static cl::opt<bool> EnableR600StructurizeCFG(
58   "r600-ir-structurize",
59   cl::desc("Use StructurizeCFG IR pass"),
60   cl::init(true));
61 
62 static cl::opt<bool> EnableSROA(
63   "amdgpu-sroa",
64   cl::desc("Run SROA after promote alloca pass"),
65   cl::ReallyHidden,
66   cl::init(true));
67 
68 static cl::opt<bool>
69 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
70                         cl::desc("Run early if-conversion"),
71                         cl::init(false));
72 
73 static cl::opt<bool>
74 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
75             cl::desc("Run pre-RA exec mask optimizations"),
76             cl::init(true));
77 
78 static cl::opt<bool> EnableR600IfConvert(
79   "r600-if-convert",
80   cl::desc("Use if conversion pass"),
81   cl::ReallyHidden,
82   cl::init(true));
83 
84 // Option to disable vectorizer for tests.
85 static cl::opt<bool> EnableLoadStoreVectorizer(
86   "amdgpu-load-store-vectorizer",
87   cl::desc("Enable load store vectorizer"),
88   cl::init(true),
89   cl::Hidden);
90 
91 // Option to control global loads scalarization
92 static cl::opt<bool> ScalarizeGlobal(
93   "amdgpu-scalarize-global-loads",
94   cl::desc("Enable global load scalarization"),
95   cl::init(true),
96   cl::Hidden);
97 
98 // Option to run internalize pass.
99 static cl::opt<bool> InternalizeSymbols(
100   "amdgpu-internalize-symbols",
101   cl::desc("Enable elimination of non-kernel functions and unused globals"),
102   cl::init(false),
103   cl::Hidden);
104 
105 // Option to inline all early.
106 static cl::opt<bool> EarlyInlineAll(
107   "amdgpu-early-inline-all",
108   cl::desc("Inline all functions early"),
109   cl::init(false),
110   cl::Hidden);
111 
112 static cl::opt<bool> EnableSDWAPeephole(
113   "amdgpu-sdwa-peephole",
114   cl::desc("Enable SDWA peepholer"),
115   cl::init(true));
116 
117 static cl::opt<bool> EnableDPPCombine(
118   "amdgpu-dpp-combine",
119   cl::desc("Enable DPP combiner"),
120   cl::init(true));
121 
122 // Enable address space based alias analysis
123 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
124   cl::desc("Enable AMDGPU Alias Analysis"),
125   cl::init(true));
126 
127 // Option to run late CFG structurizer
128 static cl::opt<bool, true> LateCFGStructurize(
129   "amdgpu-late-structurize",
130   cl::desc("Enable late CFG structurization"),
131   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
132   cl::Hidden);
133 
134 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
135   "amdgpu-function-calls",
136   cl::desc("Enable AMDGPU function call support"),
137   cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
138   cl::init(true),
139   cl::Hidden);
140 
141 // Enable lib calls simplifications
142 static cl::opt<bool> EnableLibCallSimplify(
143   "amdgpu-simplify-libcall",
144   cl::desc("Enable amdgpu library simplifications"),
145   cl::init(true),
146   cl::Hidden);
147 
148 static cl::opt<bool> EnableLowerKernelArguments(
149   "amdgpu-ir-lower-kernel-arguments",
150   cl::desc("Lower kernel argument loads in IR pass"),
151   cl::init(true),
152   cl::Hidden);
153 
154 static cl::opt<bool> EnableRegReassign(
155   "amdgpu-reassign-regs",
156   cl::desc("Enable register reassign optimizations on gfx10+"),
157   cl::init(true),
158   cl::Hidden);
159 
160 // Enable atomic optimization
161 static cl::opt<bool> EnableAtomicOptimizations(
162   "amdgpu-atomic-optimizations",
163   cl::desc("Enable atomic optimizations"),
164   cl::init(false),
165   cl::Hidden);
166 
167 // Enable Mode register optimization
168 static cl::opt<bool> EnableSIModeRegisterPass(
169   "amdgpu-mode-register",
170   cl::desc("Enable mode register pass"),
171   cl::init(true),
172   cl::Hidden);
173 
174 // Option is used in lit tests to prevent deadcoding of patterns inspected.
175 static cl::opt<bool>
176 EnableDCEInRA("amdgpu-dce-in-ra",
177     cl::init(true), cl::Hidden,
178     cl::desc("Enable machine DCE inside regalloc"));
179 
180 static cl::opt<bool> EnableScalarIRPasses(
181   "amdgpu-scalar-ir-passes",
182   cl::desc("Enable scalar IR passes"),
183   cl::init(true),
184   cl::Hidden);
185 
186 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
187   // Register the target
188   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
189   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
190 
191   PassRegistry *PR = PassRegistry::getPassRegistry();
192   initializeR600ClauseMergePassPass(*PR);
193   initializeR600ControlFlowFinalizerPass(*PR);
194   initializeR600PacketizerPass(*PR);
195   initializeR600ExpandSpecialInstrsPassPass(*PR);
196   initializeR600VectorRegMergerPass(*PR);
197   initializeGlobalISel(*PR);
198   initializeAMDGPUDAGToDAGISelPass(*PR);
199   initializeGCNDPPCombinePass(*PR);
200   initializeSILowerI1CopiesPass(*PR);
201   initializeSILowerSGPRSpillsPass(*PR);
202   initializeSIFixSGPRCopiesPass(*PR);
203   initializeSIFixVGPRCopiesPass(*PR);
204   initializeSIFixupVectorISelPass(*PR);
205   initializeSIFoldOperandsPass(*PR);
206   initializeSIPeepholeSDWAPass(*PR);
207   initializeSIShrinkInstructionsPass(*PR);
208   initializeSIOptimizeExecMaskingPreRAPass(*PR);
209   initializeSILoadStoreOptimizerPass(*PR);
210   initializeAMDGPUFixFunctionBitcastsPass(*PR);
211   initializeAMDGPUAlwaysInlinePass(*PR);
212   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
213   initializeAMDGPUAnnotateUniformValuesPass(*PR);
214   initializeAMDGPUArgumentUsageInfoPass(*PR);
215   initializeAMDGPUAtomicOptimizerPass(*PR);
216   initializeAMDGPULowerKernelArgumentsPass(*PR);
217   initializeAMDGPULowerKernelAttributesPass(*PR);
218   initializeAMDGPULowerIntrinsicsPass(*PR);
219   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
220   initializeAMDGPUPreLegalizerCombinerPass(*PR);
221   initializeAMDGPUPromoteAllocaPass(*PR);
222   initializeAMDGPUCodeGenPreparePass(*PR);
223   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
224   initializeAMDGPUPropagateAttributesLatePass(*PR);
225   initializeAMDGPURewriteOutArgumentsPass(*PR);
226   initializeAMDGPUUnifyMetadataPass(*PR);
227   initializeSIAnnotateControlFlowPass(*PR);
228   initializeSIInsertWaitcntsPass(*PR);
229   initializeSIModeRegisterPass(*PR);
230   initializeSIWholeQuadModePass(*PR);
231   initializeSILowerControlFlowPass(*PR);
232   initializeSIRemoveShortExecBranchesPass(*PR);
233   initializeSIInsertSkipsPass(*PR);
234   initializeSIMemoryLegalizerPass(*PR);
235   initializeSIOptimizeExecMaskingPass(*PR);
236   initializeSIPreAllocateWWMRegsPass(*PR);
237   initializeSIFormMemoryClausesPass(*PR);
238   initializeSIPostRABundlerPass(*PR);
239   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
240   initializeAMDGPUAAWrapperPassPass(*PR);
241   initializeAMDGPUExternalAAWrapperPass(*PR);
242   initializeAMDGPUUseNativeCallsPass(*PR);
243   initializeAMDGPUSimplifyLibCallsPass(*PR);
244   initializeAMDGPUInlinerPass(*PR);
245   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
246   initializeGCNRegBankReassignPass(*PR);
247   initializeGCNNSAReassignPass(*PR);
248   initializeSIAddIMGInitPass(*PR);
249 }
250 
251 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
252   return std::make_unique<AMDGPUTargetObjectFile>();
253 }
254 
255 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
256   return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
257 }
258 
259 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
260   return new SIScheduleDAGMI(C);
261 }
262 
263 static ScheduleDAGInstrs *
264 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
265   ScheduleDAGMILive *DAG =
266     new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
267   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
268   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
269   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
270   return DAG;
271 }
272 
273 static ScheduleDAGInstrs *
274 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
275   auto DAG = new GCNIterativeScheduler(C,
276     GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
277   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
278   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
279   return DAG;
280 }
281 
282 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
283   return new GCNIterativeScheduler(C,
284     GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
285 }
286 
287 static ScheduleDAGInstrs *
288 createIterativeILPMachineScheduler(MachineSchedContext *C) {
289   auto DAG = new GCNIterativeScheduler(C,
290     GCNIterativeScheduler::SCHEDULE_ILP);
291   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
292   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
293   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
294   return DAG;
295 }
296 
297 static MachineSchedRegistry
298 R600SchedRegistry("r600", "Run R600's custom scheduler",
299                    createR600MachineScheduler);
300 
301 static MachineSchedRegistry
302 SISchedRegistry("si", "Run SI's custom scheduler",
303                 createSIMachineScheduler);
304 
305 static MachineSchedRegistry
306 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
307                              "Run GCN scheduler to maximize occupancy",
308                              createGCNMaxOccupancyMachineScheduler);
309 
310 static MachineSchedRegistry
311 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
312   "Run GCN scheduler to maximize occupancy (experimental)",
313   createIterativeGCNMaxOccupancyMachineScheduler);
314 
315 static MachineSchedRegistry
316 GCNMinRegSchedRegistry("gcn-minreg",
317   "Run GCN iterative scheduler for minimal register usage (experimental)",
318   createMinRegScheduler);
319 
320 static MachineSchedRegistry
321 GCNILPSchedRegistry("gcn-ilp",
322   "Run GCN iterative scheduler for ILP scheduling (experimental)",
323   createIterativeILPMachineScheduler);
324 
325 static StringRef computeDataLayout(const Triple &TT) {
326   if (TT.getArch() == Triple::r600) {
327     // 32-bit pointers.
328       return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
329              "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
330   }
331 
332   // 32-bit private, local, and region pointers. 64-bit global, constant and
333   // flat, non-integral buffer fat pointers.
334     return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
335          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
336          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
337          "-ni:7";
338 }
339 
340 LLVM_READNONE
341 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
342   if (!GPU.empty())
343     return GPU;
344 
345   // Need to default to a target with flat support for HSA.
346   if (TT.getArch() == Triple::amdgcn)
347     return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
348 
349   return "r600";
350 }
351 
352 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
353   // The AMDGPU toolchain only supports generating shared objects, so we
354   // must always use PIC.
355   return Reloc::PIC_;
356 }
357 
358 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
359                                          StringRef CPU, StringRef FS,
360                                          TargetOptions Options,
361                                          Optional<Reloc::Model> RM,
362                                          Optional<CodeModel::Model> CM,
363                                          CodeGenOpt::Level OptLevel)
364     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
365                         FS, Options, getEffectiveRelocModel(RM),
366                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
367       TLOF(createTLOF(getTargetTriple())) {
368   initAsmInfo();
369 }
370 
371 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
372 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
373 
374 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
375 
376 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
377   Attribute GPUAttr = F.getFnAttribute("target-cpu");
378   return GPUAttr.hasAttribute(Attribute::None) ?
379     getTargetCPU() : GPUAttr.getValueAsString();
380 }
381 
382 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
383   Attribute FSAttr = F.getFnAttribute("target-features");
384 
385   return FSAttr.hasAttribute(Attribute::None) ?
386     getTargetFeatureString() :
387     FSAttr.getValueAsString();
388 }
389 
390 /// Predicate for Internalize pass.
391 static bool mustPreserveGV(const GlobalValue &GV) {
392   if (const Function *F = dyn_cast<Function>(&GV))
393     return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv());
394 
395   return !GV.use_empty();
396 }
397 
398 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
399   Builder.DivergentTarget = true;
400 
401   bool EnableOpt = getOptLevel() > CodeGenOpt::None;
402   bool Internalize = InternalizeSymbols;
403   bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
404   bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
405   bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
406 
407   if (EnableFunctionCalls) {
408     delete Builder.Inliner;
409     Builder.Inliner = createAMDGPUFunctionInliningPass();
410   }
411 
412   Builder.addExtension(
413     PassManagerBuilder::EP_ModuleOptimizerEarly,
414     [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
415                                                legacy::PassManagerBase &PM) {
416       if (AMDGPUAA) {
417         PM.add(createAMDGPUAAWrapperPass());
418         PM.add(createAMDGPUExternalAAWrapperPass());
419       }
420       PM.add(createAMDGPUUnifyMetadataPass());
421       PM.add(createAMDGPUPrintfRuntimeBinding());
422       PM.add(createAMDGPUPropagateAttributesLatePass(this));
423       if (Internalize) {
424         PM.add(createInternalizePass(mustPreserveGV));
425         PM.add(createGlobalDCEPass());
426       }
427       if (EarlyInline)
428         PM.add(createAMDGPUAlwaysInlinePass(false));
429   });
430 
431   const auto &Opt = Options;
432   Builder.addExtension(
433     PassManagerBuilder::EP_EarlyAsPossible,
434     [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
435                                             legacy::PassManagerBase &PM) {
436       if (AMDGPUAA) {
437         PM.add(createAMDGPUAAWrapperPass());
438         PM.add(createAMDGPUExternalAAWrapperPass());
439       }
440       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
441       PM.add(llvm::createAMDGPUUseNativeCallsPass());
442       if (LibCallSimplify)
443         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
444   });
445 
446   Builder.addExtension(
447     PassManagerBuilder::EP_CGSCCOptimizerLate,
448     [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
449       // Add infer address spaces pass to the opt pipeline after inlining
450       // but before SROA to increase SROA opportunities.
451       PM.add(createInferAddressSpacesPass());
452 
453       // This should run after inlining to have any chance of doing anything,
454       // and before other cleanup optimizations.
455       PM.add(createAMDGPULowerKernelAttributesPass());
456   });
457 }
458 
459 //===----------------------------------------------------------------------===//
460 // R600 Target Machine (R600 -> Cayman)
461 //===----------------------------------------------------------------------===//
462 
463 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
464                                      StringRef CPU, StringRef FS,
465                                      TargetOptions Options,
466                                      Optional<Reloc::Model> RM,
467                                      Optional<CodeModel::Model> CM,
468                                      CodeGenOpt::Level OL, bool JIT)
469     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
470   setRequiresStructuredCFG(true);
471 
472   // Override the default since calls aren't supported for r600.
473   if (EnableFunctionCalls &&
474       EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0)
475     EnableFunctionCalls = false;
476 }
477 
478 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
479   const Function &F) const {
480   StringRef GPU = getGPUName(F);
481   StringRef FS = getFeatureString(F);
482 
483   SmallString<128> SubtargetKey(GPU);
484   SubtargetKey.append(FS);
485 
486   auto &I = SubtargetMap[SubtargetKey];
487   if (!I) {
488     // This needs to be done before we create a new subtarget since any
489     // creation will depend on the TM and the code generation flags on the
490     // function that reside in TargetOptions.
491     resetTargetOptions(F);
492     I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
493   }
494 
495   return I.get();
496 }
497 
498 TargetTransformInfo
499 R600TargetMachine::getTargetTransformInfo(const Function &F) {
500   return TargetTransformInfo(R600TTIImpl(this, F));
501 }
502 
503 //===----------------------------------------------------------------------===//
504 // GCN Target Machine (SI+)
505 //===----------------------------------------------------------------------===//
506 
507 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
508                                    StringRef CPU, StringRef FS,
509                                    TargetOptions Options,
510                                    Optional<Reloc::Model> RM,
511                                    Optional<CodeModel::Model> CM,
512                                    CodeGenOpt::Level OL, bool JIT)
513     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
514 
515 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
516   StringRef GPU = getGPUName(F);
517   StringRef FS = getFeatureString(F);
518 
519   SmallString<128> SubtargetKey(GPU);
520   SubtargetKey.append(FS);
521 
522   auto &I = SubtargetMap[SubtargetKey];
523   if (!I) {
524     // This needs to be done before we create a new subtarget since any
525     // creation will depend on the TM and the code generation flags on the
526     // function that reside in TargetOptions.
527     resetTargetOptions(F);
528     I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
529   }
530 
531   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
532 
533   return I.get();
534 }
535 
536 TargetTransformInfo
537 GCNTargetMachine::getTargetTransformInfo(const Function &F) {
538   return TargetTransformInfo(GCNTTIImpl(this, F));
539 }
540 
541 //===----------------------------------------------------------------------===//
542 // AMDGPU Pass Setup
543 //===----------------------------------------------------------------------===//
544 
545 namespace {
546 
547 class AMDGPUPassConfig : public TargetPassConfig {
548 public:
549   AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
550     : TargetPassConfig(TM, PM) {
551     // Exceptions and StackMaps are not supported, so these passes will never do
552     // anything.
553     disablePass(&StackMapLivenessID);
554     disablePass(&FuncletLayoutID);
555   }
556 
557   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
558     return getTM<AMDGPUTargetMachine>();
559   }
560 
561   ScheduleDAGInstrs *
562   createMachineScheduler(MachineSchedContext *C) const override {
563     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
564     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
565     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
566     return DAG;
567   }
568 
569   void addEarlyCSEOrGVNPass();
570   void addStraightLineScalarOptimizationPasses();
571   void addIRPasses() override;
572   void addCodeGenPrepare() override;
573   bool addPreISel() override;
574   bool addInstSelector() override;
575   bool addGCPasses() override;
576 
577   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
578 };
579 
580 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const {
581   return getStandardCSEConfigForOpt(TM->getOptLevel());
582 }
583 
584 class R600PassConfig final : public AMDGPUPassConfig {
585 public:
586   R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
587     : AMDGPUPassConfig(TM, PM) {}
588 
589   ScheduleDAGInstrs *createMachineScheduler(
590     MachineSchedContext *C) const override {
591     return createR600MachineScheduler(C);
592   }
593 
594   bool addPreISel() override;
595   bool addInstSelector() override;
596   void addPreRegAlloc() override;
597   void addPreSched2() override;
598   void addPreEmitPass() override;
599 };
600 
601 class GCNPassConfig final : public AMDGPUPassConfig {
602 public:
603   GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
604     : AMDGPUPassConfig(TM, PM) {
605     // It is necessary to know the register usage of the entire call graph.  We
606     // allow calls without EnableAMDGPUFunctionCalls if they are marked
607     // noinline, so this is always required.
608     setRequiresCodeGenSCCOrder(true);
609   }
610 
611   GCNTargetMachine &getGCNTargetMachine() const {
612     return getTM<GCNTargetMachine>();
613   }
614 
615   ScheduleDAGInstrs *
616   createMachineScheduler(MachineSchedContext *C) const override;
617 
618   bool addPreISel() override;
619   void addMachineSSAOptimization() override;
620   bool addILPOpts() override;
621   bool addInstSelector() override;
622   bool addIRTranslator() override;
623   void addPreLegalizeMachineIR() override;
624   bool addLegalizeMachineIR() override;
625   bool addRegBankSelect() override;
626   bool addGlobalInstructionSelect() override;
627   void addFastRegAlloc() override;
628   void addOptimizedRegAlloc() override;
629   void addPreRegAlloc() override;
630   bool addPreRewrite() override;
631   void addPostRegAlloc() override;
632   void addPreSched2() override;
633   void addPreEmitPass() override;
634 };
635 
636 } // end anonymous namespace
637 
638 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
639   if (getOptLevel() == CodeGenOpt::Aggressive)
640     addPass(createGVNPass());
641   else
642     addPass(createEarlyCSEPass());
643 }
644 
645 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
646   addPass(createLICMPass());
647   addPass(createSeparateConstOffsetFromGEPPass());
648   addPass(createSpeculativeExecutionPass());
649   // ReassociateGEPs exposes more opportunites for SLSR. See
650   // the example in reassociate-geps-and-slsr.ll.
651   addPass(createStraightLineStrengthReducePass());
652   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
653   // EarlyCSE can reuse.
654   addEarlyCSEOrGVNPass();
655   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
656   addPass(createNaryReassociatePass());
657   // NaryReassociate on GEPs creates redundant common expressions, so run
658   // EarlyCSE after it.
659   addPass(createEarlyCSEPass());
660 }
661 
662 void AMDGPUPassConfig::addIRPasses() {
663   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
664 
665   // There is no reason to run these.
666   disablePass(&StackMapLivenessID);
667   disablePass(&FuncletLayoutID);
668   disablePass(&PatchableFunctionID);
669 
670   addPass(createAMDGPUPrintfRuntimeBinding());
671 
672   // This must occur before inlining, as the inliner will not look through
673   // bitcast calls.
674   addPass(createAMDGPUFixFunctionBitcastsPass());
675 
676   // A call to propagate attributes pass in the backend in case opt was not run.
677   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
678 
679   addPass(createAtomicExpandPass());
680 
681 
682   addPass(createAMDGPULowerIntrinsicsPass());
683 
684   // Function calls are not supported, so make sure we inline everything.
685   addPass(createAMDGPUAlwaysInlinePass());
686   addPass(createAlwaysInlinerLegacyPass());
687   // We need to add the barrier noop pass, otherwise adding the function
688   // inlining pass will cause all of the PassConfigs passes to be run
689   // one function at a time, which means if we have a nodule with two
690   // functions, then we will generate code for the first function
691   // without ever running any passes on the second.
692   addPass(createBarrierNoopPass());
693 
694   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
695   if (TM.getTargetTriple().getArch() == Triple::r600)
696     addPass(createR600OpenCLImageTypeLoweringPass());
697 
698   // Replace OpenCL enqueued block function pointers with global variables.
699   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
700 
701   if (TM.getOptLevel() > CodeGenOpt::None) {
702     addPass(createInferAddressSpacesPass());
703     addPass(createAMDGPUPromoteAlloca());
704 
705     if (EnableSROA)
706       addPass(createSROAPass());
707 
708     if (EnableScalarIRPasses)
709       addStraightLineScalarOptimizationPasses();
710 
711     if (EnableAMDGPUAliasAnalysis) {
712       addPass(createAMDGPUAAWrapperPass());
713       addPass(createExternalAAWrapperPass([](Pass &P, Function &,
714                                              AAResults &AAR) {
715         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
716           AAR.addAAResult(WrapperPass->getResult());
717         }));
718     }
719   }
720 
721   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
722     // TODO: May want to move later or split into an early and late one.
723     addPass(createAMDGPUCodeGenPreparePass());
724   }
725 
726   TargetPassConfig::addIRPasses();
727 
728   // EarlyCSE is not always strong enough to clean up what LSR produces. For
729   // example, GVN can combine
730   //
731   //   %0 = add %a, %b
732   //   %1 = add %b, %a
733   //
734   // and
735   //
736   //   %0 = shl nsw %a, 2
737   //   %1 = shl %a, 2
738   //
739   // but EarlyCSE can do neither of them.
740   if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
741     addEarlyCSEOrGVNPass();
742 }
743 
744 void AMDGPUPassConfig::addCodeGenPrepare() {
745   if (TM->getTargetTriple().getArch() == Triple::amdgcn)
746     addPass(createAMDGPUAnnotateKernelFeaturesPass());
747 
748   if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
749       EnableLowerKernelArguments)
750     addPass(createAMDGPULowerKernelArgumentsPass());
751 
752   addPass(&AMDGPUPerfHintAnalysisID);
753 
754   TargetPassConfig::addCodeGenPrepare();
755 
756   if (EnableLoadStoreVectorizer)
757     addPass(createLoadStoreVectorizerPass());
758 }
759 
760 bool AMDGPUPassConfig::addPreISel() {
761   addPass(createLowerSwitchPass());
762   addPass(createFlattenCFGPass());
763   return false;
764 }
765 
766 bool AMDGPUPassConfig::addInstSelector() {
767   // Defer the verifier until FinalizeISel.
768   addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
769   return false;
770 }
771 
772 bool AMDGPUPassConfig::addGCPasses() {
773   // Do nothing. GC is not supported.
774   return false;
775 }
776 
777 //===----------------------------------------------------------------------===//
778 // R600 Pass Setup
779 //===----------------------------------------------------------------------===//
780 
781 bool R600PassConfig::addPreISel() {
782   AMDGPUPassConfig::addPreISel();
783 
784   if (EnableR600StructurizeCFG)
785     addPass(createStructurizeCFGPass());
786   return false;
787 }
788 
789 bool R600PassConfig::addInstSelector() {
790   addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
791   return false;
792 }
793 
794 void R600PassConfig::addPreRegAlloc() {
795   addPass(createR600VectorRegMerger());
796 }
797 
798 void R600PassConfig::addPreSched2() {
799   addPass(createR600EmitClauseMarkers(), false);
800   if (EnableR600IfConvert)
801     addPass(&IfConverterID, false);
802   addPass(createR600ClauseMergePass(), false);
803 }
804 
805 void R600PassConfig::addPreEmitPass() {
806   addPass(createAMDGPUCFGStructurizerPass(), false);
807   addPass(createR600ExpandSpecialInstrsPass(), false);
808   addPass(&FinalizeMachineBundlesID, false);
809   addPass(createR600Packetizer(), false);
810   addPass(createR600ControlFlowFinalizer(), false);
811 }
812 
813 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
814   return new R600PassConfig(*this, PM);
815 }
816 
817 //===----------------------------------------------------------------------===//
818 // GCN Pass Setup
819 //===----------------------------------------------------------------------===//
820 
821 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
822   MachineSchedContext *C) const {
823   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
824   if (ST.enableSIScheduler())
825     return createSIMachineScheduler(C);
826   return createGCNMaxOccupancyMachineScheduler(C);
827 }
828 
829 bool GCNPassConfig::addPreISel() {
830   AMDGPUPassConfig::addPreISel();
831 
832   if (EnableAtomicOptimizations) {
833     addPass(createAMDGPUAtomicOptimizerPass());
834   }
835 
836   // FIXME: We need to run a pass to propagate the attributes when calls are
837   // supported.
838 
839   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
840   // regions formed by them.
841   addPass(&AMDGPUUnifyDivergentExitNodesID);
842   if (!LateCFGStructurize) {
843     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
844   }
845   addPass(createSinkingPass());
846   addPass(createAMDGPUAnnotateUniformValues());
847   if (!LateCFGStructurize) {
848     addPass(createSIAnnotateControlFlowPass());
849   }
850   addPass(createLCSSAPass());
851 
852   return false;
853 }
854 
855 void GCNPassConfig::addMachineSSAOptimization() {
856   TargetPassConfig::addMachineSSAOptimization();
857 
858   // We want to fold operands after PeepholeOptimizer has run (or as part of
859   // it), because it will eliminate extra copies making it easier to fold the
860   // real source operand. We want to eliminate dead instructions after, so that
861   // we see fewer uses of the copies. We then need to clean up the dead
862   // instructions leftover after the operands are folded as well.
863   //
864   // XXX - Can we get away without running DeadMachineInstructionElim again?
865   addPass(&SIFoldOperandsID);
866   if (EnableDPPCombine)
867     addPass(&GCNDPPCombineID);
868   addPass(&DeadMachineInstructionElimID);
869   addPass(&SILoadStoreOptimizerID);
870   if (EnableSDWAPeephole) {
871     addPass(&SIPeepholeSDWAID);
872     addPass(&EarlyMachineLICMID);
873     addPass(&MachineCSEID);
874     addPass(&SIFoldOperandsID);
875     addPass(&DeadMachineInstructionElimID);
876   }
877   addPass(createSIShrinkInstructionsPass());
878 }
879 
880 bool GCNPassConfig::addILPOpts() {
881   if (EnableEarlyIfConversion)
882     addPass(&EarlyIfConverterID);
883 
884   TargetPassConfig::addILPOpts();
885   return false;
886 }
887 
888 bool GCNPassConfig::addInstSelector() {
889   AMDGPUPassConfig::addInstSelector();
890   addPass(&SIFixSGPRCopiesID);
891   addPass(createSILowerI1CopiesPass());
892   addPass(createSIFixupVectorISelPass());
893   addPass(createSIAddIMGInitPass());
894   return false;
895 }
896 
897 bool GCNPassConfig::addIRTranslator() {
898   addPass(new IRTranslator());
899   return false;
900 }
901 
902 void GCNPassConfig::addPreLegalizeMachineIR() {
903   bool IsOptNone = getOptLevel() == CodeGenOpt::None;
904   addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
905 }
906 
907 bool GCNPassConfig::addLegalizeMachineIR() {
908   addPass(new Legalizer());
909   return false;
910 }
911 
912 bool GCNPassConfig::addRegBankSelect() {
913   addPass(new RegBankSelect());
914   return false;
915 }
916 
917 bool GCNPassConfig::addGlobalInstructionSelect() {
918   addPass(new InstructionSelect());
919   return false;
920 }
921 
922 void GCNPassConfig::addPreRegAlloc() {
923   if (LateCFGStructurize) {
924     addPass(createAMDGPUMachineCFGStructurizerPass());
925   }
926   addPass(createSIWholeQuadModePass());
927 }
928 
929 void GCNPassConfig::addFastRegAlloc() {
930   // FIXME: We have to disable the verifier here because of PHIElimination +
931   // TwoAddressInstructions disabling it.
932 
933   // This must be run immediately after phi elimination and before
934   // TwoAddressInstructions, otherwise the processing of the tied operand of
935   // SI_ELSE will introduce a copy of the tied operand source after the else.
936   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
937 
938   // This must be run just after RegisterCoalescing.
939   insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
940 
941   TargetPassConfig::addFastRegAlloc();
942 }
943 
944 void GCNPassConfig::addOptimizedRegAlloc() {
945   if (OptExecMaskPreRA) {
946     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
947     insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
948   } else {
949     insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
950   }
951 
952   // This must be run immediately after phi elimination and before
953   // TwoAddressInstructions, otherwise the processing of the tied operand of
954   // SI_ELSE will introduce a copy of the tied operand source after the else.
955   insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
956 
957   // This must be run just after RegisterCoalescing.
958   insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false);
959 
960   if (EnableDCEInRA)
961     insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
962 
963   TargetPassConfig::addOptimizedRegAlloc();
964 }
965 
966 bool GCNPassConfig::addPreRewrite() {
967   if (EnableRegReassign) {
968     addPass(&GCNNSAReassignID);
969     addPass(&GCNRegBankReassignID);
970   }
971   return true;
972 }
973 
974 void GCNPassConfig::addPostRegAlloc() {
975   addPass(&SIFixVGPRCopiesID);
976   if (getOptLevel() > CodeGenOpt::None)
977     addPass(&SIOptimizeExecMaskingID);
978   TargetPassConfig::addPostRegAlloc();
979 
980   // Equivalent of PEI for SGPRs.
981   addPass(&SILowerSGPRSpillsID);
982 }
983 
984 void GCNPassConfig::addPreSched2() {
985   addPass(&SIPostRABundlerID);
986 }
987 
988 void GCNPassConfig::addPreEmitPass() {
989   addPass(createSIMemoryLegalizerPass());
990   addPass(createSIInsertWaitcntsPass());
991   addPass(createSIShrinkInstructionsPass());
992   addPass(createSIModeRegisterPass());
993 
994   // The hazard recognizer that runs as part of the post-ra scheduler does not
995   // guarantee to be able handle all hazards correctly. This is because if there
996   // are multiple scheduling regions in a basic block, the regions are scheduled
997   // bottom up, so when we begin to schedule a region we don't know what
998   // instructions were emitted directly before it.
999   //
1000   // Here we add a stand-alone hazard recognizer pass which can handle all
1001   // cases.
1002   //
1003   // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
1004   // be better for it to emit S_NOP <N> when possible.
1005   addPass(&PostRAHazardRecognizerID);
1006 
1007   addPass(&SIRemoveShortExecBranchesID);
1008   addPass(&SIInsertSkipsPassID);
1009   addPass(&BranchRelaxationPassID);
1010 }
1011 
1012 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1013   return new GCNPassConfig(*this, PM);
1014 }
1015 
1016 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1017   return new yaml::SIMachineFunctionInfo();
1018 }
1019 
1020 yaml::MachineFunctionInfo *
1021 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1022   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1023   return new yaml::SIMachineFunctionInfo(*MFI,
1024                                          *MF.getSubtarget().getRegisterInfo());
1025 }
1026 
1027 bool GCNTargetMachine::parseMachineFunctionInfo(
1028     const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1029     SMDiagnostic &Error, SMRange &SourceRange) const {
1030   const yaml::SIMachineFunctionInfo &YamlMFI =
1031       reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1032   MachineFunction &MF = PFS.MF;
1033   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1034 
1035   MFI->initializeBaseYamlFields(YamlMFI);
1036 
1037   auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
1038     if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
1039       SourceRange = RegName.SourceRange;
1040       return true;
1041     }
1042 
1043     return false;
1044   };
1045 
1046   auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1047     // Create a diagnostic for a the register string literal.
1048     const MemoryBuffer &Buffer =
1049         *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1050     Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1051                          RegName.Value.size(), SourceMgr::DK_Error,
1052                          "incorrect register class for field", RegName.Value,
1053                          None, None);
1054     SourceRange = RegName.SourceRange;
1055     return true;
1056   };
1057 
1058   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1059       parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
1060       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1061       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1062     return true;
1063 
1064   if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1065       !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1066     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1067   }
1068 
1069   if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
1070       !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
1071     return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
1072   }
1073 
1074   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1075       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1076     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1077   }
1078 
1079   if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1080       !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1081     return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1082   }
1083 
1084   auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1085                                    const TargetRegisterClass &RC,
1086                                    ArgDescriptor &Arg, unsigned UserSGPRs,
1087                                    unsigned SystemSGPRs) {
1088     // Skip parsing if it's not present.
1089     if (!A)
1090       return false;
1091 
1092     if (A->IsRegister) {
1093       unsigned Reg;
1094       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1095         SourceRange = A->RegisterName.SourceRange;
1096         return true;
1097       }
1098       if (!RC.contains(Reg))
1099         return diagnoseRegisterClass(A->RegisterName);
1100       Arg = ArgDescriptor::createRegister(Reg);
1101     } else
1102       Arg = ArgDescriptor::createStack(A->StackOffset);
1103     // Check and apply the optional mask.
1104     if (A->Mask)
1105       Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue());
1106 
1107     MFI->NumUserSGPRs += UserSGPRs;
1108     MFI->NumSystemSGPRs += SystemSGPRs;
1109     return false;
1110   };
1111 
1112   if (YamlMFI.ArgInfo &&
1113       (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1114                              AMDGPU::SGPR_128RegClass,
1115                              MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1116        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1117                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1118                              2, 0) ||
1119        parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1120                              MFI->ArgInfo.QueuePtr, 2, 0) ||
1121        parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1122                              AMDGPU::SReg_64RegClass,
1123                              MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1124        parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1125                              AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1126                              2, 0) ||
1127        parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1128                              AMDGPU::SReg_64RegClass,
1129                              MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1130        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1131                              AMDGPU::SGPR_32RegClass,
1132                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1133        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1134                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1135                              0, 1) ||
1136        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1137                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1138                              0, 1) ||
1139        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1140                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1141                              0, 1) ||
1142        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1143                              AMDGPU::SGPR_32RegClass,
1144                              MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1145        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1146                              AMDGPU::SGPR_32RegClass,
1147                              MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1148        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1149                              AMDGPU::SReg_64RegClass,
1150                              MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1151        parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1152                              AMDGPU::SReg_64RegClass,
1153                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1154        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1155                              AMDGPU::VGPR_32RegClass,
1156                              MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1157        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1158                              AMDGPU::VGPR_32RegClass,
1159                              MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1160        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1161                              AMDGPU::VGPR_32RegClass,
1162                              MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1163     return true;
1164 
1165   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1166   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1167   MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1168   MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1169   MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1170   MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
1171 
1172   return false;
1173 }
1174