1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for R600 and SI GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUInstructionSelector.h" 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPUMacroFusion.h" 22 #include "AMDGPUTargetObjectFile.h" 23 #include "AMDGPUTargetTransformInfo.h" 24 #include "GCNIterativeScheduler.h" 25 #include "GCNSchedStrategy.h" 26 #include "R600MachineScheduler.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIMachineScheduler.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 31 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 32 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 34 #include "llvm/CodeGen/MIRParser/MIParser.h" 35 #include "llvm/CodeGen/Passes.h" 36 #include "llvm/CodeGen/TargetPassConfig.h" 37 #include "llvm/IR/Attributes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/LegacyPassManager.h" 40 #include "llvm/InitializePasses.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Compiler.h" 44 #include "llvm/Support/TargetRegistry.h" 45 #include "llvm/Target/TargetLoweringObjectFile.h" 46 #include "llvm/Transforms/IPO.h" 47 #include "llvm/Transforms/IPO/AlwaysInliner.h" 48 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 49 #include "llvm/Transforms/Scalar.h" 50 #include "llvm/Transforms/Scalar/GVN.h" 51 #include "llvm/Transforms/Utils.h" 52 #include "llvm/Transforms/Vectorize.h" 53 #include <memory> 54 55 using namespace llvm; 56 57 static cl::opt<bool> EnableR600StructurizeCFG( 58 "r600-ir-structurize", 59 cl::desc("Use StructurizeCFG IR pass"), 60 cl::init(true)); 61 62 static cl::opt<bool> EnableSROA( 63 "amdgpu-sroa", 64 cl::desc("Run SROA after promote alloca pass"), 65 cl::ReallyHidden, 66 cl::init(true)); 67 68 static cl::opt<bool> 69 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 70 cl::desc("Run early if-conversion"), 71 cl::init(false)); 72 73 static cl::opt<bool> 74 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 75 cl::desc("Run pre-RA exec mask optimizations"), 76 cl::init(true)); 77 78 static cl::opt<bool> EnableR600IfConvert( 79 "r600-if-convert", 80 cl::desc("Use if conversion pass"), 81 cl::ReallyHidden, 82 cl::init(true)); 83 84 // Option to disable vectorizer for tests. 85 static cl::opt<bool> EnableLoadStoreVectorizer( 86 "amdgpu-load-store-vectorizer", 87 cl::desc("Enable load store vectorizer"), 88 cl::init(true), 89 cl::Hidden); 90 91 // Option to control global loads scalarization 92 static cl::opt<bool> ScalarizeGlobal( 93 "amdgpu-scalarize-global-loads", 94 cl::desc("Enable global load scalarization"), 95 cl::init(true), 96 cl::Hidden); 97 98 // Option to run internalize pass. 99 static cl::opt<bool> InternalizeSymbols( 100 "amdgpu-internalize-symbols", 101 cl::desc("Enable elimination of non-kernel functions and unused globals"), 102 cl::init(false), 103 cl::Hidden); 104 105 // Option to inline all early. 106 static cl::opt<bool> EarlyInlineAll( 107 "amdgpu-early-inline-all", 108 cl::desc("Inline all functions early"), 109 cl::init(false), 110 cl::Hidden); 111 112 static cl::opt<bool> EnableSDWAPeephole( 113 "amdgpu-sdwa-peephole", 114 cl::desc("Enable SDWA peepholer"), 115 cl::init(true)); 116 117 static cl::opt<bool> EnableDPPCombine( 118 "amdgpu-dpp-combine", 119 cl::desc("Enable DPP combiner"), 120 cl::init(true)); 121 122 // Enable address space based alias analysis 123 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 124 cl::desc("Enable AMDGPU Alias Analysis"), 125 cl::init(true)); 126 127 // Option to run late CFG structurizer 128 static cl::opt<bool, true> LateCFGStructurize( 129 "amdgpu-late-structurize", 130 cl::desc("Enable late CFG structurization"), 131 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 132 cl::Hidden); 133 134 static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt( 135 "amdgpu-function-calls", 136 cl::desc("Enable AMDGPU function call support"), 137 cl::location(AMDGPUTargetMachine::EnableFunctionCalls), 138 cl::init(true), 139 cl::Hidden); 140 141 // Enable lib calls simplifications 142 static cl::opt<bool> EnableLibCallSimplify( 143 "amdgpu-simplify-libcall", 144 cl::desc("Enable amdgpu library simplifications"), 145 cl::init(true), 146 cl::Hidden); 147 148 static cl::opt<bool> EnableLowerKernelArguments( 149 "amdgpu-ir-lower-kernel-arguments", 150 cl::desc("Lower kernel argument loads in IR pass"), 151 cl::init(true), 152 cl::Hidden); 153 154 static cl::opt<bool> EnableRegReassign( 155 "amdgpu-reassign-regs", 156 cl::desc("Enable register reassign optimizations on gfx10+"), 157 cl::init(true), 158 cl::Hidden); 159 160 // Enable atomic optimization 161 static cl::opt<bool> EnableAtomicOptimizations( 162 "amdgpu-atomic-optimizations", 163 cl::desc("Enable atomic optimizations"), 164 cl::init(false), 165 cl::Hidden); 166 167 // Enable Mode register optimization 168 static cl::opt<bool> EnableSIModeRegisterPass( 169 "amdgpu-mode-register", 170 cl::desc("Enable mode register pass"), 171 cl::init(true), 172 cl::Hidden); 173 174 // Option is used in lit tests to prevent deadcoding of patterns inspected. 175 static cl::opt<bool> 176 EnableDCEInRA("amdgpu-dce-in-ra", 177 cl::init(true), cl::Hidden, 178 cl::desc("Enable machine DCE inside regalloc")); 179 180 static cl::opt<bool> EnableScalarIRPasses( 181 "amdgpu-scalar-ir-passes", 182 cl::desc("Enable scalar IR passes"), 183 cl::init(true), 184 cl::Hidden); 185 186 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 187 // Register the target 188 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 189 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 190 191 PassRegistry *PR = PassRegistry::getPassRegistry(); 192 initializeR600ClauseMergePassPass(*PR); 193 initializeR600ControlFlowFinalizerPass(*PR); 194 initializeR600PacketizerPass(*PR); 195 initializeR600ExpandSpecialInstrsPassPass(*PR); 196 initializeR600VectorRegMergerPass(*PR); 197 initializeGlobalISel(*PR); 198 initializeAMDGPUDAGToDAGISelPass(*PR); 199 initializeGCNDPPCombinePass(*PR); 200 initializeSILowerI1CopiesPass(*PR); 201 initializeSILowerSGPRSpillsPass(*PR); 202 initializeSIFixSGPRCopiesPass(*PR); 203 initializeSIFixVGPRCopiesPass(*PR); 204 initializeSIFixupVectorISelPass(*PR); 205 initializeSIFoldOperandsPass(*PR); 206 initializeSIPeepholeSDWAPass(*PR); 207 initializeSIShrinkInstructionsPass(*PR); 208 initializeSIOptimizeExecMaskingPreRAPass(*PR); 209 initializeSILoadStoreOptimizerPass(*PR); 210 initializeAMDGPUFixFunctionBitcastsPass(*PR); 211 initializeAMDGPUAlwaysInlinePass(*PR); 212 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 213 initializeAMDGPUAnnotateUniformValuesPass(*PR); 214 initializeAMDGPUArgumentUsageInfoPass(*PR); 215 initializeAMDGPUAtomicOptimizerPass(*PR); 216 initializeAMDGPULowerKernelArgumentsPass(*PR); 217 initializeAMDGPULowerKernelAttributesPass(*PR); 218 initializeAMDGPULowerIntrinsicsPass(*PR); 219 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 220 initializeAMDGPUPreLegalizerCombinerPass(*PR); 221 initializeAMDGPUPromoteAllocaPass(*PR); 222 initializeAMDGPUCodeGenPreparePass(*PR); 223 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 224 initializeAMDGPUPropagateAttributesLatePass(*PR); 225 initializeAMDGPURewriteOutArgumentsPass(*PR); 226 initializeAMDGPUUnifyMetadataPass(*PR); 227 initializeSIAnnotateControlFlowPass(*PR); 228 initializeSIInsertWaitcntsPass(*PR); 229 initializeSIModeRegisterPass(*PR); 230 initializeSIWholeQuadModePass(*PR); 231 initializeSILowerControlFlowPass(*PR); 232 initializeSIRemoveShortExecBranchesPass(*PR); 233 initializeSIInsertSkipsPass(*PR); 234 initializeSIMemoryLegalizerPass(*PR); 235 initializeSIOptimizeExecMaskingPass(*PR); 236 initializeSIPreAllocateWWMRegsPass(*PR); 237 initializeSIFormMemoryClausesPass(*PR); 238 initializeSIPostRABundlerPass(*PR); 239 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 240 initializeAMDGPUAAWrapperPassPass(*PR); 241 initializeAMDGPUExternalAAWrapperPass(*PR); 242 initializeAMDGPUUseNativeCallsPass(*PR); 243 initializeAMDGPUSimplifyLibCallsPass(*PR); 244 initializeAMDGPUInlinerPass(*PR); 245 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 246 initializeGCNRegBankReassignPass(*PR); 247 initializeGCNNSAReassignPass(*PR); 248 } 249 250 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 251 return std::make_unique<AMDGPUTargetObjectFile>(); 252 } 253 254 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 255 return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>()); 256 } 257 258 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 259 return new SIScheduleDAGMI(C); 260 } 261 262 static ScheduleDAGInstrs * 263 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 264 ScheduleDAGMILive *DAG = 265 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 266 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 267 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 268 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 269 return DAG; 270 } 271 272 static ScheduleDAGInstrs * 273 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 274 auto DAG = new GCNIterativeScheduler(C, 275 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 276 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 277 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 278 return DAG; 279 } 280 281 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 282 return new GCNIterativeScheduler(C, 283 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 284 } 285 286 static ScheduleDAGInstrs * 287 createIterativeILPMachineScheduler(MachineSchedContext *C) { 288 auto DAG = new GCNIterativeScheduler(C, 289 GCNIterativeScheduler::SCHEDULE_ILP); 290 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 291 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 292 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 293 return DAG; 294 } 295 296 static MachineSchedRegistry 297 R600SchedRegistry("r600", "Run R600's custom scheduler", 298 createR600MachineScheduler); 299 300 static MachineSchedRegistry 301 SISchedRegistry("si", "Run SI's custom scheduler", 302 createSIMachineScheduler); 303 304 static MachineSchedRegistry 305 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 306 "Run GCN scheduler to maximize occupancy", 307 createGCNMaxOccupancyMachineScheduler); 308 309 static MachineSchedRegistry 310 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 311 "Run GCN scheduler to maximize occupancy (experimental)", 312 createIterativeGCNMaxOccupancyMachineScheduler); 313 314 static MachineSchedRegistry 315 GCNMinRegSchedRegistry("gcn-minreg", 316 "Run GCN iterative scheduler for minimal register usage (experimental)", 317 createMinRegScheduler); 318 319 static MachineSchedRegistry 320 GCNILPSchedRegistry("gcn-ilp", 321 "Run GCN iterative scheduler for ILP scheduling (experimental)", 322 createIterativeILPMachineScheduler); 323 324 static StringRef computeDataLayout(const Triple &TT) { 325 if (TT.getArch() == Triple::r600) { 326 // 32-bit pointers. 327 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 328 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; 329 } 330 331 // 32-bit private, local, and region pointers. 64-bit global, constant and 332 // flat, non-integral buffer fat pointers. 333 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 334 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 335 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 336 "-ni:7"; 337 } 338 339 LLVM_READNONE 340 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 341 if (!GPU.empty()) 342 return GPU; 343 344 // Need to default to a target with flat support for HSA. 345 if (TT.getArch() == Triple::amdgcn) 346 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 347 348 return "r600"; 349 } 350 351 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 352 // The AMDGPU toolchain only supports generating shared objects, so we 353 // must always use PIC. 354 return Reloc::PIC_; 355 } 356 357 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 358 StringRef CPU, StringRef FS, 359 TargetOptions Options, 360 Optional<Reloc::Model> RM, 361 Optional<CodeModel::Model> CM, 362 CodeGenOpt::Level OptLevel) 363 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 364 FS, Options, getEffectiveRelocModel(RM), 365 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 366 TLOF(createTLOF(getTargetTriple())) { 367 initAsmInfo(); 368 } 369 370 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 371 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 372 373 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 374 375 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 376 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 377 return GPUAttr.hasAttribute(Attribute::None) ? 378 getTargetCPU() : GPUAttr.getValueAsString(); 379 } 380 381 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 382 Attribute FSAttr = F.getFnAttribute("target-features"); 383 384 return FSAttr.hasAttribute(Attribute::None) ? 385 getTargetFeatureString() : 386 FSAttr.getValueAsString(); 387 } 388 389 /// Predicate for Internalize pass. 390 static bool mustPreserveGV(const GlobalValue &GV) { 391 if (const Function *F = dyn_cast<Function>(&GV)) 392 return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv()); 393 394 return !GV.use_empty(); 395 } 396 397 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 398 Builder.DivergentTarget = true; 399 400 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 401 bool Internalize = InternalizeSymbols; 402 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 403 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 404 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 405 406 if (EnableFunctionCalls) { 407 delete Builder.Inliner; 408 Builder.Inliner = createAMDGPUFunctionInliningPass(); 409 } 410 411 Builder.addExtension( 412 PassManagerBuilder::EP_ModuleOptimizerEarly, 413 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 414 legacy::PassManagerBase &PM) { 415 if (AMDGPUAA) { 416 PM.add(createAMDGPUAAWrapperPass()); 417 PM.add(createAMDGPUExternalAAWrapperPass()); 418 } 419 PM.add(createAMDGPUUnifyMetadataPass()); 420 PM.add(createAMDGPUPrintfRuntimeBinding()); 421 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 422 if (Internalize) { 423 PM.add(createInternalizePass(mustPreserveGV)); 424 PM.add(createGlobalDCEPass()); 425 } 426 if (EarlyInline) 427 PM.add(createAMDGPUAlwaysInlinePass(false)); 428 }); 429 430 const auto &Opt = Options; 431 Builder.addExtension( 432 PassManagerBuilder::EP_EarlyAsPossible, 433 [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &, 434 legacy::PassManagerBase &PM) { 435 if (AMDGPUAA) { 436 PM.add(createAMDGPUAAWrapperPass()); 437 PM.add(createAMDGPUExternalAAWrapperPass()); 438 } 439 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 440 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 441 if (LibCallSimplify) 442 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); 443 }); 444 445 Builder.addExtension( 446 PassManagerBuilder::EP_CGSCCOptimizerLate, 447 [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 448 // Add infer address spaces pass to the opt pipeline after inlining 449 // but before SROA to increase SROA opportunities. 450 PM.add(createInferAddressSpacesPass()); 451 452 // This should run after inlining to have any chance of doing anything, 453 // and before other cleanup optimizations. 454 PM.add(createAMDGPULowerKernelAttributesPass()); 455 }); 456 } 457 458 //===----------------------------------------------------------------------===// 459 // R600 Target Machine (R600 -> Cayman) 460 //===----------------------------------------------------------------------===// 461 462 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 463 StringRef CPU, StringRef FS, 464 TargetOptions Options, 465 Optional<Reloc::Model> RM, 466 Optional<CodeModel::Model> CM, 467 CodeGenOpt::Level OL, bool JIT) 468 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { 469 setRequiresStructuredCFG(true); 470 471 // Override the default since calls aren't supported for r600. 472 if (EnableFunctionCalls && 473 EnableAMDGPUFunctionCallsOpt.getNumOccurrences() == 0) 474 EnableFunctionCalls = false; 475 } 476 477 const R600Subtarget *R600TargetMachine::getSubtargetImpl( 478 const Function &F) const { 479 StringRef GPU = getGPUName(F); 480 StringRef FS = getFeatureString(F); 481 482 SmallString<128> SubtargetKey(GPU); 483 SubtargetKey.append(FS); 484 485 auto &I = SubtargetMap[SubtargetKey]; 486 if (!I) { 487 // This needs to be done before we create a new subtarget since any 488 // creation will depend on the TM and the code generation flags on the 489 // function that reside in TargetOptions. 490 resetTargetOptions(F); 491 I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); 492 } 493 494 return I.get(); 495 } 496 497 TargetTransformInfo 498 R600TargetMachine::getTargetTransformInfo(const Function &F) { 499 return TargetTransformInfo(R600TTIImpl(this, F)); 500 } 501 502 //===----------------------------------------------------------------------===// 503 // GCN Target Machine (SI+) 504 //===----------------------------------------------------------------------===// 505 506 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 507 StringRef CPU, StringRef FS, 508 TargetOptions Options, 509 Optional<Reloc::Model> RM, 510 Optional<CodeModel::Model> CM, 511 CodeGenOpt::Level OL, bool JIT) 512 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 513 514 const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { 515 StringRef GPU = getGPUName(F); 516 StringRef FS = getFeatureString(F); 517 518 SmallString<128> SubtargetKey(GPU); 519 SubtargetKey.append(FS); 520 521 auto &I = SubtargetMap[SubtargetKey]; 522 if (!I) { 523 // This needs to be done before we create a new subtarget since any 524 // creation will depend on the TM and the code generation flags on the 525 // function that reside in TargetOptions. 526 resetTargetOptions(F); 527 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 528 } 529 530 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 531 532 return I.get(); 533 } 534 535 TargetTransformInfo 536 GCNTargetMachine::getTargetTransformInfo(const Function &F) { 537 return TargetTransformInfo(GCNTTIImpl(this, F)); 538 } 539 540 //===----------------------------------------------------------------------===// 541 // AMDGPU Pass Setup 542 //===----------------------------------------------------------------------===// 543 544 namespace { 545 546 class AMDGPUPassConfig : public TargetPassConfig { 547 public: 548 AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 549 : TargetPassConfig(TM, PM) { 550 // Exceptions and StackMaps are not supported, so these passes will never do 551 // anything. 552 disablePass(&StackMapLivenessID); 553 disablePass(&FuncletLayoutID); 554 } 555 556 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 557 return getTM<AMDGPUTargetMachine>(); 558 } 559 560 ScheduleDAGInstrs * 561 createMachineScheduler(MachineSchedContext *C) const override { 562 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 563 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 564 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 565 return DAG; 566 } 567 568 void addEarlyCSEOrGVNPass(); 569 void addStraightLineScalarOptimizationPasses(); 570 void addIRPasses() override; 571 void addCodeGenPrepare() override; 572 bool addPreISel() override; 573 bool addInstSelector() override; 574 bool addGCPasses() override; 575 576 std::unique_ptr<CSEConfigBase> getCSEConfig() const override; 577 }; 578 579 std::unique_ptr<CSEConfigBase> AMDGPUPassConfig::getCSEConfig() const { 580 return getStandardCSEConfigForOpt(TM->getOptLevel()); 581 } 582 583 class R600PassConfig final : public AMDGPUPassConfig { 584 public: 585 R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 586 : AMDGPUPassConfig(TM, PM) {} 587 588 ScheduleDAGInstrs *createMachineScheduler( 589 MachineSchedContext *C) const override { 590 return createR600MachineScheduler(C); 591 } 592 593 bool addPreISel() override; 594 bool addInstSelector() override; 595 void addPreRegAlloc() override; 596 void addPreSched2() override; 597 void addPreEmitPass() override; 598 }; 599 600 class GCNPassConfig final : public AMDGPUPassConfig { 601 public: 602 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 603 : AMDGPUPassConfig(TM, PM) { 604 // It is necessary to know the register usage of the entire call graph. We 605 // allow calls without EnableAMDGPUFunctionCalls if they are marked 606 // noinline, so this is always required. 607 setRequiresCodeGenSCCOrder(true); 608 } 609 610 GCNTargetMachine &getGCNTargetMachine() const { 611 return getTM<GCNTargetMachine>(); 612 } 613 614 ScheduleDAGInstrs * 615 createMachineScheduler(MachineSchedContext *C) const override; 616 617 bool addPreISel() override; 618 void addMachineSSAOptimization() override; 619 bool addILPOpts() override; 620 bool addInstSelector() override; 621 bool addIRTranslator() override; 622 void addPreLegalizeMachineIR() override; 623 bool addLegalizeMachineIR() override; 624 bool addRegBankSelect() override; 625 bool addGlobalInstructionSelect() override; 626 void addFastRegAlloc() override; 627 void addOptimizedRegAlloc() override; 628 void addPreRegAlloc() override; 629 bool addPreRewrite() override; 630 void addPostRegAlloc() override; 631 void addPreSched2() override; 632 void addPreEmitPass() override; 633 }; 634 635 } // end anonymous namespace 636 637 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 638 if (getOptLevel() == CodeGenOpt::Aggressive) 639 addPass(createGVNPass()); 640 else 641 addPass(createEarlyCSEPass()); 642 } 643 644 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 645 addPass(createLICMPass()); 646 addPass(createSeparateConstOffsetFromGEPPass()); 647 addPass(createSpeculativeExecutionPass()); 648 // ReassociateGEPs exposes more opportunites for SLSR. See 649 // the example in reassociate-geps-and-slsr.ll. 650 addPass(createStraightLineStrengthReducePass()); 651 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 652 // EarlyCSE can reuse. 653 addEarlyCSEOrGVNPass(); 654 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 655 addPass(createNaryReassociatePass()); 656 // NaryReassociate on GEPs creates redundant common expressions, so run 657 // EarlyCSE after it. 658 addPass(createEarlyCSEPass()); 659 } 660 661 void AMDGPUPassConfig::addIRPasses() { 662 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 663 664 // There is no reason to run these. 665 disablePass(&StackMapLivenessID); 666 disablePass(&FuncletLayoutID); 667 disablePass(&PatchableFunctionID); 668 669 addPass(createAMDGPUPrintfRuntimeBinding()); 670 671 // This must occur before inlining, as the inliner will not look through 672 // bitcast calls. 673 addPass(createAMDGPUFixFunctionBitcastsPass()); 674 675 // A call to propagate attributes pass in the backend in case opt was not run. 676 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 677 678 addPass(createAtomicExpandPass()); 679 680 681 addPass(createAMDGPULowerIntrinsicsPass()); 682 683 // Function calls are not supported, so make sure we inline everything. 684 addPass(createAMDGPUAlwaysInlinePass()); 685 addPass(createAlwaysInlinerLegacyPass()); 686 // We need to add the barrier noop pass, otherwise adding the function 687 // inlining pass will cause all of the PassConfigs passes to be run 688 // one function at a time, which means if we have a nodule with two 689 // functions, then we will generate code for the first function 690 // without ever running any passes on the second. 691 addPass(createBarrierNoopPass()); 692 693 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 694 if (TM.getTargetTriple().getArch() == Triple::r600) 695 addPass(createR600OpenCLImageTypeLoweringPass()); 696 697 // Replace OpenCL enqueued block function pointers with global variables. 698 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 699 700 if (TM.getOptLevel() > CodeGenOpt::None) { 701 addPass(createInferAddressSpacesPass()); 702 addPass(createAMDGPUPromoteAlloca()); 703 704 if (EnableSROA) 705 addPass(createSROAPass()); 706 707 if (EnableScalarIRPasses) 708 addStraightLineScalarOptimizationPasses(); 709 710 if (EnableAMDGPUAliasAnalysis) { 711 addPass(createAMDGPUAAWrapperPass()); 712 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 713 AAResults &AAR) { 714 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 715 AAR.addAAResult(WrapperPass->getResult()); 716 })); 717 } 718 } 719 720 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 721 // TODO: May want to move later or split into an early and late one. 722 addPass(createAMDGPUCodeGenPreparePass()); 723 } 724 725 TargetPassConfig::addIRPasses(); 726 727 // EarlyCSE is not always strong enough to clean up what LSR produces. For 728 // example, GVN can combine 729 // 730 // %0 = add %a, %b 731 // %1 = add %b, %a 732 // 733 // and 734 // 735 // %0 = shl nsw %a, 2 736 // %1 = shl %a, 2 737 // 738 // but EarlyCSE can do neither of them. 739 if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) 740 addEarlyCSEOrGVNPass(); 741 } 742 743 void AMDGPUPassConfig::addCodeGenPrepare() { 744 if (TM->getTargetTriple().getArch() == Triple::amdgcn) 745 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 746 747 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 748 EnableLowerKernelArguments) 749 addPass(createAMDGPULowerKernelArgumentsPass()); 750 751 addPass(&AMDGPUPerfHintAnalysisID); 752 753 TargetPassConfig::addCodeGenPrepare(); 754 755 if (EnableLoadStoreVectorizer) 756 addPass(createLoadStoreVectorizerPass()); 757 } 758 759 bool AMDGPUPassConfig::addPreISel() { 760 addPass(createLowerSwitchPass()); 761 addPass(createFlattenCFGPass()); 762 return false; 763 } 764 765 bool AMDGPUPassConfig::addInstSelector() { 766 // Defer the verifier until FinalizeISel. 767 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false); 768 return false; 769 } 770 771 bool AMDGPUPassConfig::addGCPasses() { 772 // Do nothing. GC is not supported. 773 return false; 774 } 775 776 //===----------------------------------------------------------------------===// 777 // R600 Pass Setup 778 //===----------------------------------------------------------------------===// 779 780 bool R600PassConfig::addPreISel() { 781 AMDGPUPassConfig::addPreISel(); 782 783 if (EnableR600StructurizeCFG) 784 addPass(createStructurizeCFGPass()); 785 return false; 786 } 787 788 bool R600PassConfig::addInstSelector() { 789 addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 790 return false; 791 } 792 793 void R600PassConfig::addPreRegAlloc() { 794 addPass(createR600VectorRegMerger()); 795 } 796 797 void R600PassConfig::addPreSched2() { 798 addPass(createR600EmitClauseMarkers(), false); 799 if (EnableR600IfConvert) 800 addPass(&IfConverterID, false); 801 addPass(createR600ClauseMergePass(), false); 802 } 803 804 void R600PassConfig::addPreEmitPass() { 805 addPass(createAMDGPUCFGStructurizerPass(), false); 806 addPass(createR600ExpandSpecialInstrsPass(), false); 807 addPass(&FinalizeMachineBundlesID, false); 808 addPass(createR600Packetizer(), false); 809 addPass(createR600ControlFlowFinalizer(), false); 810 } 811 812 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 813 return new R600PassConfig(*this, PM); 814 } 815 816 //===----------------------------------------------------------------------===// 817 // GCN Pass Setup 818 //===----------------------------------------------------------------------===// 819 820 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 821 MachineSchedContext *C) const { 822 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 823 if (ST.enableSIScheduler()) 824 return createSIMachineScheduler(C); 825 return createGCNMaxOccupancyMachineScheduler(C); 826 } 827 828 bool GCNPassConfig::addPreISel() { 829 AMDGPUPassConfig::addPreISel(); 830 831 if (EnableAtomicOptimizations) { 832 addPass(createAMDGPUAtomicOptimizerPass()); 833 } 834 835 // FIXME: We need to run a pass to propagate the attributes when calls are 836 // supported. 837 838 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 839 // regions formed by them. 840 addPass(&AMDGPUUnifyDivergentExitNodesID); 841 if (!LateCFGStructurize) { 842 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 843 } 844 addPass(createSinkingPass()); 845 addPass(createAMDGPUAnnotateUniformValues()); 846 if (!LateCFGStructurize) { 847 addPass(createSIAnnotateControlFlowPass()); 848 } 849 addPass(createLCSSAPass()); 850 851 return false; 852 } 853 854 void GCNPassConfig::addMachineSSAOptimization() { 855 TargetPassConfig::addMachineSSAOptimization(); 856 857 // We want to fold operands after PeepholeOptimizer has run (or as part of 858 // it), because it will eliminate extra copies making it easier to fold the 859 // real source operand. We want to eliminate dead instructions after, so that 860 // we see fewer uses of the copies. We then need to clean up the dead 861 // instructions leftover after the operands are folded as well. 862 // 863 // XXX - Can we get away without running DeadMachineInstructionElim again? 864 addPass(&SIFoldOperandsID); 865 if (EnableDPPCombine) 866 addPass(&GCNDPPCombineID); 867 addPass(&DeadMachineInstructionElimID); 868 addPass(&SILoadStoreOptimizerID); 869 if (EnableSDWAPeephole) { 870 addPass(&SIPeepholeSDWAID); 871 addPass(&EarlyMachineLICMID); 872 addPass(&MachineCSEID); 873 addPass(&SIFoldOperandsID); 874 addPass(&DeadMachineInstructionElimID); 875 } 876 addPass(createSIShrinkInstructionsPass()); 877 } 878 879 bool GCNPassConfig::addILPOpts() { 880 if (EnableEarlyIfConversion) 881 addPass(&EarlyIfConverterID); 882 883 TargetPassConfig::addILPOpts(); 884 return false; 885 } 886 887 bool GCNPassConfig::addInstSelector() { 888 AMDGPUPassConfig::addInstSelector(); 889 addPass(&SIFixSGPRCopiesID); 890 addPass(createSILowerI1CopiesPass()); 891 addPass(createSIFixupVectorISelPass()); 892 addPass(createSIAddIMGInitPass()); 893 return false; 894 } 895 896 bool GCNPassConfig::addIRTranslator() { 897 addPass(new IRTranslator()); 898 return false; 899 } 900 901 void GCNPassConfig::addPreLegalizeMachineIR() { 902 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 903 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 904 } 905 906 bool GCNPassConfig::addLegalizeMachineIR() { 907 addPass(new Legalizer()); 908 return false; 909 } 910 911 bool GCNPassConfig::addRegBankSelect() { 912 addPass(new RegBankSelect()); 913 return false; 914 } 915 916 bool GCNPassConfig::addGlobalInstructionSelect() { 917 addPass(new InstructionSelect()); 918 return false; 919 } 920 921 void GCNPassConfig::addPreRegAlloc() { 922 if (LateCFGStructurize) { 923 addPass(createAMDGPUMachineCFGStructurizerPass()); 924 } 925 addPass(createSIWholeQuadModePass()); 926 } 927 928 void GCNPassConfig::addFastRegAlloc() { 929 // FIXME: We have to disable the verifier here because of PHIElimination + 930 // TwoAddressInstructions disabling it. 931 932 // This must be run immediately after phi elimination and before 933 // TwoAddressInstructions, otherwise the processing of the tied operand of 934 // SI_ELSE will introduce a copy of the tied operand source after the else. 935 insertPass(&PHIEliminationID, &SILowerControlFlowID, false); 936 937 // This must be run just after RegisterCoalescing. 938 insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); 939 940 TargetPassConfig::addFastRegAlloc(); 941 } 942 943 void GCNPassConfig::addOptimizedRegAlloc() { 944 if (OptExecMaskPreRA) { 945 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 946 insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); 947 } else { 948 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 949 } 950 951 // This must be run immediately after phi elimination and before 952 // TwoAddressInstructions, otherwise the processing of the tied operand of 953 // SI_ELSE will introduce a copy of the tied operand source after the else. 954 insertPass(&PHIEliminationID, &SILowerControlFlowID, false); 955 956 // This must be run just after RegisterCoalescing. 957 insertPass(&RegisterCoalescerID, &SIPreAllocateWWMRegsID, false); 958 959 if (EnableDCEInRA) 960 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 961 962 TargetPassConfig::addOptimizedRegAlloc(); 963 } 964 965 bool GCNPassConfig::addPreRewrite() { 966 if (EnableRegReassign) { 967 addPass(&GCNNSAReassignID); 968 addPass(&GCNRegBankReassignID); 969 } 970 return true; 971 } 972 973 void GCNPassConfig::addPostRegAlloc() { 974 addPass(&SIFixVGPRCopiesID); 975 if (getOptLevel() > CodeGenOpt::None) 976 addPass(&SIOptimizeExecMaskingID); 977 TargetPassConfig::addPostRegAlloc(); 978 979 // Equivalent of PEI for SGPRs. 980 addPass(&SILowerSGPRSpillsID); 981 } 982 983 void GCNPassConfig::addPreSched2() { 984 addPass(&SIPostRABundlerID); 985 } 986 987 void GCNPassConfig::addPreEmitPass() { 988 addPass(createSIMemoryLegalizerPass()); 989 addPass(createSIInsertWaitcntsPass()); 990 addPass(createSIShrinkInstructionsPass()); 991 addPass(createSIModeRegisterPass()); 992 993 // The hazard recognizer that runs as part of the post-ra scheduler does not 994 // guarantee to be able handle all hazards correctly. This is because if there 995 // are multiple scheduling regions in a basic block, the regions are scheduled 996 // bottom up, so when we begin to schedule a region we don't know what 997 // instructions were emitted directly before it. 998 // 999 // Here we add a stand-alone hazard recognizer pass which can handle all 1000 // cases. 1001 // 1002 // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would 1003 // be better for it to emit S_NOP <N> when possible. 1004 addPass(&PostRAHazardRecognizerID); 1005 1006 addPass(&SIRemoveShortExecBranchesID); 1007 addPass(&SIInsertSkipsPassID); 1008 addPass(&BranchRelaxationPassID); 1009 } 1010 1011 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1012 return new GCNPassConfig(*this, PM); 1013 } 1014 1015 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1016 return new yaml::SIMachineFunctionInfo(); 1017 } 1018 1019 yaml::MachineFunctionInfo * 1020 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1021 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1022 return new yaml::SIMachineFunctionInfo(*MFI, 1023 *MF.getSubtarget().getRegisterInfo()); 1024 } 1025 1026 bool GCNTargetMachine::parseMachineFunctionInfo( 1027 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1028 SMDiagnostic &Error, SMRange &SourceRange) const { 1029 const yaml::SIMachineFunctionInfo &YamlMFI = 1030 reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1031 MachineFunction &MF = PFS.MF; 1032 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1033 1034 MFI->initializeBaseYamlFields(YamlMFI); 1035 1036 auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) { 1037 if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) { 1038 SourceRange = RegName.SourceRange; 1039 return true; 1040 } 1041 1042 return false; 1043 }; 1044 1045 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1046 // Create a diagnostic for a the register string literal. 1047 const MemoryBuffer &Buffer = 1048 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1049 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1050 RegName.Value.size(), SourceMgr::DK_Error, 1051 "incorrect register class for field", RegName.Value, 1052 None, None); 1053 SourceRange = RegName.SourceRange; 1054 return true; 1055 }; 1056 1057 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1058 parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || 1059 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1060 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1061 return true; 1062 1063 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1064 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1065 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1066 } 1067 1068 if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && 1069 !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { 1070 return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); 1071 } 1072 1073 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1074 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1075 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1076 } 1077 1078 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1079 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1080 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1081 } 1082 1083 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1084 const TargetRegisterClass &RC, 1085 ArgDescriptor &Arg, unsigned UserSGPRs, 1086 unsigned SystemSGPRs) { 1087 // Skip parsing if it's not present. 1088 if (!A) 1089 return false; 1090 1091 if (A->IsRegister) { 1092 unsigned Reg; 1093 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1094 SourceRange = A->RegisterName.SourceRange; 1095 return true; 1096 } 1097 if (!RC.contains(Reg)) 1098 return diagnoseRegisterClass(A->RegisterName); 1099 Arg = ArgDescriptor::createRegister(Reg); 1100 } else 1101 Arg = ArgDescriptor::createStack(A->StackOffset); 1102 // Check and apply the optional mask. 1103 if (A->Mask) 1104 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1105 1106 MFI->NumUserSGPRs += UserSGPRs; 1107 MFI->NumSystemSGPRs += SystemSGPRs; 1108 return false; 1109 }; 1110 1111 if (YamlMFI.ArgInfo && 1112 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1113 AMDGPU::SGPR_128RegClass, 1114 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1115 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1116 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1117 2, 0) || 1118 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1119 MFI->ArgInfo.QueuePtr, 2, 0) || 1120 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1121 AMDGPU::SReg_64RegClass, 1122 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1123 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1124 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1125 2, 0) || 1126 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1127 AMDGPU::SReg_64RegClass, 1128 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1129 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1130 AMDGPU::SGPR_32RegClass, 1131 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1132 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1133 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1134 0, 1) || 1135 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1136 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1137 0, 1) || 1138 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1139 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1140 0, 1) || 1141 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1142 AMDGPU::SGPR_32RegClass, 1143 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1144 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1145 AMDGPU::SGPR_32RegClass, 1146 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1147 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1148 AMDGPU::SReg_64RegClass, 1149 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1150 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1151 AMDGPU::SReg_64RegClass, 1152 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1153 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1154 AMDGPU::VGPR_32RegClass, 1155 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1156 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1157 AMDGPU::VGPR_32RegClass, 1158 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1159 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1160 AMDGPU::VGPR_32RegClass, 1161 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1162 return true; 1163 1164 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1165 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1166 MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals; 1167 MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals; 1168 1169 return false; 1170 } 1171