1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUIGroupLP.h" 20 #include "AMDGPUMacroFusion.h" 21 #include "AMDGPUTargetObjectFile.h" 22 #include "AMDGPUTargetTransformInfo.h" 23 #include "GCNIterativeScheduler.h" 24 #include "GCNSchedStrategy.h" 25 #include "R600.h" 26 #include "R600TargetMachine.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIMachineScheduler.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "llvm/Analysis/CGSCCPassManager.h" 31 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 32 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 33 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 34 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 35 #include "llvm/CodeGen/GlobalISel/Localizer.h" 36 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 37 #include "llvm/CodeGen/MIRParser/MIParser.h" 38 #include "llvm/CodeGen/Passes.h" 39 #include "llvm/CodeGen/RegAllocRegistry.h" 40 #include "llvm/CodeGen/TargetPassConfig.h" 41 #include "llvm/IR/IntrinsicsAMDGPU.h" 42 #include "llvm/IR/LegacyPassManager.h" 43 #include "llvm/IR/PassManager.h" 44 #include "llvm/IR/PatternMatch.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/MC/TargetRegistry.h" 47 #include "llvm/Passes/PassBuilder.h" 48 #include "llvm/Transforms/IPO.h" 49 #include "llvm/Transforms/IPO/AlwaysInliner.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/Internalize.h" 52 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 53 #include "llvm/Transforms/Scalar.h" 54 #include "llvm/Transforms/Scalar/GVN.h" 55 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 56 #include "llvm/Transforms/Utils.h" 57 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 58 #include "llvm/Transforms/Vectorize.h" 59 60 using namespace llvm; 61 using namespace llvm::PatternMatch; 62 63 namespace { 64 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 65 public: 66 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 67 : RegisterRegAllocBase(N, D, C) {} 68 }; 69 70 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 71 public: 72 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 73 : RegisterRegAllocBase(N, D, C) {} 74 }; 75 76 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 77 const TargetRegisterClass &RC) { 78 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 79 } 80 81 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 82 const TargetRegisterClass &RC) { 83 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 84 } 85 86 87 /// -{sgpr|vgpr}-regalloc=... command line option. 88 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 89 90 /// A dummy default pass factory indicates whether the register allocator is 91 /// overridden on the command line. 92 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 93 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 94 95 static SGPRRegisterRegAlloc 96 defaultSGPRRegAlloc("default", 97 "pick SGPR register allocator based on -O option", 98 useDefaultRegisterAllocator); 99 100 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 101 RegisterPassParser<SGPRRegisterRegAlloc>> 102 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 103 cl::desc("Register allocator to use for SGPRs")); 104 105 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 106 RegisterPassParser<VGPRRegisterRegAlloc>> 107 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 108 cl::desc("Register allocator to use for VGPRs")); 109 110 111 static void initializeDefaultSGPRRegisterAllocatorOnce() { 112 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 113 114 if (!Ctor) { 115 Ctor = SGPRRegAlloc; 116 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 117 } 118 } 119 120 static void initializeDefaultVGPRRegisterAllocatorOnce() { 121 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 122 123 if (!Ctor) { 124 Ctor = VGPRRegAlloc; 125 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 126 } 127 } 128 129 static FunctionPass *createBasicSGPRRegisterAllocator() { 130 return createBasicRegisterAllocator(onlyAllocateSGPRs); 131 } 132 133 static FunctionPass *createGreedySGPRRegisterAllocator() { 134 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 135 } 136 137 static FunctionPass *createFastSGPRRegisterAllocator() { 138 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 139 } 140 141 static FunctionPass *createBasicVGPRRegisterAllocator() { 142 return createBasicRegisterAllocator(onlyAllocateVGPRs); 143 } 144 145 static FunctionPass *createGreedyVGPRRegisterAllocator() { 146 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 147 } 148 149 static FunctionPass *createFastVGPRRegisterAllocator() { 150 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 151 } 152 153 static SGPRRegisterRegAlloc basicRegAllocSGPR( 154 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 155 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 156 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 157 158 static SGPRRegisterRegAlloc fastRegAllocSGPR( 159 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 160 161 162 static VGPRRegisterRegAlloc basicRegAllocVGPR( 163 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 164 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 165 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 166 167 static VGPRRegisterRegAlloc fastRegAllocVGPR( 168 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 169 } 170 171 static cl::opt<bool> EnableSROA( 172 "amdgpu-sroa", 173 cl::desc("Run SROA after promote alloca pass"), 174 cl::ReallyHidden, 175 cl::init(true)); 176 177 static cl::opt<bool> 178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 179 cl::desc("Run early if-conversion"), 180 cl::init(false)); 181 182 static cl::opt<bool> 183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 184 cl::desc("Run pre-RA exec mask optimizations"), 185 cl::init(true)); 186 187 // Option to disable vectorizer for tests. 188 static cl::opt<bool> EnableLoadStoreVectorizer( 189 "amdgpu-load-store-vectorizer", 190 cl::desc("Enable load store vectorizer"), 191 cl::init(true), 192 cl::Hidden); 193 194 // Option to control global loads scalarization 195 static cl::opt<bool> ScalarizeGlobal( 196 "amdgpu-scalarize-global-loads", 197 cl::desc("Enable global load scalarization"), 198 cl::init(true), 199 cl::Hidden); 200 201 // Option to run internalize pass. 202 static cl::opt<bool> InternalizeSymbols( 203 "amdgpu-internalize-symbols", 204 cl::desc("Enable elimination of non-kernel functions and unused globals"), 205 cl::init(false), 206 cl::Hidden); 207 208 // Option to inline all early. 209 static cl::opt<bool> EarlyInlineAll( 210 "amdgpu-early-inline-all", 211 cl::desc("Inline all functions early"), 212 cl::init(false), 213 cl::Hidden); 214 215 static cl::opt<bool> EnableSDWAPeephole( 216 "amdgpu-sdwa-peephole", 217 cl::desc("Enable SDWA peepholer"), 218 cl::init(true)); 219 220 static cl::opt<bool> EnableDPPCombine( 221 "amdgpu-dpp-combine", 222 cl::desc("Enable DPP combiner"), 223 cl::init(true)); 224 225 // Enable address space based alias analysis 226 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 227 cl::desc("Enable AMDGPU Alias Analysis"), 228 cl::init(true)); 229 230 // Option to run late CFG structurizer 231 static cl::opt<bool, true> LateCFGStructurize( 232 "amdgpu-late-structurize", 233 cl::desc("Enable late CFG structurization"), 234 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 235 cl::Hidden); 236 237 // Enable lib calls simplifications 238 static cl::opt<bool> EnableLibCallSimplify( 239 "amdgpu-simplify-libcall", 240 cl::desc("Enable amdgpu library simplifications"), 241 cl::init(true), 242 cl::Hidden); 243 244 static cl::opt<bool> EnableLowerKernelArguments( 245 "amdgpu-ir-lower-kernel-arguments", 246 cl::desc("Lower kernel argument loads in IR pass"), 247 cl::init(true), 248 cl::Hidden); 249 250 static cl::opt<bool> EnableRegReassign( 251 "amdgpu-reassign-regs", 252 cl::desc("Enable register reassign optimizations on gfx10+"), 253 cl::init(true), 254 cl::Hidden); 255 256 static cl::opt<bool> OptVGPRLiveRange( 257 "amdgpu-opt-vgpr-liverange", 258 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 259 cl::init(true), cl::Hidden); 260 261 // Enable atomic optimization 262 static cl::opt<bool> EnableAtomicOptimizations( 263 "amdgpu-atomic-optimizations", 264 cl::desc("Enable atomic optimizations"), 265 cl::init(false), 266 cl::Hidden); 267 268 // Enable Mode register optimization 269 static cl::opt<bool> EnableSIModeRegisterPass( 270 "amdgpu-mode-register", 271 cl::desc("Enable mode register pass"), 272 cl::init(true), 273 cl::Hidden); 274 275 // Enable GFX11+ s_delay_alu insertion 276 static cl::opt<bool> 277 EnableInsertDelayAlu("amdgpu-enable-delay-alu", 278 cl::desc("Enable s_delay_alu insertion"), 279 cl::init(true), cl::Hidden); 280 281 // Option is used in lit tests to prevent deadcoding of patterns inspected. 282 static cl::opt<bool> 283 EnableDCEInRA("amdgpu-dce-in-ra", 284 cl::init(true), cl::Hidden, 285 cl::desc("Enable machine DCE inside regalloc")); 286 287 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 288 cl::desc("Adjust wave priority"), 289 cl::init(false), cl::Hidden); 290 291 static cl::opt<bool> EnableScalarIRPasses( 292 "amdgpu-scalar-ir-passes", 293 cl::desc("Enable scalar IR passes"), 294 cl::init(true), 295 cl::Hidden); 296 297 static cl::opt<bool> EnableStructurizerWorkarounds( 298 "amdgpu-enable-structurizer-workarounds", 299 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 300 cl::Hidden); 301 302 static cl::opt<bool> EnableLDSReplaceWithPointer( 303 "amdgpu-enable-lds-replace-with-pointer", 304 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 305 cl::Hidden); 306 307 static cl::opt<bool, true> EnableLowerModuleLDS( 308 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 309 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 310 cl::Hidden); 311 312 static cl::opt<bool> EnablePreRAOptimizations( 313 "amdgpu-enable-pre-ra-optimizations", 314 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 315 cl::Hidden); 316 317 static cl::opt<bool> EnablePromoteKernelArguments( 318 "amdgpu-enable-promote-kernel-arguments", 319 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 320 cl::Hidden, cl::init(true)); 321 322 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 323 // Register the target 324 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 325 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 326 327 PassRegistry *PR = PassRegistry::getPassRegistry(); 328 initializeR600ClauseMergePassPass(*PR); 329 initializeR600ControlFlowFinalizerPass(*PR); 330 initializeR600PacketizerPass(*PR); 331 initializeR600ExpandSpecialInstrsPassPass(*PR); 332 initializeR600VectorRegMergerPass(*PR); 333 initializeGlobalISel(*PR); 334 initializeAMDGPUDAGToDAGISelPass(*PR); 335 initializeGCNDPPCombinePass(*PR); 336 initializeSILowerI1CopiesPass(*PR); 337 initializeSILowerSGPRSpillsPass(*PR); 338 initializeSIFixSGPRCopiesPass(*PR); 339 initializeSIFixVGPRCopiesPass(*PR); 340 initializeSIFoldOperandsPass(*PR); 341 initializeSIPeepholeSDWAPass(*PR); 342 initializeSIShrinkInstructionsPass(*PR); 343 initializeSIOptimizeExecMaskingPreRAPass(*PR); 344 initializeSIOptimizeVGPRLiveRangePass(*PR); 345 initializeSILoadStoreOptimizerPass(*PR); 346 initializeAMDGPUCtorDtorLoweringPass(*PR); 347 initializeAMDGPUAlwaysInlinePass(*PR); 348 initializeAMDGPUAttributorPass(*PR); 349 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 350 initializeAMDGPUAnnotateUniformValuesPass(*PR); 351 initializeAMDGPUArgumentUsageInfoPass(*PR); 352 initializeAMDGPUAtomicOptimizerPass(*PR); 353 initializeAMDGPULowerKernelArgumentsPass(*PR); 354 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 355 initializeAMDGPULowerKernelAttributesPass(*PR); 356 initializeAMDGPULowerIntrinsicsPass(*PR); 357 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 358 initializeAMDGPUPostLegalizerCombinerPass(*PR); 359 initializeAMDGPUPreLegalizerCombinerPass(*PR); 360 initializeAMDGPURegBankCombinerPass(*PR); 361 initializeAMDGPUPromoteAllocaPass(*PR); 362 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 363 initializeAMDGPUCodeGenPreparePass(*PR); 364 initializeAMDGPULateCodeGenPreparePass(*PR); 365 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 366 initializeAMDGPUPropagateAttributesLatePass(*PR); 367 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 368 initializeAMDGPULowerModuleLDSPass(*PR); 369 initializeAMDGPURewriteOutArgumentsPass(*PR); 370 initializeAMDGPUUnifyMetadataPass(*PR); 371 initializeSIAnnotateControlFlowPass(*PR); 372 initializeAMDGPUReleaseVGPRsPass(*PR); 373 initializeAMDGPUInsertDelayAluPass(*PR); 374 initializeSIInsertHardClausesPass(*PR); 375 initializeSIInsertWaitcntsPass(*PR); 376 initializeSIModeRegisterPass(*PR); 377 initializeSIWholeQuadModePass(*PR); 378 initializeSILowerControlFlowPass(*PR); 379 initializeSIPreEmitPeepholePass(*PR); 380 initializeSILateBranchLoweringPass(*PR); 381 initializeSIMemoryLegalizerPass(*PR); 382 initializeSIOptimizeExecMaskingPass(*PR); 383 initializeSIPreAllocateWWMRegsPass(*PR); 384 initializeSIFormMemoryClausesPass(*PR); 385 initializeSIPostRABundlerPass(*PR); 386 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 387 initializeAMDGPUAAWrapperPassPass(*PR); 388 initializeAMDGPUExternalAAWrapperPass(*PR); 389 initializeAMDGPUUseNativeCallsPass(*PR); 390 initializeAMDGPUSimplifyLibCallsPass(*PR); 391 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 392 initializeAMDGPUResourceUsageAnalysisPass(*PR); 393 initializeGCNNSAReassignPass(*PR); 394 initializeGCNPreRAOptimizationsPass(*PR); 395 } 396 397 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 398 return std::make_unique<AMDGPUTargetObjectFile>(); 399 } 400 401 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 402 return new SIScheduleDAGMI(C); 403 } 404 405 static ScheduleDAGInstrs * 406 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 407 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 408 ScheduleDAGMILive *DAG = 409 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 410 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 411 if (ST.shouldClusterStores()) 412 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 413 DAG->addMutation(createIGroupLPDAGMutation()); 414 DAG->addMutation(createSchedBarrierDAGMutation()); 415 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 416 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 417 return DAG; 418 } 419 420 static ScheduleDAGInstrs * 421 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 422 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 423 auto DAG = new GCNIterativeScheduler(C, 424 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 425 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 426 if (ST.shouldClusterStores()) 427 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 428 return DAG; 429 } 430 431 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 432 return new GCNIterativeScheduler(C, 433 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 434 } 435 436 static ScheduleDAGInstrs * 437 createIterativeILPMachineScheduler(MachineSchedContext *C) { 438 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 439 auto DAG = new GCNIterativeScheduler(C, 440 GCNIterativeScheduler::SCHEDULE_ILP); 441 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 442 if (ST.shouldClusterStores()) 443 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 444 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 445 return DAG; 446 } 447 448 static MachineSchedRegistry 449 SISchedRegistry("si", "Run SI's custom scheduler", 450 createSIMachineScheduler); 451 452 static MachineSchedRegistry 453 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 454 "Run GCN scheduler to maximize occupancy", 455 createGCNMaxOccupancyMachineScheduler); 456 457 static MachineSchedRegistry 458 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 459 "Run GCN scheduler to maximize occupancy (experimental)", 460 createIterativeGCNMaxOccupancyMachineScheduler); 461 462 static MachineSchedRegistry 463 GCNMinRegSchedRegistry("gcn-minreg", 464 "Run GCN iterative scheduler for minimal register usage (experimental)", 465 createMinRegScheduler); 466 467 static MachineSchedRegistry 468 GCNILPSchedRegistry("gcn-ilp", 469 "Run GCN iterative scheduler for ILP scheduling (experimental)", 470 createIterativeILPMachineScheduler); 471 472 static StringRef computeDataLayout(const Triple &TT) { 473 if (TT.getArch() == Triple::r600) { 474 // 32-bit pointers. 475 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 476 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 477 } 478 479 // 32-bit private, local, and region pointers. 64-bit global, constant and 480 // flat, non-integral buffer fat pointers. 481 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 482 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 483 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 484 "-ni:7"; 485 } 486 487 LLVM_READNONE 488 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 489 if (!GPU.empty()) 490 return GPU; 491 492 // Need to default to a target with flat support for HSA. 493 if (TT.getArch() == Triple::amdgcn) 494 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 495 496 return "r600"; 497 } 498 499 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 500 // The AMDGPU toolchain only supports generating shared objects, so we 501 // must always use PIC. 502 return Reloc::PIC_; 503 } 504 505 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 506 StringRef CPU, StringRef FS, 507 TargetOptions Options, 508 Optional<Reloc::Model> RM, 509 Optional<CodeModel::Model> CM, 510 CodeGenOpt::Level OptLevel) 511 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 512 FS, Options, getEffectiveRelocModel(RM), 513 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 514 TLOF(createTLOF(getTargetTriple())) { 515 initAsmInfo(); 516 if (TT.getArch() == Triple::amdgcn) { 517 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 518 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 519 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 520 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 521 } 522 } 523 524 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 525 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 526 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 527 528 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 529 530 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 531 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 532 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 533 } 534 535 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 536 Attribute FSAttr = F.getFnAttribute("target-features"); 537 538 return FSAttr.isValid() ? FSAttr.getValueAsString() 539 : getTargetFeatureString(); 540 } 541 542 /// Predicate for Internalize pass. 543 static bool mustPreserveGV(const GlobalValue &GV) { 544 if (const Function *F = dyn_cast<Function>(&GV)) 545 return F->isDeclaration() || F->getName().startswith("__asan_") || 546 F->getName().startswith("__sanitizer_") || 547 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 548 549 GV.removeDeadConstantUsers(); 550 return !GV.use_empty(); 551 } 552 553 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 554 Builder.DivergentTarget = true; 555 556 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 557 bool Internalize = InternalizeSymbols; 558 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 559 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 560 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 561 bool PromoteKernelArguments = 562 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 563 564 if (EnableFunctionCalls) { 565 delete Builder.Inliner; 566 Builder.Inliner = createFunctionInliningPass(); 567 } 568 569 Builder.addExtension( 570 PassManagerBuilder::EP_ModuleOptimizerEarly, 571 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 572 legacy::PassManagerBase &PM) { 573 if (AMDGPUAA) { 574 PM.add(createAMDGPUAAWrapperPass()); 575 PM.add(createAMDGPUExternalAAWrapperPass()); 576 } 577 PM.add(createAMDGPUUnifyMetadataPass()); 578 PM.add(createAMDGPUPrintfRuntimeBinding()); 579 if (Internalize) 580 PM.add(createInternalizePass(mustPreserveGV)); 581 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 582 if (Internalize) 583 PM.add(createGlobalDCEPass()); 584 if (EarlyInline) 585 PM.add(createAMDGPUAlwaysInlinePass(false)); 586 }); 587 588 Builder.addExtension( 589 PassManagerBuilder::EP_EarlyAsPossible, 590 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 591 legacy::PassManagerBase &PM) { 592 if (AMDGPUAA) { 593 PM.add(createAMDGPUAAWrapperPass()); 594 PM.add(createAMDGPUExternalAAWrapperPass()); 595 } 596 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 597 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 598 if (LibCallSimplify) 599 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 600 }); 601 602 Builder.addExtension( 603 PassManagerBuilder::EP_CGSCCOptimizerLate, 604 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 605 legacy::PassManagerBase &PM) { 606 // Add promote kernel arguments pass to the opt pipeline right before 607 // infer address spaces which is needed to do actual address space 608 // rewriting. 609 if (PromoteKernelArguments) 610 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 611 612 // Add infer address spaces pass to the opt pipeline after inlining 613 // but before SROA to increase SROA opportunities. 614 PM.add(createInferAddressSpacesPass()); 615 616 // This should run after inlining to have any chance of doing anything, 617 // and before other cleanup optimizations. 618 PM.add(createAMDGPULowerKernelAttributesPass()); 619 620 // Promote alloca to vector before SROA and loop unroll. If we manage 621 // to eliminate allocas before unroll we may choose to unroll less. 622 if (EnableOpt) 623 PM.add(createAMDGPUPromoteAllocaToVector()); 624 }); 625 } 626 627 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 628 AAM.registerFunctionAnalysis<AMDGPUAA>(); 629 } 630 631 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 632 PB.registerPipelineParsingCallback( 633 [this](StringRef PassName, ModulePassManager &PM, 634 ArrayRef<PassBuilder::PipelineElement>) { 635 if (PassName == "amdgpu-propagate-attributes-late") { 636 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 637 return true; 638 } 639 if (PassName == "amdgpu-unify-metadata") { 640 PM.addPass(AMDGPUUnifyMetadataPass()); 641 return true; 642 } 643 if (PassName == "amdgpu-printf-runtime-binding") { 644 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 645 return true; 646 } 647 if (PassName == "amdgpu-always-inline") { 648 PM.addPass(AMDGPUAlwaysInlinePass()); 649 return true; 650 } 651 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 652 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 653 return true; 654 } 655 if (PassName == "amdgpu-lower-module-lds") { 656 PM.addPass(AMDGPULowerModuleLDSPass()); 657 return true; 658 } 659 return false; 660 }); 661 PB.registerPipelineParsingCallback( 662 [this](StringRef PassName, FunctionPassManager &PM, 663 ArrayRef<PassBuilder::PipelineElement>) { 664 if (PassName == "amdgpu-simplifylib") { 665 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 666 return true; 667 } 668 if (PassName == "amdgpu-usenative") { 669 PM.addPass(AMDGPUUseNativeCallsPass()); 670 return true; 671 } 672 if (PassName == "amdgpu-promote-alloca") { 673 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 674 return true; 675 } 676 if (PassName == "amdgpu-promote-alloca-to-vector") { 677 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 678 return true; 679 } 680 if (PassName == "amdgpu-lower-kernel-attributes") { 681 PM.addPass(AMDGPULowerKernelAttributesPass()); 682 return true; 683 } 684 if (PassName == "amdgpu-propagate-attributes-early") { 685 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 686 return true; 687 } 688 if (PassName == "amdgpu-promote-kernel-arguments") { 689 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 690 return true; 691 } 692 return false; 693 }); 694 695 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 696 FAM.registerPass([&] { return AMDGPUAA(); }); 697 }); 698 699 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 700 if (AAName == "amdgpu-aa") { 701 AAM.registerFunctionAnalysis<AMDGPUAA>(); 702 return true; 703 } 704 return false; 705 }); 706 707 PB.registerPipelineStartEPCallback( 708 [this](ModulePassManager &PM, OptimizationLevel Level) { 709 FunctionPassManager FPM; 710 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 711 FPM.addPass(AMDGPUUseNativeCallsPass()); 712 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 713 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 714 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 715 }); 716 717 PB.registerPipelineEarlySimplificationEPCallback( 718 [this](ModulePassManager &PM, OptimizationLevel Level) { 719 if (Level == OptimizationLevel::O0) 720 return; 721 722 PM.addPass(AMDGPUUnifyMetadataPass()); 723 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 724 725 if (InternalizeSymbols) { 726 PM.addPass(InternalizePass(mustPreserveGV)); 727 } 728 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 729 if (InternalizeSymbols) { 730 PM.addPass(GlobalDCEPass()); 731 } 732 if (EarlyInlineAll && !EnableFunctionCalls) 733 PM.addPass(AMDGPUAlwaysInlinePass()); 734 }); 735 736 PB.registerCGSCCOptimizerLateEPCallback( 737 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 738 if (Level == OptimizationLevel::O0) 739 return; 740 741 FunctionPassManager FPM; 742 743 // Add promote kernel arguments pass to the opt pipeline right before 744 // infer address spaces which is needed to do actual address space 745 // rewriting. 746 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 747 EnablePromoteKernelArguments) 748 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 749 750 // Add infer address spaces pass to the opt pipeline after inlining 751 // but before SROA to increase SROA opportunities. 752 FPM.addPass(InferAddressSpacesPass()); 753 754 // This should run after inlining to have any chance of doing 755 // anything, and before other cleanup optimizations. 756 FPM.addPass(AMDGPULowerKernelAttributesPass()); 757 758 if (Level != OptimizationLevel::O0) { 759 // Promote alloca to vector before SROA and loop unroll. If we 760 // manage to eliminate allocas before unroll we may choose to unroll 761 // less. 762 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 763 } 764 765 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 766 }); 767 } 768 769 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 770 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 771 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 772 AddrSpace == AMDGPUAS::REGION_ADDRESS) 773 ? -1 774 : 0; 775 } 776 777 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 778 unsigned DestAS) const { 779 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 780 AMDGPU::isFlatGlobalAddrSpace(DestAS); 781 } 782 783 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 784 const auto *LD = dyn_cast<LoadInst>(V); 785 if (!LD) 786 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 787 788 // It must be a generic pointer loaded. 789 assert(V->getType()->isPointerTy() && 790 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 791 792 const auto *Ptr = LD->getPointerOperand(); 793 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 794 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 795 // For a generic pointer loaded from the constant memory, it could be assumed 796 // as a global pointer since the constant memory is only populated on the 797 // host side. As implied by the offload programming model, only global 798 // pointers could be referenced on the host side. 799 return AMDGPUAS::GLOBAL_ADDRESS; 800 } 801 802 std::pair<const Value *, unsigned> 803 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 804 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 805 switch (II->getIntrinsicID()) { 806 case Intrinsic::amdgcn_is_shared: 807 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 808 case Intrinsic::amdgcn_is_private: 809 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 810 default: 811 break; 812 } 813 return std::make_pair(nullptr, -1); 814 } 815 // Check the global pointer predication based on 816 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 817 // the order of 'is_shared' and 'is_private' is not significant. 818 Value *Ptr; 819 if (match( 820 const_cast<Value *>(V), 821 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 822 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 823 m_Deferred(Ptr)))))) 824 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 825 826 return std::make_pair(nullptr, -1); 827 } 828 829 unsigned 830 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { 831 switch (Kind) { 832 case PseudoSourceValue::Stack: 833 case PseudoSourceValue::FixedStack: 834 return AMDGPUAS::PRIVATE_ADDRESS; 835 case PseudoSourceValue::ConstantPool: 836 case PseudoSourceValue::GOT: 837 case PseudoSourceValue::JumpTable: 838 case PseudoSourceValue::GlobalValueCallEntry: 839 case PseudoSourceValue::ExternalSymbolCallEntry: 840 case PseudoSourceValue::TargetCustom: 841 return AMDGPUAS::CONSTANT_ADDRESS; 842 } 843 return AMDGPUAS::FLAT_ADDRESS; 844 } 845 846 //===----------------------------------------------------------------------===// 847 // GCN Target Machine (SI+) 848 //===----------------------------------------------------------------------===// 849 850 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 851 StringRef CPU, StringRef FS, 852 TargetOptions Options, 853 Optional<Reloc::Model> RM, 854 Optional<CodeModel::Model> CM, 855 CodeGenOpt::Level OL, bool JIT) 856 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 857 858 const TargetSubtargetInfo * 859 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 860 StringRef GPU = getGPUName(F); 861 StringRef FS = getFeatureString(F); 862 863 SmallString<128> SubtargetKey(GPU); 864 SubtargetKey.append(FS); 865 866 auto &I = SubtargetMap[SubtargetKey]; 867 if (!I) { 868 // This needs to be done before we create a new subtarget since any 869 // creation will depend on the TM and the code generation flags on the 870 // function that reside in TargetOptions. 871 resetTargetOptions(F); 872 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 873 } 874 875 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 876 877 return I.get(); 878 } 879 880 TargetTransformInfo 881 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 882 return TargetTransformInfo(GCNTTIImpl(this, F)); 883 } 884 885 //===----------------------------------------------------------------------===// 886 // AMDGPU Pass Setup 887 //===----------------------------------------------------------------------===// 888 889 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 890 return getStandardCSEConfigForOpt(TM->getOptLevel()); 891 } 892 893 namespace { 894 895 class GCNPassConfig final : public AMDGPUPassConfig { 896 public: 897 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 898 : AMDGPUPassConfig(TM, PM) { 899 // It is necessary to know the register usage of the entire call graph. We 900 // allow calls without EnableAMDGPUFunctionCalls if they are marked 901 // noinline, so this is always required. 902 setRequiresCodeGenSCCOrder(true); 903 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 904 } 905 906 GCNTargetMachine &getGCNTargetMachine() const { 907 return getTM<GCNTargetMachine>(); 908 } 909 910 ScheduleDAGInstrs * 911 createMachineScheduler(MachineSchedContext *C) const override; 912 913 ScheduleDAGInstrs * 914 createPostMachineScheduler(MachineSchedContext *C) const override { 915 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 916 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 917 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 918 if (ST.shouldClusterStores()) 919 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 920 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 921 DAG->addMutation(createIGroupLPDAGMutation()); 922 DAG->addMutation(createSchedBarrierDAGMutation()); 923 return DAG; 924 } 925 926 bool addPreISel() override; 927 void addMachineSSAOptimization() override; 928 bool addILPOpts() override; 929 bool addInstSelector() override; 930 bool addIRTranslator() override; 931 void addPreLegalizeMachineIR() override; 932 bool addLegalizeMachineIR() override; 933 void addPreRegBankSelect() override; 934 bool addRegBankSelect() override; 935 void addPreGlobalInstructionSelect() override; 936 bool addGlobalInstructionSelect() override; 937 void addFastRegAlloc() override; 938 void addOptimizedRegAlloc() override; 939 940 FunctionPass *createSGPRAllocPass(bool Optimized); 941 FunctionPass *createVGPRAllocPass(bool Optimized); 942 FunctionPass *createRegAllocPass(bool Optimized) override; 943 944 bool addRegAssignAndRewriteFast() override; 945 bool addRegAssignAndRewriteOptimized() override; 946 947 void addPreRegAlloc() override; 948 bool addPreRewrite() override; 949 void addPostRegAlloc() override; 950 void addPreSched2() override; 951 void addPreEmitPass() override; 952 }; 953 954 } // end anonymous namespace 955 956 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 957 : TargetPassConfig(TM, PM) { 958 // Exceptions and StackMaps are not supported, so these passes will never do 959 // anything. 960 disablePass(&StackMapLivenessID); 961 disablePass(&FuncletLayoutID); 962 // Garbage collection is not supported. 963 disablePass(&GCLoweringID); 964 disablePass(&ShadowStackGCLoweringID); 965 } 966 967 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 968 if (getOptLevel() == CodeGenOpt::Aggressive) 969 addPass(createGVNPass()); 970 else 971 addPass(createEarlyCSEPass()); 972 } 973 974 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 975 addPass(createLICMPass()); 976 addPass(createSeparateConstOffsetFromGEPPass()); 977 addPass(createSpeculativeExecutionPass()); 978 // ReassociateGEPs exposes more opportunities for SLSR. See 979 // the example in reassociate-geps-and-slsr.ll. 980 addPass(createStraightLineStrengthReducePass()); 981 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 982 // EarlyCSE can reuse. 983 addEarlyCSEOrGVNPass(); 984 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 985 addPass(createNaryReassociatePass()); 986 // NaryReassociate on GEPs creates redundant common expressions, so run 987 // EarlyCSE after it. 988 addPass(createEarlyCSEPass()); 989 } 990 991 void AMDGPUPassConfig::addIRPasses() { 992 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 993 994 // There is no reason to run these. 995 disablePass(&StackMapLivenessID); 996 disablePass(&FuncletLayoutID); 997 disablePass(&PatchableFunctionID); 998 999 addPass(createAMDGPUPrintfRuntimeBinding()); 1000 addPass(createAMDGPUCtorDtorLoweringPass()); 1001 1002 // A call to propagate attributes pass in the backend in case opt was not run. 1003 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 1004 1005 addPass(createAMDGPULowerIntrinsicsPass()); 1006 1007 // Function calls are not supported, so make sure we inline everything. 1008 addPass(createAMDGPUAlwaysInlinePass()); 1009 addPass(createAlwaysInlinerLegacyPass()); 1010 // We need to add the barrier noop pass, otherwise adding the function 1011 // inlining pass will cause all of the PassConfigs passes to be run 1012 // one function at a time, which means if we have a module with two 1013 // functions, then we will generate code for the first function 1014 // without ever running any passes on the second. 1015 addPass(createBarrierNoopPass()); 1016 1017 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 1018 if (TM.getTargetTriple().getArch() == Triple::r600) 1019 addPass(createR600OpenCLImageTypeLoweringPass()); 1020 1021 // Replace OpenCL enqueued block function pointers with global variables. 1022 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 1023 1024 // Can increase LDS used by kernel so runs before PromoteAlloca 1025 if (EnableLowerModuleLDS) { 1026 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 1027 // pass "amdgpu-lower-module-lds", and also it required to be run only if 1028 // "amdgpu-lower-module-lds" pass is enabled. 1029 if (EnableLDSReplaceWithPointer) 1030 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 1031 1032 addPass(createAMDGPULowerModuleLDSPass()); 1033 } 1034 1035 if (TM.getOptLevel() > CodeGenOpt::None) 1036 addPass(createInferAddressSpacesPass()); 1037 1038 addPass(createAtomicExpandPass()); 1039 1040 if (TM.getOptLevel() > CodeGenOpt::None) { 1041 addPass(createAMDGPUPromoteAlloca()); 1042 1043 if (EnableSROA) 1044 addPass(createSROAPass()); 1045 if (isPassEnabled(EnableScalarIRPasses)) 1046 addStraightLineScalarOptimizationPasses(); 1047 1048 if (EnableAMDGPUAliasAnalysis) { 1049 addPass(createAMDGPUAAWrapperPass()); 1050 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1051 AAResults &AAR) { 1052 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1053 AAR.addAAResult(WrapperPass->getResult()); 1054 })); 1055 } 1056 1057 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1058 // TODO: May want to move later or split into an early and late one. 1059 addPass(createAMDGPUCodeGenPreparePass()); 1060 } 1061 } 1062 1063 TargetPassConfig::addIRPasses(); 1064 1065 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1066 // example, GVN can combine 1067 // 1068 // %0 = add %a, %b 1069 // %1 = add %b, %a 1070 // 1071 // and 1072 // 1073 // %0 = shl nsw %a, 2 1074 // %1 = shl %a, 2 1075 // 1076 // but EarlyCSE can do neither of them. 1077 if (isPassEnabled(EnableScalarIRPasses)) 1078 addEarlyCSEOrGVNPass(); 1079 } 1080 1081 void AMDGPUPassConfig::addCodeGenPrepare() { 1082 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1083 addPass(createAMDGPUAttributorPass()); 1084 1085 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1086 // analysis, and should be removed. 1087 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1088 } 1089 1090 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1091 EnableLowerKernelArguments) 1092 addPass(createAMDGPULowerKernelArgumentsPass()); 1093 1094 TargetPassConfig::addCodeGenPrepare(); 1095 1096 if (isPassEnabled(EnableLoadStoreVectorizer)) 1097 addPass(createLoadStoreVectorizerPass()); 1098 1099 // LowerSwitch pass may introduce unreachable blocks that can 1100 // cause unexpected behavior for subsequent passes. Placing it 1101 // here seems better that these blocks would get cleaned up by 1102 // UnreachableBlockElim inserted next in the pass flow. 1103 addPass(createLowerSwitchPass()); 1104 } 1105 1106 bool AMDGPUPassConfig::addPreISel() { 1107 if (TM->getOptLevel() > CodeGenOpt::None) 1108 addPass(createFlattenCFGPass()); 1109 return false; 1110 } 1111 1112 bool AMDGPUPassConfig::addInstSelector() { 1113 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1114 return false; 1115 } 1116 1117 bool AMDGPUPassConfig::addGCPasses() { 1118 // Do nothing. GC is not supported. 1119 return false; 1120 } 1121 1122 llvm::ScheduleDAGInstrs * 1123 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1124 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1125 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1126 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1127 if (ST.shouldClusterStores()) 1128 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 1129 return DAG; 1130 } 1131 1132 //===----------------------------------------------------------------------===// 1133 // GCN Pass Setup 1134 //===----------------------------------------------------------------------===// 1135 1136 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1137 MachineSchedContext *C) const { 1138 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1139 if (ST.enableSIScheduler()) 1140 return createSIMachineScheduler(C); 1141 return createGCNMaxOccupancyMachineScheduler(C); 1142 } 1143 1144 bool GCNPassConfig::addPreISel() { 1145 AMDGPUPassConfig::addPreISel(); 1146 1147 if (TM->getOptLevel() > CodeGenOpt::None) 1148 addPass(createAMDGPULateCodeGenPreparePass()); 1149 1150 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1151 addPass(createAMDGPUAtomicOptimizerPass()); 1152 } 1153 1154 if (TM->getOptLevel() > CodeGenOpt::None) 1155 addPass(createSinkingPass()); 1156 1157 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1158 // regions formed by them. 1159 addPass(&AMDGPUUnifyDivergentExitNodesID); 1160 if (!LateCFGStructurize) { 1161 if (EnableStructurizerWorkarounds) { 1162 addPass(createFixIrreduciblePass()); 1163 addPass(createUnifyLoopExitsPass()); 1164 } 1165 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1166 } 1167 addPass(createAMDGPUAnnotateUniformValues()); 1168 if (!LateCFGStructurize) { 1169 addPass(createSIAnnotateControlFlowPass()); 1170 } 1171 addPass(createLCSSAPass()); 1172 1173 if (TM->getOptLevel() > CodeGenOpt::Less) 1174 addPass(&AMDGPUPerfHintAnalysisID); 1175 1176 return false; 1177 } 1178 1179 void GCNPassConfig::addMachineSSAOptimization() { 1180 TargetPassConfig::addMachineSSAOptimization(); 1181 1182 // We want to fold operands after PeepholeOptimizer has run (or as part of 1183 // it), because it will eliminate extra copies making it easier to fold the 1184 // real source operand. We want to eliminate dead instructions after, so that 1185 // we see fewer uses of the copies. We then need to clean up the dead 1186 // instructions leftover after the operands are folded as well. 1187 // 1188 // XXX - Can we get away without running DeadMachineInstructionElim again? 1189 addPass(&SIFoldOperandsID); 1190 if (EnableDPPCombine) 1191 addPass(&GCNDPPCombineID); 1192 addPass(&SILoadStoreOptimizerID); 1193 if (isPassEnabled(EnableSDWAPeephole)) { 1194 addPass(&SIPeepholeSDWAID); 1195 addPass(&EarlyMachineLICMID); 1196 addPass(&MachineCSEID); 1197 addPass(&SIFoldOperandsID); 1198 } 1199 addPass(&DeadMachineInstructionElimID); 1200 addPass(createSIShrinkInstructionsPass()); 1201 } 1202 1203 bool GCNPassConfig::addILPOpts() { 1204 if (EnableEarlyIfConversion) 1205 addPass(&EarlyIfConverterID); 1206 1207 TargetPassConfig::addILPOpts(); 1208 return false; 1209 } 1210 1211 bool GCNPassConfig::addInstSelector() { 1212 AMDGPUPassConfig::addInstSelector(); 1213 addPass(&SIFixSGPRCopiesID); 1214 addPass(createSILowerI1CopiesPass()); 1215 return false; 1216 } 1217 1218 bool GCNPassConfig::addIRTranslator() { 1219 addPass(new IRTranslator(getOptLevel())); 1220 return false; 1221 } 1222 1223 void GCNPassConfig::addPreLegalizeMachineIR() { 1224 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1225 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1226 addPass(new Localizer()); 1227 } 1228 1229 bool GCNPassConfig::addLegalizeMachineIR() { 1230 addPass(new Legalizer()); 1231 return false; 1232 } 1233 1234 void GCNPassConfig::addPreRegBankSelect() { 1235 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1236 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1237 } 1238 1239 bool GCNPassConfig::addRegBankSelect() { 1240 addPass(new RegBankSelect()); 1241 return false; 1242 } 1243 1244 void GCNPassConfig::addPreGlobalInstructionSelect() { 1245 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1246 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1247 } 1248 1249 bool GCNPassConfig::addGlobalInstructionSelect() { 1250 addPass(new InstructionSelect(getOptLevel())); 1251 return false; 1252 } 1253 1254 void GCNPassConfig::addPreRegAlloc() { 1255 if (LateCFGStructurize) { 1256 addPass(createAMDGPUMachineCFGStructurizerPass()); 1257 } 1258 } 1259 1260 void GCNPassConfig::addFastRegAlloc() { 1261 // FIXME: We have to disable the verifier here because of PHIElimination + 1262 // TwoAddressInstructions disabling it. 1263 1264 // This must be run immediately after phi elimination and before 1265 // TwoAddressInstructions, otherwise the processing of the tied operand of 1266 // SI_ELSE will introduce a copy of the tied operand source after the else. 1267 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1268 1269 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1270 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1271 1272 TargetPassConfig::addFastRegAlloc(); 1273 } 1274 1275 void GCNPassConfig::addOptimizedRegAlloc() { 1276 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1277 // instructions that cause scheduling barriers. 1278 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1279 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1280 1281 if (OptExecMaskPreRA) 1282 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1283 1284 if (isPassEnabled(EnablePreRAOptimizations)) 1285 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1286 1287 // This is not an essential optimization and it has a noticeable impact on 1288 // compilation time, so we only enable it from O2. 1289 if (TM->getOptLevel() > CodeGenOpt::Less) 1290 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1291 1292 // FIXME: when an instruction has a Killed operand, and the instruction is 1293 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1294 // the register in LiveVariables, this would trigger a failure in verifier, 1295 // we should fix it and enable the verifier. 1296 if (OptVGPRLiveRange) 1297 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1298 // This must be run immediately after phi elimination and before 1299 // TwoAddressInstructions, otherwise the processing of the tied operand of 1300 // SI_ELSE will introduce a copy of the tied operand source after the else. 1301 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1302 1303 if (EnableDCEInRA) 1304 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1305 1306 TargetPassConfig::addOptimizedRegAlloc(); 1307 } 1308 1309 bool GCNPassConfig::addPreRewrite() { 1310 if (EnableRegReassign) 1311 addPass(&GCNNSAReassignID); 1312 return true; 1313 } 1314 1315 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1316 // Initialize the global default. 1317 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1318 initializeDefaultSGPRRegisterAllocatorOnce); 1319 1320 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1321 if (Ctor != useDefaultRegisterAllocator) 1322 return Ctor(); 1323 1324 if (Optimized) 1325 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1326 1327 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1328 } 1329 1330 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1331 // Initialize the global default. 1332 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1333 initializeDefaultVGPRRegisterAllocatorOnce); 1334 1335 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1336 if (Ctor != useDefaultRegisterAllocator) 1337 return Ctor(); 1338 1339 if (Optimized) 1340 return createGreedyVGPRRegisterAllocator(); 1341 1342 return createFastVGPRRegisterAllocator(); 1343 } 1344 1345 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1346 llvm_unreachable("should not be used"); 1347 } 1348 1349 static const char RegAllocOptNotSupportedMessage[] = 1350 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1351 1352 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1353 if (!usingDefaultRegAlloc()) 1354 report_fatal_error(RegAllocOptNotSupportedMessage); 1355 1356 addPass(createSGPRAllocPass(false)); 1357 1358 // Equivalent of PEI for SGPRs. 1359 addPass(&SILowerSGPRSpillsID); 1360 1361 addPass(createVGPRAllocPass(false)); 1362 return true; 1363 } 1364 1365 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1366 if (!usingDefaultRegAlloc()) 1367 report_fatal_error(RegAllocOptNotSupportedMessage); 1368 1369 addPass(createSGPRAllocPass(true)); 1370 1371 // Commit allocated register changes. This is mostly necessary because too 1372 // many things rely on the use lists of the physical registers, such as the 1373 // verifier. This is only necessary with allocators which use LiveIntervals, 1374 // since FastRegAlloc does the replacements itself. 1375 addPass(createVirtRegRewriter(false)); 1376 1377 // Equivalent of PEI for SGPRs. 1378 addPass(&SILowerSGPRSpillsID); 1379 1380 addPass(createVGPRAllocPass(true)); 1381 1382 addPreRewrite(); 1383 addPass(&VirtRegRewriterID); 1384 1385 return true; 1386 } 1387 1388 void GCNPassConfig::addPostRegAlloc() { 1389 addPass(&SIFixVGPRCopiesID); 1390 if (getOptLevel() > CodeGenOpt::None) 1391 addPass(&SIOptimizeExecMaskingID); 1392 TargetPassConfig::addPostRegAlloc(); 1393 } 1394 1395 void GCNPassConfig::addPreSched2() { 1396 if (TM->getOptLevel() > CodeGenOpt::None) 1397 addPass(createSIShrinkInstructionsPass()); 1398 addPass(&SIPostRABundlerID); 1399 } 1400 1401 void GCNPassConfig::addPreEmitPass() { 1402 addPass(createSIMemoryLegalizerPass()); 1403 addPass(createSIInsertWaitcntsPass()); 1404 1405 addPass(createSIModeRegisterPass()); 1406 1407 if (getOptLevel() > CodeGenOpt::None) 1408 addPass(&SIInsertHardClausesID); 1409 1410 addPass(&SILateBranchLoweringPassID); 1411 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) 1412 addPass(createAMDGPUSetWavePriorityPass()); 1413 if (getOptLevel() > CodeGenOpt::None) 1414 addPass(&SIPreEmitPeepholeID); 1415 // The hazard recognizer that runs as part of the post-ra scheduler does not 1416 // guarantee to be able handle all hazards correctly. This is because if there 1417 // are multiple scheduling regions in a basic block, the regions are scheduled 1418 // bottom up, so when we begin to schedule a region we don't know what 1419 // instructions were emitted directly before it. 1420 // 1421 // Here we add a stand-alone hazard recognizer pass which can handle all 1422 // cases. 1423 addPass(&PostRAHazardRecognizerID); 1424 1425 if (getOptLevel() > CodeGenOpt::Less) 1426 addPass(&AMDGPUReleaseVGPRsID); 1427 1428 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less)) 1429 addPass(&AMDGPUInsertDelayAluID); 1430 1431 addPass(&BranchRelaxationPassID); 1432 } 1433 1434 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1435 return new GCNPassConfig(*this, PM); 1436 } 1437 1438 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1439 return new yaml::SIMachineFunctionInfo(); 1440 } 1441 1442 yaml::MachineFunctionInfo * 1443 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1444 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1445 return new yaml::SIMachineFunctionInfo( 1446 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1447 } 1448 1449 bool GCNTargetMachine::parseMachineFunctionInfo( 1450 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1451 SMDiagnostic &Error, SMRange &SourceRange) const { 1452 const yaml::SIMachineFunctionInfo &YamlMFI = 1453 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1454 MachineFunction &MF = PFS.MF; 1455 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1456 1457 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1458 return true; 1459 1460 if (MFI->Occupancy == 0) { 1461 // Fixup the subtarget dependent default value. 1462 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1463 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1464 } 1465 1466 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1467 Register TempReg; 1468 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1469 SourceRange = RegName.SourceRange; 1470 return true; 1471 } 1472 RegVal = TempReg; 1473 1474 return false; 1475 }; 1476 1477 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1478 Register &RegVal) { 1479 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1480 }; 1481 1482 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1483 return true; 1484 1485 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1486 // Create a diagnostic for a the register string literal. 1487 const MemoryBuffer &Buffer = 1488 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1489 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1490 RegName.Value.size(), SourceMgr::DK_Error, 1491 "incorrect register class for field", RegName.Value, 1492 None, None); 1493 SourceRange = RegName.SourceRange; 1494 return true; 1495 }; 1496 1497 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1498 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1499 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1500 return true; 1501 1502 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1503 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1504 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1505 } 1506 1507 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1508 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1509 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1510 } 1511 1512 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1513 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1514 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1515 } 1516 1517 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1518 Register ParsedReg; 1519 if (parseRegister(YamlReg, ParsedReg)) 1520 return true; 1521 1522 MFI->reserveWWMRegister(ParsedReg); 1523 } 1524 1525 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1526 const TargetRegisterClass &RC, 1527 ArgDescriptor &Arg, unsigned UserSGPRs, 1528 unsigned SystemSGPRs) { 1529 // Skip parsing if it's not present. 1530 if (!A) 1531 return false; 1532 1533 if (A->IsRegister) { 1534 Register Reg; 1535 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1536 SourceRange = A->RegisterName.SourceRange; 1537 return true; 1538 } 1539 if (!RC.contains(Reg)) 1540 return diagnoseRegisterClass(A->RegisterName); 1541 Arg = ArgDescriptor::createRegister(Reg); 1542 } else 1543 Arg = ArgDescriptor::createStack(A->StackOffset); 1544 // Check and apply the optional mask. 1545 if (A->Mask) 1546 Arg = ArgDescriptor::createArg(Arg, *A->Mask); 1547 1548 MFI->NumUserSGPRs += UserSGPRs; 1549 MFI->NumSystemSGPRs += SystemSGPRs; 1550 return false; 1551 }; 1552 1553 if (YamlMFI.ArgInfo && 1554 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1555 AMDGPU::SGPR_128RegClass, 1556 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1557 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1558 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1559 2, 0) || 1560 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1561 MFI->ArgInfo.QueuePtr, 2, 0) || 1562 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1563 AMDGPU::SReg_64RegClass, 1564 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1565 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1566 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1567 2, 0) || 1568 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1569 AMDGPU::SReg_64RegClass, 1570 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1571 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1572 AMDGPU::SGPR_32RegClass, 1573 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1574 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1575 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1576 0, 1) || 1577 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1578 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1579 0, 1) || 1580 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1581 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1582 0, 1) || 1583 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1584 AMDGPU::SGPR_32RegClass, 1585 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1586 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1587 AMDGPU::SGPR_32RegClass, 1588 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1589 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1590 AMDGPU::SReg_64RegClass, 1591 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1592 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1593 AMDGPU::SReg_64RegClass, 1594 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1595 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1596 AMDGPU::VGPR_32RegClass, 1597 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1598 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1599 AMDGPU::VGPR_32RegClass, 1600 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1601 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1602 AMDGPU::VGPR_32RegClass, 1603 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1604 return true; 1605 1606 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1607 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1608 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1609 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1610 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1611 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1612 1613 return false; 1614 } 1615