1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUIGroupLP.h" 20 #include "AMDGPUMacroFusion.h" 21 #include "AMDGPUTargetObjectFile.h" 22 #include "AMDGPUTargetTransformInfo.h" 23 #include "GCNIterativeScheduler.h" 24 #include "GCNSchedStrategy.h" 25 #include "R600.h" 26 #include "R600TargetMachine.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIMachineScheduler.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "llvm/Analysis/CGSCCPassManager.h" 31 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 32 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 33 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 34 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 35 #include "llvm/CodeGen/GlobalISel/Localizer.h" 36 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 37 #include "llvm/CodeGen/MIRParser/MIParser.h" 38 #include "llvm/CodeGen/Passes.h" 39 #include "llvm/CodeGen/RegAllocRegistry.h" 40 #include "llvm/CodeGen/TargetPassConfig.h" 41 #include "llvm/IR/IntrinsicsAMDGPU.h" 42 #include "llvm/IR/LegacyPassManager.h" 43 #include "llvm/IR/PassManager.h" 44 #include "llvm/IR/PatternMatch.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/MC/TargetRegistry.h" 47 #include "llvm/Passes/PassBuilder.h" 48 #include "llvm/Transforms/IPO.h" 49 #include "llvm/Transforms/IPO/AlwaysInliner.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/Internalize.h" 52 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 53 #include "llvm/Transforms/Scalar.h" 54 #include "llvm/Transforms/Scalar/GVN.h" 55 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 56 #include "llvm/Transforms/Utils.h" 57 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 58 #include "llvm/Transforms/Vectorize.h" 59 60 using namespace llvm; 61 using namespace llvm::PatternMatch; 62 63 namespace { 64 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 65 public: 66 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 67 : RegisterRegAllocBase(N, D, C) {} 68 }; 69 70 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 71 public: 72 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 73 : RegisterRegAllocBase(N, D, C) {} 74 }; 75 76 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 77 const TargetRegisterClass &RC) { 78 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 79 } 80 81 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 82 const TargetRegisterClass &RC) { 83 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 84 } 85 86 87 /// -{sgpr|vgpr}-regalloc=... command line option. 88 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 89 90 /// A dummy default pass factory indicates whether the register allocator is 91 /// overridden on the command line. 92 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 93 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 94 95 static SGPRRegisterRegAlloc 96 defaultSGPRRegAlloc("default", 97 "pick SGPR register allocator based on -O option", 98 useDefaultRegisterAllocator); 99 100 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 101 RegisterPassParser<SGPRRegisterRegAlloc>> 102 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 103 cl::desc("Register allocator to use for SGPRs")); 104 105 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 106 RegisterPassParser<VGPRRegisterRegAlloc>> 107 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 108 cl::desc("Register allocator to use for VGPRs")); 109 110 111 static void initializeDefaultSGPRRegisterAllocatorOnce() { 112 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 113 114 if (!Ctor) { 115 Ctor = SGPRRegAlloc; 116 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 117 } 118 } 119 120 static void initializeDefaultVGPRRegisterAllocatorOnce() { 121 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 122 123 if (!Ctor) { 124 Ctor = VGPRRegAlloc; 125 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 126 } 127 } 128 129 static FunctionPass *createBasicSGPRRegisterAllocator() { 130 return createBasicRegisterAllocator(onlyAllocateSGPRs); 131 } 132 133 static FunctionPass *createGreedySGPRRegisterAllocator() { 134 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 135 } 136 137 static FunctionPass *createFastSGPRRegisterAllocator() { 138 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 139 } 140 141 static FunctionPass *createBasicVGPRRegisterAllocator() { 142 return createBasicRegisterAllocator(onlyAllocateVGPRs); 143 } 144 145 static FunctionPass *createGreedyVGPRRegisterAllocator() { 146 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 147 } 148 149 static FunctionPass *createFastVGPRRegisterAllocator() { 150 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 151 } 152 153 static SGPRRegisterRegAlloc basicRegAllocSGPR( 154 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 155 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 156 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 157 158 static SGPRRegisterRegAlloc fastRegAllocSGPR( 159 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 160 161 162 static VGPRRegisterRegAlloc basicRegAllocVGPR( 163 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 164 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 165 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 166 167 static VGPRRegisterRegAlloc fastRegAllocVGPR( 168 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 169 } 170 171 static cl::opt<bool> EnableSROA( 172 "amdgpu-sroa", 173 cl::desc("Run SROA after promote alloca pass"), 174 cl::ReallyHidden, 175 cl::init(true)); 176 177 static cl::opt<bool> 178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 179 cl::desc("Run early if-conversion"), 180 cl::init(false)); 181 182 static cl::opt<bool> 183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 184 cl::desc("Run pre-RA exec mask optimizations"), 185 cl::init(true)); 186 187 // Option to disable vectorizer for tests. 188 static cl::opt<bool> EnableLoadStoreVectorizer( 189 "amdgpu-load-store-vectorizer", 190 cl::desc("Enable load store vectorizer"), 191 cl::init(true), 192 cl::Hidden); 193 194 // Option to control global loads scalarization 195 static cl::opt<bool> ScalarizeGlobal( 196 "amdgpu-scalarize-global-loads", 197 cl::desc("Enable global load scalarization"), 198 cl::init(true), 199 cl::Hidden); 200 201 // Option to run internalize pass. 202 static cl::opt<bool> InternalizeSymbols( 203 "amdgpu-internalize-symbols", 204 cl::desc("Enable elimination of non-kernel functions and unused globals"), 205 cl::init(false), 206 cl::Hidden); 207 208 // Option to inline all early. 209 static cl::opt<bool> EarlyInlineAll( 210 "amdgpu-early-inline-all", 211 cl::desc("Inline all functions early"), 212 cl::init(false), 213 cl::Hidden); 214 215 static cl::opt<bool> EnableSDWAPeephole( 216 "amdgpu-sdwa-peephole", 217 cl::desc("Enable SDWA peepholer"), 218 cl::init(true)); 219 220 static cl::opt<bool> EnableDPPCombine( 221 "amdgpu-dpp-combine", 222 cl::desc("Enable DPP combiner"), 223 cl::init(true)); 224 225 // Enable address space based alias analysis 226 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 227 cl::desc("Enable AMDGPU Alias Analysis"), 228 cl::init(true)); 229 230 // Option to run late CFG structurizer 231 static cl::opt<bool, true> LateCFGStructurize( 232 "amdgpu-late-structurize", 233 cl::desc("Enable late CFG structurization"), 234 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 235 cl::Hidden); 236 237 // Enable lib calls simplifications 238 static cl::opt<bool> EnableLibCallSimplify( 239 "amdgpu-simplify-libcall", 240 cl::desc("Enable amdgpu library simplifications"), 241 cl::init(true), 242 cl::Hidden); 243 244 static cl::opt<bool> EnableLowerKernelArguments( 245 "amdgpu-ir-lower-kernel-arguments", 246 cl::desc("Lower kernel argument loads in IR pass"), 247 cl::init(true), 248 cl::Hidden); 249 250 static cl::opt<bool> EnableRegReassign( 251 "amdgpu-reassign-regs", 252 cl::desc("Enable register reassign optimizations on gfx10+"), 253 cl::init(true), 254 cl::Hidden); 255 256 static cl::opt<bool> OptVGPRLiveRange( 257 "amdgpu-opt-vgpr-liverange", 258 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 259 cl::init(true), cl::Hidden); 260 261 // Enable atomic optimization 262 static cl::opt<bool> EnableAtomicOptimizations( 263 "amdgpu-atomic-optimizations", 264 cl::desc("Enable atomic optimizations"), 265 cl::init(false), 266 cl::Hidden); 267 268 // Enable Mode register optimization 269 static cl::opt<bool> EnableSIModeRegisterPass( 270 "amdgpu-mode-register", 271 cl::desc("Enable mode register pass"), 272 cl::init(true), 273 cl::Hidden); 274 275 // Option is used in lit tests to prevent deadcoding of patterns inspected. 276 static cl::opt<bool> 277 EnableDCEInRA("amdgpu-dce-in-ra", 278 cl::init(true), cl::Hidden, 279 cl::desc("Enable machine DCE inside regalloc")); 280 281 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 282 cl::desc("Adjust wave priority"), 283 cl::init(false), cl::Hidden); 284 285 static cl::opt<bool> EnableScalarIRPasses( 286 "amdgpu-scalar-ir-passes", 287 cl::desc("Enable scalar IR passes"), 288 cl::init(true), 289 cl::Hidden); 290 291 static cl::opt<bool> EnableStructurizerWorkarounds( 292 "amdgpu-enable-structurizer-workarounds", 293 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 294 cl::Hidden); 295 296 static cl::opt<bool> EnableLDSReplaceWithPointer( 297 "amdgpu-enable-lds-replace-with-pointer", 298 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 299 cl::Hidden); 300 301 static cl::opt<bool, true> EnableLowerModuleLDS( 302 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 303 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 304 cl::Hidden); 305 306 static cl::opt<bool> EnablePreRAOptimizations( 307 "amdgpu-enable-pre-ra-optimizations", 308 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 309 cl::Hidden); 310 311 static cl::opt<bool> EnablePromoteKernelArguments( 312 "amdgpu-enable-promote-kernel-arguments", 313 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 314 cl::Hidden, cl::init(true)); 315 316 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 317 // Register the target 318 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 319 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 320 321 PassRegistry *PR = PassRegistry::getPassRegistry(); 322 initializeR600ClauseMergePassPass(*PR); 323 initializeR600ControlFlowFinalizerPass(*PR); 324 initializeR600PacketizerPass(*PR); 325 initializeR600ExpandSpecialInstrsPassPass(*PR); 326 initializeR600VectorRegMergerPass(*PR); 327 initializeGlobalISel(*PR); 328 initializeAMDGPUDAGToDAGISelPass(*PR); 329 initializeGCNDPPCombinePass(*PR); 330 initializeSILowerI1CopiesPass(*PR); 331 initializeSILowerSGPRSpillsPass(*PR); 332 initializeSIFixSGPRCopiesPass(*PR); 333 initializeSIFixVGPRCopiesPass(*PR); 334 initializeSIFoldOperandsPass(*PR); 335 initializeSIPeepholeSDWAPass(*PR); 336 initializeSIShrinkInstructionsPass(*PR); 337 initializeSIOptimizeExecMaskingPreRAPass(*PR); 338 initializeSIOptimizeVGPRLiveRangePass(*PR); 339 initializeSILoadStoreOptimizerPass(*PR); 340 initializeAMDGPUCtorDtorLoweringPass(*PR); 341 initializeAMDGPUAlwaysInlinePass(*PR); 342 initializeAMDGPUAttributorPass(*PR); 343 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 344 initializeAMDGPUAnnotateUniformValuesPass(*PR); 345 initializeAMDGPUArgumentUsageInfoPass(*PR); 346 initializeAMDGPUAtomicOptimizerPass(*PR); 347 initializeAMDGPULowerKernelArgumentsPass(*PR); 348 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 349 initializeAMDGPULowerKernelAttributesPass(*PR); 350 initializeAMDGPULowerIntrinsicsPass(*PR); 351 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 352 initializeAMDGPUPostLegalizerCombinerPass(*PR); 353 initializeAMDGPUPreLegalizerCombinerPass(*PR); 354 initializeAMDGPURegBankCombinerPass(*PR); 355 initializeAMDGPUPromoteAllocaPass(*PR); 356 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 357 initializeAMDGPUCodeGenPreparePass(*PR); 358 initializeAMDGPULateCodeGenPreparePass(*PR); 359 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 360 initializeAMDGPUPropagateAttributesLatePass(*PR); 361 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 362 initializeAMDGPULowerModuleLDSPass(*PR); 363 initializeAMDGPURewriteOutArgumentsPass(*PR); 364 initializeAMDGPUUnifyMetadataPass(*PR); 365 initializeSIAnnotateControlFlowPass(*PR); 366 initializeSIInsertHardClausesPass(*PR); 367 initializeSIInsertWaitcntsPass(*PR); 368 initializeSIModeRegisterPass(*PR); 369 initializeSIWholeQuadModePass(*PR); 370 initializeSILowerControlFlowPass(*PR); 371 initializeSIPreEmitPeepholePass(*PR); 372 initializeSILateBranchLoweringPass(*PR); 373 initializeSIMemoryLegalizerPass(*PR); 374 initializeSIOptimizeExecMaskingPass(*PR); 375 initializeSIPreAllocateWWMRegsPass(*PR); 376 initializeSIFormMemoryClausesPass(*PR); 377 initializeSIPostRABundlerPass(*PR); 378 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 379 initializeAMDGPUAAWrapperPassPass(*PR); 380 initializeAMDGPUExternalAAWrapperPass(*PR); 381 initializeAMDGPUUseNativeCallsPass(*PR); 382 initializeAMDGPUSimplifyLibCallsPass(*PR); 383 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 384 initializeAMDGPUResourceUsageAnalysisPass(*PR); 385 initializeGCNNSAReassignPass(*PR); 386 initializeGCNPreRAOptimizationsPass(*PR); 387 } 388 389 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 390 return std::make_unique<AMDGPUTargetObjectFile>(); 391 } 392 393 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 394 return new SIScheduleDAGMI(C); 395 } 396 397 static ScheduleDAGInstrs * 398 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 399 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 400 ScheduleDAGMILive *DAG = 401 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 402 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 403 if (ST.shouldClusterStores()) 404 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 405 DAG->addMutation(createIGroupLPDAGMutation()); 406 DAG->addMutation(createSchedBarrierDAGMutation()); 407 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 408 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 409 return DAG; 410 } 411 412 static ScheduleDAGInstrs * 413 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 414 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 415 auto DAG = new GCNIterativeScheduler(C, 416 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 417 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 418 if (ST.shouldClusterStores()) 419 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 420 return DAG; 421 } 422 423 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 424 return new GCNIterativeScheduler(C, 425 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 426 } 427 428 static ScheduleDAGInstrs * 429 createIterativeILPMachineScheduler(MachineSchedContext *C) { 430 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 431 auto DAG = new GCNIterativeScheduler(C, 432 GCNIterativeScheduler::SCHEDULE_ILP); 433 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 434 if (ST.shouldClusterStores()) 435 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 436 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 437 return DAG; 438 } 439 440 static MachineSchedRegistry 441 SISchedRegistry("si", "Run SI's custom scheduler", 442 createSIMachineScheduler); 443 444 static MachineSchedRegistry 445 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 446 "Run GCN scheduler to maximize occupancy", 447 createGCNMaxOccupancyMachineScheduler); 448 449 static MachineSchedRegistry 450 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 451 "Run GCN scheduler to maximize occupancy (experimental)", 452 createIterativeGCNMaxOccupancyMachineScheduler); 453 454 static MachineSchedRegistry 455 GCNMinRegSchedRegistry("gcn-minreg", 456 "Run GCN iterative scheduler for minimal register usage (experimental)", 457 createMinRegScheduler); 458 459 static MachineSchedRegistry 460 GCNILPSchedRegistry("gcn-ilp", 461 "Run GCN iterative scheduler for ILP scheduling (experimental)", 462 createIterativeILPMachineScheduler); 463 464 static StringRef computeDataLayout(const Triple &TT) { 465 if (TT.getArch() == Triple::r600) { 466 // 32-bit pointers. 467 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 468 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 469 } 470 471 // 32-bit private, local, and region pointers. 64-bit global, constant and 472 // flat, non-integral buffer fat pointers. 473 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 474 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 475 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 476 "-ni:7"; 477 } 478 479 LLVM_READNONE 480 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 481 if (!GPU.empty()) 482 return GPU; 483 484 // Need to default to a target with flat support for HSA. 485 if (TT.getArch() == Triple::amdgcn) 486 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 487 488 return "r600"; 489 } 490 491 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 492 // The AMDGPU toolchain only supports generating shared objects, so we 493 // must always use PIC. 494 return Reloc::PIC_; 495 } 496 497 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 498 StringRef CPU, StringRef FS, 499 TargetOptions Options, 500 Optional<Reloc::Model> RM, 501 Optional<CodeModel::Model> CM, 502 CodeGenOpt::Level OptLevel) 503 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 504 FS, Options, getEffectiveRelocModel(RM), 505 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 506 TLOF(createTLOF(getTargetTriple())) { 507 initAsmInfo(); 508 if (TT.getArch() == Triple::amdgcn) { 509 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 510 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 511 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 512 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 513 } 514 } 515 516 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 517 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 518 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 519 520 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 521 522 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 523 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 524 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 525 } 526 527 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 528 Attribute FSAttr = F.getFnAttribute("target-features"); 529 530 return FSAttr.isValid() ? FSAttr.getValueAsString() 531 : getTargetFeatureString(); 532 } 533 534 /// Predicate for Internalize pass. 535 static bool mustPreserveGV(const GlobalValue &GV) { 536 if (const Function *F = dyn_cast<Function>(&GV)) 537 return F->isDeclaration() || F->getName().startswith("__asan_") || 538 F->getName().startswith("__sanitizer_") || 539 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 540 541 GV.removeDeadConstantUsers(); 542 return !GV.use_empty(); 543 } 544 545 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 546 Builder.DivergentTarget = true; 547 548 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 549 bool Internalize = InternalizeSymbols; 550 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 551 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 552 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 553 bool PromoteKernelArguments = 554 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 555 556 if (EnableFunctionCalls) { 557 delete Builder.Inliner; 558 Builder.Inliner = createFunctionInliningPass(); 559 } 560 561 Builder.addExtension( 562 PassManagerBuilder::EP_ModuleOptimizerEarly, 563 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 564 legacy::PassManagerBase &PM) { 565 if (AMDGPUAA) { 566 PM.add(createAMDGPUAAWrapperPass()); 567 PM.add(createAMDGPUExternalAAWrapperPass()); 568 } 569 PM.add(createAMDGPUUnifyMetadataPass()); 570 PM.add(createAMDGPUPrintfRuntimeBinding()); 571 if (Internalize) 572 PM.add(createInternalizePass(mustPreserveGV)); 573 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 574 if (Internalize) 575 PM.add(createGlobalDCEPass()); 576 if (EarlyInline) 577 PM.add(createAMDGPUAlwaysInlinePass(false)); 578 }); 579 580 Builder.addExtension( 581 PassManagerBuilder::EP_EarlyAsPossible, 582 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 583 legacy::PassManagerBase &PM) { 584 if (AMDGPUAA) { 585 PM.add(createAMDGPUAAWrapperPass()); 586 PM.add(createAMDGPUExternalAAWrapperPass()); 587 } 588 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 589 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 590 if (LibCallSimplify) 591 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 592 }); 593 594 Builder.addExtension( 595 PassManagerBuilder::EP_CGSCCOptimizerLate, 596 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 597 legacy::PassManagerBase &PM) { 598 // Add promote kernel arguments pass to the opt pipeline right before 599 // infer address spaces which is needed to do actual address space 600 // rewriting. 601 if (PromoteKernelArguments) 602 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 603 604 // Add infer address spaces pass to the opt pipeline after inlining 605 // but before SROA to increase SROA opportunities. 606 PM.add(createInferAddressSpacesPass()); 607 608 // This should run after inlining to have any chance of doing anything, 609 // and before other cleanup optimizations. 610 PM.add(createAMDGPULowerKernelAttributesPass()); 611 612 // Promote alloca to vector before SROA and loop unroll. If we manage 613 // to eliminate allocas before unroll we may choose to unroll less. 614 if (EnableOpt) 615 PM.add(createAMDGPUPromoteAllocaToVector()); 616 }); 617 } 618 619 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 620 AAM.registerFunctionAnalysis<AMDGPUAA>(); 621 } 622 623 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 624 PB.registerPipelineParsingCallback( 625 [this](StringRef PassName, ModulePassManager &PM, 626 ArrayRef<PassBuilder::PipelineElement>) { 627 if (PassName == "amdgpu-propagate-attributes-late") { 628 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 629 return true; 630 } 631 if (PassName == "amdgpu-unify-metadata") { 632 PM.addPass(AMDGPUUnifyMetadataPass()); 633 return true; 634 } 635 if (PassName == "amdgpu-printf-runtime-binding") { 636 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 637 return true; 638 } 639 if (PassName == "amdgpu-always-inline") { 640 PM.addPass(AMDGPUAlwaysInlinePass()); 641 return true; 642 } 643 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 644 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 645 return true; 646 } 647 if (PassName == "amdgpu-lower-module-lds") { 648 PM.addPass(AMDGPULowerModuleLDSPass()); 649 return true; 650 } 651 return false; 652 }); 653 PB.registerPipelineParsingCallback( 654 [this](StringRef PassName, FunctionPassManager &PM, 655 ArrayRef<PassBuilder::PipelineElement>) { 656 if (PassName == "amdgpu-simplifylib") { 657 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 658 return true; 659 } 660 if (PassName == "amdgpu-usenative") { 661 PM.addPass(AMDGPUUseNativeCallsPass()); 662 return true; 663 } 664 if (PassName == "amdgpu-promote-alloca") { 665 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 666 return true; 667 } 668 if (PassName == "amdgpu-promote-alloca-to-vector") { 669 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 670 return true; 671 } 672 if (PassName == "amdgpu-lower-kernel-attributes") { 673 PM.addPass(AMDGPULowerKernelAttributesPass()); 674 return true; 675 } 676 if (PassName == "amdgpu-propagate-attributes-early") { 677 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 678 return true; 679 } 680 if (PassName == "amdgpu-promote-kernel-arguments") { 681 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 682 return true; 683 } 684 return false; 685 }); 686 687 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 688 FAM.registerPass([&] { return AMDGPUAA(); }); 689 }); 690 691 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 692 if (AAName == "amdgpu-aa") { 693 AAM.registerFunctionAnalysis<AMDGPUAA>(); 694 return true; 695 } 696 return false; 697 }); 698 699 PB.registerPipelineStartEPCallback( 700 [this](ModulePassManager &PM, OptimizationLevel Level) { 701 FunctionPassManager FPM; 702 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 703 FPM.addPass(AMDGPUUseNativeCallsPass()); 704 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 705 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 706 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 707 }); 708 709 PB.registerPipelineEarlySimplificationEPCallback( 710 [this](ModulePassManager &PM, OptimizationLevel Level) { 711 if (Level == OptimizationLevel::O0) 712 return; 713 714 PM.addPass(AMDGPUUnifyMetadataPass()); 715 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 716 717 if (InternalizeSymbols) { 718 PM.addPass(InternalizePass(mustPreserveGV)); 719 } 720 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 721 if (InternalizeSymbols) { 722 PM.addPass(GlobalDCEPass()); 723 } 724 if (EarlyInlineAll && !EnableFunctionCalls) 725 PM.addPass(AMDGPUAlwaysInlinePass()); 726 }); 727 728 PB.registerCGSCCOptimizerLateEPCallback( 729 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 730 if (Level == OptimizationLevel::O0) 731 return; 732 733 FunctionPassManager FPM; 734 735 // Add promote kernel arguments pass to the opt pipeline right before 736 // infer address spaces which is needed to do actual address space 737 // rewriting. 738 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 739 EnablePromoteKernelArguments) 740 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 741 742 // Add infer address spaces pass to the opt pipeline after inlining 743 // but before SROA to increase SROA opportunities. 744 FPM.addPass(InferAddressSpacesPass()); 745 746 // This should run after inlining to have any chance of doing 747 // anything, and before other cleanup optimizations. 748 FPM.addPass(AMDGPULowerKernelAttributesPass()); 749 750 if (Level != OptimizationLevel::O0) { 751 // Promote alloca to vector before SROA and loop unroll. If we 752 // manage to eliminate allocas before unroll we may choose to unroll 753 // less. 754 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 755 } 756 757 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 758 }); 759 } 760 761 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 762 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 763 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 764 AddrSpace == AMDGPUAS::REGION_ADDRESS) 765 ? -1 766 : 0; 767 } 768 769 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 770 unsigned DestAS) const { 771 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 772 AMDGPU::isFlatGlobalAddrSpace(DestAS); 773 } 774 775 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 776 const auto *LD = dyn_cast<LoadInst>(V); 777 if (!LD) 778 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 779 780 // It must be a generic pointer loaded. 781 assert(V->getType()->isPointerTy() && 782 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 783 784 const auto *Ptr = LD->getPointerOperand(); 785 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 786 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 787 // For a generic pointer loaded from the constant memory, it could be assumed 788 // as a global pointer since the constant memory is only populated on the 789 // host side. As implied by the offload programming model, only global 790 // pointers could be referenced on the host side. 791 return AMDGPUAS::GLOBAL_ADDRESS; 792 } 793 794 std::pair<const Value *, unsigned> 795 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 796 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 797 switch (II->getIntrinsicID()) { 798 case Intrinsic::amdgcn_is_shared: 799 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 800 case Intrinsic::amdgcn_is_private: 801 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 802 default: 803 break; 804 } 805 return std::make_pair(nullptr, -1); 806 } 807 // Check the global pointer predication based on 808 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 809 // the order of 'is_shared' and 'is_private' is not significant. 810 Value *Ptr; 811 if (match( 812 const_cast<Value *>(V), 813 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 814 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 815 m_Deferred(Ptr)))))) 816 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 817 818 return std::make_pair(nullptr, -1); 819 } 820 821 unsigned 822 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { 823 switch (Kind) { 824 case PseudoSourceValue::Stack: 825 case PseudoSourceValue::FixedStack: 826 return AMDGPUAS::PRIVATE_ADDRESS; 827 case PseudoSourceValue::ConstantPool: 828 case PseudoSourceValue::GOT: 829 case PseudoSourceValue::JumpTable: 830 case PseudoSourceValue::GlobalValueCallEntry: 831 case PseudoSourceValue::ExternalSymbolCallEntry: 832 case PseudoSourceValue::TargetCustom: 833 return AMDGPUAS::CONSTANT_ADDRESS; 834 } 835 return AMDGPUAS::FLAT_ADDRESS; 836 } 837 838 //===----------------------------------------------------------------------===// 839 // GCN Target Machine (SI+) 840 //===----------------------------------------------------------------------===// 841 842 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 843 StringRef CPU, StringRef FS, 844 TargetOptions Options, 845 Optional<Reloc::Model> RM, 846 Optional<CodeModel::Model> CM, 847 CodeGenOpt::Level OL, bool JIT) 848 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 849 850 const TargetSubtargetInfo * 851 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 852 StringRef GPU = getGPUName(F); 853 StringRef FS = getFeatureString(F); 854 855 SmallString<128> SubtargetKey(GPU); 856 SubtargetKey.append(FS); 857 858 auto &I = SubtargetMap[SubtargetKey]; 859 if (!I) { 860 // This needs to be done before we create a new subtarget since any 861 // creation will depend on the TM and the code generation flags on the 862 // function that reside in TargetOptions. 863 resetTargetOptions(F); 864 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 865 } 866 867 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 868 869 return I.get(); 870 } 871 872 TargetTransformInfo 873 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 874 return TargetTransformInfo(GCNTTIImpl(this, F)); 875 } 876 877 //===----------------------------------------------------------------------===// 878 // AMDGPU Pass Setup 879 //===----------------------------------------------------------------------===// 880 881 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 882 return getStandardCSEConfigForOpt(TM->getOptLevel()); 883 } 884 885 namespace { 886 887 class GCNPassConfig final : public AMDGPUPassConfig { 888 public: 889 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 890 : AMDGPUPassConfig(TM, PM) { 891 // It is necessary to know the register usage of the entire call graph. We 892 // allow calls without EnableAMDGPUFunctionCalls if they are marked 893 // noinline, so this is always required. 894 setRequiresCodeGenSCCOrder(true); 895 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 896 } 897 898 GCNTargetMachine &getGCNTargetMachine() const { 899 return getTM<GCNTargetMachine>(); 900 } 901 902 ScheduleDAGInstrs * 903 createMachineScheduler(MachineSchedContext *C) const override; 904 905 ScheduleDAGInstrs * 906 createPostMachineScheduler(MachineSchedContext *C) const override { 907 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 908 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 909 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 910 if (ST.shouldClusterStores()) 911 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 912 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 913 DAG->addMutation(createIGroupLPDAGMutation()); 914 DAG->addMutation(createSchedBarrierDAGMutation()); 915 return DAG; 916 } 917 918 bool addPreISel() override; 919 void addMachineSSAOptimization() override; 920 bool addILPOpts() override; 921 bool addInstSelector() override; 922 bool addIRTranslator() override; 923 void addPreLegalizeMachineIR() override; 924 bool addLegalizeMachineIR() override; 925 void addPreRegBankSelect() override; 926 bool addRegBankSelect() override; 927 void addPreGlobalInstructionSelect() override; 928 bool addGlobalInstructionSelect() override; 929 void addFastRegAlloc() override; 930 void addOptimizedRegAlloc() override; 931 932 FunctionPass *createSGPRAllocPass(bool Optimized); 933 FunctionPass *createVGPRAllocPass(bool Optimized); 934 FunctionPass *createRegAllocPass(bool Optimized) override; 935 936 bool addRegAssignAndRewriteFast() override; 937 bool addRegAssignAndRewriteOptimized() override; 938 939 void addPreRegAlloc() override; 940 bool addPreRewrite() override; 941 void addPostRegAlloc() override; 942 void addPreSched2() override; 943 void addPreEmitPass() override; 944 }; 945 946 } // end anonymous namespace 947 948 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 949 : TargetPassConfig(TM, PM) { 950 // Exceptions and StackMaps are not supported, so these passes will never do 951 // anything. 952 disablePass(&StackMapLivenessID); 953 disablePass(&FuncletLayoutID); 954 // Garbage collection is not supported. 955 disablePass(&GCLoweringID); 956 disablePass(&ShadowStackGCLoweringID); 957 } 958 959 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 960 if (getOptLevel() == CodeGenOpt::Aggressive) 961 addPass(createGVNPass()); 962 else 963 addPass(createEarlyCSEPass()); 964 } 965 966 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 967 addPass(createLICMPass()); 968 addPass(createSeparateConstOffsetFromGEPPass()); 969 addPass(createSpeculativeExecutionPass()); 970 // ReassociateGEPs exposes more opportunities for SLSR. See 971 // the example in reassociate-geps-and-slsr.ll. 972 addPass(createStraightLineStrengthReducePass()); 973 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 974 // EarlyCSE can reuse. 975 addEarlyCSEOrGVNPass(); 976 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 977 addPass(createNaryReassociatePass()); 978 // NaryReassociate on GEPs creates redundant common expressions, so run 979 // EarlyCSE after it. 980 addPass(createEarlyCSEPass()); 981 } 982 983 void AMDGPUPassConfig::addIRPasses() { 984 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 985 986 // There is no reason to run these. 987 disablePass(&StackMapLivenessID); 988 disablePass(&FuncletLayoutID); 989 disablePass(&PatchableFunctionID); 990 991 addPass(createAMDGPUPrintfRuntimeBinding()); 992 addPass(createAMDGPUCtorDtorLoweringPass()); 993 994 // A call to propagate attributes pass in the backend in case opt was not run. 995 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 996 997 addPass(createAMDGPULowerIntrinsicsPass()); 998 999 // Function calls are not supported, so make sure we inline everything. 1000 addPass(createAMDGPUAlwaysInlinePass()); 1001 addPass(createAlwaysInlinerLegacyPass()); 1002 // We need to add the barrier noop pass, otherwise adding the function 1003 // inlining pass will cause all of the PassConfigs passes to be run 1004 // one function at a time, which means if we have a module with two 1005 // functions, then we will generate code for the first function 1006 // without ever running any passes on the second. 1007 addPass(createBarrierNoopPass()); 1008 1009 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 1010 if (TM.getTargetTriple().getArch() == Triple::r600) 1011 addPass(createR600OpenCLImageTypeLoweringPass()); 1012 1013 // Replace OpenCL enqueued block function pointers with global variables. 1014 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 1015 1016 // Can increase LDS used by kernel so runs before PromoteAlloca 1017 if (EnableLowerModuleLDS) { 1018 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 1019 // pass "amdgpu-lower-module-lds", and also it required to be run only if 1020 // "amdgpu-lower-module-lds" pass is enabled. 1021 if (EnableLDSReplaceWithPointer) 1022 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 1023 1024 addPass(createAMDGPULowerModuleLDSPass()); 1025 } 1026 1027 if (TM.getOptLevel() > CodeGenOpt::None) 1028 addPass(createInferAddressSpacesPass()); 1029 1030 addPass(createAtomicExpandPass()); 1031 1032 if (TM.getOptLevel() > CodeGenOpt::None) { 1033 addPass(createAMDGPUPromoteAlloca()); 1034 1035 if (EnableSROA) 1036 addPass(createSROAPass()); 1037 if (isPassEnabled(EnableScalarIRPasses)) 1038 addStraightLineScalarOptimizationPasses(); 1039 1040 if (EnableAMDGPUAliasAnalysis) { 1041 addPass(createAMDGPUAAWrapperPass()); 1042 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1043 AAResults &AAR) { 1044 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1045 AAR.addAAResult(WrapperPass->getResult()); 1046 })); 1047 } 1048 1049 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1050 // TODO: May want to move later or split into an early and late one. 1051 addPass(createAMDGPUCodeGenPreparePass()); 1052 } 1053 } 1054 1055 TargetPassConfig::addIRPasses(); 1056 1057 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1058 // example, GVN can combine 1059 // 1060 // %0 = add %a, %b 1061 // %1 = add %b, %a 1062 // 1063 // and 1064 // 1065 // %0 = shl nsw %a, 2 1066 // %1 = shl %a, 2 1067 // 1068 // but EarlyCSE can do neither of them. 1069 if (isPassEnabled(EnableScalarIRPasses)) 1070 addEarlyCSEOrGVNPass(); 1071 } 1072 1073 void AMDGPUPassConfig::addCodeGenPrepare() { 1074 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1075 addPass(createAMDGPUAttributorPass()); 1076 1077 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1078 // analysis, and should be removed. 1079 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1080 } 1081 1082 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1083 EnableLowerKernelArguments) 1084 addPass(createAMDGPULowerKernelArgumentsPass()); 1085 1086 TargetPassConfig::addCodeGenPrepare(); 1087 1088 if (isPassEnabled(EnableLoadStoreVectorizer)) 1089 addPass(createLoadStoreVectorizerPass()); 1090 1091 // LowerSwitch pass may introduce unreachable blocks that can 1092 // cause unexpected behavior for subsequent passes. Placing it 1093 // here seems better that these blocks would get cleaned up by 1094 // UnreachableBlockElim inserted next in the pass flow. 1095 addPass(createLowerSwitchPass()); 1096 } 1097 1098 bool AMDGPUPassConfig::addPreISel() { 1099 if (TM->getOptLevel() > CodeGenOpt::None) 1100 addPass(createFlattenCFGPass()); 1101 return false; 1102 } 1103 1104 bool AMDGPUPassConfig::addInstSelector() { 1105 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1106 return false; 1107 } 1108 1109 bool AMDGPUPassConfig::addGCPasses() { 1110 // Do nothing. GC is not supported. 1111 return false; 1112 } 1113 1114 llvm::ScheduleDAGInstrs * 1115 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1116 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1117 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1118 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1119 if (ST.shouldClusterStores()) 1120 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); 1121 return DAG; 1122 } 1123 1124 //===----------------------------------------------------------------------===// 1125 // GCN Pass Setup 1126 //===----------------------------------------------------------------------===// 1127 1128 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1129 MachineSchedContext *C) const { 1130 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1131 if (ST.enableSIScheduler()) 1132 return createSIMachineScheduler(C); 1133 return createGCNMaxOccupancyMachineScheduler(C); 1134 } 1135 1136 bool GCNPassConfig::addPreISel() { 1137 AMDGPUPassConfig::addPreISel(); 1138 1139 if (TM->getOptLevel() > CodeGenOpt::None) 1140 addPass(createAMDGPULateCodeGenPreparePass()); 1141 1142 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1143 addPass(createAMDGPUAtomicOptimizerPass()); 1144 } 1145 1146 if (TM->getOptLevel() > CodeGenOpt::None) 1147 addPass(createSinkingPass()); 1148 1149 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1150 // regions formed by them. 1151 addPass(&AMDGPUUnifyDivergentExitNodesID); 1152 if (!LateCFGStructurize) { 1153 if (EnableStructurizerWorkarounds) { 1154 addPass(createFixIrreduciblePass()); 1155 addPass(createUnifyLoopExitsPass()); 1156 } 1157 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1158 } 1159 addPass(createAMDGPUAnnotateUniformValues()); 1160 if (!LateCFGStructurize) { 1161 addPass(createSIAnnotateControlFlowPass()); 1162 } 1163 addPass(createLCSSAPass()); 1164 1165 if (TM->getOptLevel() > CodeGenOpt::Less) 1166 addPass(&AMDGPUPerfHintAnalysisID); 1167 1168 return false; 1169 } 1170 1171 void GCNPassConfig::addMachineSSAOptimization() { 1172 TargetPassConfig::addMachineSSAOptimization(); 1173 1174 // We want to fold operands after PeepholeOptimizer has run (or as part of 1175 // it), because it will eliminate extra copies making it easier to fold the 1176 // real source operand. We want to eliminate dead instructions after, so that 1177 // we see fewer uses of the copies. We then need to clean up the dead 1178 // instructions leftover after the operands are folded as well. 1179 // 1180 // XXX - Can we get away without running DeadMachineInstructionElim again? 1181 addPass(&SIFoldOperandsID); 1182 if (EnableDPPCombine) 1183 addPass(&GCNDPPCombineID); 1184 addPass(&SILoadStoreOptimizerID); 1185 if (isPassEnabled(EnableSDWAPeephole)) { 1186 addPass(&SIPeepholeSDWAID); 1187 addPass(&EarlyMachineLICMID); 1188 addPass(&MachineCSEID); 1189 addPass(&SIFoldOperandsID); 1190 } 1191 addPass(&DeadMachineInstructionElimID); 1192 addPass(createSIShrinkInstructionsPass()); 1193 } 1194 1195 bool GCNPassConfig::addILPOpts() { 1196 if (EnableEarlyIfConversion) 1197 addPass(&EarlyIfConverterID); 1198 1199 TargetPassConfig::addILPOpts(); 1200 return false; 1201 } 1202 1203 bool GCNPassConfig::addInstSelector() { 1204 AMDGPUPassConfig::addInstSelector(); 1205 addPass(&SIFixSGPRCopiesID); 1206 addPass(createSILowerI1CopiesPass()); 1207 return false; 1208 } 1209 1210 bool GCNPassConfig::addIRTranslator() { 1211 addPass(new IRTranslator(getOptLevel())); 1212 return false; 1213 } 1214 1215 void GCNPassConfig::addPreLegalizeMachineIR() { 1216 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1217 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1218 addPass(new Localizer()); 1219 } 1220 1221 bool GCNPassConfig::addLegalizeMachineIR() { 1222 addPass(new Legalizer()); 1223 return false; 1224 } 1225 1226 void GCNPassConfig::addPreRegBankSelect() { 1227 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1228 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1229 } 1230 1231 bool GCNPassConfig::addRegBankSelect() { 1232 addPass(new RegBankSelect()); 1233 return false; 1234 } 1235 1236 void GCNPassConfig::addPreGlobalInstructionSelect() { 1237 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1238 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1239 } 1240 1241 bool GCNPassConfig::addGlobalInstructionSelect() { 1242 addPass(new InstructionSelect(getOptLevel())); 1243 return false; 1244 } 1245 1246 void GCNPassConfig::addPreRegAlloc() { 1247 if (LateCFGStructurize) { 1248 addPass(createAMDGPUMachineCFGStructurizerPass()); 1249 } 1250 } 1251 1252 void GCNPassConfig::addFastRegAlloc() { 1253 // FIXME: We have to disable the verifier here because of PHIElimination + 1254 // TwoAddressInstructions disabling it. 1255 1256 // This must be run immediately after phi elimination and before 1257 // TwoAddressInstructions, otherwise the processing of the tied operand of 1258 // SI_ELSE will introduce a copy of the tied operand source after the else. 1259 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1260 1261 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1262 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1263 1264 TargetPassConfig::addFastRegAlloc(); 1265 } 1266 1267 void GCNPassConfig::addOptimizedRegAlloc() { 1268 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1269 // instructions that cause scheduling barriers. 1270 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1271 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1272 1273 if (OptExecMaskPreRA) 1274 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1275 1276 if (isPassEnabled(EnablePreRAOptimizations)) 1277 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1278 1279 // This is not an essential optimization and it has a noticeable impact on 1280 // compilation time, so we only enable it from O2. 1281 if (TM->getOptLevel() > CodeGenOpt::Less) 1282 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1283 1284 // FIXME: when an instruction has a Killed operand, and the instruction is 1285 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1286 // the register in LiveVariables, this would trigger a failure in verifier, 1287 // we should fix it and enable the verifier. 1288 if (OptVGPRLiveRange) 1289 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1290 // This must be run immediately after phi elimination and before 1291 // TwoAddressInstructions, otherwise the processing of the tied operand of 1292 // SI_ELSE will introduce a copy of the tied operand source after the else. 1293 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1294 1295 if (EnableDCEInRA) 1296 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1297 1298 TargetPassConfig::addOptimizedRegAlloc(); 1299 } 1300 1301 bool GCNPassConfig::addPreRewrite() { 1302 if (EnableRegReassign) 1303 addPass(&GCNNSAReassignID); 1304 return true; 1305 } 1306 1307 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1308 // Initialize the global default. 1309 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1310 initializeDefaultSGPRRegisterAllocatorOnce); 1311 1312 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1313 if (Ctor != useDefaultRegisterAllocator) 1314 return Ctor(); 1315 1316 if (Optimized) 1317 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1318 1319 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1320 } 1321 1322 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1323 // Initialize the global default. 1324 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1325 initializeDefaultVGPRRegisterAllocatorOnce); 1326 1327 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1328 if (Ctor != useDefaultRegisterAllocator) 1329 return Ctor(); 1330 1331 if (Optimized) 1332 return createGreedyVGPRRegisterAllocator(); 1333 1334 return createFastVGPRRegisterAllocator(); 1335 } 1336 1337 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1338 llvm_unreachable("should not be used"); 1339 } 1340 1341 static const char RegAllocOptNotSupportedMessage[] = 1342 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1343 1344 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1345 if (!usingDefaultRegAlloc()) 1346 report_fatal_error(RegAllocOptNotSupportedMessage); 1347 1348 addPass(createSGPRAllocPass(false)); 1349 1350 // Equivalent of PEI for SGPRs. 1351 addPass(&SILowerSGPRSpillsID); 1352 1353 addPass(createVGPRAllocPass(false)); 1354 return true; 1355 } 1356 1357 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1358 if (!usingDefaultRegAlloc()) 1359 report_fatal_error(RegAllocOptNotSupportedMessage); 1360 1361 addPass(createSGPRAllocPass(true)); 1362 1363 // Commit allocated register changes. This is mostly necessary because too 1364 // many things rely on the use lists of the physical registers, such as the 1365 // verifier. This is only necessary with allocators which use LiveIntervals, 1366 // since FastRegAlloc does the replacements itself. 1367 addPass(createVirtRegRewriter(false)); 1368 1369 // Equivalent of PEI for SGPRs. 1370 addPass(&SILowerSGPRSpillsID); 1371 1372 addPass(createVGPRAllocPass(true)); 1373 1374 addPreRewrite(); 1375 addPass(&VirtRegRewriterID); 1376 1377 return true; 1378 } 1379 1380 void GCNPassConfig::addPostRegAlloc() { 1381 addPass(&SIFixVGPRCopiesID); 1382 if (getOptLevel() > CodeGenOpt::None) 1383 addPass(&SIOptimizeExecMaskingID); 1384 TargetPassConfig::addPostRegAlloc(); 1385 } 1386 1387 void GCNPassConfig::addPreSched2() { 1388 if (TM->getOptLevel() > CodeGenOpt::None) 1389 addPass(createSIShrinkInstructionsPass()); 1390 addPass(&SIPostRABundlerID); 1391 } 1392 1393 void GCNPassConfig::addPreEmitPass() { 1394 addPass(createSIMemoryLegalizerPass()); 1395 addPass(createSIInsertWaitcntsPass()); 1396 1397 addPass(createSIModeRegisterPass()); 1398 1399 if (getOptLevel() > CodeGenOpt::None) 1400 addPass(&SIInsertHardClausesID); 1401 1402 addPass(&SILateBranchLoweringPassID); 1403 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) 1404 addPass(createAMDGPUSetWavePriorityPass()); 1405 if (getOptLevel() > CodeGenOpt::None) 1406 addPass(&SIPreEmitPeepholeID); 1407 // The hazard recognizer that runs as part of the post-ra scheduler does not 1408 // guarantee to be able handle all hazards correctly. This is because if there 1409 // are multiple scheduling regions in a basic block, the regions are scheduled 1410 // bottom up, so when we begin to schedule a region we don't know what 1411 // instructions were emitted directly before it. 1412 // 1413 // Here we add a stand-alone hazard recognizer pass which can handle all 1414 // cases. 1415 addPass(&PostRAHazardRecognizerID); 1416 addPass(&BranchRelaxationPassID); 1417 } 1418 1419 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1420 return new GCNPassConfig(*this, PM); 1421 } 1422 1423 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1424 return new yaml::SIMachineFunctionInfo(); 1425 } 1426 1427 yaml::MachineFunctionInfo * 1428 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1429 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1430 return new yaml::SIMachineFunctionInfo( 1431 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1432 } 1433 1434 bool GCNTargetMachine::parseMachineFunctionInfo( 1435 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1436 SMDiagnostic &Error, SMRange &SourceRange) const { 1437 const yaml::SIMachineFunctionInfo &YamlMFI = 1438 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1439 MachineFunction &MF = PFS.MF; 1440 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1441 1442 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1443 return true; 1444 1445 if (MFI->Occupancy == 0) { 1446 // Fixup the subtarget dependent default value. 1447 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1448 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1449 } 1450 1451 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1452 Register TempReg; 1453 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1454 SourceRange = RegName.SourceRange; 1455 return true; 1456 } 1457 RegVal = TempReg; 1458 1459 return false; 1460 }; 1461 1462 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1463 Register &RegVal) { 1464 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1465 }; 1466 1467 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1468 return true; 1469 1470 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1471 // Create a diagnostic for a the register string literal. 1472 const MemoryBuffer &Buffer = 1473 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1474 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1475 RegName.Value.size(), SourceMgr::DK_Error, 1476 "incorrect register class for field", RegName.Value, 1477 None, None); 1478 SourceRange = RegName.SourceRange; 1479 return true; 1480 }; 1481 1482 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1483 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1484 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1485 return true; 1486 1487 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1488 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1489 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1490 } 1491 1492 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1493 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1494 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1495 } 1496 1497 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1498 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1499 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1500 } 1501 1502 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1503 Register ParsedReg; 1504 if (parseRegister(YamlReg, ParsedReg)) 1505 return true; 1506 1507 MFI->reserveWWMRegister(ParsedReg); 1508 } 1509 1510 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1511 const TargetRegisterClass &RC, 1512 ArgDescriptor &Arg, unsigned UserSGPRs, 1513 unsigned SystemSGPRs) { 1514 // Skip parsing if it's not present. 1515 if (!A) 1516 return false; 1517 1518 if (A->IsRegister) { 1519 Register Reg; 1520 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1521 SourceRange = A->RegisterName.SourceRange; 1522 return true; 1523 } 1524 if (!RC.contains(Reg)) 1525 return diagnoseRegisterClass(A->RegisterName); 1526 Arg = ArgDescriptor::createRegister(Reg); 1527 } else 1528 Arg = ArgDescriptor::createStack(A->StackOffset); 1529 // Check and apply the optional mask. 1530 if (A->Mask) 1531 Arg = ArgDescriptor::createArg(Arg, *A->Mask); 1532 1533 MFI->NumUserSGPRs += UserSGPRs; 1534 MFI->NumSystemSGPRs += SystemSGPRs; 1535 return false; 1536 }; 1537 1538 if (YamlMFI.ArgInfo && 1539 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1540 AMDGPU::SGPR_128RegClass, 1541 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1542 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1543 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1544 2, 0) || 1545 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1546 MFI->ArgInfo.QueuePtr, 2, 0) || 1547 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1548 AMDGPU::SReg_64RegClass, 1549 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1550 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1551 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1552 2, 0) || 1553 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1554 AMDGPU::SReg_64RegClass, 1555 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1556 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1557 AMDGPU::SGPR_32RegClass, 1558 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1559 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1560 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1561 0, 1) || 1562 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1563 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1564 0, 1) || 1565 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1566 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1567 0, 1) || 1568 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1569 AMDGPU::SGPR_32RegClass, 1570 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1571 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1572 AMDGPU::SGPR_32RegClass, 1573 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1574 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1575 AMDGPU::SReg_64RegClass, 1576 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1577 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1578 AMDGPU::SReg_64RegClass, 1579 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1580 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1581 AMDGPU::VGPR_32RegClass, 1582 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1583 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1584 AMDGPU::VGPR_32RegClass, 1585 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1586 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1587 AMDGPU::VGPR_32RegClass, 1588 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1589 return true; 1590 1591 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1592 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1593 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1594 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1595 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1596 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1597 1598 return false; 1599 } 1600