1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUMFMAClustering.h" 20 #include "AMDGPUMacroFusion.h" 21 #include "AMDGPUTargetObjectFile.h" 22 #include "AMDGPUTargetTransformInfo.h" 23 #include "GCNIterativeScheduler.h" 24 #include "GCNSchedStrategy.h" 25 #include "R600.h" 26 #include "R600TargetMachine.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "SIMachineScheduler.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "llvm/Analysis/CGSCCPassManager.h" 31 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 32 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 33 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 34 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 35 #include "llvm/CodeGen/GlobalISel/Localizer.h" 36 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 37 #include "llvm/CodeGen/MIRParser/MIParser.h" 38 #include "llvm/CodeGen/Passes.h" 39 #include "llvm/CodeGen/RegAllocRegistry.h" 40 #include "llvm/CodeGen/TargetPassConfig.h" 41 #include "llvm/IR/IntrinsicsAMDGPU.h" 42 #include "llvm/IR/LegacyPassManager.h" 43 #include "llvm/IR/PassManager.h" 44 #include "llvm/IR/PatternMatch.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/MC/TargetRegistry.h" 47 #include "llvm/Passes/PassBuilder.h" 48 #include "llvm/Transforms/IPO.h" 49 #include "llvm/Transforms/IPO/AlwaysInliner.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/Internalize.h" 52 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 53 #include "llvm/Transforms/Scalar.h" 54 #include "llvm/Transforms/Scalar/GVN.h" 55 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 56 #include "llvm/Transforms/Utils.h" 57 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 58 #include "llvm/Transforms/Vectorize.h" 59 60 using namespace llvm; 61 using namespace llvm::PatternMatch; 62 63 namespace { 64 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 65 public: 66 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 67 : RegisterRegAllocBase(N, D, C) {} 68 }; 69 70 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 71 public: 72 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 73 : RegisterRegAllocBase(N, D, C) {} 74 }; 75 76 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 77 const TargetRegisterClass &RC) { 78 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 79 } 80 81 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 82 const TargetRegisterClass &RC) { 83 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 84 } 85 86 87 /// -{sgpr|vgpr}-regalloc=... command line option. 88 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 89 90 /// A dummy default pass factory indicates whether the register allocator is 91 /// overridden on the command line. 92 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 93 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 94 95 static SGPRRegisterRegAlloc 96 defaultSGPRRegAlloc("default", 97 "pick SGPR register allocator based on -O option", 98 useDefaultRegisterAllocator); 99 100 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 101 RegisterPassParser<SGPRRegisterRegAlloc>> 102 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 103 cl::desc("Register allocator to use for SGPRs")); 104 105 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 106 RegisterPassParser<VGPRRegisterRegAlloc>> 107 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 108 cl::desc("Register allocator to use for VGPRs")); 109 110 111 static void initializeDefaultSGPRRegisterAllocatorOnce() { 112 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 113 114 if (!Ctor) { 115 Ctor = SGPRRegAlloc; 116 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 117 } 118 } 119 120 static void initializeDefaultVGPRRegisterAllocatorOnce() { 121 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 122 123 if (!Ctor) { 124 Ctor = VGPRRegAlloc; 125 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 126 } 127 } 128 129 static FunctionPass *createBasicSGPRRegisterAllocator() { 130 return createBasicRegisterAllocator(onlyAllocateSGPRs); 131 } 132 133 static FunctionPass *createGreedySGPRRegisterAllocator() { 134 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 135 } 136 137 static FunctionPass *createFastSGPRRegisterAllocator() { 138 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 139 } 140 141 static FunctionPass *createBasicVGPRRegisterAllocator() { 142 return createBasicRegisterAllocator(onlyAllocateVGPRs); 143 } 144 145 static FunctionPass *createGreedyVGPRRegisterAllocator() { 146 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 147 } 148 149 static FunctionPass *createFastVGPRRegisterAllocator() { 150 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 151 } 152 153 static SGPRRegisterRegAlloc basicRegAllocSGPR( 154 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 155 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 156 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 157 158 static SGPRRegisterRegAlloc fastRegAllocSGPR( 159 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 160 161 162 static VGPRRegisterRegAlloc basicRegAllocVGPR( 163 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 164 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 165 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 166 167 static VGPRRegisterRegAlloc fastRegAllocVGPR( 168 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 169 } 170 171 static cl::opt<bool> EnableSROA( 172 "amdgpu-sroa", 173 cl::desc("Run SROA after promote alloca pass"), 174 cl::ReallyHidden, 175 cl::init(true)); 176 177 static cl::opt<bool> 178 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 179 cl::desc("Run early if-conversion"), 180 cl::init(false)); 181 182 static cl::opt<bool> 183 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 184 cl::desc("Run pre-RA exec mask optimizations"), 185 cl::init(true)); 186 187 // Option to disable vectorizer for tests. 188 static cl::opt<bool> EnableLoadStoreVectorizer( 189 "amdgpu-load-store-vectorizer", 190 cl::desc("Enable load store vectorizer"), 191 cl::init(true), 192 cl::Hidden); 193 194 // Option to control global loads scalarization 195 static cl::opt<bool> ScalarizeGlobal( 196 "amdgpu-scalarize-global-loads", 197 cl::desc("Enable global load scalarization"), 198 cl::init(true), 199 cl::Hidden); 200 201 // Option to run internalize pass. 202 static cl::opt<bool> InternalizeSymbols( 203 "amdgpu-internalize-symbols", 204 cl::desc("Enable elimination of non-kernel functions and unused globals"), 205 cl::init(false), 206 cl::Hidden); 207 208 // Option to inline all early. 209 static cl::opt<bool> EarlyInlineAll( 210 "amdgpu-early-inline-all", 211 cl::desc("Inline all functions early"), 212 cl::init(false), 213 cl::Hidden); 214 215 static cl::opt<bool> EnableSDWAPeephole( 216 "amdgpu-sdwa-peephole", 217 cl::desc("Enable SDWA peepholer"), 218 cl::init(true)); 219 220 static cl::opt<bool> EnableDPPCombine( 221 "amdgpu-dpp-combine", 222 cl::desc("Enable DPP combiner"), 223 cl::init(true)); 224 225 // Enable address space based alias analysis 226 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 227 cl::desc("Enable AMDGPU Alias Analysis"), 228 cl::init(true)); 229 230 // Option to run late CFG structurizer 231 static cl::opt<bool, true> LateCFGStructurize( 232 "amdgpu-late-structurize", 233 cl::desc("Enable late CFG structurization"), 234 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 235 cl::Hidden); 236 237 // Enable lib calls simplifications 238 static cl::opt<bool> EnableLibCallSimplify( 239 "amdgpu-simplify-libcall", 240 cl::desc("Enable amdgpu library simplifications"), 241 cl::init(true), 242 cl::Hidden); 243 244 static cl::opt<bool> EnableLowerKernelArguments( 245 "amdgpu-ir-lower-kernel-arguments", 246 cl::desc("Lower kernel argument loads in IR pass"), 247 cl::init(true), 248 cl::Hidden); 249 250 static cl::opt<bool> EnableRegReassign( 251 "amdgpu-reassign-regs", 252 cl::desc("Enable register reassign optimizations on gfx10+"), 253 cl::init(true), 254 cl::Hidden); 255 256 static cl::opt<bool> OptVGPRLiveRange( 257 "amdgpu-opt-vgpr-liverange", 258 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 259 cl::init(true), cl::Hidden); 260 261 // Enable atomic optimization 262 static cl::opt<bool> EnableAtomicOptimizations( 263 "amdgpu-atomic-optimizations", 264 cl::desc("Enable atomic optimizations"), 265 cl::init(false), 266 cl::Hidden); 267 268 // Enable Mode register optimization 269 static cl::opt<bool> EnableSIModeRegisterPass( 270 "amdgpu-mode-register", 271 cl::desc("Enable mode register pass"), 272 cl::init(true), 273 cl::Hidden); 274 275 // Option is used in lit tests to prevent deadcoding of patterns inspected. 276 static cl::opt<bool> 277 EnableDCEInRA("amdgpu-dce-in-ra", 278 cl::init(true), cl::Hidden, 279 cl::desc("Enable machine DCE inside regalloc")); 280 281 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority", 282 cl::desc("Adjust wave priority"), 283 cl::init(false), cl::Hidden); 284 285 static cl::opt<bool> EnableScalarIRPasses( 286 "amdgpu-scalar-ir-passes", 287 cl::desc("Enable scalar IR passes"), 288 cl::init(true), 289 cl::Hidden); 290 291 static cl::opt<bool> EnableStructurizerWorkarounds( 292 "amdgpu-enable-structurizer-workarounds", 293 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 294 cl::Hidden); 295 296 static cl::opt<bool> EnableLDSReplaceWithPointer( 297 "amdgpu-enable-lds-replace-with-pointer", 298 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 299 cl::Hidden); 300 301 static cl::opt<bool, true> EnableLowerModuleLDS( 302 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 303 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 304 cl::Hidden); 305 306 static cl::opt<bool> EnablePreRAOptimizations( 307 "amdgpu-enable-pre-ra-optimizations", 308 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 309 cl::Hidden); 310 311 static cl::opt<bool> EnablePromoteKernelArguments( 312 "amdgpu-enable-promote-kernel-arguments", 313 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 314 cl::Hidden, cl::init(true)); 315 316 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 317 // Register the target 318 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 319 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 320 321 PassRegistry *PR = PassRegistry::getPassRegistry(); 322 initializeR600ClauseMergePassPass(*PR); 323 initializeR600ControlFlowFinalizerPass(*PR); 324 initializeR600PacketizerPass(*PR); 325 initializeR600ExpandSpecialInstrsPassPass(*PR); 326 initializeR600VectorRegMergerPass(*PR); 327 initializeGlobalISel(*PR); 328 initializeAMDGPUDAGToDAGISelPass(*PR); 329 initializeGCNDPPCombinePass(*PR); 330 initializeSILowerI1CopiesPass(*PR); 331 initializeSILowerSGPRSpillsPass(*PR); 332 initializeSIFixSGPRCopiesPass(*PR); 333 initializeSIFixVGPRCopiesPass(*PR); 334 initializeSIFoldOperandsPass(*PR); 335 initializeSIPeepholeSDWAPass(*PR); 336 initializeSIShrinkInstructionsPass(*PR); 337 initializeSIOptimizeExecMaskingPreRAPass(*PR); 338 initializeSIOptimizeVGPRLiveRangePass(*PR); 339 initializeSILoadStoreOptimizerPass(*PR); 340 initializeAMDGPUCtorDtorLoweringPass(*PR); 341 initializeAMDGPUAlwaysInlinePass(*PR); 342 initializeAMDGPUAttributorPass(*PR); 343 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 344 initializeAMDGPUAnnotateUniformValuesPass(*PR); 345 initializeAMDGPUArgumentUsageInfoPass(*PR); 346 initializeAMDGPUAtomicOptimizerPass(*PR); 347 initializeAMDGPULowerKernelArgumentsPass(*PR); 348 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 349 initializeAMDGPULowerKernelAttributesPass(*PR); 350 initializeAMDGPULowerIntrinsicsPass(*PR); 351 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 352 initializeAMDGPUPostLegalizerCombinerPass(*PR); 353 initializeAMDGPUPreLegalizerCombinerPass(*PR); 354 initializeAMDGPURegBankCombinerPass(*PR); 355 initializeAMDGPUPromoteAllocaPass(*PR); 356 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 357 initializeAMDGPUCodeGenPreparePass(*PR); 358 initializeAMDGPULateCodeGenPreparePass(*PR); 359 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 360 initializeAMDGPUPropagateAttributesLatePass(*PR); 361 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 362 initializeAMDGPULowerModuleLDSPass(*PR); 363 initializeAMDGPURewriteOutArgumentsPass(*PR); 364 initializeAMDGPUUnifyMetadataPass(*PR); 365 initializeSIAnnotateControlFlowPass(*PR); 366 initializeSIInsertHardClausesPass(*PR); 367 initializeSIInsertWaitcntsPass(*PR); 368 initializeSIModeRegisterPass(*PR); 369 initializeSIWholeQuadModePass(*PR); 370 initializeSILowerControlFlowPass(*PR); 371 initializeSIPreEmitPeepholePass(*PR); 372 initializeSILateBranchLoweringPass(*PR); 373 initializeSIMemoryLegalizerPass(*PR); 374 initializeSIOptimizeExecMaskingPass(*PR); 375 initializeSIPreAllocateWWMRegsPass(*PR); 376 initializeSIFormMemoryClausesPass(*PR); 377 initializeSIPostRABundlerPass(*PR); 378 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 379 initializeAMDGPUAAWrapperPassPass(*PR); 380 initializeAMDGPUExternalAAWrapperPass(*PR); 381 initializeAMDGPUUseNativeCallsPass(*PR); 382 initializeAMDGPUSimplifyLibCallsPass(*PR); 383 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 384 initializeAMDGPUResourceUsageAnalysisPass(*PR); 385 initializeGCNNSAReassignPass(*PR); 386 initializeGCNPreRAOptimizationsPass(*PR); 387 } 388 389 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 390 return std::make_unique<AMDGPUTargetObjectFile>(); 391 } 392 393 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 394 return new SIScheduleDAGMI(C); 395 } 396 397 static ScheduleDAGInstrs * 398 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 399 ScheduleDAGMILive *DAG = 400 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 401 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 402 DAG->addMutation(createMFMAClusterDAGMutation()); 403 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 404 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 405 return DAG; 406 } 407 408 static ScheduleDAGInstrs * 409 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 410 auto DAG = new GCNIterativeScheduler(C, 411 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 412 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 413 return DAG; 414 } 415 416 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 417 return new GCNIterativeScheduler(C, 418 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 419 } 420 421 static ScheduleDAGInstrs * 422 createIterativeILPMachineScheduler(MachineSchedContext *C) { 423 auto DAG = new GCNIterativeScheduler(C, 424 GCNIterativeScheduler::SCHEDULE_ILP); 425 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 426 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 427 return DAG; 428 } 429 430 static MachineSchedRegistry 431 SISchedRegistry("si", "Run SI's custom scheduler", 432 createSIMachineScheduler); 433 434 static MachineSchedRegistry 435 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 436 "Run GCN scheduler to maximize occupancy", 437 createGCNMaxOccupancyMachineScheduler); 438 439 static MachineSchedRegistry 440 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 441 "Run GCN scheduler to maximize occupancy (experimental)", 442 createIterativeGCNMaxOccupancyMachineScheduler); 443 444 static MachineSchedRegistry 445 GCNMinRegSchedRegistry("gcn-minreg", 446 "Run GCN iterative scheduler for minimal register usage (experimental)", 447 createMinRegScheduler); 448 449 static MachineSchedRegistry 450 GCNILPSchedRegistry("gcn-ilp", 451 "Run GCN iterative scheduler for ILP scheduling (experimental)", 452 createIterativeILPMachineScheduler); 453 454 static StringRef computeDataLayout(const Triple &TT) { 455 if (TT.getArch() == Triple::r600) { 456 // 32-bit pointers. 457 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 458 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 459 } 460 461 // 32-bit private, local, and region pointers. 64-bit global, constant and 462 // flat, non-integral buffer fat pointers. 463 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 464 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 465 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 466 "-ni:7"; 467 } 468 469 LLVM_READNONE 470 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 471 if (!GPU.empty()) 472 return GPU; 473 474 // Need to default to a target with flat support for HSA. 475 if (TT.getArch() == Triple::amdgcn) 476 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 477 478 return "r600"; 479 } 480 481 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 482 // The AMDGPU toolchain only supports generating shared objects, so we 483 // must always use PIC. 484 return Reloc::PIC_; 485 } 486 487 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 488 StringRef CPU, StringRef FS, 489 TargetOptions Options, 490 Optional<Reloc::Model> RM, 491 Optional<CodeModel::Model> CM, 492 CodeGenOpt::Level OptLevel) 493 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 494 FS, Options, getEffectiveRelocModel(RM), 495 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 496 TLOF(createTLOF(getTargetTriple())) { 497 initAsmInfo(); 498 if (TT.getArch() == Triple::amdgcn) { 499 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 500 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 501 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 502 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 503 } 504 } 505 506 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 507 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 508 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 509 510 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 511 512 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 513 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 514 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 515 } 516 517 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 518 Attribute FSAttr = F.getFnAttribute("target-features"); 519 520 return FSAttr.isValid() ? FSAttr.getValueAsString() 521 : getTargetFeatureString(); 522 } 523 524 /// Predicate for Internalize pass. 525 static bool mustPreserveGV(const GlobalValue &GV) { 526 if (const Function *F = dyn_cast<Function>(&GV)) 527 return F->isDeclaration() || F->getName().startswith("__asan_") || 528 F->getName().startswith("__sanitizer_") || 529 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 530 531 GV.removeDeadConstantUsers(); 532 return !GV.use_empty(); 533 } 534 535 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 536 Builder.DivergentTarget = true; 537 538 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 539 bool Internalize = InternalizeSymbols; 540 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 541 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 542 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 543 bool PromoteKernelArguments = 544 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 545 546 if (EnableFunctionCalls) { 547 delete Builder.Inliner; 548 Builder.Inliner = createFunctionInliningPass(); 549 } 550 551 Builder.addExtension( 552 PassManagerBuilder::EP_ModuleOptimizerEarly, 553 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 554 legacy::PassManagerBase &PM) { 555 if (AMDGPUAA) { 556 PM.add(createAMDGPUAAWrapperPass()); 557 PM.add(createAMDGPUExternalAAWrapperPass()); 558 } 559 PM.add(createAMDGPUUnifyMetadataPass()); 560 PM.add(createAMDGPUPrintfRuntimeBinding()); 561 if (Internalize) 562 PM.add(createInternalizePass(mustPreserveGV)); 563 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 564 if (Internalize) 565 PM.add(createGlobalDCEPass()); 566 if (EarlyInline) 567 PM.add(createAMDGPUAlwaysInlinePass(false)); 568 }); 569 570 Builder.addExtension( 571 PassManagerBuilder::EP_EarlyAsPossible, 572 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 573 legacy::PassManagerBase &PM) { 574 if (AMDGPUAA) { 575 PM.add(createAMDGPUAAWrapperPass()); 576 PM.add(createAMDGPUExternalAAWrapperPass()); 577 } 578 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 579 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 580 if (LibCallSimplify) 581 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 582 }); 583 584 Builder.addExtension( 585 PassManagerBuilder::EP_CGSCCOptimizerLate, 586 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 587 legacy::PassManagerBase &PM) { 588 // Add promote kernel arguments pass to the opt pipeline right before 589 // infer address spaces which is needed to do actual address space 590 // rewriting. 591 if (PromoteKernelArguments) 592 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 593 594 // Add infer address spaces pass to the opt pipeline after inlining 595 // but before SROA to increase SROA opportunities. 596 PM.add(createInferAddressSpacesPass()); 597 598 // This should run after inlining to have any chance of doing anything, 599 // and before other cleanup optimizations. 600 PM.add(createAMDGPULowerKernelAttributesPass()); 601 602 // Promote alloca to vector before SROA and loop unroll. If we manage 603 // to eliminate allocas before unroll we may choose to unroll less. 604 if (EnableOpt) 605 PM.add(createAMDGPUPromoteAllocaToVector()); 606 }); 607 } 608 609 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 610 AAM.registerFunctionAnalysis<AMDGPUAA>(); 611 } 612 613 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 614 PB.registerPipelineParsingCallback( 615 [this](StringRef PassName, ModulePassManager &PM, 616 ArrayRef<PassBuilder::PipelineElement>) { 617 if (PassName == "amdgpu-propagate-attributes-late") { 618 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 619 return true; 620 } 621 if (PassName == "amdgpu-unify-metadata") { 622 PM.addPass(AMDGPUUnifyMetadataPass()); 623 return true; 624 } 625 if (PassName == "amdgpu-printf-runtime-binding") { 626 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 627 return true; 628 } 629 if (PassName == "amdgpu-always-inline") { 630 PM.addPass(AMDGPUAlwaysInlinePass()); 631 return true; 632 } 633 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 634 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 635 return true; 636 } 637 if (PassName == "amdgpu-lower-module-lds") { 638 PM.addPass(AMDGPULowerModuleLDSPass()); 639 return true; 640 } 641 return false; 642 }); 643 PB.registerPipelineParsingCallback( 644 [this](StringRef PassName, FunctionPassManager &PM, 645 ArrayRef<PassBuilder::PipelineElement>) { 646 if (PassName == "amdgpu-simplifylib") { 647 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 648 return true; 649 } 650 if (PassName == "amdgpu-usenative") { 651 PM.addPass(AMDGPUUseNativeCallsPass()); 652 return true; 653 } 654 if (PassName == "amdgpu-promote-alloca") { 655 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 656 return true; 657 } 658 if (PassName == "amdgpu-promote-alloca-to-vector") { 659 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 660 return true; 661 } 662 if (PassName == "amdgpu-lower-kernel-attributes") { 663 PM.addPass(AMDGPULowerKernelAttributesPass()); 664 return true; 665 } 666 if (PassName == "amdgpu-propagate-attributes-early") { 667 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 668 return true; 669 } 670 if (PassName == "amdgpu-promote-kernel-arguments") { 671 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 672 return true; 673 } 674 return false; 675 }); 676 677 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 678 FAM.registerPass([&] { return AMDGPUAA(); }); 679 }); 680 681 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 682 if (AAName == "amdgpu-aa") { 683 AAM.registerFunctionAnalysis<AMDGPUAA>(); 684 return true; 685 } 686 return false; 687 }); 688 689 PB.registerPipelineStartEPCallback( 690 [this](ModulePassManager &PM, OptimizationLevel Level) { 691 FunctionPassManager FPM; 692 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 693 FPM.addPass(AMDGPUUseNativeCallsPass()); 694 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 695 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 696 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 697 }); 698 699 PB.registerPipelineEarlySimplificationEPCallback( 700 [this](ModulePassManager &PM, OptimizationLevel Level) { 701 if (Level == OptimizationLevel::O0) 702 return; 703 704 PM.addPass(AMDGPUUnifyMetadataPass()); 705 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 706 707 if (InternalizeSymbols) { 708 PM.addPass(InternalizePass(mustPreserveGV)); 709 } 710 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 711 if (InternalizeSymbols) { 712 PM.addPass(GlobalDCEPass()); 713 } 714 if (EarlyInlineAll && !EnableFunctionCalls) 715 PM.addPass(AMDGPUAlwaysInlinePass()); 716 }); 717 718 PB.registerCGSCCOptimizerLateEPCallback( 719 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 720 if (Level == OptimizationLevel::O0) 721 return; 722 723 FunctionPassManager FPM; 724 725 // Add promote kernel arguments pass to the opt pipeline right before 726 // infer address spaces which is needed to do actual address space 727 // rewriting. 728 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 729 EnablePromoteKernelArguments) 730 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 731 732 // Add infer address spaces pass to the opt pipeline after inlining 733 // but before SROA to increase SROA opportunities. 734 FPM.addPass(InferAddressSpacesPass()); 735 736 // This should run after inlining to have any chance of doing 737 // anything, and before other cleanup optimizations. 738 FPM.addPass(AMDGPULowerKernelAttributesPass()); 739 740 if (Level != OptimizationLevel::O0) { 741 // Promote alloca to vector before SROA and loop unroll. If we 742 // manage to eliminate allocas before unroll we may choose to unroll 743 // less. 744 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 745 } 746 747 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 748 }); 749 } 750 751 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 752 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 753 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 754 AddrSpace == AMDGPUAS::REGION_ADDRESS) 755 ? -1 756 : 0; 757 } 758 759 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 760 unsigned DestAS) const { 761 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 762 AMDGPU::isFlatGlobalAddrSpace(DestAS); 763 } 764 765 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 766 const auto *LD = dyn_cast<LoadInst>(V); 767 if (!LD) 768 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 769 770 // It must be a generic pointer loaded. 771 assert(V->getType()->isPointerTy() && 772 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 773 774 const auto *Ptr = LD->getPointerOperand(); 775 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 776 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 777 // For a generic pointer loaded from the constant memory, it could be assumed 778 // as a global pointer since the constant memory is only populated on the 779 // host side. As implied by the offload programming model, only global 780 // pointers could be referenced on the host side. 781 return AMDGPUAS::GLOBAL_ADDRESS; 782 } 783 784 std::pair<const Value *, unsigned> 785 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 786 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 787 switch (II->getIntrinsicID()) { 788 case Intrinsic::amdgcn_is_shared: 789 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 790 case Intrinsic::amdgcn_is_private: 791 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 792 default: 793 break; 794 } 795 return std::make_pair(nullptr, -1); 796 } 797 // Check the global pointer predication based on 798 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 799 // the order of 'is_shared' and 'is_private' is not significant. 800 Value *Ptr; 801 if (match( 802 const_cast<Value *>(V), 803 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 804 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 805 m_Deferred(Ptr)))))) 806 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 807 808 return std::make_pair(nullptr, -1); 809 } 810 811 //===----------------------------------------------------------------------===// 812 // GCN Target Machine (SI+) 813 //===----------------------------------------------------------------------===// 814 815 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 816 StringRef CPU, StringRef FS, 817 TargetOptions Options, 818 Optional<Reloc::Model> RM, 819 Optional<CodeModel::Model> CM, 820 CodeGenOpt::Level OL, bool JIT) 821 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 822 823 const TargetSubtargetInfo * 824 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 825 StringRef GPU = getGPUName(F); 826 StringRef FS = getFeatureString(F); 827 828 SmallString<128> SubtargetKey(GPU); 829 SubtargetKey.append(FS); 830 831 auto &I = SubtargetMap[SubtargetKey]; 832 if (!I) { 833 // This needs to be done before we create a new subtarget since any 834 // creation will depend on the TM and the code generation flags on the 835 // function that reside in TargetOptions. 836 resetTargetOptions(F); 837 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 838 } 839 840 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 841 842 return I.get(); 843 } 844 845 TargetTransformInfo 846 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 847 return TargetTransformInfo(GCNTTIImpl(this, F)); 848 } 849 850 //===----------------------------------------------------------------------===// 851 // AMDGPU Pass Setup 852 //===----------------------------------------------------------------------===// 853 854 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 855 return getStandardCSEConfigForOpt(TM->getOptLevel()); 856 } 857 858 namespace { 859 860 class GCNPassConfig final : public AMDGPUPassConfig { 861 public: 862 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 863 : AMDGPUPassConfig(TM, PM) { 864 // It is necessary to know the register usage of the entire call graph. We 865 // allow calls without EnableAMDGPUFunctionCalls if they are marked 866 // noinline, so this is always required. 867 setRequiresCodeGenSCCOrder(true); 868 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 869 } 870 871 GCNTargetMachine &getGCNTargetMachine() const { 872 return getTM<GCNTargetMachine>(); 873 } 874 875 ScheduleDAGInstrs * 876 createMachineScheduler(MachineSchedContext *C) const override; 877 878 ScheduleDAGInstrs * 879 createPostMachineScheduler(MachineSchedContext *C) const override { 880 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 881 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 882 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 883 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 884 DAG->addMutation(createMFMAClusterDAGMutation()); 885 return DAG; 886 } 887 888 bool addPreISel() override; 889 void addMachineSSAOptimization() override; 890 bool addILPOpts() override; 891 bool addInstSelector() override; 892 bool addIRTranslator() override; 893 void addPreLegalizeMachineIR() override; 894 bool addLegalizeMachineIR() override; 895 void addPreRegBankSelect() override; 896 bool addRegBankSelect() override; 897 void addPreGlobalInstructionSelect() override; 898 bool addGlobalInstructionSelect() override; 899 void addFastRegAlloc() override; 900 void addOptimizedRegAlloc() override; 901 902 FunctionPass *createSGPRAllocPass(bool Optimized); 903 FunctionPass *createVGPRAllocPass(bool Optimized); 904 FunctionPass *createRegAllocPass(bool Optimized) override; 905 906 bool addRegAssignAndRewriteFast() override; 907 bool addRegAssignAndRewriteOptimized() override; 908 909 void addPreRegAlloc() override; 910 bool addPreRewrite() override; 911 void addPostRegAlloc() override; 912 void addPreSched2() override; 913 void addPreEmitPass() override; 914 }; 915 916 } // end anonymous namespace 917 918 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 919 : TargetPassConfig(TM, PM) { 920 // Exceptions and StackMaps are not supported, so these passes will never do 921 // anything. 922 disablePass(&StackMapLivenessID); 923 disablePass(&FuncletLayoutID); 924 // Garbage collection is not supported. 925 disablePass(&GCLoweringID); 926 disablePass(&ShadowStackGCLoweringID); 927 } 928 929 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 930 if (getOptLevel() == CodeGenOpt::Aggressive) 931 addPass(createGVNPass()); 932 else 933 addPass(createEarlyCSEPass()); 934 } 935 936 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 937 addPass(createLICMPass()); 938 addPass(createSeparateConstOffsetFromGEPPass()); 939 addPass(createSpeculativeExecutionPass()); 940 // ReassociateGEPs exposes more opportunities for SLSR. See 941 // the example in reassociate-geps-and-slsr.ll. 942 addPass(createStraightLineStrengthReducePass()); 943 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 944 // EarlyCSE can reuse. 945 addEarlyCSEOrGVNPass(); 946 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 947 addPass(createNaryReassociatePass()); 948 // NaryReassociate on GEPs creates redundant common expressions, so run 949 // EarlyCSE after it. 950 addPass(createEarlyCSEPass()); 951 } 952 953 void AMDGPUPassConfig::addIRPasses() { 954 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 955 956 // There is no reason to run these. 957 disablePass(&StackMapLivenessID); 958 disablePass(&FuncletLayoutID); 959 disablePass(&PatchableFunctionID); 960 961 addPass(createAMDGPUPrintfRuntimeBinding()); 962 addPass(createAMDGPUCtorDtorLoweringPass()); 963 964 // A call to propagate attributes pass in the backend in case opt was not run. 965 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 966 967 addPass(createAMDGPULowerIntrinsicsPass()); 968 969 // Function calls are not supported, so make sure we inline everything. 970 addPass(createAMDGPUAlwaysInlinePass()); 971 addPass(createAlwaysInlinerLegacyPass()); 972 // We need to add the barrier noop pass, otherwise adding the function 973 // inlining pass will cause all of the PassConfigs passes to be run 974 // one function at a time, which means if we have a module with two 975 // functions, then we will generate code for the first function 976 // without ever running any passes on the second. 977 addPass(createBarrierNoopPass()); 978 979 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 980 if (TM.getTargetTriple().getArch() == Triple::r600) 981 addPass(createR600OpenCLImageTypeLoweringPass()); 982 983 // Replace OpenCL enqueued block function pointers with global variables. 984 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 985 986 // Can increase LDS used by kernel so runs before PromoteAlloca 987 if (EnableLowerModuleLDS) { 988 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 989 // pass "amdgpu-lower-module-lds", and also it required to be run only if 990 // "amdgpu-lower-module-lds" pass is enabled. 991 if (EnableLDSReplaceWithPointer) 992 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 993 994 addPass(createAMDGPULowerModuleLDSPass()); 995 } 996 997 if (TM.getOptLevel() > CodeGenOpt::None) 998 addPass(createInferAddressSpacesPass()); 999 1000 addPass(createAtomicExpandPass()); 1001 1002 if (TM.getOptLevel() > CodeGenOpt::None) { 1003 addPass(createAMDGPUPromoteAlloca()); 1004 1005 if (EnableSROA) 1006 addPass(createSROAPass()); 1007 if (isPassEnabled(EnableScalarIRPasses)) 1008 addStraightLineScalarOptimizationPasses(); 1009 1010 if (EnableAMDGPUAliasAnalysis) { 1011 addPass(createAMDGPUAAWrapperPass()); 1012 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1013 AAResults &AAR) { 1014 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1015 AAR.addAAResult(WrapperPass->getResult()); 1016 })); 1017 } 1018 1019 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1020 // TODO: May want to move later or split into an early and late one. 1021 addPass(createAMDGPUCodeGenPreparePass()); 1022 } 1023 } 1024 1025 TargetPassConfig::addIRPasses(); 1026 1027 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1028 // example, GVN can combine 1029 // 1030 // %0 = add %a, %b 1031 // %1 = add %b, %a 1032 // 1033 // and 1034 // 1035 // %0 = shl nsw %a, 2 1036 // %1 = shl %a, 2 1037 // 1038 // but EarlyCSE can do neither of them. 1039 if (isPassEnabled(EnableScalarIRPasses)) 1040 addEarlyCSEOrGVNPass(); 1041 } 1042 1043 void AMDGPUPassConfig::addCodeGenPrepare() { 1044 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1045 addPass(createAMDGPUAttributorPass()); 1046 1047 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1048 // analysis, and should be removed. 1049 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1050 } 1051 1052 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1053 EnableLowerKernelArguments) 1054 addPass(createAMDGPULowerKernelArgumentsPass()); 1055 1056 TargetPassConfig::addCodeGenPrepare(); 1057 1058 if (isPassEnabled(EnableLoadStoreVectorizer)) 1059 addPass(createLoadStoreVectorizerPass()); 1060 1061 // LowerSwitch pass may introduce unreachable blocks that can 1062 // cause unexpected behavior for subsequent passes. Placing it 1063 // here seems better that these blocks would get cleaned up by 1064 // UnreachableBlockElim inserted next in the pass flow. 1065 addPass(createLowerSwitchPass()); 1066 } 1067 1068 bool AMDGPUPassConfig::addPreISel() { 1069 if (TM->getOptLevel() > CodeGenOpt::None) 1070 addPass(createFlattenCFGPass()); 1071 return false; 1072 } 1073 1074 bool AMDGPUPassConfig::addInstSelector() { 1075 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1076 return false; 1077 } 1078 1079 bool AMDGPUPassConfig::addGCPasses() { 1080 // Do nothing. GC is not supported. 1081 return false; 1082 } 1083 1084 llvm::ScheduleDAGInstrs * 1085 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1086 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1087 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1088 return DAG; 1089 } 1090 1091 //===----------------------------------------------------------------------===// 1092 // GCN Pass Setup 1093 //===----------------------------------------------------------------------===// 1094 1095 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1096 MachineSchedContext *C) const { 1097 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1098 if (ST.enableSIScheduler()) 1099 return createSIMachineScheduler(C); 1100 return createGCNMaxOccupancyMachineScheduler(C); 1101 } 1102 1103 bool GCNPassConfig::addPreISel() { 1104 AMDGPUPassConfig::addPreISel(); 1105 1106 if (TM->getOptLevel() > CodeGenOpt::None) 1107 addPass(createAMDGPULateCodeGenPreparePass()); 1108 1109 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1110 addPass(createAMDGPUAtomicOptimizerPass()); 1111 } 1112 1113 if (TM->getOptLevel() > CodeGenOpt::None) 1114 addPass(createSinkingPass()); 1115 1116 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1117 // regions formed by them. 1118 addPass(&AMDGPUUnifyDivergentExitNodesID); 1119 if (!LateCFGStructurize) { 1120 if (EnableStructurizerWorkarounds) { 1121 addPass(createFixIrreduciblePass()); 1122 addPass(createUnifyLoopExitsPass()); 1123 } 1124 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1125 } 1126 addPass(createAMDGPUAnnotateUniformValues()); 1127 if (!LateCFGStructurize) { 1128 addPass(createSIAnnotateControlFlowPass()); 1129 } 1130 addPass(createLCSSAPass()); 1131 1132 if (TM->getOptLevel() > CodeGenOpt::Less) 1133 addPass(&AMDGPUPerfHintAnalysisID); 1134 1135 return false; 1136 } 1137 1138 void GCNPassConfig::addMachineSSAOptimization() { 1139 TargetPassConfig::addMachineSSAOptimization(); 1140 1141 // We want to fold operands after PeepholeOptimizer has run (or as part of 1142 // it), because it will eliminate extra copies making it easier to fold the 1143 // real source operand. We want to eliminate dead instructions after, so that 1144 // we see fewer uses of the copies. We then need to clean up the dead 1145 // instructions leftover after the operands are folded as well. 1146 // 1147 // XXX - Can we get away without running DeadMachineInstructionElim again? 1148 addPass(&SIFoldOperandsID); 1149 if (EnableDPPCombine) 1150 addPass(&GCNDPPCombineID); 1151 addPass(&SILoadStoreOptimizerID); 1152 if (isPassEnabled(EnableSDWAPeephole)) { 1153 addPass(&SIPeepholeSDWAID); 1154 addPass(&EarlyMachineLICMID); 1155 addPass(&MachineCSEID); 1156 addPass(&SIFoldOperandsID); 1157 } 1158 addPass(&DeadMachineInstructionElimID); 1159 addPass(createSIShrinkInstructionsPass()); 1160 } 1161 1162 bool GCNPassConfig::addILPOpts() { 1163 if (EnableEarlyIfConversion) 1164 addPass(&EarlyIfConverterID); 1165 1166 TargetPassConfig::addILPOpts(); 1167 return false; 1168 } 1169 1170 bool GCNPassConfig::addInstSelector() { 1171 AMDGPUPassConfig::addInstSelector(); 1172 addPass(&SIFixSGPRCopiesID); 1173 addPass(createSILowerI1CopiesPass()); 1174 return false; 1175 } 1176 1177 bool GCNPassConfig::addIRTranslator() { 1178 addPass(new IRTranslator(getOptLevel())); 1179 return false; 1180 } 1181 1182 void GCNPassConfig::addPreLegalizeMachineIR() { 1183 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1184 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1185 addPass(new Localizer()); 1186 } 1187 1188 bool GCNPassConfig::addLegalizeMachineIR() { 1189 addPass(new Legalizer()); 1190 return false; 1191 } 1192 1193 void GCNPassConfig::addPreRegBankSelect() { 1194 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1195 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1196 } 1197 1198 bool GCNPassConfig::addRegBankSelect() { 1199 addPass(new RegBankSelect()); 1200 return false; 1201 } 1202 1203 void GCNPassConfig::addPreGlobalInstructionSelect() { 1204 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1205 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1206 } 1207 1208 bool GCNPassConfig::addGlobalInstructionSelect() { 1209 addPass(new InstructionSelect(getOptLevel())); 1210 return false; 1211 } 1212 1213 void GCNPassConfig::addPreRegAlloc() { 1214 if (LateCFGStructurize) { 1215 addPass(createAMDGPUMachineCFGStructurizerPass()); 1216 } 1217 } 1218 1219 void GCNPassConfig::addFastRegAlloc() { 1220 // FIXME: We have to disable the verifier here because of PHIElimination + 1221 // TwoAddressInstructions disabling it. 1222 1223 // This must be run immediately after phi elimination and before 1224 // TwoAddressInstructions, otherwise the processing of the tied operand of 1225 // SI_ELSE will introduce a copy of the tied operand source after the else. 1226 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1227 1228 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1229 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1230 1231 TargetPassConfig::addFastRegAlloc(); 1232 } 1233 1234 void GCNPassConfig::addOptimizedRegAlloc() { 1235 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1236 // instructions that cause scheduling barriers. 1237 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1238 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1239 1240 if (OptExecMaskPreRA) 1241 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1242 1243 if (isPassEnabled(EnablePreRAOptimizations)) 1244 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1245 1246 // This is not an essential optimization and it has a noticeable impact on 1247 // compilation time, so we only enable it from O2. 1248 if (TM->getOptLevel() > CodeGenOpt::Less) 1249 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1250 1251 // FIXME: when an instruction has a Killed operand, and the instruction is 1252 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1253 // the register in LiveVariables, this would trigger a failure in verifier, 1254 // we should fix it and enable the verifier. 1255 if (OptVGPRLiveRange) 1256 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1257 // This must be run immediately after phi elimination and before 1258 // TwoAddressInstructions, otherwise the processing of the tied operand of 1259 // SI_ELSE will introduce a copy of the tied operand source after the else. 1260 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1261 1262 if (EnableDCEInRA) 1263 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1264 1265 TargetPassConfig::addOptimizedRegAlloc(); 1266 } 1267 1268 bool GCNPassConfig::addPreRewrite() { 1269 if (EnableRegReassign) 1270 addPass(&GCNNSAReassignID); 1271 return true; 1272 } 1273 1274 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1275 // Initialize the global default. 1276 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1277 initializeDefaultSGPRRegisterAllocatorOnce); 1278 1279 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1280 if (Ctor != useDefaultRegisterAllocator) 1281 return Ctor(); 1282 1283 if (Optimized) 1284 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1285 1286 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1287 } 1288 1289 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1290 // Initialize the global default. 1291 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1292 initializeDefaultVGPRRegisterAllocatorOnce); 1293 1294 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1295 if (Ctor != useDefaultRegisterAllocator) 1296 return Ctor(); 1297 1298 if (Optimized) 1299 return createGreedyVGPRRegisterAllocator(); 1300 1301 return createFastVGPRRegisterAllocator(); 1302 } 1303 1304 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1305 llvm_unreachable("should not be used"); 1306 } 1307 1308 static const char RegAllocOptNotSupportedMessage[] = 1309 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1310 1311 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1312 if (!usingDefaultRegAlloc()) 1313 report_fatal_error(RegAllocOptNotSupportedMessage); 1314 1315 addPass(createSGPRAllocPass(false)); 1316 1317 // Equivalent of PEI for SGPRs. 1318 addPass(&SILowerSGPRSpillsID); 1319 1320 addPass(createVGPRAllocPass(false)); 1321 return true; 1322 } 1323 1324 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1325 if (!usingDefaultRegAlloc()) 1326 report_fatal_error(RegAllocOptNotSupportedMessage); 1327 1328 addPass(createSGPRAllocPass(true)); 1329 1330 // Commit allocated register changes. This is mostly necessary because too 1331 // many things rely on the use lists of the physical registers, such as the 1332 // verifier. This is only necessary with allocators which use LiveIntervals, 1333 // since FastRegAlloc does the replacements itself. 1334 addPass(createVirtRegRewriter(false)); 1335 1336 // Equivalent of PEI for SGPRs. 1337 addPass(&SILowerSGPRSpillsID); 1338 1339 addPass(createVGPRAllocPass(true)); 1340 1341 addPreRewrite(); 1342 addPass(&VirtRegRewriterID); 1343 1344 return true; 1345 } 1346 1347 void GCNPassConfig::addPostRegAlloc() { 1348 addPass(&SIFixVGPRCopiesID); 1349 if (getOptLevel() > CodeGenOpt::None) 1350 addPass(&SIOptimizeExecMaskingID); 1351 TargetPassConfig::addPostRegAlloc(); 1352 } 1353 1354 void GCNPassConfig::addPreSched2() { 1355 if (TM->getOptLevel() > CodeGenOpt::None) 1356 addPass(createSIShrinkInstructionsPass()); 1357 addPass(&SIPostRABundlerID); 1358 } 1359 1360 void GCNPassConfig::addPreEmitPass() { 1361 addPass(createSIMemoryLegalizerPass()); 1362 addPass(createSIInsertWaitcntsPass()); 1363 1364 addPass(createSIModeRegisterPass()); 1365 1366 if (getOptLevel() > CodeGenOpt::None) 1367 addPass(&SIInsertHardClausesID); 1368 1369 addPass(&SILateBranchLoweringPassID); 1370 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less)) 1371 addPass(createAMDGPUSetWavePriorityPass()); 1372 if (getOptLevel() > CodeGenOpt::None) 1373 addPass(&SIPreEmitPeepholeID); 1374 // The hazard recognizer that runs as part of the post-ra scheduler does not 1375 // guarantee to be able handle all hazards correctly. This is because if there 1376 // are multiple scheduling regions in a basic block, the regions are scheduled 1377 // bottom up, so when we begin to schedule a region we don't know what 1378 // instructions were emitted directly before it. 1379 // 1380 // Here we add a stand-alone hazard recognizer pass which can handle all 1381 // cases. 1382 addPass(&PostRAHazardRecognizerID); 1383 addPass(&BranchRelaxationPassID); 1384 } 1385 1386 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1387 return new GCNPassConfig(*this, PM); 1388 } 1389 1390 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1391 return new yaml::SIMachineFunctionInfo(); 1392 } 1393 1394 yaml::MachineFunctionInfo * 1395 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1396 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1397 return new yaml::SIMachineFunctionInfo( 1398 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1399 } 1400 1401 bool GCNTargetMachine::parseMachineFunctionInfo( 1402 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1403 SMDiagnostic &Error, SMRange &SourceRange) const { 1404 const yaml::SIMachineFunctionInfo &YamlMFI = 1405 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1406 MachineFunction &MF = PFS.MF; 1407 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1408 1409 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1410 return true; 1411 1412 if (MFI->Occupancy == 0) { 1413 // Fixup the subtarget dependent default value. 1414 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1415 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1416 } 1417 1418 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1419 Register TempReg; 1420 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1421 SourceRange = RegName.SourceRange; 1422 return true; 1423 } 1424 RegVal = TempReg; 1425 1426 return false; 1427 }; 1428 1429 auto parseOptionalRegister = [&](const yaml::StringValue &RegName, 1430 Register &RegVal) { 1431 return !RegName.Value.empty() && parseRegister(RegName, RegVal); 1432 }; 1433 1434 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) 1435 return true; 1436 1437 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1438 // Create a diagnostic for a the register string literal. 1439 const MemoryBuffer &Buffer = 1440 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1441 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1442 RegName.Value.size(), SourceMgr::DK_Error, 1443 "incorrect register class for field", RegName.Value, 1444 None, None); 1445 SourceRange = RegName.SourceRange; 1446 return true; 1447 }; 1448 1449 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1450 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1451 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1452 return true; 1453 1454 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1455 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1456 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1457 } 1458 1459 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1460 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1461 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1462 } 1463 1464 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1465 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1466 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1467 } 1468 1469 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) { 1470 Register ParsedReg; 1471 if (parseRegister(YamlReg, ParsedReg)) 1472 return true; 1473 1474 MFI->reserveWWMRegister(ParsedReg); 1475 } 1476 1477 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1478 const TargetRegisterClass &RC, 1479 ArgDescriptor &Arg, unsigned UserSGPRs, 1480 unsigned SystemSGPRs) { 1481 // Skip parsing if it's not present. 1482 if (!A) 1483 return false; 1484 1485 if (A->IsRegister) { 1486 Register Reg; 1487 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1488 SourceRange = A->RegisterName.SourceRange; 1489 return true; 1490 } 1491 if (!RC.contains(Reg)) 1492 return diagnoseRegisterClass(A->RegisterName); 1493 Arg = ArgDescriptor::createRegister(Reg); 1494 } else 1495 Arg = ArgDescriptor::createStack(A->StackOffset); 1496 // Check and apply the optional mask. 1497 if (A->Mask) 1498 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1499 1500 MFI->NumUserSGPRs += UserSGPRs; 1501 MFI->NumSystemSGPRs += SystemSGPRs; 1502 return false; 1503 }; 1504 1505 if (YamlMFI.ArgInfo && 1506 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1507 AMDGPU::SGPR_128RegClass, 1508 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1509 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1510 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1511 2, 0) || 1512 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1513 MFI->ArgInfo.QueuePtr, 2, 0) || 1514 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1515 AMDGPU::SReg_64RegClass, 1516 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1517 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1518 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1519 2, 0) || 1520 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1521 AMDGPU::SReg_64RegClass, 1522 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1523 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1524 AMDGPU::SGPR_32RegClass, 1525 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1526 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1527 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1528 0, 1) || 1529 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1530 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1531 0, 1) || 1532 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1533 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1534 0, 1) || 1535 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1536 AMDGPU::SGPR_32RegClass, 1537 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1538 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1539 AMDGPU::SGPR_32RegClass, 1540 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1541 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1542 AMDGPU::SReg_64RegClass, 1543 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1544 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1545 AMDGPU::SReg_64RegClass, 1546 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1547 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1548 AMDGPU::VGPR_32RegClass, 1549 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1550 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1551 AMDGPU::VGPR_32RegClass, 1552 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1553 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1554 AMDGPU::VGPR_32RegClass, 1555 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1556 return true; 1557 1558 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1559 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1560 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1561 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1562 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1563 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1564 1565 return false; 1566 } 1567