1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUMacroFusion.h" 20 #include "AMDGPUTargetObjectFile.h" 21 #include "AMDGPUTargetTransformInfo.h" 22 #include "GCNIterativeScheduler.h" 23 #include "GCNSchedStrategy.h" 24 #include "R600.h" 25 #include "R600TargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIMachineScheduler.h" 28 #include "TargetInfo/AMDGPUTargetInfo.h" 29 #include "llvm/Analysis/CGSCCPassManager.h" 30 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 31 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 32 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 33 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 34 #include "llvm/CodeGen/GlobalISel/Localizer.h" 35 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 36 #include "llvm/CodeGen/MIRParser/MIParser.h" 37 #include "llvm/CodeGen/Passes.h" 38 #include "llvm/CodeGen/RegAllocRegistry.h" 39 #include "llvm/CodeGen/TargetPassConfig.h" 40 #include "llvm/IR/IntrinsicsAMDGPU.h" 41 #include "llvm/IR/LegacyPassManager.h" 42 #include "llvm/IR/PassManager.h" 43 #include "llvm/IR/PatternMatch.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/MC/TargetRegistry.h" 46 #include "llvm/Passes/PassBuilder.h" 47 #include "llvm/Transforms/IPO.h" 48 #include "llvm/Transforms/IPO/AlwaysInliner.h" 49 #include "llvm/Transforms/IPO/GlobalDCE.h" 50 #include "llvm/Transforms/IPO/Internalize.h" 51 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 52 #include "llvm/Transforms/Scalar.h" 53 #include "llvm/Transforms/Scalar/GVN.h" 54 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 55 #include "llvm/Transforms/Utils.h" 56 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 57 #include "llvm/Transforms/Vectorize.h" 58 59 using namespace llvm; 60 using namespace llvm::PatternMatch; 61 62 namespace { 63 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 64 public: 65 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 66 : RegisterRegAllocBase(N, D, C) {} 67 }; 68 69 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 70 public: 71 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 72 : RegisterRegAllocBase(N, D, C) {} 73 }; 74 75 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 76 const TargetRegisterClass &RC) { 77 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 78 } 79 80 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 81 const TargetRegisterClass &RC) { 82 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 83 } 84 85 86 /// -{sgpr|vgpr}-regalloc=... command line option. 87 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 88 89 /// A dummy default pass factory indicates whether the register allocator is 90 /// overridden on the command line. 91 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 92 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 93 94 static SGPRRegisterRegAlloc 95 defaultSGPRRegAlloc("default", 96 "pick SGPR register allocator based on -O option", 97 useDefaultRegisterAllocator); 98 99 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 100 RegisterPassParser<SGPRRegisterRegAlloc>> 101 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 102 cl::desc("Register allocator to use for SGPRs")); 103 104 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 105 RegisterPassParser<VGPRRegisterRegAlloc>> 106 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 107 cl::desc("Register allocator to use for VGPRs")); 108 109 110 static void initializeDefaultSGPRRegisterAllocatorOnce() { 111 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 112 113 if (!Ctor) { 114 Ctor = SGPRRegAlloc; 115 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 116 } 117 } 118 119 static void initializeDefaultVGPRRegisterAllocatorOnce() { 120 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 121 122 if (!Ctor) { 123 Ctor = VGPRRegAlloc; 124 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 125 } 126 } 127 128 static FunctionPass *createBasicSGPRRegisterAllocator() { 129 return createBasicRegisterAllocator(onlyAllocateSGPRs); 130 } 131 132 static FunctionPass *createGreedySGPRRegisterAllocator() { 133 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 134 } 135 136 static FunctionPass *createFastSGPRRegisterAllocator() { 137 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 138 } 139 140 static FunctionPass *createBasicVGPRRegisterAllocator() { 141 return createBasicRegisterAllocator(onlyAllocateVGPRs); 142 } 143 144 static FunctionPass *createGreedyVGPRRegisterAllocator() { 145 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 146 } 147 148 static FunctionPass *createFastVGPRRegisterAllocator() { 149 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 150 } 151 152 static SGPRRegisterRegAlloc basicRegAllocSGPR( 153 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 154 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 155 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 156 157 static SGPRRegisterRegAlloc fastRegAllocSGPR( 158 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 159 160 161 static VGPRRegisterRegAlloc basicRegAllocVGPR( 162 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 163 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 164 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 165 166 static VGPRRegisterRegAlloc fastRegAllocVGPR( 167 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 168 } 169 170 static cl::opt<bool> EnableSROA( 171 "amdgpu-sroa", 172 cl::desc("Run SROA after promote alloca pass"), 173 cl::ReallyHidden, 174 cl::init(true)); 175 176 static cl::opt<bool> 177 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 178 cl::desc("Run early if-conversion"), 179 cl::init(false)); 180 181 static cl::opt<bool> 182 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 183 cl::desc("Run pre-RA exec mask optimizations"), 184 cl::init(true)); 185 186 // Option to disable vectorizer for tests. 187 static cl::opt<bool> EnableLoadStoreVectorizer( 188 "amdgpu-load-store-vectorizer", 189 cl::desc("Enable load store vectorizer"), 190 cl::init(true), 191 cl::Hidden); 192 193 // Option to control global loads scalarization 194 static cl::opt<bool> ScalarizeGlobal( 195 "amdgpu-scalarize-global-loads", 196 cl::desc("Enable global load scalarization"), 197 cl::init(true), 198 cl::Hidden); 199 200 // Option to run internalize pass. 201 static cl::opt<bool> InternalizeSymbols( 202 "amdgpu-internalize-symbols", 203 cl::desc("Enable elimination of non-kernel functions and unused globals"), 204 cl::init(false), 205 cl::Hidden); 206 207 // Option to inline all early. 208 static cl::opt<bool> EarlyInlineAll( 209 "amdgpu-early-inline-all", 210 cl::desc("Inline all functions early"), 211 cl::init(false), 212 cl::Hidden); 213 214 static cl::opt<bool> EnableSDWAPeephole( 215 "amdgpu-sdwa-peephole", 216 cl::desc("Enable SDWA peepholer"), 217 cl::init(true)); 218 219 static cl::opt<bool> EnableDPPCombine( 220 "amdgpu-dpp-combine", 221 cl::desc("Enable DPP combiner"), 222 cl::init(true)); 223 224 // Enable address space based alias analysis 225 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 226 cl::desc("Enable AMDGPU Alias Analysis"), 227 cl::init(true)); 228 229 // Option to run late CFG structurizer 230 static cl::opt<bool, true> LateCFGStructurize( 231 "amdgpu-late-structurize", 232 cl::desc("Enable late CFG structurization"), 233 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 234 cl::Hidden); 235 236 // Enable lib calls simplifications 237 static cl::opt<bool> EnableLibCallSimplify( 238 "amdgpu-simplify-libcall", 239 cl::desc("Enable amdgpu library simplifications"), 240 cl::init(true), 241 cl::Hidden); 242 243 static cl::opt<bool> EnableLowerKernelArguments( 244 "amdgpu-ir-lower-kernel-arguments", 245 cl::desc("Lower kernel argument loads in IR pass"), 246 cl::init(true), 247 cl::Hidden); 248 249 static cl::opt<bool> EnableRegReassign( 250 "amdgpu-reassign-regs", 251 cl::desc("Enable register reassign optimizations on gfx10+"), 252 cl::init(true), 253 cl::Hidden); 254 255 static cl::opt<bool> OptVGPRLiveRange( 256 "amdgpu-opt-vgpr-liverange", 257 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 258 cl::init(true), cl::Hidden); 259 260 // Enable atomic optimization 261 static cl::opt<bool> EnableAtomicOptimizations( 262 "amdgpu-atomic-optimizations", 263 cl::desc("Enable atomic optimizations"), 264 cl::init(false), 265 cl::Hidden); 266 267 // Enable Mode register optimization 268 static cl::opt<bool> EnableSIModeRegisterPass( 269 "amdgpu-mode-register", 270 cl::desc("Enable mode register pass"), 271 cl::init(true), 272 cl::Hidden); 273 274 // Option is used in lit tests to prevent deadcoding of patterns inspected. 275 static cl::opt<bool> 276 EnableDCEInRA("amdgpu-dce-in-ra", 277 cl::init(true), cl::Hidden, 278 cl::desc("Enable machine DCE inside regalloc")); 279 280 static cl::opt<bool> EnableScalarIRPasses( 281 "amdgpu-scalar-ir-passes", 282 cl::desc("Enable scalar IR passes"), 283 cl::init(true), 284 cl::Hidden); 285 286 static cl::opt<bool> EnableStructurizerWorkarounds( 287 "amdgpu-enable-structurizer-workarounds", 288 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 289 cl::Hidden); 290 291 static cl::opt<bool> EnableLDSReplaceWithPointer( 292 "amdgpu-enable-lds-replace-with-pointer", 293 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 294 cl::Hidden); 295 296 static cl::opt<bool, true> EnableLowerModuleLDS( 297 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 298 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 299 cl::Hidden); 300 301 static cl::opt<bool> EnablePreRAOptimizations( 302 "amdgpu-enable-pre-ra-optimizations", 303 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 304 cl::Hidden); 305 306 static cl::opt<bool> EnablePromoteKernelArguments( 307 "amdgpu-enable-promote-kernel-arguments", 308 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 309 cl::Hidden, cl::init(true)); 310 311 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 312 // Register the target 313 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 314 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 315 316 PassRegistry *PR = PassRegistry::getPassRegistry(); 317 initializeR600ClauseMergePassPass(*PR); 318 initializeR600ControlFlowFinalizerPass(*PR); 319 initializeR600PacketizerPass(*PR); 320 initializeR600ExpandSpecialInstrsPassPass(*PR); 321 initializeR600VectorRegMergerPass(*PR); 322 initializeGlobalISel(*PR); 323 initializeAMDGPUDAGToDAGISelPass(*PR); 324 initializeGCNDPPCombinePass(*PR); 325 initializeSILowerI1CopiesPass(*PR); 326 initializeSILowerSGPRSpillsPass(*PR); 327 initializeSIFixSGPRCopiesPass(*PR); 328 initializeSIFixVGPRCopiesPass(*PR); 329 initializeSIFoldOperandsPass(*PR); 330 initializeSIPeepholeSDWAPass(*PR); 331 initializeSIShrinkInstructionsPass(*PR); 332 initializeSIOptimizeExecMaskingPreRAPass(*PR); 333 initializeSIOptimizeVGPRLiveRangePass(*PR); 334 initializeSILoadStoreOptimizerPass(*PR); 335 initializeAMDGPUCtorDtorLoweringPass(*PR); 336 initializeAMDGPUAlwaysInlinePass(*PR); 337 initializeAMDGPUAttributorPass(*PR); 338 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 339 initializeAMDGPUAnnotateUniformValuesPass(*PR); 340 initializeAMDGPUArgumentUsageInfoPass(*PR); 341 initializeAMDGPUAtomicOptimizerPass(*PR); 342 initializeAMDGPULowerKernelArgumentsPass(*PR); 343 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 344 initializeAMDGPULowerKernelAttributesPass(*PR); 345 initializeAMDGPULowerIntrinsicsPass(*PR); 346 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 347 initializeAMDGPUPostLegalizerCombinerPass(*PR); 348 initializeAMDGPUPreLegalizerCombinerPass(*PR); 349 initializeAMDGPURegBankCombinerPass(*PR); 350 initializeAMDGPUPromoteAllocaPass(*PR); 351 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 352 initializeAMDGPUCodeGenPreparePass(*PR); 353 initializeAMDGPULateCodeGenPreparePass(*PR); 354 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 355 initializeAMDGPUPropagateAttributesLatePass(*PR); 356 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 357 initializeAMDGPULowerModuleLDSPass(*PR); 358 initializeAMDGPURewriteOutArgumentsPass(*PR); 359 initializeAMDGPUUnifyMetadataPass(*PR); 360 initializeSIAnnotateControlFlowPass(*PR); 361 initializeSIInsertHardClausesPass(*PR); 362 initializeSIInsertWaitcntsPass(*PR); 363 initializeSIModeRegisterPass(*PR); 364 initializeSIWholeQuadModePass(*PR); 365 initializeSILowerControlFlowPass(*PR); 366 initializeSIPreEmitPeepholePass(*PR); 367 initializeSILateBranchLoweringPass(*PR); 368 initializeSIMemoryLegalizerPass(*PR); 369 initializeSIOptimizeExecMaskingPass(*PR); 370 initializeSIPreAllocateWWMRegsPass(*PR); 371 initializeSIFormMemoryClausesPass(*PR); 372 initializeSIPostRABundlerPass(*PR); 373 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 374 initializeAMDGPUAAWrapperPassPass(*PR); 375 initializeAMDGPUExternalAAWrapperPass(*PR); 376 initializeAMDGPUUseNativeCallsPass(*PR); 377 initializeAMDGPUSimplifyLibCallsPass(*PR); 378 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 379 initializeAMDGPUResourceUsageAnalysisPass(*PR); 380 initializeGCNNSAReassignPass(*PR); 381 initializeGCNPreRAOptimizationsPass(*PR); 382 } 383 384 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 385 return std::make_unique<AMDGPUTargetObjectFile>(); 386 } 387 388 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 389 return new SIScheduleDAGMI(C); 390 } 391 392 static ScheduleDAGInstrs * 393 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 394 ScheduleDAGMILive *DAG = 395 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 396 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 397 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 398 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 399 return DAG; 400 } 401 402 static ScheduleDAGInstrs * 403 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 404 auto DAG = new GCNIterativeScheduler(C, 405 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 406 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 407 return DAG; 408 } 409 410 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 411 return new GCNIterativeScheduler(C, 412 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 413 } 414 415 static ScheduleDAGInstrs * 416 createIterativeILPMachineScheduler(MachineSchedContext *C) { 417 auto DAG = new GCNIterativeScheduler(C, 418 GCNIterativeScheduler::SCHEDULE_ILP); 419 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 420 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 421 return DAG; 422 } 423 424 static MachineSchedRegistry 425 SISchedRegistry("si", "Run SI's custom scheduler", 426 createSIMachineScheduler); 427 428 static MachineSchedRegistry 429 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 430 "Run GCN scheduler to maximize occupancy", 431 createGCNMaxOccupancyMachineScheduler); 432 433 static MachineSchedRegistry 434 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 435 "Run GCN scheduler to maximize occupancy (experimental)", 436 createIterativeGCNMaxOccupancyMachineScheduler); 437 438 static MachineSchedRegistry 439 GCNMinRegSchedRegistry("gcn-minreg", 440 "Run GCN iterative scheduler for minimal register usage (experimental)", 441 createMinRegScheduler); 442 443 static MachineSchedRegistry 444 GCNILPSchedRegistry("gcn-ilp", 445 "Run GCN iterative scheduler for ILP scheduling (experimental)", 446 createIterativeILPMachineScheduler); 447 448 static StringRef computeDataLayout(const Triple &TT) { 449 if (TT.getArch() == Triple::r600) { 450 // 32-bit pointers. 451 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 452 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 453 } 454 455 // 32-bit private, local, and region pointers. 64-bit global, constant and 456 // flat, non-integral buffer fat pointers. 457 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 458 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 459 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 460 "-ni:7"; 461 } 462 463 LLVM_READNONE 464 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 465 if (!GPU.empty()) 466 return GPU; 467 468 // Need to default to a target with flat support for HSA. 469 if (TT.getArch() == Triple::amdgcn) 470 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 471 472 return "r600"; 473 } 474 475 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 476 // The AMDGPU toolchain only supports generating shared objects, so we 477 // must always use PIC. 478 return Reloc::PIC_; 479 } 480 481 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 482 StringRef CPU, StringRef FS, 483 TargetOptions Options, 484 Optional<Reloc::Model> RM, 485 Optional<CodeModel::Model> CM, 486 CodeGenOpt::Level OptLevel) 487 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 488 FS, Options, getEffectiveRelocModel(RM), 489 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 490 TLOF(createTLOF(getTargetTriple())) { 491 initAsmInfo(); 492 if (TT.getArch() == Triple::amdgcn) { 493 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 494 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 495 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 496 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 497 } 498 } 499 500 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 501 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 502 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 503 504 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 505 506 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 507 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 508 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 509 } 510 511 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 512 Attribute FSAttr = F.getFnAttribute("target-features"); 513 514 return FSAttr.isValid() ? FSAttr.getValueAsString() 515 : getTargetFeatureString(); 516 } 517 518 /// Predicate for Internalize pass. 519 static bool mustPreserveGV(const GlobalValue &GV) { 520 if (const Function *F = dyn_cast<Function>(&GV)) 521 return F->isDeclaration() || F->getName().startswith("__asan_") || 522 F->getName().startswith("__sanitizer_") || 523 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 524 525 GV.removeDeadConstantUsers(); 526 return !GV.use_empty(); 527 } 528 529 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 530 Builder.DivergentTarget = true; 531 532 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 533 bool Internalize = InternalizeSymbols; 534 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 535 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 536 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 537 bool PromoteKernelArguments = 538 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 539 540 if (EnableFunctionCalls) { 541 delete Builder.Inliner; 542 Builder.Inliner = createFunctionInliningPass(); 543 } 544 545 Builder.addExtension( 546 PassManagerBuilder::EP_ModuleOptimizerEarly, 547 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 548 legacy::PassManagerBase &PM) { 549 if (AMDGPUAA) { 550 PM.add(createAMDGPUAAWrapperPass()); 551 PM.add(createAMDGPUExternalAAWrapperPass()); 552 } 553 PM.add(createAMDGPUUnifyMetadataPass()); 554 PM.add(createAMDGPUPrintfRuntimeBinding()); 555 if (Internalize) 556 PM.add(createInternalizePass(mustPreserveGV)); 557 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 558 if (Internalize) 559 PM.add(createGlobalDCEPass()); 560 if (EarlyInline) 561 PM.add(createAMDGPUAlwaysInlinePass(false)); 562 }); 563 564 Builder.addExtension( 565 PassManagerBuilder::EP_EarlyAsPossible, 566 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 567 legacy::PassManagerBase &PM) { 568 if (AMDGPUAA) { 569 PM.add(createAMDGPUAAWrapperPass()); 570 PM.add(createAMDGPUExternalAAWrapperPass()); 571 } 572 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 573 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 574 if (LibCallSimplify) 575 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 576 }); 577 578 Builder.addExtension( 579 PassManagerBuilder::EP_CGSCCOptimizerLate, 580 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 581 legacy::PassManagerBase &PM) { 582 // Add promote kernel arguments pass to the opt pipeline right before 583 // infer address spaces which is needed to do actual address space 584 // rewriting. 585 if (PromoteKernelArguments) 586 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 587 588 // Add infer address spaces pass to the opt pipeline after inlining 589 // but before SROA to increase SROA opportunities. 590 PM.add(createInferAddressSpacesPass()); 591 592 // This should run after inlining to have any chance of doing anything, 593 // and before other cleanup optimizations. 594 PM.add(createAMDGPULowerKernelAttributesPass()); 595 596 // Promote alloca to vector before SROA and loop unroll. If we manage 597 // to eliminate allocas before unroll we may choose to unroll less. 598 if (EnableOpt) 599 PM.add(createAMDGPUPromoteAllocaToVector()); 600 }); 601 } 602 603 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 604 AAM.registerFunctionAnalysis<AMDGPUAA>(); 605 } 606 607 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 608 PB.registerPipelineParsingCallback( 609 [this](StringRef PassName, ModulePassManager &PM, 610 ArrayRef<PassBuilder::PipelineElement>) { 611 if (PassName == "amdgpu-propagate-attributes-late") { 612 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 613 return true; 614 } 615 if (PassName == "amdgpu-unify-metadata") { 616 PM.addPass(AMDGPUUnifyMetadataPass()); 617 return true; 618 } 619 if (PassName == "amdgpu-printf-runtime-binding") { 620 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 621 return true; 622 } 623 if (PassName == "amdgpu-always-inline") { 624 PM.addPass(AMDGPUAlwaysInlinePass()); 625 return true; 626 } 627 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 628 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 629 return true; 630 } 631 if (PassName == "amdgpu-lower-module-lds") { 632 PM.addPass(AMDGPULowerModuleLDSPass()); 633 return true; 634 } 635 return false; 636 }); 637 PB.registerPipelineParsingCallback( 638 [this](StringRef PassName, FunctionPassManager &PM, 639 ArrayRef<PassBuilder::PipelineElement>) { 640 if (PassName == "amdgpu-simplifylib") { 641 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 642 return true; 643 } 644 if (PassName == "amdgpu-usenative") { 645 PM.addPass(AMDGPUUseNativeCallsPass()); 646 return true; 647 } 648 if (PassName == "amdgpu-promote-alloca") { 649 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 650 return true; 651 } 652 if (PassName == "amdgpu-promote-alloca-to-vector") { 653 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 654 return true; 655 } 656 if (PassName == "amdgpu-lower-kernel-attributes") { 657 PM.addPass(AMDGPULowerKernelAttributesPass()); 658 return true; 659 } 660 if (PassName == "amdgpu-propagate-attributes-early") { 661 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 662 return true; 663 } 664 if (PassName == "amdgpu-promote-kernel-arguments") { 665 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 666 return true; 667 } 668 return false; 669 }); 670 671 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 672 FAM.registerPass([&] { return AMDGPUAA(); }); 673 }); 674 675 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 676 if (AAName == "amdgpu-aa") { 677 AAM.registerFunctionAnalysis<AMDGPUAA>(); 678 return true; 679 } 680 return false; 681 }); 682 683 PB.registerPipelineStartEPCallback( 684 [this](ModulePassManager &PM, OptimizationLevel Level) { 685 FunctionPassManager FPM; 686 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 687 FPM.addPass(AMDGPUUseNativeCallsPass()); 688 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 689 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 690 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 691 }); 692 693 PB.registerPipelineEarlySimplificationEPCallback( 694 [this](ModulePassManager &PM, OptimizationLevel Level) { 695 if (Level == OptimizationLevel::O0) 696 return; 697 698 PM.addPass(AMDGPUUnifyMetadataPass()); 699 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 700 701 if (InternalizeSymbols) { 702 PM.addPass(InternalizePass(mustPreserveGV)); 703 } 704 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 705 if (InternalizeSymbols) { 706 PM.addPass(GlobalDCEPass()); 707 } 708 if (EarlyInlineAll && !EnableFunctionCalls) 709 PM.addPass(AMDGPUAlwaysInlinePass()); 710 }); 711 712 PB.registerCGSCCOptimizerLateEPCallback( 713 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 714 if (Level == OptimizationLevel::O0) 715 return; 716 717 FunctionPassManager FPM; 718 719 // Add promote kernel arguments pass to the opt pipeline right before 720 // infer address spaces which is needed to do actual address space 721 // rewriting. 722 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 723 EnablePromoteKernelArguments) 724 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 725 726 // Add infer address spaces pass to the opt pipeline after inlining 727 // but before SROA to increase SROA opportunities. 728 FPM.addPass(InferAddressSpacesPass()); 729 730 // This should run after inlining to have any chance of doing 731 // anything, and before other cleanup optimizations. 732 FPM.addPass(AMDGPULowerKernelAttributesPass()); 733 734 if (Level != OptimizationLevel::O0) { 735 // Promote alloca to vector before SROA and loop unroll. If we 736 // manage to eliminate allocas before unroll we may choose to unroll 737 // less. 738 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 739 } 740 741 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 742 }); 743 } 744 745 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 746 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 747 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 748 AddrSpace == AMDGPUAS::REGION_ADDRESS) 749 ? -1 750 : 0; 751 } 752 753 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 754 unsigned DestAS) const { 755 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 756 AMDGPU::isFlatGlobalAddrSpace(DestAS); 757 } 758 759 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 760 const auto *LD = dyn_cast<LoadInst>(V); 761 if (!LD) 762 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 763 764 // It must be a generic pointer loaded. 765 assert(V->getType()->isPointerTy() && 766 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 767 768 const auto *Ptr = LD->getPointerOperand(); 769 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 770 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 771 // For a generic pointer loaded from the constant memory, it could be assumed 772 // as a global pointer since the constant memory is only populated on the 773 // host side. As implied by the offload programming model, only global 774 // pointers could be referenced on the host side. 775 return AMDGPUAS::GLOBAL_ADDRESS; 776 } 777 778 std::pair<const Value *, unsigned> 779 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 780 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 781 switch (II->getIntrinsicID()) { 782 case Intrinsic::amdgcn_is_shared: 783 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 784 case Intrinsic::amdgcn_is_private: 785 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 786 default: 787 break; 788 } 789 return std::make_pair(nullptr, -1); 790 } 791 // Check the global pointer predication based on 792 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 793 // the order of 'is_shared' and 'is_private' is not significant. 794 Value *Ptr; 795 if (match( 796 const_cast<Value *>(V), 797 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 798 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 799 m_Deferred(Ptr)))))) 800 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 801 802 return std::make_pair(nullptr, -1); 803 } 804 805 //===----------------------------------------------------------------------===// 806 // GCN Target Machine (SI+) 807 //===----------------------------------------------------------------------===// 808 809 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 810 StringRef CPU, StringRef FS, 811 TargetOptions Options, 812 Optional<Reloc::Model> RM, 813 Optional<CodeModel::Model> CM, 814 CodeGenOpt::Level OL, bool JIT) 815 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 816 817 const TargetSubtargetInfo * 818 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 819 StringRef GPU = getGPUName(F); 820 StringRef FS = getFeatureString(F); 821 822 SmallString<128> SubtargetKey(GPU); 823 SubtargetKey.append(FS); 824 825 auto &I = SubtargetMap[SubtargetKey]; 826 if (!I) { 827 // This needs to be done before we create a new subtarget since any 828 // creation will depend on the TM and the code generation flags on the 829 // function that reside in TargetOptions. 830 resetTargetOptions(F); 831 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 832 } 833 834 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 835 836 return I.get(); 837 } 838 839 TargetTransformInfo 840 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 841 return TargetTransformInfo(GCNTTIImpl(this, F)); 842 } 843 844 //===----------------------------------------------------------------------===// 845 // AMDGPU Pass Setup 846 //===----------------------------------------------------------------------===// 847 848 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 849 return getStandardCSEConfigForOpt(TM->getOptLevel()); 850 } 851 852 namespace { 853 854 class GCNPassConfig final : public AMDGPUPassConfig { 855 public: 856 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 857 : AMDGPUPassConfig(TM, PM) { 858 // It is necessary to know the register usage of the entire call graph. We 859 // allow calls without EnableAMDGPUFunctionCalls if they are marked 860 // noinline, so this is always required. 861 setRequiresCodeGenSCCOrder(true); 862 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 863 } 864 865 GCNTargetMachine &getGCNTargetMachine() const { 866 return getTM<GCNTargetMachine>(); 867 } 868 869 ScheduleDAGInstrs * 870 createMachineScheduler(MachineSchedContext *C) const override; 871 872 ScheduleDAGInstrs * 873 createPostMachineScheduler(MachineSchedContext *C) const override { 874 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 875 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 876 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 877 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 878 return DAG; 879 } 880 881 bool addPreISel() override; 882 void addMachineSSAOptimization() override; 883 bool addILPOpts() override; 884 bool addInstSelector() override; 885 bool addIRTranslator() override; 886 void addPreLegalizeMachineIR() override; 887 bool addLegalizeMachineIR() override; 888 void addPreRegBankSelect() override; 889 bool addRegBankSelect() override; 890 void addPreGlobalInstructionSelect() override; 891 bool addGlobalInstructionSelect() override; 892 void addFastRegAlloc() override; 893 void addOptimizedRegAlloc() override; 894 895 FunctionPass *createSGPRAllocPass(bool Optimized); 896 FunctionPass *createVGPRAllocPass(bool Optimized); 897 FunctionPass *createRegAllocPass(bool Optimized) override; 898 899 bool addRegAssignAndRewriteFast() override; 900 bool addRegAssignAndRewriteOptimized() override; 901 902 void addPreRegAlloc() override; 903 bool addPreRewrite() override; 904 void addPostRegAlloc() override; 905 void addPreSched2() override; 906 void addPreEmitPass() override; 907 }; 908 909 } // end anonymous namespace 910 911 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 912 : TargetPassConfig(TM, PM) { 913 // Exceptions and StackMaps are not supported, so these passes will never do 914 // anything. 915 disablePass(&StackMapLivenessID); 916 disablePass(&FuncletLayoutID); 917 // Garbage collection is not supported. 918 disablePass(&GCLoweringID); 919 disablePass(&ShadowStackGCLoweringID); 920 } 921 922 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 923 if (getOptLevel() == CodeGenOpt::Aggressive) 924 addPass(createGVNPass()); 925 else 926 addPass(createEarlyCSEPass()); 927 } 928 929 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 930 addPass(createLICMPass()); 931 addPass(createSeparateConstOffsetFromGEPPass()); 932 addPass(createSpeculativeExecutionPass()); 933 // ReassociateGEPs exposes more opportunities for SLSR. See 934 // the example in reassociate-geps-and-slsr.ll. 935 addPass(createStraightLineStrengthReducePass()); 936 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 937 // EarlyCSE can reuse. 938 addEarlyCSEOrGVNPass(); 939 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 940 addPass(createNaryReassociatePass()); 941 // NaryReassociate on GEPs creates redundant common expressions, so run 942 // EarlyCSE after it. 943 addPass(createEarlyCSEPass()); 944 } 945 946 void AMDGPUPassConfig::addIRPasses() { 947 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 948 949 // There is no reason to run these. 950 disablePass(&StackMapLivenessID); 951 disablePass(&FuncletLayoutID); 952 disablePass(&PatchableFunctionID); 953 954 addPass(createAMDGPUPrintfRuntimeBinding()); 955 addPass(createAMDGPUCtorDtorLoweringPass()); 956 957 // A call to propagate attributes pass in the backend in case opt was not run. 958 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 959 960 addPass(createAMDGPULowerIntrinsicsPass()); 961 962 // Function calls are not supported, so make sure we inline everything. 963 addPass(createAMDGPUAlwaysInlinePass()); 964 addPass(createAlwaysInlinerLegacyPass()); 965 // We need to add the barrier noop pass, otherwise adding the function 966 // inlining pass will cause all of the PassConfigs passes to be run 967 // one function at a time, which means if we have a nodule with two 968 // functions, then we will generate code for the first function 969 // without ever running any passes on the second. 970 addPass(createBarrierNoopPass()); 971 972 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 973 if (TM.getTargetTriple().getArch() == Triple::r600) 974 addPass(createR600OpenCLImageTypeLoweringPass()); 975 976 // Replace OpenCL enqueued block function pointers with global variables. 977 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 978 979 // Can increase LDS used by kernel so runs before PromoteAlloca 980 if (EnableLowerModuleLDS) { 981 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 982 // pass "amdgpu-lower-module-lds", and also it required to be run only if 983 // "amdgpu-lower-module-lds" pass is enabled. 984 if (EnableLDSReplaceWithPointer) 985 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 986 987 addPass(createAMDGPULowerModuleLDSPass()); 988 } 989 990 if (TM.getOptLevel() > CodeGenOpt::None) 991 addPass(createInferAddressSpacesPass()); 992 993 addPass(createAtomicExpandPass()); 994 995 if (TM.getOptLevel() > CodeGenOpt::None) { 996 addPass(createAMDGPUPromoteAlloca()); 997 998 if (EnableSROA) 999 addPass(createSROAPass()); 1000 if (isPassEnabled(EnableScalarIRPasses)) 1001 addStraightLineScalarOptimizationPasses(); 1002 1003 if (EnableAMDGPUAliasAnalysis) { 1004 addPass(createAMDGPUAAWrapperPass()); 1005 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1006 AAResults &AAR) { 1007 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1008 AAR.addAAResult(WrapperPass->getResult()); 1009 })); 1010 } 1011 1012 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1013 // TODO: May want to move later or split into an early and late one. 1014 addPass(createAMDGPUCodeGenPreparePass()); 1015 } 1016 } 1017 1018 TargetPassConfig::addIRPasses(); 1019 1020 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1021 // example, GVN can combine 1022 // 1023 // %0 = add %a, %b 1024 // %1 = add %b, %a 1025 // 1026 // and 1027 // 1028 // %0 = shl nsw %a, 2 1029 // %1 = shl %a, 2 1030 // 1031 // but EarlyCSE can do neither of them. 1032 if (isPassEnabled(EnableScalarIRPasses)) 1033 addEarlyCSEOrGVNPass(); 1034 } 1035 1036 void AMDGPUPassConfig::addCodeGenPrepare() { 1037 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1038 addPass(createAMDGPUAttributorPass()); 1039 1040 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1041 // analysis, and should be removed. 1042 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1043 } 1044 1045 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1046 EnableLowerKernelArguments) 1047 addPass(createAMDGPULowerKernelArgumentsPass()); 1048 1049 TargetPassConfig::addCodeGenPrepare(); 1050 1051 if (isPassEnabled(EnableLoadStoreVectorizer)) 1052 addPass(createLoadStoreVectorizerPass()); 1053 1054 // LowerSwitch pass may introduce unreachable blocks that can 1055 // cause unexpected behavior for subsequent passes. Placing it 1056 // here seems better that these blocks would get cleaned up by 1057 // UnreachableBlockElim inserted next in the pass flow. 1058 addPass(createLowerSwitchPass()); 1059 } 1060 1061 bool AMDGPUPassConfig::addPreISel() { 1062 if (TM->getOptLevel() > CodeGenOpt::None) 1063 addPass(createFlattenCFGPass()); 1064 return false; 1065 } 1066 1067 bool AMDGPUPassConfig::addInstSelector() { 1068 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1069 return false; 1070 } 1071 1072 bool AMDGPUPassConfig::addGCPasses() { 1073 // Do nothing. GC is not supported. 1074 return false; 1075 } 1076 1077 llvm::ScheduleDAGInstrs * 1078 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1079 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1080 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1081 return DAG; 1082 } 1083 1084 //===----------------------------------------------------------------------===// 1085 // GCN Pass Setup 1086 //===----------------------------------------------------------------------===// 1087 1088 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1089 MachineSchedContext *C) const { 1090 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1091 if (ST.enableSIScheduler()) 1092 return createSIMachineScheduler(C); 1093 return createGCNMaxOccupancyMachineScheduler(C); 1094 } 1095 1096 bool GCNPassConfig::addPreISel() { 1097 AMDGPUPassConfig::addPreISel(); 1098 1099 if (TM->getOptLevel() > CodeGenOpt::None) 1100 addPass(createAMDGPULateCodeGenPreparePass()); 1101 1102 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1103 addPass(createAMDGPUAtomicOptimizerPass()); 1104 } 1105 1106 if (TM->getOptLevel() > CodeGenOpt::None) 1107 addPass(createSinkingPass()); 1108 1109 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1110 // regions formed by them. 1111 addPass(&AMDGPUUnifyDivergentExitNodesID); 1112 if (!LateCFGStructurize) { 1113 if (EnableStructurizerWorkarounds) { 1114 addPass(createFixIrreduciblePass()); 1115 addPass(createUnifyLoopExitsPass()); 1116 } 1117 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1118 } 1119 addPass(createAMDGPUAnnotateUniformValues()); 1120 if (!LateCFGStructurize) { 1121 addPass(createSIAnnotateControlFlowPass()); 1122 } 1123 addPass(createLCSSAPass()); 1124 1125 if (TM->getOptLevel() > CodeGenOpt::Less) 1126 addPass(&AMDGPUPerfHintAnalysisID); 1127 1128 return false; 1129 } 1130 1131 void GCNPassConfig::addMachineSSAOptimization() { 1132 TargetPassConfig::addMachineSSAOptimization(); 1133 1134 // We want to fold operands after PeepholeOptimizer has run (or as part of 1135 // it), because it will eliminate extra copies making it easier to fold the 1136 // real source operand. We want to eliminate dead instructions after, so that 1137 // we see fewer uses of the copies. We then need to clean up the dead 1138 // instructions leftover after the operands are folded as well. 1139 // 1140 // XXX - Can we get away without running DeadMachineInstructionElim again? 1141 addPass(&SIFoldOperandsID); 1142 if (EnableDPPCombine) 1143 addPass(&GCNDPPCombineID); 1144 addPass(&SILoadStoreOptimizerID); 1145 if (isPassEnabled(EnableSDWAPeephole)) { 1146 addPass(&SIPeepholeSDWAID); 1147 addPass(&EarlyMachineLICMID); 1148 addPass(&MachineCSEID); 1149 addPass(&SIFoldOperandsID); 1150 } 1151 addPass(&DeadMachineInstructionElimID); 1152 addPass(createSIShrinkInstructionsPass()); 1153 } 1154 1155 bool GCNPassConfig::addILPOpts() { 1156 if (EnableEarlyIfConversion) 1157 addPass(&EarlyIfConverterID); 1158 1159 TargetPassConfig::addILPOpts(); 1160 return false; 1161 } 1162 1163 bool GCNPassConfig::addInstSelector() { 1164 AMDGPUPassConfig::addInstSelector(); 1165 addPass(&SIFixSGPRCopiesID); 1166 addPass(createSILowerI1CopiesPass()); 1167 return false; 1168 } 1169 1170 bool GCNPassConfig::addIRTranslator() { 1171 addPass(new IRTranslator(getOptLevel())); 1172 return false; 1173 } 1174 1175 void GCNPassConfig::addPreLegalizeMachineIR() { 1176 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1177 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1178 addPass(new Localizer()); 1179 } 1180 1181 bool GCNPassConfig::addLegalizeMachineIR() { 1182 addPass(new Legalizer()); 1183 return false; 1184 } 1185 1186 void GCNPassConfig::addPreRegBankSelect() { 1187 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1188 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1189 } 1190 1191 bool GCNPassConfig::addRegBankSelect() { 1192 addPass(new RegBankSelect()); 1193 return false; 1194 } 1195 1196 void GCNPassConfig::addPreGlobalInstructionSelect() { 1197 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1198 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1199 } 1200 1201 bool GCNPassConfig::addGlobalInstructionSelect() { 1202 addPass(new InstructionSelect(getOptLevel())); 1203 return false; 1204 } 1205 1206 void GCNPassConfig::addPreRegAlloc() { 1207 if (LateCFGStructurize) { 1208 addPass(createAMDGPUMachineCFGStructurizerPass()); 1209 } 1210 } 1211 1212 void GCNPassConfig::addFastRegAlloc() { 1213 // FIXME: We have to disable the verifier here because of PHIElimination + 1214 // TwoAddressInstructions disabling it. 1215 1216 // This must be run immediately after phi elimination and before 1217 // TwoAddressInstructions, otherwise the processing of the tied operand of 1218 // SI_ELSE will introduce a copy of the tied operand source after the else. 1219 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1220 1221 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1222 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1223 1224 TargetPassConfig::addFastRegAlloc(); 1225 } 1226 1227 void GCNPassConfig::addOptimizedRegAlloc() { 1228 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1229 // instructions that cause scheduling barriers. 1230 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1231 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1232 1233 if (OptExecMaskPreRA) 1234 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1235 1236 if (isPassEnabled(EnablePreRAOptimizations)) 1237 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1238 1239 // This is not an essential optimization and it has a noticeable impact on 1240 // compilation time, so we only enable it from O2. 1241 if (TM->getOptLevel() > CodeGenOpt::Less) 1242 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1243 1244 // FIXME: when an instruction has a Killed operand, and the instruction is 1245 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1246 // the register in LiveVariables, this would trigger a failure in verifier, 1247 // we should fix it and enable the verifier. 1248 if (OptVGPRLiveRange) 1249 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1250 // This must be run immediately after phi elimination and before 1251 // TwoAddressInstructions, otherwise the processing of the tied operand of 1252 // SI_ELSE will introduce a copy of the tied operand source after the else. 1253 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1254 1255 if (EnableDCEInRA) 1256 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1257 1258 TargetPassConfig::addOptimizedRegAlloc(); 1259 } 1260 1261 bool GCNPassConfig::addPreRewrite() { 1262 if (EnableRegReassign) 1263 addPass(&GCNNSAReassignID); 1264 return true; 1265 } 1266 1267 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1268 // Initialize the global default. 1269 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1270 initializeDefaultSGPRRegisterAllocatorOnce); 1271 1272 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1273 if (Ctor != useDefaultRegisterAllocator) 1274 return Ctor(); 1275 1276 if (Optimized) 1277 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1278 1279 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1280 } 1281 1282 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1283 // Initialize the global default. 1284 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1285 initializeDefaultVGPRRegisterAllocatorOnce); 1286 1287 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1288 if (Ctor != useDefaultRegisterAllocator) 1289 return Ctor(); 1290 1291 if (Optimized) 1292 return createGreedyVGPRRegisterAllocator(); 1293 1294 return createFastVGPRRegisterAllocator(); 1295 } 1296 1297 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1298 llvm_unreachable("should not be used"); 1299 } 1300 1301 static const char RegAllocOptNotSupportedMessage[] = 1302 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1303 1304 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1305 if (!usingDefaultRegAlloc()) 1306 report_fatal_error(RegAllocOptNotSupportedMessage); 1307 1308 addPass(createSGPRAllocPass(false)); 1309 1310 // Equivalent of PEI for SGPRs. 1311 addPass(&SILowerSGPRSpillsID); 1312 1313 addPass(createVGPRAllocPass(false)); 1314 return true; 1315 } 1316 1317 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1318 if (!usingDefaultRegAlloc()) 1319 report_fatal_error(RegAllocOptNotSupportedMessage); 1320 1321 addPass(createSGPRAllocPass(true)); 1322 1323 // Commit allocated register changes. This is mostly necessary because too 1324 // many things rely on the use lists of the physical registers, such as the 1325 // verifier. This is only necessary with allocators which use LiveIntervals, 1326 // since FastRegAlloc does the replacements itself. 1327 addPass(createVirtRegRewriter(false)); 1328 1329 // Equivalent of PEI for SGPRs. 1330 addPass(&SILowerSGPRSpillsID); 1331 1332 addPass(createVGPRAllocPass(true)); 1333 1334 addPreRewrite(); 1335 addPass(&VirtRegRewriterID); 1336 1337 return true; 1338 } 1339 1340 void GCNPassConfig::addPostRegAlloc() { 1341 addPass(&SIFixVGPRCopiesID); 1342 if (getOptLevel() > CodeGenOpt::None) 1343 addPass(&SIOptimizeExecMaskingID); 1344 TargetPassConfig::addPostRegAlloc(); 1345 } 1346 1347 void GCNPassConfig::addPreSched2() { 1348 if (TM->getOptLevel() > CodeGenOpt::None) 1349 addPass(createSIShrinkInstructionsPass()); 1350 addPass(&SIPostRABundlerID); 1351 } 1352 1353 void GCNPassConfig::addPreEmitPass() { 1354 addPass(createSIMemoryLegalizerPass()); 1355 addPass(createSIInsertWaitcntsPass()); 1356 1357 addPass(createSIModeRegisterPass()); 1358 1359 if (getOptLevel() > CodeGenOpt::None) 1360 addPass(&SIInsertHardClausesID); 1361 1362 addPass(&SILateBranchLoweringPassID); 1363 if (getOptLevel() > CodeGenOpt::None) 1364 addPass(&SIPreEmitPeepholeID); 1365 // The hazard recognizer that runs as part of the post-ra scheduler does not 1366 // guarantee to be able handle all hazards correctly. This is because if there 1367 // are multiple scheduling regions in a basic block, the regions are scheduled 1368 // bottom up, so when we begin to schedule a region we don't know what 1369 // instructions were emitted directly before it. 1370 // 1371 // Here we add a stand-alone hazard recognizer pass which can handle all 1372 // cases. 1373 addPass(&PostRAHazardRecognizerID); 1374 addPass(&BranchRelaxationPassID); 1375 } 1376 1377 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1378 return new GCNPassConfig(*this, PM); 1379 } 1380 1381 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1382 return new yaml::SIMachineFunctionInfo(); 1383 } 1384 1385 yaml::MachineFunctionInfo * 1386 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1387 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1388 return new yaml::SIMachineFunctionInfo( 1389 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1390 } 1391 1392 bool GCNTargetMachine::parseMachineFunctionInfo( 1393 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1394 SMDiagnostic &Error, SMRange &SourceRange) const { 1395 const yaml::SIMachineFunctionInfo &YamlMFI = 1396 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1397 MachineFunction &MF = PFS.MF; 1398 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1399 1400 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1401 return true; 1402 1403 if (MFI->Occupancy == 0) { 1404 // Fixup the subtarget dependent default value. 1405 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1406 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1407 } 1408 1409 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1410 Register TempReg; 1411 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1412 SourceRange = RegName.SourceRange; 1413 return true; 1414 } 1415 RegVal = TempReg; 1416 1417 return false; 1418 }; 1419 1420 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1421 // Create a diagnostic for a the register string literal. 1422 const MemoryBuffer &Buffer = 1423 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1424 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1425 RegName.Value.size(), SourceMgr::DK_Error, 1426 "incorrect register class for field", RegName.Value, 1427 None, None); 1428 SourceRange = RegName.SourceRange; 1429 return true; 1430 }; 1431 1432 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1433 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1434 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1435 return true; 1436 1437 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1438 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1439 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1440 } 1441 1442 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1443 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1444 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1445 } 1446 1447 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1448 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1449 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1450 } 1451 1452 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1453 const TargetRegisterClass &RC, 1454 ArgDescriptor &Arg, unsigned UserSGPRs, 1455 unsigned SystemSGPRs) { 1456 // Skip parsing if it's not present. 1457 if (!A) 1458 return false; 1459 1460 if (A->IsRegister) { 1461 Register Reg; 1462 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1463 SourceRange = A->RegisterName.SourceRange; 1464 return true; 1465 } 1466 if (!RC.contains(Reg)) 1467 return diagnoseRegisterClass(A->RegisterName); 1468 Arg = ArgDescriptor::createRegister(Reg); 1469 } else 1470 Arg = ArgDescriptor::createStack(A->StackOffset); 1471 // Check and apply the optional mask. 1472 if (A->Mask) 1473 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1474 1475 MFI->NumUserSGPRs += UserSGPRs; 1476 MFI->NumSystemSGPRs += SystemSGPRs; 1477 return false; 1478 }; 1479 1480 if (YamlMFI.ArgInfo && 1481 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1482 AMDGPU::SGPR_128RegClass, 1483 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1484 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1485 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1486 2, 0) || 1487 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1488 MFI->ArgInfo.QueuePtr, 2, 0) || 1489 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1490 AMDGPU::SReg_64RegClass, 1491 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1492 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1493 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1494 2, 0) || 1495 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1496 AMDGPU::SReg_64RegClass, 1497 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1498 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1499 AMDGPU::SGPR_32RegClass, 1500 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1501 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1502 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1503 0, 1) || 1504 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1505 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1506 0, 1) || 1507 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1508 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1509 0, 1) || 1510 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1511 AMDGPU::SGPR_32RegClass, 1512 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1513 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1514 AMDGPU::SGPR_32RegClass, 1515 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1516 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1517 AMDGPU::SReg_64RegClass, 1518 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1519 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1520 AMDGPU::SReg_64RegClass, 1521 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1522 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1523 AMDGPU::VGPR_32RegClass, 1524 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1525 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1526 AMDGPU::VGPR_32RegClass, 1527 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1528 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1529 AMDGPU::VGPR_32RegClass, 1530 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1531 return true; 1532 1533 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1534 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1535 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1536 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1537 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1538 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1539 1540 return false; 1541 } 1542