1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUMacroFusion.h" 20 #include "AMDGPUTargetObjectFile.h" 21 #include "AMDGPUTargetTransformInfo.h" 22 #include "GCNIterativeScheduler.h" 23 #include "GCNSchedStrategy.h" 24 #include "R600.h" 25 #include "R600TargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIMachineScheduler.h" 28 #include "TargetInfo/AMDGPUTargetInfo.h" 29 #include "llvm/Analysis/CGSCCPassManager.h" 30 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 31 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 32 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 33 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 34 #include "llvm/CodeGen/GlobalISel/Localizer.h" 35 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 36 #include "llvm/CodeGen/MIRParser/MIParser.h" 37 #include "llvm/CodeGen/Passes.h" 38 #include "llvm/CodeGen/RegAllocRegistry.h" 39 #include "llvm/CodeGen/TargetPassConfig.h" 40 #include "llvm/IR/IntrinsicsAMDGPU.h" 41 #include "llvm/IR/LegacyPassManager.h" 42 #include "llvm/IR/PassManager.h" 43 #include "llvm/IR/PatternMatch.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/MC/TargetRegistry.h" 46 #include "llvm/Passes/PassBuilder.h" 47 #include "llvm/Transforms/IPO.h" 48 #include "llvm/Transforms/IPO/AlwaysInliner.h" 49 #include "llvm/Transforms/IPO/GlobalDCE.h" 50 #include "llvm/Transforms/IPO/Internalize.h" 51 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 52 #include "llvm/Transforms/Scalar.h" 53 #include "llvm/Transforms/Scalar/GVN.h" 54 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 55 #include "llvm/Transforms/Utils.h" 56 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 57 #include "llvm/Transforms/Vectorize.h" 58 59 using namespace llvm; 60 61 namespace { 62 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 63 public: 64 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 65 : RegisterRegAllocBase(N, D, C) {} 66 }; 67 68 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 69 public: 70 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 71 : RegisterRegAllocBase(N, D, C) {} 72 }; 73 74 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 75 const TargetRegisterClass &RC) { 76 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 77 } 78 79 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 80 const TargetRegisterClass &RC) { 81 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 82 } 83 84 85 /// -{sgpr|vgpr}-regalloc=... command line option. 86 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 87 88 /// A dummy default pass factory indicates whether the register allocator is 89 /// overridden on the command line. 90 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 91 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 92 93 static SGPRRegisterRegAlloc 94 defaultSGPRRegAlloc("default", 95 "pick SGPR register allocator based on -O option", 96 useDefaultRegisterAllocator); 97 98 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 99 RegisterPassParser<SGPRRegisterRegAlloc>> 100 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 101 cl::desc("Register allocator to use for SGPRs")); 102 103 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 104 RegisterPassParser<VGPRRegisterRegAlloc>> 105 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 106 cl::desc("Register allocator to use for VGPRs")); 107 108 109 static void initializeDefaultSGPRRegisterAllocatorOnce() { 110 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 111 112 if (!Ctor) { 113 Ctor = SGPRRegAlloc; 114 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 115 } 116 } 117 118 static void initializeDefaultVGPRRegisterAllocatorOnce() { 119 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 120 121 if (!Ctor) { 122 Ctor = VGPRRegAlloc; 123 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 124 } 125 } 126 127 static FunctionPass *createBasicSGPRRegisterAllocator() { 128 return createBasicRegisterAllocator(onlyAllocateSGPRs); 129 } 130 131 static FunctionPass *createGreedySGPRRegisterAllocator() { 132 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 133 } 134 135 static FunctionPass *createFastSGPRRegisterAllocator() { 136 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 137 } 138 139 static FunctionPass *createBasicVGPRRegisterAllocator() { 140 return createBasicRegisterAllocator(onlyAllocateVGPRs); 141 } 142 143 static FunctionPass *createGreedyVGPRRegisterAllocator() { 144 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 145 } 146 147 static FunctionPass *createFastVGPRRegisterAllocator() { 148 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 149 } 150 151 static SGPRRegisterRegAlloc basicRegAllocSGPR( 152 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 153 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 154 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 155 156 static SGPRRegisterRegAlloc fastRegAllocSGPR( 157 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 158 159 160 static VGPRRegisterRegAlloc basicRegAllocVGPR( 161 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 162 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 163 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 164 165 static VGPRRegisterRegAlloc fastRegAllocVGPR( 166 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 167 } 168 169 static cl::opt<bool> EnableSROA( 170 "amdgpu-sroa", 171 cl::desc("Run SROA after promote alloca pass"), 172 cl::ReallyHidden, 173 cl::init(true)); 174 175 static cl::opt<bool> 176 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 177 cl::desc("Run early if-conversion"), 178 cl::init(false)); 179 180 static cl::opt<bool> 181 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 182 cl::desc("Run pre-RA exec mask optimizations"), 183 cl::init(true)); 184 185 // Option to disable vectorizer for tests. 186 static cl::opt<bool> EnableLoadStoreVectorizer( 187 "amdgpu-load-store-vectorizer", 188 cl::desc("Enable load store vectorizer"), 189 cl::init(true), 190 cl::Hidden); 191 192 // Option to control global loads scalarization 193 static cl::opt<bool> ScalarizeGlobal( 194 "amdgpu-scalarize-global-loads", 195 cl::desc("Enable global load scalarization"), 196 cl::init(true), 197 cl::Hidden); 198 199 // Option to run internalize pass. 200 static cl::opt<bool> InternalizeSymbols( 201 "amdgpu-internalize-symbols", 202 cl::desc("Enable elimination of non-kernel functions and unused globals"), 203 cl::init(false), 204 cl::Hidden); 205 206 // Option to inline all early. 207 static cl::opt<bool> EarlyInlineAll( 208 "amdgpu-early-inline-all", 209 cl::desc("Inline all functions early"), 210 cl::init(false), 211 cl::Hidden); 212 213 static cl::opt<bool> EnableSDWAPeephole( 214 "amdgpu-sdwa-peephole", 215 cl::desc("Enable SDWA peepholer"), 216 cl::init(true)); 217 218 static cl::opt<bool> EnableDPPCombine( 219 "amdgpu-dpp-combine", 220 cl::desc("Enable DPP combiner"), 221 cl::init(true)); 222 223 // Enable address space based alias analysis 224 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 225 cl::desc("Enable AMDGPU Alias Analysis"), 226 cl::init(true)); 227 228 // Option to run late CFG structurizer 229 static cl::opt<bool, true> LateCFGStructurize( 230 "amdgpu-late-structurize", 231 cl::desc("Enable late CFG structurization"), 232 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 233 cl::Hidden); 234 235 // Enable lib calls simplifications 236 static cl::opt<bool> EnableLibCallSimplify( 237 "amdgpu-simplify-libcall", 238 cl::desc("Enable amdgpu library simplifications"), 239 cl::init(true), 240 cl::Hidden); 241 242 static cl::opt<bool> EnableLowerKernelArguments( 243 "amdgpu-ir-lower-kernel-arguments", 244 cl::desc("Lower kernel argument loads in IR pass"), 245 cl::init(true), 246 cl::Hidden); 247 248 static cl::opt<bool> EnableRegReassign( 249 "amdgpu-reassign-regs", 250 cl::desc("Enable register reassign optimizations on gfx10+"), 251 cl::init(true), 252 cl::Hidden); 253 254 static cl::opt<bool> OptVGPRLiveRange( 255 "amdgpu-opt-vgpr-liverange", 256 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 257 cl::init(true), cl::Hidden); 258 259 // Enable atomic optimization 260 static cl::opt<bool> EnableAtomicOptimizations( 261 "amdgpu-atomic-optimizations", 262 cl::desc("Enable atomic optimizations"), 263 cl::init(false), 264 cl::Hidden); 265 266 // Enable Mode register optimization 267 static cl::opt<bool> EnableSIModeRegisterPass( 268 "amdgpu-mode-register", 269 cl::desc("Enable mode register pass"), 270 cl::init(true), 271 cl::Hidden); 272 273 // Option is used in lit tests to prevent deadcoding of patterns inspected. 274 static cl::opt<bool> 275 EnableDCEInRA("amdgpu-dce-in-ra", 276 cl::init(true), cl::Hidden, 277 cl::desc("Enable machine DCE inside regalloc")); 278 279 static cl::opt<bool> EnableScalarIRPasses( 280 "amdgpu-scalar-ir-passes", 281 cl::desc("Enable scalar IR passes"), 282 cl::init(true), 283 cl::Hidden); 284 285 static cl::opt<bool> EnableStructurizerWorkarounds( 286 "amdgpu-enable-structurizer-workarounds", 287 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 288 cl::Hidden); 289 290 static cl::opt<bool> EnableLDSReplaceWithPointer( 291 "amdgpu-enable-lds-replace-with-pointer", 292 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 293 cl::Hidden); 294 295 static cl::opt<bool, true> EnableLowerModuleLDS( 296 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 297 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 298 cl::Hidden); 299 300 static cl::opt<bool> EnablePreRAOptimizations( 301 "amdgpu-enable-pre-ra-optimizations", 302 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 303 cl::Hidden); 304 305 static cl::opt<bool> EnablePromoteKernelArguments( 306 "amdgpu-enable-promote-kernel-arguments", 307 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 308 cl::Hidden, cl::init(true)); 309 310 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 311 // Register the target 312 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 313 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 314 315 PassRegistry *PR = PassRegistry::getPassRegistry(); 316 initializeR600ClauseMergePassPass(*PR); 317 initializeR600ControlFlowFinalizerPass(*PR); 318 initializeR600PacketizerPass(*PR); 319 initializeR600ExpandSpecialInstrsPassPass(*PR); 320 initializeR600VectorRegMergerPass(*PR); 321 initializeGlobalISel(*PR); 322 initializeAMDGPUDAGToDAGISelPass(*PR); 323 initializeGCNDPPCombinePass(*PR); 324 initializeSILowerI1CopiesPass(*PR); 325 initializeSILowerSGPRSpillsPass(*PR); 326 initializeSIFixSGPRCopiesPass(*PR); 327 initializeSIFixVGPRCopiesPass(*PR); 328 initializeSIFoldOperandsPass(*PR); 329 initializeSIPeepholeSDWAPass(*PR); 330 initializeSIShrinkInstructionsPass(*PR); 331 initializeSIOptimizeExecMaskingPreRAPass(*PR); 332 initializeSIOptimizeVGPRLiveRangePass(*PR); 333 initializeSILoadStoreOptimizerPass(*PR); 334 initializeAMDGPUFixFunctionBitcastsPass(*PR); 335 initializeAMDGPUCtorDtorLoweringPass(*PR); 336 initializeAMDGPUAlwaysInlinePass(*PR); 337 initializeAMDGPUAttributorPass(*PR); 338 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 339 initializeAMDGPUAnnotateUniformValuesPass(*PR); 340 initializeAMDGPUArgumentUsageInfoPass(*PR); 341 initializeAMDGPUAtomicOptimizerPass(*PR); 342 initializeAMDGPULowerKernelArgumentsPass(*PR); 343 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 344 initializeAMDGPULowerKernelAttributesPass(*PR); 345 initializeAMDGPULowerIntrinsicsPass(*PR); 346 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 347 initializeAMDGPUPostLegalizerCombinerPass(*PR); 348 initializeAMDGPUPreLegalizerCombinerPass(*PR); 349 initializeAMDGPURegBankCombinerPass(*PR); 350 initializeAMDGPUPromoteAllocaPass(*PR); 351 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 352 initializeAMDGPUCodeGenPreparePass(*PR); 353 initializeAMDGPULateCodeGenPreparePass(*PR); 354 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 355 initializeAMDGPUPropagateAttributesLatePass(*PR); 356 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 357 initializeAMDGPULowerModuleLDSPass(*PR); 358 initializeAMDGPURewriteOutArgumentsPass(*PR); 359 initializeAMDGPUUnifyMetadataPass(*PR); 360 initializeSIAnnotateControlFlowPass(*PR); 361 initializeSIInsertHardClausesPass(*PR); 362 initializeSIInsertWaitcntsPass(*PR); 363 initializeSIModeRegisterPass(*PR); 364 initializeSIWholeQuadModePass(*PR); 365 initializeSILowerControlFlowPass(*PR); 366 initializeSIPreEmitPeepholePass(*PR); 367 initializeSILateBranchLoweringPass(*PR); 368 initializeSIMemoryLegalizerPass(*PR); 369 initializeSIOptimizeExecMaskingPass(*PR); 370 initializeSIPreAllocateWWMRegsPass(*PR); 371 initializeSIFormMemoryClausesPass(*PR); 372 initializeSIPostRABundlerPass(*PR); 373 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 374 initializeAMDGPUAAWrapperPassPass(*PR); 375 initializeAMDGPUExternalAAWrapperPass(*PR); 376 initializeAMDGPUUseNativeCallsPass(*PR); 377 initializeAMDGPUSimplifyLibCallsPass(*PR); 378 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 379 initializeAMDGPUResourceUsageAnalysisPass(*PR); 380 initializeGCNNSAReassignPass(*PR); 381 initializeGCNPreRAOptimizationsPass(*PR); 382 } 383 384 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 385 return std::make_unique<AMDGPUTargetObjectFile>(); 386 } 387 388 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 389 return new SIScheduleDAGMI(C); 390 } 391 392 static ScheduleDAGInstrs * 393 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 394 ScheduleDAGMILive *DAG = 395 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 396 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 397 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 398 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 399 return DAG; 400 } 401 402 static ScheduleDAGInstrs * 403 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 404 auto DAG = new GCNIterativeScheduler(C, 405 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 406 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 407 return DAG; 408 } 409 410 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 411 return new GCNIterativeScheduler(C, 412 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 413 } 414 415 static ScheduleDAGInstrs * 416 createIterativeILPMachineScheduler(MachineSchedContext *C) { 417 auto DAG = new GCNIterativeScheduler(C, 418 GCNIterativeScheduler::SCHEDULE_ILP); 419 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 420 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 421 return DAG; 422 } 423 424 static MachineSchedRegistry 425 SISchedRegistry("si", "Run SI's custom scheduler", 426 createSIMachineScheduler); 427 428 static MachineSchedRegistry 429 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 430 "Run GCN scheduler to maximize occupancy", 431 createGCNMaxOccupancyMachineScheduler); 432 433 static MachineSchedRegistry 434 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 435 "Run GCN scheduler to maximize occupancy (experimental)", 436 createIterativeGCNMaxOccupancyMachineScheduler); 437 438 static MachineSchedRegistry 439 GCNMinRegSchedRegistry("gcn-minreg", 440 "Run GCN iterative scheduler for minimal register usage (experimental)", 441 createMinRegScheduler); 442 443 static MachineSchedRegistry 444 GCNILPSchedRegistry("gcn-ilp", 445 "Run GCN iterative scheduler for ILP scheduling (experimental)", 446 createIterativeILPMachineScheduler); 447 448 static StringRef computeDataLayout(const Triple &TT) { 449 if (TT.getArch() == Triple::r600) { 450 // 32-bit pointers. 451 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 452 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 453 } 454 455 // 32-bit private, local, and region pointers. 64-bit global, constant and 456 // flat, non-integral buffer fat pointers. 457 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 458 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 459 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 460 "-ni:7"; 461 } 462 463 LLVM_READNONE 464 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 465 if (!GPU.empty()) 466 return GPU; 467 468 // Need to default to a target with flat support for HSA. 469 if (TT.getArch() == Triple::amdgcn) 470 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 471 472 return "r600"; 473 } 474 475 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 476 // The AMDGPU toolchain only supports generating shared objects, so we 477 // must always use PIC. 478 return Reloc::PIC_; 479 } 480 481 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 482 StringRef CPU, StringRef FS, 483 TargetOptions Options, 484 Optional<Reloc::Model> RM, 485 Optional<CodeModel::Model> CM, 486 CodeGenOpt::Level OptLevel) 487 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 488 FS, Options, getEffectiveRelocModel(RM), 489 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 490 TLOF(createTLOF(getTargetTriple())) { 491 initAsmInfo(); 492 if (TT.getArch() == Triple::amdgcn) { 493 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 494 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 495 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 496 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 497 } 498 } 499 500 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 501 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 502 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 503 504 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 505 506 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 507 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 508 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 509 } 510 511 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 512 Attribute FSAttr = F.getFnAttribute("target-features"); 513 514 return FSAttr.isValid() ? FSAttr.getValueAsString() 515 : getTargetFeatureString(); 516 } 517 518 /// Predicate for Internalize pass. 519 static bool mustPreserveGV(const GlobalValue &GV) { 520 if (const Function *F = dyn_cast<Function>(&GV)) 521 return F->isDeclaration() || F->getName().startswith("__asan_") || 522 F->getName().startswith("__sanitizer_") || 523 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 524 525 GV.removeDeadConstantUsers(); 526 return !GV.use_empty(); 527 } 528 529 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 530 Builder.DivergentTarget = true; 531 532 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 533 bool Internalize = InternalizeSymbols; 534 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 535 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 536 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 537 bool PromoteKernelArguments = 538 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 539 540 if (EnableFunctionCalls) { 541 delete Builder.Inliner; 542 Builder.Inliner = createFunctionInliningPass(); 543 } 544 545 Builder.addExtension( 546 PassManagerBuilder::EP_ModuleOptimizerEarly, 547 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 548 legacy::PassManagerBase &PM) { 549 if (AMDGPUAA) { 550 PM.add(createAMDGPUAAWrapperPass()); 551 PM.add(createAMDGPUExternalAAWrapperPass()); 552 } 553 PM.add(createAMDGPUUnifyMetadataPass()); 554 PM.add(createAMDGPUPrintfRuntimeBinding()); 555 if (Internalize) 556 PM.add(createInternalizePass(mustPreserveGV)); 557 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 558 if (Internalize) 559 PM.add(createGlobalDCEPass()); 560 if (EarlyInline) 561 PM.add(createAMDGPUAlwaysInlinePass(false)); 562 }); 563 564 Builder.addExtension( 565 PassManagerBuilder::EP_EarlyAsPossible, 566 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 567 legacy::PassManagerBase &PM) { 568 if (AMDGPUAA) { 569 PM.add(createAMDGPUAAWrapperPass()); 570 PM.add(createAMDGPUExternalAAWrapperPass()); 571 } 572 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 573 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 574 if (LibCallSimplify) 575 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 576 }); 577 578 Builder.addExtension( 579 PassManagerBuilder::EP_CGSCCOptimizerLate, 580 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 581 legacy::PassManagerBase &PM) { 582 // Add promote kernel arguments pass to the opt pipeline right before 583 // infer address spaces which is needed to do actual address space 584 // rewriting. 585 if (PromoteKernelArguments) 586 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 587 588 // Add infer address spaces pass to the opt pipeline after inlining 589 // but before SROA to increase SROA opportunities. 590 PM.add(createInferAddressSpacesPass()); 591 592 // This should run after inlining to have any chance of doing anything, 593 // and before other cleanup optimizations. 594 PM.add(createAMDGPULowerKernelAttributesPass()); 595 596 // Promote alloca to vector before SROA and loop unroll. If we manage 597 // to eliminate allocas before unroll we may choose to unroll less. 598 if (EnableOpt) 599 PM.add(createAMDGPUPromoteAllocaToVector()); 600 }); 601 } 602 603 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 604 AAM.registerFunctionAnalysis<AMDGPUAA>(); 605 } 606 607 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 608 PB.registerPipelineParsingCallback( 609 [this](StringRef PassName, ModulePassManager &PM, 610 ArrayRef<PassBuilder::PipelineElement>) { 611 if (PassName == "amdgpu-propagate-attributes-late") { 612 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 613 return true; 614 } 615 if (PassName == "amdgpu-unify-metadata") { 616 PM.addPass(AMDGPUUnifyMetadataPass()); 617 return true; 618 } 619 if (PassName == "amdgpu-printf-runtime-binding") { 620 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 621 return true; 622 } 623 if (PassName == "amdgpu-always-inline") { 624 PM.addPass(AMDGPUAlwaysInlinePass()); 625 return true; 626 } 627 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 628 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 629 return true; 630 } 631 if (PassName == "amdgpu-lower-module-lds") { 632 PM.addPass(AMDGPULowerModuleLDSPass()); 633 return true; 634 } 635 return false; 636 }); 637 PB.registerPipelineParsingCallback( 638 [this](StringRef PassName, FunctionPassManager &PM, 639 ArrayRef<PassBuilder::PipelineElement>) { 640 if (PassName == "amdgpu-simplifylib") { 641 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 642 return true; 643 } 644 if (PassName == "amdgpu-usenative") { 645 PM.addPass(AMDGPUUseNativeCallsPass()); 646 return true; 647 } 648 if (PassName == "amdgpu-promote-alloca") { 649 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 650 return true; 651 } 652 if (PassName == "amdgpu-promote-alloca-to-vector") { 653 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 654 return true; 655 } 656 if (PassName == "amdgpu-lower-kernel-attributes") { 657 PM.addPass(AMDGPULowerKernelAttributesPass()); 658 return true; 659 } 660 if (PassName == "amdgpu-propagate-attributes-early") { 661 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 662 return true; 663 } 664 if (PassName == "amdgpu-promote-kernel-arguments") { 665 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 666 return true; 667 } 668 return false; 669 }); 670 671 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 672 FAM.registerPass([&] { return AMDGPUAA(); }); 673 }); 674 675 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 676 if (AAName == "amdgpu-aa") { 677 AAM.registerFunctionAnalysis<AMDGPUAA>(); 678 return true; 679 } 680 return false; 681 }); 682 683 PB.registerPipelineStartEPCallback( 684 [this](ModulePassManager &PM, OptimizationLevel Level) { 685 FunctionPassManager FPM; 686 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 687 FPM.addPass(AMDGPUUseNativeCallsPass()); 688 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 689 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 690 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 691 }); 692 693 PB.registerPipelineEarlySimplificationEPCallback( 694 [this](ModulePassManager &PM, OptimizationLevel Level) { 695 if (Level == OptimizationLevel::O0) 696 return; 697 698 PM.addPass(AMDGPUUnifyMetadataPass()); 699 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 700 701 if (InternalizeSymbols) { 702 PM.addPass(InternalizePass(mustPreserveGV)); 703 } 704 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 705 if (InternalizeSymbols) { 706 PM.addPass(GlobalDCEPass()); 707 } 708 if (EarlyInlineAll && !EnableFunctionCalls) 709 PM.addPass(AMDGPUAlwaysInlinePass()); 710 }); 711 712 PB.registerCGSCCOptimizerLateEPCallback( 713 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 714 if (Level == OptimizationLevel::O0) 715 return; 716 717 FunctionPassManager FPM; 718 719 // Add promote kernel arguments pass to the opt pipeline right before 720 // infer address spaces which is needed to do actual address space 721 // rewriting. 722 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 723 EnablePromoteKernelArguments) 724 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 725 726 // Add infer address spaces pass to the opt pipeline after inlining 727 // but before SROA to increase SROA opportunities. 728 FPM.addPass(InferAddressSpacesPass()); 729 730 // This should run after inlining to have any chance of doing 731 // anything, and before other cleanup optimizations. 732 FPM.addPass(AMDGPULowerKernelAttributesPass()); 733 734 if (Level != OptimizationLevel::O0) { 735 // Promote alloca to vector before SROA and loop unroll. If we 736 // manage to eliminate allocas before unroll we may choose to unroll 737 // less. 738 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 739 } 740 741 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 742 }); 743 } 744 745 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 746 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 747 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 748 AddrSpace == AMDGPUAS::REGION_ADDRESS) 749 ? -1 750 : 0; 751 } 752 753 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 754 unsigned DestAS) const { 755 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 756 AMDGPU::isFlatGlobalAddrSpace(DestAS); 757 } 758 759 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 760 const auto *LD = dyn_cast<LoadInst>(V); 761 if (!LD) 762 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 763 764 // It must be a generic pointer loaded. 765 assert(V->getType()->isPointerTy() && 766 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 767 768 const auto *Ptr = LD->getPointerOperand(); 769 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 770 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 771 // For a generic pointer loaded from the constant memory, it could be assumed 772 // as a global pointer since the constant memory is only populated on the 773 // host side. As implied by the offload programming model, only global 774 // pointers could be referenced on the host side. 775 return AMDGPUAS::GLOBAL_ADDRESS; 776 } 777 778 std::pair<const Value *, unsigned> 779 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 780 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 781 switch (II->getIntrinsicID()) { 782 case Intrinsic::amdgcn_is_shared: 783 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 784 case Intrinsic::amdgcn_is_private: 785 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 786 default: 787 break; 788 } 789 return std::make_pair(nullptr, -1); 790 } 791 // Check the global pointer predication based on 792 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 793 // the order of 'is_shared' and 'is_private' is not significant. 794 Value *Ptr; 795 if (match( 796 const_cast<Value *>(V), 797 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 798 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 799 m_Deferred(Ptr)))))) 800 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 801 802 return std::make_pair(nullptr, -1); 803 } 804 805 //===----------------------------------------------------------------------===// 806 // GCN Target Machine (SI+) 807 //===----------------------------------------------------------------------===// 808 809 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 810 StringRef CPU, StringRef FS, 811 TargetOptions Options, 812 Optional<Reloc::Model> RM, 813 Optional<CodeModel::Model> CM, 814 CodeGenOpt::Level OL, bool JIT) 815 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 816 817 const TargetSubtargetInfo * 818 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 819 StringRef GPU = getGPUName(F); 820 StringRef FS = getFeatureString(F); 821 822 SmallString<128> SubtargetKey(GPU); 823 SubtargetKey.append(FS); 824 825 auto &I = SubtargetMap[SubtargetKey]; 826 if (!I) { 827 // This needs to be done before we create a new subtarget since any 828 // creation will depend on the TM and the code generation flags on the 829 // function that reside in TargetOptions. 830 resetTargetOptions(F); 831 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 832 } 833 834 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 835 836 return I.get(); 837 } 838 839 TargetTransformInfo 840 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 841 return TargetTransformInfo(GCNTTIImpl(this, F)); 842 } 843 844 //===----------------------------------------------------------------------===// 845 // AMDGPU Pass Setup 846 //===----------------------------------------------------------------------===// 847 848 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 849 return getStandardCSEConfigForOpt(TM->getOptLevel()); 850 } 851 852 namespace { 853 854 class GCNPassConfig final : public AMDGPUPassConfig { 855 public: 856 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 857 : AMDGPUPassConfig(TM, PM) { 858 // It is necessary to know the register usage of the entire call graph. We 859 // allow calls without EnableAMDGPUFunctionCalls if they are marked 860 // noinline, so this is always required. 861 setRequiresCodeGenSCCOrder(true); 862 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 863 } 864 865 GCNTargetMachine &getGCNTargetMachine() const { 866 return getTM<GCNTargetMachine>(); 867 } 868 869 ScheduleDAGInstrs * 870 createMachineScheduler(MachineSchedContext *C) const override; 871 872 ScheduleDAGInstrs * 873 createPostMachineScheduler(MachineSchedContext *C) const override { 874 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 875 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 876 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 877 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 878 return DAG; 879 } 880 881 bool addPreISel() override; 882 void addMachineSSAOptimization() override; 883 bool addILPOpts() override; 884 bool addInstSelector() override; 885 bool addIRTranslator() override; 886 void addPreLegalizeMachineIR() override; 887 bool addLegalizeMachineIR() override; 888 void addPreRegBankSelect() override; 889 bool addRegBankSelect() override; 890 void addPreGlobalInstructionSelect() override; 891 bool addGlobalInstructionSelect() override; 892 void addFastRegAlloc() override; 893 void addOptimizedRegAlloc() override; 894 895 FunctionPass *createSGPRAllocPass(bool Optimized); 896 FunctionPass *createVGPRAllocPass(bool Optimized); 897 FunctionPass *createRegAllocPass(bool Optimized) override; 898 899 bool addRegAssignAndRewriteFast() override; 900 bool addRegAssignAndRewriteOptimized() override; 901 902 void addPreRegAlloc() override; 903 bool addPreRewrite() override; 904 void addPostRegAlloc() override; 905 void addPreSched2() override; 906 void addPreEmitPass() override; 907 }; 908 909 } // end anonymous namespace 910 911 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 912 : TargetPassConfig(TM, PM) { 913 // Exceptions and StackMaps are not supported, so these passes will never do 914 // anything. 915 disablePass(&StackMapLivenessID); 916 disablePass(&FuncletLayoutID); 917 // Garbage collection is not supported. 918 disablePass(&GCLoweringID); 919 disablePass(&ShadowStackGCLoweringID); 920 } 921 922 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 923 if (getOptLevel() == CodeGenOpt::Aggressive) 924 addPass(createGVNPass()); 925 else 926 addPass(createEarlyCSEPass()); 927 } 928 929 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 930 addPass(createLICMPass()); 931 addPass(createSeparateConstOffsetFromGEPPass()); 932 addPass(createSpeculativeExecutionPass()); 933 // ReassociateGEPs exposes more opportunities for SLSR. See 934 // the example in reassociate-geps-and-slsr.ll. 935 addPass(createStraightLineStrengthReducePass()); 936 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 937 // EarlyCSE can reuse. 938 addEarlyCSEOrGVNPass(); 939 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 940 addPass(createNaryReassociatePass()); 941 // NaryReassociate on GEPs creates redundant common expressions, so run 942 // EarlyCSE after it. 943 addPass(createEarlyCSEPass()); 944 } 945 946 void AMDGPUPassConfig::addIRPasses() { 947 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 948 949 // There is no reason to run these. 950 disablePass(&StackMapLivenessID); 951 disablePass(&FuncletLayoutID); 952 disablePass(&PatchableFunctionID); 953 954 addPass(createAMDGPUPrintfRuntimeBinding()); 955 addPass(createAMDGPUCtorDtorLoweringPass()); 956 957 // This must occur before inlining, as the inliner will not look through 958 // bitcast calls. 959 addPass(createAMDGPUFixFunctionBitcastsPass()); 960 961 // A call to propagate attributes pass in the backend in case opt was not run. 962 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 963 964 addPass(createAMDGPULowerIntrinsicsPass()); 965 966 // Function calls are not supported, so make sure we inline everything. 967 addPass(createAMDGPUAlwaysInlinePass()); 968 addPass(createAlwaysInlinerLegacyPass()); 969 // We need to add the barrier noop pass, otherwise adding the function 970 // inlining pass will cause all of the PassConfigs passes to be run 971 // one function at a time, which means if we have a nodule with two 972 // functions, then we will generate code for the first function 973 // without ever running any passes on the second. 974 addPass(createBarrierNoopPass()); 975 976 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 977 if (TM.getTargetTriple().getArch() == Triple::r600) 978 addPass(createR600OpenCLImageTypeLoweringPass()); 979 980 // Replace OpenCL enqueued block function pointers with global variables. 981 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 982 983 // Can increase LDS used by kernel so runs before PromoteAlloca 984 if (EnableLowerModuleLDS) { 985 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 986 // pass "amdgpu-lower-module-lds", and also it required to be run only if 987 // "amdgpu-lower-module-lds" pass is enabled. 988 if (EnableLDSReplaceWithPointer) 989 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 990 991 addPass(createAMDGPULowerModuleLDSPass()); 992 } 993 994 if (TM.getOptLevel() > CodeGenOpt::None) 995 addPass(createInferAddressSpacesPass()); 996 997 addPass(createAtomicExpandPass()); 998 999 if (TM.getOptLevel() > CodeGenOpt::None) { 1000 addPass(createAMDGPUPromoteAlloca()); 1001 1002 if (EnableSROA) 1003 addPass(createSROAPass()); 1004 if (isPassEnabled(EnableScalarIRPasses)) 1005 addStraightLineScalarOptimizationPasses(); 1006 1007 if (EnableAMDGPUAliasAnalysis) { 1008 addPass(createAMDGPUAAWrapperPass()); 1009 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1010 AAResults &AAR) { 1011 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1012 AAR.addAAResult(WrapperPass->getResult()); 1013 })); 1014 } 1015 1016 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1017 // TODO: May want to move later or split into an early and late one. 1018 addPass(createAMDGPUCodeGenPreparePass()); 1019 } 1020 } 1021 1022 TargetPassConfig::addIRPasses(); 1023 1024 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1025 // example, GVN can combine 1026 // 1027 // %0 = add %a, %b 1028 // %1 = add %b, %a 1029 // 1030 // and 1031 // 1032 // %0 = shl nsw %a, 2 1033 // %1 = shl %a, 2 1034 // 1035 // but EarlyCSE can do neither of them. 1036 if (isPassEnabled(EnableScalarIRPasses)) 1037 addEarlyCSEOrGVNPass(); 1038 } 1039 1040 void AMDGPUPassConfig::addCodeGenPrepare() { 1041 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1042 addPass(createAMDGPUAttributorPass()); 1043 1044 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1045 // analysis, and should be removed. 1046 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1047 } 1048 1049 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1050 EnableLowerKernelArguments) 1051 addPass(createAMDGPULowerKernelArgumentsPass()); 1052 1053 TargetPassConfig::addCodeGenPrepare(); 1054 1055 if (isPassEnabled(EnableLoadStoreVectorizer)) 1056 addPass(createLoadStoreVectorizerPass()); 1057 1058 // LowerSwitch pass may introduce unreachable blocks that can 1059 // cause unexpected behavior for subsequent passes. Placing it 1060 // here seems better that these blocks would get cleaned up by 1061 // UnreachableBlockElim inserted next in the pass flow. 1062 addPass(createLowerSwitchPass()); 1063 } 1064 1065 bool AMDGPUPassConfig::addPreISel() { 1066 if (TM->getOptLevel() > CodeGenOpt::None) 1067 addPass(createFlattenCFGPass()); 1068 return false; 1069 } 1070 1071 bool AMDGPUPassConfig::addInstSelector() { 1072 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1073 return false; 1074 } 1075 1076 bool AMDGPUPassConfig::addGCPasses() { 1077 // Do nothing. GC is not supported. 1078 return false; 1079 } 1080 1081 llvm::ScheduleDAGInstrs * 1082 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1083 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1084 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1085 return DAG; 1086 } 1087 1088 //===----------------------------------------------------------------------===// 1089 // GCN Pass Setup 1090 //===----------------------------------------------------------------------===// 1091 1092 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1093 MachineSchedContext *C) const { 1094 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1095 if (ST.enableSIScheduler()) 1096 return createSIMachineScheduler(C); 1097 return createGCNMaxOccupancyMachineScheduler(C); 1098 } 1099 1100 bool GCNPassConfig::addPreISel() { 1101 AMDGPUPassConfig::addPreISel(); 1102 1103 if (TM->getOptLevel() > CodeGenOpt::None) 1104 addPass(createAMDGPULateCodeGenPreparePass()); 1105 1106 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1107 addPass(createAMDGPUAtomicOptimizerPass()); 1108 } 1109 1110 if (TM->getOptLevel() > CodeGenOpt::None) 1111 addPass(createSinkingPass()); 1112 1113 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1114 // regions formed by them. 1115 addPass(&AMDGPUUnifyDivergentExitNodesID); 1116 if (!LateCFGStructurize) { 1117 if (EnableStructurizerWorkarounds) { 1118 addPass(createFixIrreduciblePass()); 1119 addPass(createUnifyLoopExitsPass()); 1120 } 1121 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1122 } 1123 addPass(createAMDGPUAnnotateUniformValues()); 1124 if (!LateCFGStructurize) { 1125 addPass(createSIAnnotateControlFlowPass()); 1126 } 1127 addPass(createLCSSAPass()); 1128 1129 if (TM->getOptLevel() > CodeGenOpt::Less) 1130 addPass(&AMDGPUPerfHintAnalysisID); 1131 1132 return false; 1133 } 1134 1135 void GCNPassConfig::addMachineSSAOptimization() { 1136 TargetPassConfig::addMachineSSAOptimization(); 1137 1138 // We want to fold operands after PeepholeOptimizer has run (or as part of 1139 // it), because it will eliminate extra copies making it easier to fold the 1140 // real source operand. We want to eliminate dead instructions after, so that 1141 // we see fewer uses of the copies. We then need to clean up the dead 1142 // instructions leftover after the operands are folded as well. 1143 // 1144 // XXX - Can we get away without running DeadMachineInstructionElim again? 1145 addPass(&SIFoldOperandsID); 1146 if (EnableDPPCombine) 1147 addPass(&GCNDPPCombineID); 1148 addPass(&SILoadStoreOptimizerID); 1149 if (isPassEnabled(EnableSDWAPeephole)) { 1150 addPass(&SIPeepholeSDWAID); 1151 addPass(&EarlyMachineLICMID); 1152 addPass(&MachineCSEID); 1153 addPass(&SIFoldOperandsID); 1154 } 1155 addPass(&DeadMachineInstructionElimID); 1156 addPass(createSIShrinkInstructionsPass()); 1157 } 1158 1159 bool GCNPassConfig::addILPOpts() { 1160 if (EnableEarlyIfConversion) 1161 addPass(&EarlyIfConverterID); 1162 1163 TargetPassConfig::addILPOpts(); 1164 return false; 1165 } 1166 1167 bool GCNPassConfig::addInstSelector() { 1168 AMDGPUPassConfig::addInstSelector(); 1169 addPass(&SIFixSGPRCopiesID); 1170 addPass(createSILowerI1CopiesPass()); 1171 return false; 1172 } 1173 1174 bool GCNPassConfig::addIRTranslator() { 1175 addPass(new IRTranslator(getOptLevel())); 1176 return false; 1177 } 1178 1179 void GCNPassConfig::addPreLegalizeMachineIR() { 1180 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1181 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1182 addPass(new Localizer()); 1183 } 1184 1185 bool GCNPassConfig::addLegalizeMachineIR() { 1186 addPass(new Legalizer()); 1187 return false; 1188 } 1189 1190 void GCNPassConfig::addPreRegBankSelect() { 1191 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1192 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1193 } 1194 1195 bool GCNPassConfig::addRegBankSelect() { 1196 addPass(new RegBankSelect()); 1197 return false; 1198 } 1199 1200 void GCNPassConfig::addPreGlobalInstructionSelect() { 1201 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1202 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1203 } 1204 1205 bool GCNPassConfig::addGlobalInstructionSelect() { 1206 addPass(new InstructionSelect(getOptLevel())); 1207 return false; 1208 } 1209 1210 void GCNPassConfig::addPreRegAlloc() { 1211 if (LateCFGStructurize) { 1212 addPass(createAMDGPUMachineCFGStructurizerPass()); 1213 } 1214 } 1215 1216 void GCNPassConfig::addFastRegAlloc() { 1217 // FIXME: We have to disable the verifier here because of PHIElimination + 1218 // TwoAddressInstructions disabling it. 1219 1220 // This must be run immediately after phi elimination and before 1221 // TwoAddressInstructions, otherwise the processing of the tied operand of 1222 // SI_ELSE will introduce a copy of the tied operand source after the else. 1223 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1224 1225 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1226 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1227 1228 TargetPassConfig::addFastRegAlloc(); 1229 } 1230 1231 void GCNPassConfig::addOptimizedRegAlloc() { 1232 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1233 // instructions that cause scheduling barriers. 1234 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1235 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1236 1237 if (OptExecMaskPreRA) 1238 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1239 1240 if (isPassEnabled(EnablePreRAOptimizations)) 1241 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1242 1243 // This is not an essential optimization and it has a noticeable impact on 1244 // compilation time, so we only enable it from O2. 1245 if (TM->getOptLevel() > CodeGenOpt::Less) 1246 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1247 1248 // FIXME: when an instruction has a Killed operand, and the instruction is 1249 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1250 // the register in LiveVariables, this would trigger a failure in verifier, 1251 // we should fix it and enable the verifier. 1252 if (OptVGPRLiveRange) 1253 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1254 // This must be run immediately after phi elimination and before 1255 // TwoAddressInstructions, otherwise the processing of the tied operand of 1256 // SI_ELSE will introduce a copy of the tied operand source after the else. 1257 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1258 1259 if (EnableDCEInRA) 1260 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1261 1262 TargetPassConfig::addOptimizedRegAlloc(); 1263 } 1264 1265 bool GCNPassConfig::addPreRewrite() { 1266 if (EnableRegReassign) 1267 addPass(&GCNNSAReassignID); 1268 return true; 1269 } 1270 1271 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1272 // Initialize the global default. 1273 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1274 initializeDefaultSGPRRegisterAllocatorOnce); 1275 1276 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1277 if (Ctor != useDefaultRegisterAllocator) 1278 return Ctor(); 1279 1280 if (Optimized) 1281 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1282 1283 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1284 } 1285 1286 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1287 // Initialize the global default. 1288 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1289 initializeDefaultVGPRRegisterAllocatorOnce); 1290 1291 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1292 if (Ctor != useDefaultRegisterAllocator) 1293 return Ctor(); 1294 1295 if (Optimized) 1296 return createGreedyVGPRRegisterAllocator(); 1297 1298 return createFastVGPRRegisterAllocator(); 1299 } 1300 1301 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1302 llvm_unreachable("should not be used"); 1303 } 1304 1305 static const char RegAllocOptNotSupportedMessage[] = 1306 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1307 1308 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1309 if (!usingDefaultRegAlloc()) 1310 report_fatal_error(RegAllocOptNotSupportedMessage); 1311 1312 addPass(createSGPRAllocPass(false)); 1313 1314 // Equivalent of PEI for SGPRs. 1315 addPass(&SILowerSGPRSpillsID); 1316 1317 addPass(createVGPRAllocPass(false)); 1318 return true; 1319 } 1320 1321 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1322 if (!usingDefaultRegAlloc()) 1323 report_fatal_error(RegAllocOptNotSupportedMessage); 1324 1325 addPass(createSGPRAllocPass(true)); 1326 1327 // Commit allocated register changes. This is mostly necessary because too 1328 // many things rely on the use lists of the physical registers, such as the 1329 // verifier. This is only necessary with allocators which use LiveIntervals, 1330 // since FastRegAlloc does the replacements itself. 1331 addPass(createVirtRegRewriter(false)); 1332 1333 // Equivalent of PEI for SGPRs. 1334 addPass(&SILowerSGPRSpillsID); 1335 1336 addPass(createVGPRAllocPass(true)); 1337 1338 addPreRewrite(); 1339 addPass(&VirtRegRewriterID); 1340 1341 return true; 1342 } 1343 1344 void GCNPassConfig::addPostRegAlloc() { 1345 addPass(&SIFixVGPRCopiesID); 1346 if (getOptLevel() > CodeGenOpt::None) 1347 addPass(&SIOptimizeExecMaskingID); 1348 TargetPassConfig::addPostRegAlloc(); 1349 } 1350 1351 void GCNPassConfig::addPreSched2() { 1352 if (TM->getOptLevel() > CodeGenOpt::None) 1353 addPass(createSIShrinkInstructionsPass()); 1354 addPass(&SIPostRABundlerID); 1355 } 1356 1357 void GCNPassConfig::addPreEmitPass() { 1358 addPass(createSIMemoryLegalizerPass()); 1359 addPass(createSIInsertWaitcntsPass()); 1360 1361 addPass(createSIModeRegisterPass()); 1362 1363 if (getOptLevel() > CodeGenOpt::None) 1364 addPass(&SIInsertHardClausesID); 1365 1366 addPass(&SILateBranchLoweringPassID); 1367 if (getOptLevel() > CodeGenOpt::None) 1368 addPass(&SIPreEmitPeepholeID); 1369 // The hazard recognizer that runs as part of the post-ra scheduler does not 1370 // guarantee to be able handle all hazards correctly. This is because if there 1371 // are multiple scheduling regions in a basic block, the regions are scheduled 1372 // bottom up, so when we begin to schedule a region we don't know what 1373 // instructions were emitted directly before it. 1374 // 1375 // Here we add a stand-alone hazard recognizer pass which can handle all 1376 // cases. 1377 addPass(&PostRAHazardRecognizerID); 1378 addPass(&BranchRelaxationPassID); 1379 } 1380 1381 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1382 return new GCNPassConfig(*this, PM); 1383 } 1384 1385 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1386 return new yaml::SIMachineFunctionInfo(); 1387 } 1388 1389 yaml::MachineFunctionInfo * 1390 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1391 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1392 return new yaml::SIMachineFunctionInfo( 1393 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1394 } 1395 1396 bool GCNTargetMachine::parseMachineFunctionInfo( 1397 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1398 SMDiagnostic &Error, SMRange &SourceRange) const { 1399 const yaml::SIMachineFunctionInfo &YamlMFI = 1400 reinterpret_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1401 MachineFunction &MF = PFS.MF; 1402 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1403 1404 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1405 return true; 1406 1407 if (MFI->Occupancy == 0) { 1408 // Fixup the subtarget dependent default value. 1409 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1410 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1411 } 1412 1413 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1414 Register TempReg; 1415 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1416 SourceRange = RegName.SourceRange; 1417 return true; 1418 } 1419 RegVal = TempReg; 1420 1421 return false; 1422 }; 1423 1424 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1425 // Create a diagnostic for a the register string literal. 1426 const MemoryBuffer &Buffer = 1427 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1428 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1429 RegName.Value.size(), SourceMgr::DK_Error, 1430 "incorrect register class for field", RegName.Value, 1431 None, None); 1432 SourceRange = RegName.SourceRange; 1433 return true; 1434 }; 1435 1436 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1437 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1438 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1439 return true; 1440 1441 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1442 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1443 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1444 } 1445 1446 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1447 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1448 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1449 } 1450 1451 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1452 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1453 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1454 } 1455 1456 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1457 const TargetRegisterClass &RC, 1458 ArgDescriptor &Arg, unsigned UserSGPRs, 1459 unsigned SystemSGPRs) { 1460 // Skip parsing if it's not present. 1461 if (!A) 1462 return false; 1463 1464 if (A->IsRegister) { 1465 Register Reg; 1466 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1467 SourceRange = A->RegisterName.SourceRange; 1468 return true; 1469 } 1470 if (!RC.contains(Reg)) 1471 return diagnoseRegisterClass(A->RegisterName); 1472 Arg = ArgDescriptor::createRegister(Reg); 1473 } else 1474 Arg = ArgDescriptor::createStack(A->StackOffset); 1475 // Check and apply the optional mask. 1476 if (A->Mask) 1477 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1478 1479 MFI->NumUserSGPRs += UserSGPRs; 1480 MFI->NumSystemSGPRs += SystemSGPRs; 1481 return false; 1482 }; 1483 1484 if (YamlMFI.ArgInfo && 1485 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1486 AMDGPU::SGPR_128RegClass, 1487 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1488 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1489 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1490 2, 0) || 1491 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1492 MFI->ArgInfo.QueuePtr, 2, 0) || 1493 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1494 AMDGPU::SReg_64RegClass, 1495 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1496 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1497 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1498 2, 0) || 1499 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1500 AMDGPU::SReg_64RegClass, 1501 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1502 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1503 AMDGPU::SGPR_32RegClass, 1504 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1505 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1506 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1507 0, 1) || 1508 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1509 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1510 0, 1) || 1511 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1512 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1513 0, 1) || 1514 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1515 AMDGPU::SGPR_32RegClass, 1516 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1517 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1518 AMDGPU::SGPR_32RegClass, 1519 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1520 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1521 AMDGPU::SReg_64RegClass, 1522 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1523 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1524 AMDGPU::SReg_64RegClass, 1525 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1526 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1527 AMDGPU::VGPR_32RegClass, 1528 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1529 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1530 AMDGPU::VGPR_32RegClass, 1531 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1532 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1533 AMDGPU::VGPR_32RegClass, 1534 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1535 return true; 1536 1537 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1538 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1539 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1540 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1541 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1542 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1543 1544 return false; 1545 } 1546