1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// The AMDGPU target machine contains all of the hardware specific 11 /// information needed to emit code for SI+ GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUTargetMachine.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUAliasAnalysis.h" 18 #include "AMDGPUExportClustering.h" 19 #include "AMDGPUMacroFusion.h" 20 #include "AMDGPUTargetObjectFile.h" 21 #include "AMDGPUTargetTransformInfo.h" 22 #include "GCNIterativeScheduler.h" 23 #include "GCNSchedStrategy.h" 24 #include "R600.h" 25 #include "R600TargetMachine.h" 26 #include "SIMachineFunctionInfo.h" 27 #include "SIMachineScheduler.h" 28 #include "TargetInfo/AMDGPUTargetInfo.h" 29 #include "llvm/Analysis/CGSCCPassManager.h" 30 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 31 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 32 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 33 #include "llvm/CodeGen/GlobalISel/Legalizer.h" 34 #include "llvm/CodeGen/GlobalISel/Localizer.h" 35 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" 36 #include "llvm/CodeGen/MIRParser/MIParser.h" 37 #include "llvm/CodeGen/Passes.h" 38 #include "llvm/CodeGen/RegAllocRegistry.h" 39 #include "llvm/CodeGen/TargetPassConfig.h" 40 #include "llvm/IR/IntrinsicsAMDGPU.h" 41 #include "llvm/IR/LegacyPassManager.h" 42 #include "llvm/IR/PassManager.h" 43 #include "llvm/IR/PatternMatch.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/MC/TargetRegistry.h" 46 #include "llvm/Passes/PassBuilder.h" 47 #include "llvm/Transforms/IPO.h" 48 #include "llvm/Transforms/IPO/AlwaysInliner.h" 49 #include "llvm/Transforms/IPO/GlobalDCE.h" 50 #include "llvm/Transforms/IPO/Internalize.h" 51 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 52 #include "llvm/Transforms/Scalar.h" 53 #include "llvm/Transforms/Scalar/GVN.h" 54 #include "llvm/Transforms/Scalar/InferAddressSpaces.h" 55 #include "llvm/Transforms/Utils.h" 56 #include "llvm/Transforms/Utils/SimplifyLibCalls.h" 57 #include "llvm/Transforms/Vectorize.h" 58 59 using namespace llvm; 60 using namespace llvm::PatternMatch; 61 62 namespace { 63 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> { 64 public: 65 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 66 : RegisterRegAllocBase(N, D, C) {} 67 }; 68 69 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> { 70 public: 71 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C) 72 : RegisterRegAllocBase(N, D, C) {} 73 }; 74 75 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI, 76 const TargetRegisterClass &RC) { 77 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 78 } 79 80 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI, 81 const TargetRegisterClass &RC) { 82 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC); 83 } 84 85 86 /// -{sgpr|vgpr}-regalloc=... command line option. 87 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; } 88 89 /// A dummy default pass factory indicates whether the register allocator is 90 /// overridden on the command line. 91 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag; 92 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag; 93 94 static SGPRRegisterRegAlloc 95 defaultSGPRRegAlloc("default", 96 "pick SGPR register allocator based on -O option", 97 useDefaultRegisterAllocator); 98 99 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false, 100 RegisterPassParser<SGPRRegisterRegAlloc>> 101 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 102 cl::desc("Register allocator to use for SGPRs")); 103 104 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false, 105 RegisterPassParser<VGPRRegisterRegAlloc>> 106 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), 107 cl::desc("Register allocator to use for VGPRs")); 108 109 110 static void initializeDefaultSGPRRegisterAllocatorOnce() { 111 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 112 113 if (!Ctor) { 114 Ctor = SGPRRegAlloc; 115 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc); 116 } 117 } 118 119 static void initializeDefaultVGPRRegisterAllocatorOnce() { 120 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 121 122 if (!Ctor) { 123 Ctor = VGPRRegAlloc; 124 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc); 125 } 126 } 127 128 static FunctionPass *createBasicSGPRRegisterAllocator() { 129 return createBasicRegisterAllocator(onlyAllocateSGPRs); 130 } 131 132 static FunctionPass *createGreedySGPRRegisterAllocator() { 133 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 134 } 135 136 static FunctionPass *createFastSGPRRegisterAllocator() { 137 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 138 } 139 140 static FunctionPass *createBasicVGPRRegisterAllocator() { 141 return createBasicRegisterAllocator(onlyAllocateVGPRs); 142 } 143 144 static FunctionPass *createGreedyVGPRRegisterAllocator() { 145 return createGreedyRegisterAllocator(onlyAllocateVGPRs); 146 } 147 148 static FunctionPass *createFastVGPRRegisterAllocator() { 149 return createFastRegisterAllocator(onlyAllocateVGPRs, true); 150 } 151 152 static SGPRRegisterRegAlloc basicRegAllocSGPR( 153 "basic", "basic register allocator", createBasicSGPRRegisterAllocator); 154 static SGPRRegisterRegAlloc greedyRegAllocSGPR( 155 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator); 156 157 static SGPRRegisterRegAlloc fastRegAllocSGPR( 158 "fast", "fast register allocator", createFastSGPRRegisterAllocator); 159 160 161 static VGPRRegisterRegAlloc basicRegAllocVGPR( 162 "basic", "basic register allocator", createBasicVGPRRegisterAllocator); 163 static VGPRRegisterRegAlloc greedyRegAllocVGPR( 164 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator); 165 166 static VGPRRegisterRegAlloc fastRegAllocVGPR( 167 "fast", "fast register allocator", createFastVGPRRegisterAllocator); 168 } 169 170 static cl::opt<bool> EnableSROA( 171 "amdgpu-sroa", 172 cl::desc("Run SROA after promote alloca pass"), 173 cl::ReallyHidden, 174 cl::init(true)); 175 176 static cl::opt<bool> 177 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, 178 cl::desc("Run early if-conversion"), 179 cl::init(false)); 180 181 static cl::opt<bool> 182 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, 183 cl::desc("Run pre-RA exec mask optimizations"), 184 cl::init(true)); 185 186 // Option to disable vectorizer for tests. 187 static cl::opt<bool> EnableLoadStoreVectorizer( 188 "amdgpu-load-store-vectorizer", 189 cl::desc("Enable load store vectorizer"), 190 cl::init(true), 191 cl::Hidden); 192 193 // Option to control global loads scalarization 194 static cl::opt<bool> ScalarizeGlobal( 195 "amdgpu-scalarize-global-loads", 196 cl::desc("Enable global load scalarization"), 197 cl::init(true), 198 cl::Hidden); 199 200 // Option to run internalize pass. 201 static cl::opt<bool> InternalizeSymbols( 202 "amdgpu-internalize-symbols", 203 cl::desc("Enable elimination of non-kernel functions and unused globals"), 204 cl::init(false), 205 cl::Hidden); 206 207 // Option to inline all early. 208 static cl::opt<bool> EarlyInlineAll( 209 "amdgpu-early-inline-all", 210 cl::desc("Inline all functions early"), 211 cl::init(false), 212 cl::Hidden); 213 214 static cl::opt<bool> EnableSDWAPeephole( 215 "amdgpu-sdwa-peephole", 216 cl::desc("Enable SDWA peepholer"), 217 cl::init(true)); 218 219 static cl::opt<bool> EnableDPPCombine( 220 "amdgpu-dpp-combine", 221 cl::desc("Enable DPP combiner"), 222 cl::init(true)); 223 224 // Enable address space based alias analysis 225 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, 226 cl::desc("Enable AMDGPU Alias Analysis"), 227 cl::init(true)); 228 229 // Option to run late CFG structurizer 230 static cl::opt<bool, true> LateCFGStructurize( 231 "amdgpu-late-structurize", 232 cl::desc("Enable late CFG structurization"), 233 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), 234 cl::Hidden); 235 236 // Enable lib calls simplifications 237 static cl::opt<bool> EnableLibCallSimplify( 238 "amdgpu-simplify-libcall", 239 cl::desc("Enable amdgpu library simplifications"), 240 cl::init(true), 241 cl::Hidden); 242 243 static cl::opt<bool> EnableLowerKernelArguments( 244 "amdgpu-ir-lower-kernel-arguments", 245 cl::desc("Lower kernel argument loads in IR pass"), 246 cl::init(true), 247 cl::Hidden); 248 249 static cl::opt<bool> EnableRegReassign( 250 "amdgpu-reassign-regs", 251 cl::desc("Enable register reassign optimizations on gfx10+"), 252 cl::init(true), 253 cl::Hidden); 254 255 static cl::opt<bool> OptVGPRLiveRange( 256 "amdgpu-opt-vgpr-liverange", 257 cl::desc("Enable VGPR liverange optimizations for if-else structure"), 258 cl::init(true), cl::Hidden); 259 260 // Enable atomic optimization 261 static cl::opt<bool> EnableAtomicOptimizations( 262 "amdgpu-atomic-optimizations", 263 cl::desc("Enable atomic optimizations"), 264 cl::init(false), 265 cl::Hidden); 266 267 // Enable Mode register optimization 268 static cl::opt<bool> EnableSIModeRegisterPass( 269 "amdgpu-mode-register", 270 cl::desc("Enable mode register pass"), 271 cl::init(true), 272 cl::Hidden); 273 274 // Option is used in lit tests to prevent deadcoding of patterns inspected. 275 static cl::opt<bool> 276 EnableDCEInRA("amdgpu-dce-in-ra", 277 cl::init(true), cl::Hidden, 278 cl::desc("Enable machine DCE inside regalloc")); 279 280 static cl::opt<bool> EnableScalarIRPasses( 281 "amdgpu-scalar-ir-passes", 282 cl::desc("Enable scalar IR passes"), 283 cl::init(true), 284 cl::Hidden); 285 286 static cl::opt<bool> EnableStructurizerWorkarounds( 287 "amdgpu-enable-structurizer-workarounds", 288 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), 289 cl::Hidden); 290 291 static cl::opt<bool> EnableLDSReplaceWithPointer( 292 "amdgpu-enable-lds-replace-with-pointer", 293 cl::desc("Enable LDS replace with pointer pass"), cl::init(false), 294 cl::Hidden); 295 296 static cl::opt<bool, true> EnableLowerModuleLDS( 297 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), 298 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), 299 cl::Hidden); 300 301 static cl::opt<bool> EnablePreRAOptimizations( 302 "amdgpu-enable-pre-ra-optimizations", 303 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), 304 cl::Hidden); 305 306 static cl::opt<bool> EnablePromoteKernelArguments( 307 "amdgpu-enable-promote-kernel-arguments", 308 cl::desc("Enable promotion of flat kernel pointer arguments to global"), 309 cl::Hidden, cl::init(true)); 310 311 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { 312 // Register the target 313 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 314 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 315 316 PassRegistry *PR = PassRegistry::getPassRegistry(); 317 initializeR600ClauseMergePassPass(*PR); 318 initializeR600ControlFlowFinalizerPass(*PR); 319 initializeR600PacketizerPass(*PR); 320 initializeR600ExpandSpecialInstrsPassPass(*PR); 321 initializeR600VectorRegMergerPass(*PR); 322 initializeGlobalISel(*PR); 323 initializeAMDGPUDAGToDAGISelPass(*PR); 324 initializeGCNDPPCombinePass(*PR); 325 initializeSILowerI1CopiesPass(*PR); 326 initializeSILowerSGPRSpillsPass(*PR); 327 initializeSIFixSGPRCopiesPass(*PR); 328 initializeSIFixVGPRCopiesPass(*PR); 329 initializeSIFoldOperandsPass(*PR); 330 initializeSIPeepholeSDWAPass(*PR); 331 initializeSIShrinkInstructionsPass(*PR); 332 initializeSIOptimizeExecMaskingPreRAPass(*PR); 333 initializeSIOptimizeVGPRLiveRangePass(*PR); 334 initializeSILoadStoreOptimizerPass(*PR); 335 initializeAMDGPUFixFunctionBitcastsPass(*PR); 336 initializeAMDGPUCtorDtorLoweringPass(*PR); 337 initializeAMDGPUAlwaysInlinePass(*PR); 338 initializeAMDGPUAttributorPass(*PR); 339 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 340 initializeAMDGPUAnnotateUniformValuesPass(*PR); 341 initializeAMDGPUArgumentUsageInfoPass(*PR); 342 initializeAMDGPUAtomicOptimizerPass(*PR); 343 initializeAMDGPULowerKernelArgumentsPass(*PR); 344 initializeAMDGPUPromoteKernelArgumentsPass(*PR); 345 initializeAMDGPULowerKernelAttributesPass(*PR); 346 initializeAMDGPULowerIntrinsicsPass(*PR); 347 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); 348 initializeAMDGPUPostLegalizerCombinerPass(*PR); 349 initializeAMDGPUPreLegalizerCombinerPass(*PR); 350 initializeAMDGPURegBankCombinerPass(*PR); 351 initializeAMDGPUPromoteAllocaPass(*PR); 352 initializeAMDGPUPromoteAllocaToVectorPass(*PR); 353 initializeAMDGPUCodeGenPreparePass(*PR); 354 initializeAMDGPULateCodeGenPreparePass(*PR); 355 initializeAMDGPUPropagateAttributesEarlyPass(*PR); 356 initializeAMDGPUPropagateAttributesLatePass(*PR); 357 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); 358 initializeAMDGPULowerModuleLDSPass(*PR); 359 initializeAMDGPURewriteOutArgumentsPass(*PR); 360 initializeAMDGPUUnifyMetadataPass(*PR); 361 initializeSIAnnotateControlFlowPass(*PR); 362 initializeSIInsertHardClausesPass(*PR); 363 initializeSIInsertWaitcntsPass(*PR); 364 initializeSIModeRegisterPass(*PR); 365 initializeSIWholeQuadModePass(*PR); 366 initializeSILowerControlFlowPass(*PR); 367 initializeSIPreEmitPeepholePass(*PR); 368 initializeSILateBranchLoweringPass(*PR); 369 initializeSIMemoryLegalizerPass(*PR); 370 initializeSIOptimizeExecMaskingPass(*PR); 371 initializeSIPreAllocateWWMRegsPass(*PR); 372 initializeSIFormMemoryClausesPass(*PR); 373 initializeSIPostRABundlerPass(*PR); 374 initializeAMDGPUUnifyDivergentExitNodesPass(*PR); 375 initializeAMDGPUAAWrapperPassPass(*PR); 376 initializeAMDGPUExternalAAWrapperPass(*PR); 377 initializeAMDGPUUseNativeCallsPass(*PR); 378 initializeAMDGPUSimplifyLibCallsPass(*PR); 379 initializeAMDGPUPrintfRuntimeBindingPass(*PR); 380 initializeAMDGPUResourceUsageAnalysisPass(*PR); 381 initializeGCNNSAReassignPass(*PR); 382 initializeGCNPreRAOptimizationsPass(*PR); 383 } 384 385 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 386 return std::make_unique<AMDGPUTargetObjectFile>(); 387 } 388 389 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 390 return new SIScheduleDAGMI(C); 391 } 392 393 static ScheduleDAGInstrs * 394 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 395 ScheduleDAGMILive *DAG = 396 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C)); 397 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 398 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 399 DAG->addMutation(createAMDGPUExportClusteringDAGMutation()); 400 return DAG; 401 } 402 403 static ScheduleDAGInstrs * 404 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 405 auto DAG = new GCNIterativeScheduler(C, 406 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); 407 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 408 return DAG; 409 } 410 411 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { 412 return new GCNIterativeScheduler(C, 413 GCNIterativeScheduler::SCHEDULE_MINREGFORCED); 414 } 415 416 static ScheduleDAGInstrs * 417 createIterativeILPMachineScheduler(MachineSchedContext *C) { 418 auto DAG = new GCNIterativeScheduler(C, 419 GCNIterativeScheduler::SCHEDULE_ILP); 420 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 421 DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); 422 return DAG; 423 } 424 425 static MachineSchedRegistry 426 SISchedRegistry("si", "Run SI's custom scheduler", 427 createSIMachineScheduler); 428 429 static MachineSchedRegistry 430 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 431 "Run GCN scheduler to maximize occupancy", 432 createGCNMaxOccupancyMachineScheduler); 433 434 static MachineSchedRegistry 435 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", 436 "Run GCN scheduler to maximize occupancy (experimental)", 437 createIterativeGCNMaxOccupancyMachineScheduler); 438 439 static MachineSchedRegistry 440 GCNMinRegSchedRegistry("gcn-minreg", 441 "Run GCN iterative scheduler for minimal register usage (experimental)", 442 createMinRegScheduler); 443 444 static MachineSchedRegistry 445 GCNILPSchedRegistry("gcn-ilp", 446 "Run GCN iterative scheduler for ILP scheduling (experimental)", 447 createIterativeILPMachineScheduler); 448 449 static StringRef computeDataLayout(const Triple &TT) { 450 if (TT.getArch() == Triple::r600) { 451 // 32-bit pointers. 452 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 453 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"; 454 } 455 456 // 32-bit private, local, and region pointers. 64-bit global, constant and 457 // flat, non-integral buffer fat pointers. 458 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" 459 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 460 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" 461 "-ni:7"; 462 } 463 464 LLVM_READNONE 465 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 466 if (!GPU.empty()) 467 return GPU; 468 469 // Need to default to a target with flat support for HSA. 470 if (TT.getArch() == Triple::amdgcn) 471 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic"; 472 473 return "r600"; 474 } 475 476 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 477 // The AMDGPU toolchain only supports generating shared objects, so we 478 // must always use PIC. 479 return Reloc::PIC_; 480 } 481 482 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 483 StringRef CPU, StringRef FS, 484 TargetOptions Options, 485 Optional<Reloc::Model> RM, 486 Optional<CodeModel::Model> CM, 487 CodeGenOpt::Level OptLevel) 488 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 489 FS, Options, getEffectiveRelocModel(RM), 490 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), 491 TLOF(createTLOF(getTargetTriple())) { 492 initAsmInfo(); 493 if (TT.getArch() == Triple::amdgcn) { 494 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) 495 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); 496 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) 497 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); 498 } 499 } 500 501 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; 502 bool AMDGPUTargetMachine::EnableFunctionCalls = false; 503 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; 504 505 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; 506 507 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 508 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 509 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU(); 510 } 511 512 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 513 Attribute FSAttr = F.getFnAttribute("target-features"); 514 515 return FSAttr.isValid() ? FSAttr.getValueAsString() 516 : getTargetFeatureString(); 517 } 518 519 /// Predicate for Internalize pass. 520 static bool mustPreserveGV(const GlobalValue &GV) { 521 if (const Function *F = dyn_cast<Function>(&GV)) 522 return F->isDeclaration() || F->getName().startswith("__asan_") || 523 F->getName().startswith("__sanitizer_") || 524 AMDGPU::isEntryFunctionCC(F->getCallingConv()); 525 526 GV.removeDeadConstantUsers(); 527 return !GV.use_empty(); 528 } 529 530 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 531 Builder.DivergentTarget = true; 532 533 bool EnableOpt = getOptLevel() > CodeGenOpt::None; 534 bool Internalize = InternalizeSymbols; 535 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; 536 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; 537 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; 538 bool PromoteKernelArguments = 539 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less; 540 541 if (EnableFunctionCalls) { 542 delete Builder.Inliner; 543 Builder.Inliner = createFunctionInliningPass(); 544 } 545 546 Builder.addExtension( 547 PassManagerBuilder::EP_ModuleOptimizerEarly, 548 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &, 549 legacy::PassManagerBase &PM) { 550 if (AMDGPUAA) { 551 PM.add(createAMDGPUAAWrapperPass()); 552 PM.add(createAMDGPUExternalAAWrapperPass()); 553 } 554 PM.add(createAMDGPUUnifyMetadataPass()); 555 PM.add(createAMDGPUPrintfRuntimeBinding()); 556 if (Internalize) 557 PM.add(createInternalizePass(mustPreserveGV)); 558 PM.add(createAMDGPUPropagateAttributesLatePass(this)); 559 if (Internalize) 560 PM.add(createGlobalDCEPass()); 561 if (EarlyInline) 562 PM.add(createAMDGPUAlwaysInlinePass(false)); 563 }); 564 565 Builder.addExtension( 566 PassManagerBuilder::EP_EarlyAsPossible, 567 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &, 568 legacy::PassManagerBase &PM) { 569 if (AMDGPUAA) { 570 PM.add(createAMDGPUAAWrapperPass()); 571 PM.add(createAMDGPUExternalAAWrapperPass()); 572 } 573 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); 574 PM.add(llvm::createAMDGPUUseNativeCallsPass()); 575 if (LibCallSimplify) 576 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this)); 577 }); 578 579 Builder.addExtension( 580 PassManagerBuilder::EP_CGSCCOptimizerLate, 581 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, 582 legacy::PassManagerBase &PM) { 583 // Add promote kernel arguments pass to the opt pipeline right before 584 // infer address spaces which is needed to do actual address space 585 // rewriting. 586 if (PromoteKernelArguments) 587 PM.add(createAMDGPUPromoteKernelArgumentsPass()); 588 589 // Add infer address spaces pass to the opt pipeline after inlining 590 // but before SROA to increase SROA opportunities. 591 PM.add(createInferAddressSpacesPass()); 592 593 // This should run after inlining to have any chance of doing anything, 594 // and before other cleanup optimizations. 595 PM.add(createAMDGPULowerKernelAttributesPass()); 596 597 // Promote alloca to vector before SROA and loop unroll. If we manage 598 // to eliminate allocas before unroll we may choose to unroll less. 599 if (EnableOpt) 600 PM.add(createAMDGPUPromoteAllocaToVector()); 601 }); 602 } 603 604 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { 605 AAM.registerFunctionAnalysis<AMDGPUAA>(); 606 } 607 608 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 609 PB.registerPipelineParsingCallback( 610 [this](StringRef PassName, ModulePassManager &PM, 611 ArrayRef<PassBuilder::PipelineElement>) { 612 if (PassName == "amdgpu-propagate-attributes-late") { 613 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 614 return true; 615 } 616 if (PassName == "amdgpu-unify-metadata") { 617 PM.addPass(AMDGPUUnifyMetadataPass()); 618 return true; 619 } 620 if (PassName == "amdgpu-printf-runtime-binding") { 621 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 622 return true; 623 } 624 if (PassName == "amdgpu-always-inline") { 625 PM.addPass(AMDGPUAlwaysInlinePass()); 626 return true; 627 } 628 if (PassName == "amdgpu-replace-lds-use-with-pointer") { 629 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); 630 return true; 631 } 632 if (PassName == "amdgpu-lower-module-lds") { 633 PM.addPass(AMDGPULowerModuleLDSPass()); 634 return true; 635 } 636 return false; 637 }); 638 PB.registerPipelineParsingCallback( 639 [this](StringRef PassName, FunctionPassManager &PM, 640 ArrayRef<PassBuilder::PipelineElement>) { 641 if (PassName == "amdgpu-simplifylib") { 642 PM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 643 return true; 644 } 645 if (PassName == "amdgpu-usenative") { 646 PM.addPass(AMDGPUUseNativeCallsPass()); 647 return true; 648 } 649 if (PassName == "amdgpu-promote-alloca") { 650 PM.addPass(AMDGPUPromoteAllocaPass(*this)); 651 return true; 652 } 653 if (PassName == "amdgpu-promote-alloca-to-vector") { 654 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 655 return true; 656 } 657 if (PassName == "amdgpu-lower-kernel-attributes") { 658 PM.addPass(AMDGPULowerKernelAttributesPass()); 659 return true; 660 } 661 if (PassName == "amdgpu-propagate-attributes-early") { 662 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 663 return true; 664 } 665 if (PassName == "amdgpu-promote-kernel-arguments") { 666 PM.addPass(AMDGPUPromoteKernelArgumentsPass()); 667 return true; 668 } 669 return false; 670 }); 671 672 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) { 673 FAM.registerPass([&] { return AMDGPUAA(); }); 674 }); 675 676 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) { 677 if (AAName == "amdgpu-aa") { 678 AAM.registerFunctionAnalysis<AMDGPUAA>(); 679 return true; 680 } 681 return false; 682 }); 683 684 PB.registerPipelineStartEPCallback( 685 [this](ModulePassManager &PM, OptimizationLevel Level) { 686 FunctionPassManager FPM; 687 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); 688 FPM.addPass(AMDGPUUseNativeCallsPass()); 689 if (EnableLibCallSimplify && Level != OptimizationLevel::O0) 690 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); 691 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 692 }); 693 694 PB.registerPipelineEarlySimplificationEPCallback( 695 [this](ModulePassManager &PM, OptimizationLevel Level) { 696 if (Level == OptimizationLevel::O0) 697 return; 698 699 PM.addPass(AMDGPUUnifyMetadataPass()); 700 PM.addPass(AMDGPUPrintfRuntimeBindingPass()); 701 702 if (InternalizeSymbols) { 703 PM.addPass(InternalizePass(mustPreserveGV)); 704 } 705 PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); 706 if (InternalizeSymbols) { 707 PM.addPass(GlobalDCEPass()); 708 } 709 if (EarlyInlineAll && !EnableFunctionCalls) 710 PM.addPass(AMDGPUAlwaysInlinePass()); 711 }); 712 713 PB.registerCGSCCOptimizerLateEPCallback( 714 [this](CGSCCPassManager &PM, OptimizationLevel Level) { 715 if (Level == OptimizationLevel::O0) 716 return; 717 718 FunctionPassManager FPM; 719 720 // Add promote kernel arguments pass to the opt pipeline right before 721 // infer address spaces which is needed to do actual address space 722 // rewriting. 723 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && 724 EnablePromoteKernelArguments) 725 FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); 726 727 // Add infer address spaces pass to the opt pipeline after inlining 728 // but before SROA to increase SROA opportunities. 729 FPM.addPass(InferAddressSpacesPass()); 730 731 // This should run after inlining to have any chance of doing 732 // anything, and before other cleanup optimizations. 733 FPM.addPass(AMDGPULowerKernelAttributesPass()); 734 735 if (Level != OptimizationLevel::O0) { 736 // Promote alloca to vector before SROA and loop unroll. If we 737 // manage to eliminate allocas before unroll we may choose to unroll 738 // less. 739 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); 740 } 741 742 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); 743 }); 744 } 745 746 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) { 747 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || 748 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || 749 AddrSpace == AMDGPUAS::REGION_ADDRESS) 750 ? -1 751 : 0; 752 } 753 754 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS, 755 unsigned DestAS) const { 756 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) && 757 AMDGPU::isFlatGlobalAddrSpace(DestAS); 758 } 759 760 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const { 761 const auto *LD = dyn_cast<LoadInst>(V); 762 if (!LD) 763 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 764 765 // It must be a generic pointer loaded. 766 assert(V->getType()->isPointerTy() && 767 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS); 768 769 const auto *Ptr = LD->getPointerOperand(); 770 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 771 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE; 772 // For a generic pointer loaded from the constant memory, it could be assumed 773 // as a global pointer since the constant memory is only populated on the 774 // host side. As implied by the offload programming model, only global 775 // pointers could be referenced on the host side. 776 return AMDGPUAS::GLOBAL_ADDRESS; 777 } 778 779 std::pair<const Value *, unsigned> 780 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const { 781 if (auto *II = dyn_cast<IntrinsicInst>(V)) { 782 switch (II->getIntrinsicID()) { 783 case Intrinsic::amdgcn_is_shared: 784 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS); 785 case Intrinsic::amdgcn_is_private: 786 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS); 787 default: 788 break; 789 } 790 return std::make_pair(nullptr, -1); 791 } 792 // Check the global pointer predication based on 793 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and 794 // the order of 'is_shared' and 'is_private' is not significant. 795 Value *Ptr; 796 if (match( 797 const_cast<Value *>(V), 798 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))), 799 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>( 800 m_Deferred(Ptr)))))) 801 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS); 802 803 return std::make_pair(nullptr, -1); 804 } 805 806 //===----------------------------------------------------------------------===// 807 // GCN Target Machine (SI+) 808 //===----------------------------------------------------------------------===// 809 810 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 811 StringRef CPU, StringRef FS, 812 TargetOptions Options, 813 Optional<Reloc::Model> RM, 814 Optional<CodeModel::Model> CM, 815 CodeGenOpt::Level OL, bool JIT) 816 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 817 818 const TargetSubtargetInfo * 819 GCNTargetMachine::getSubtargetImpl(const Function &F) const { 820 StringRef GPU = getGPUName(F); 821 StringRef FS = getFeatureString(F); 822 823 SmallString<128> SubtargetKey(GPU); 824 SubtargetKey.append(FS); 825 826 auto &I = SubtargetMap[SubtargetKey]; 827 if (!I) { 828 // This needs to be done before we create a new subtarget since any 829 // creation will depend on the TM and the code generation flags on the 830 // function that reside in TargetOptions. 831 resetTargetOptions(F); 832 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); 833 } 834 835 I->setScalarizeGlobalBehavior(ScalarizeGlobal); 836 837 return I.get(); 838 } 839 840 TargetTransformInfo 841 GCNTargetMachine::getTargetTransformInfo(const Function &F) const { 842 return TargetTransformInfo(GCNTTIImpl(this, F)); 843 } 844 845 //===----------------------------------------------------------------------===// 846 // AMDGPU Pass Setup 847 //===----------------------------------------------------------------------===// 848 849 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const { 850 return getStandardCSEConfigForOpt(TM->getOptLevel()); 851 } 852 853 namespace { 854 855 class GCNPassConfig final : public AMDGPUPassConfig { 856 public: 857 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 858 : AMDGPUPassConfig(TM, PM) { 859 // It is necessary to know the register usage of the entire call graph. We 860 // allow calls without EnableAMDGPUFunctionCalls if they are marked 861 // noinline, so this is always required. 862 setRequiresCodeGenSCCOrder(true); 863 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); 864 } 865 866 GCNTargetMachine &getGCNTargetMachine() const { 867 return getTM<GCNTargetMachine>(); 868 } 869 870 ScheduleDAGInstrs * 871 createMachineScheduler(MachineSchedContext *C) const override; 872 873 ScheduleDAGInstrs * 874 createPostMachineScheduler(MachineSchedContext *C) const override { 875 ScheduleDAGMI *DAG = createGenericSchedPostRA(C); 876 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 877 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 878 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); 879 return DAG; 880 } 881 882 bool addPreISel() override; 883 void addMachineSSAOptimization() override; 884 bool addILPOpts() override; 885 bool addInstSelector() override; 886 bool addIRTranslator() override; 887 void addPreLegalizeMachineIR() override; 888 bool addLegalizeMachineIR() override; 889 void addPreRegBankSelect() override; 890 bool addRegBankSelect() override; 891 void addPreGlobalInstructionSelect() override; 892 bool addGlobalInstructionSelect() override; 893 void addFastRegAlloc() override; 894 void addOptimizedRegAlloc() override; 895 896 FunctionPass *createSGPRAllocPass(bool Optimized); 897 FunctionPass *createVGPRAllocPass(bool Optimized); 898 FunctionPass *createRegAllocPass(bool Optimized) override; 899 900 bool addRegAssignAndRewriteFast() override; 901 bool addRegAssignAndRewriteOptimized() override; 902 903 void addPreRegAlloc() override; 904 bool addPreRewrite() override; 905 void addPostRegAlloc() override; 906 void addPreSched2() override; 907 void addPreEmitPass() override; 908 }; 909 910 } // end anonymous namespace 911 912 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) 913 : TargetPassConfig(TM, PM) { 914 // Exceptions and StackMaps are not supported, so these passes will never do 915 // anything. 916 disablePass(&StackMapLivenessID); 917 disablePass(&FuncletLayoutID); 918 // Garbage collection is not supported. 919 disablePass(&GCLoweringID); 920 disablePass(&ShadowStackGCLoweringID); 921 } 922 923 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 924 if (getOptLevel() == CodeGenOpt::Aggressive) 925 addPass(createGVNPass()); 926 else 927 addPass(createEarlyCSEPass()); 928 } 929 930 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 931 addPass(createLICMPass()); 932 addPass(createSeparateConstOffsetFromGEPPass()); 933 addPass(createSpeculativeExecutionPass()); 934 // ReassociateGEPs exposes more opportunities for SLSR. See 935 // the example in reassociate-geps-and-slsr.ll. 936 addPass(createStraightLineStrengthReducePass()); 937 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 938 // EarlyCSE can reuse. 939 addEarlyCSEOrGVNPass(); 940 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 941 addPass(createNaryReassociatePass()); 942 // NaryReassociate on GEPs creates redundant common expressions, so run 943 // EarlyCSE after it. 944 addPass(createEarlyCSEPass()); 945 } 946 947 void AMDGPUPassConfig::addIRPasses() { 948 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 949 950 // There is no reason to run these. 951 disablePass(&StackMapLivenessID); 952 disablePass(&FuncletLayoutID); 953 disablePass(&PatchableFunctionID); 954 955 addPass(createAMDGPUPrintfRuntimeBinding()); 956 addPass(createAMDGPUCtorDtorLoweringPass()); 957 958 // This must occur before inlining, as the inliner will not look through 959 // bitcast calls. 960 addPass(createAMDGPUFixFunctionBitcastsPass()); 961 962 // A call to propagate attributes pass in the backend in case opt was not run. 963 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); 964 965 addPass(createAMDGPULowerIntrinsicsPass()); 966 967 // Function calls are not supported, so make sure we inline everything. 968 addPass(createAMDGPUAlwaysInlinePass()); 969 addPass(createAlwaysInlinerLegacyPass()); 970 // We need to add the barrier noop pass, otherwise adding the function 971 // inlining pass will cause all of the PassConfigs passes to be run 972 // one function at a time, which means if we have a nodule with two 973 // functions, then we will generate code for the first function 974 // without ever running any passes on the second. 975 addPass(createBarrierNoopPass()); 976 977 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 978 if (TM.getTargetTriple().getArch() == Triple::r600) 979 addPass(createR600OpenCLImageTypeLoweringPass()); 980 981 // Replace OpenCL enqueued block function pointers with global variables. 982 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); 983 984 // Can increase LDS used by kernel so runs before PromoteAlloca 985 if (EnableLowerModuleLDS) { 986 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the 987 // pass "amdgpu-lower-module-lds", and also it required to be run only if 988 // "amdgpu-lower-module-lds" pass is enabled. 989 if (EnableLDSReplaceWithPointer) 990 addPass(createAMDGPUReplaceLDSUseWithPointerPass()); 991 992 addPass(createAMDGPULowerModuleLDSPass()); 993 } 994 995 if (TM.getOptLevel() > CodeGenOpt::None) 996 addPass(createInferAddressSpacesPass()); 997 998 addPass(createAtomicExpandPass()); 999 1000 if (TM.getOptLevel() > CodeGenOpt::None) { 1001 addPass(createAMDGPUPromoteAlloca()); 1002 1003 if (EnableSROA) 1004 addPass(createSROAPass()); 1005 if (isPassEnabled(EnableScalarIRPasses)) 1006 addStraightLineScalarOptimizationPasses(); 1007 1008 if (EnableAMDGPUAliasAnalysis) { 1009 addPass(createAMDGPUAAWrapperPass()); 1010 addPass(createExternalAAWrapperPass([](Pass &P, Function &, 1011 AAResults &AAR) { 1012 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) 1013 AAR.addAAResult(WrapperPass->getResult()); 1014 })); 1015 } 1016 1017 if (TM.getTargetTriple().getArch() == Triple::amdgcn) { 1018 // TODO: May want to move later or split into an early and late one. 1019 addPass(createAMDGPUCodeGenPreparePass()); 1020 } 1021 } 1022 1023 TargetPassConfig::addIRPasses(); 1024 1025 // EarlyCSE is not always strong enough to clean up what LSR produces. For 1026 // example, GVN can combine 1027 // 1028 // %0 = add %a, %b 1029 // %1 = add %b, %a 1030 // 1031 // and 1032 // 1033 // %0 = shl nsw %a, 2 1034 // %1 = shl %a, 2 1035 // 1036 // but EarlyCSE can do neither of them. 1037 if (isPassEnabled(EnableScalarIRPasses)) 1038 addEarlyCSEOrGVNPass(); 1039 } 1040 1041 void AMDGPUPassConfig::addCodeGenPrepare() { 1042 if (TM->getTargetTriple().getArch() == Triple::amdgcn) { 1043 addPass(createAMDGPUAttributorPass()); 1044 1045 // FIXME: This pass adds 2 hacky attributes that can be replaced with an 1046 // analysis, and should be removed. 1047 addPass(createAMDGPUAnnotateKernelFeaturesPass()); 1048 } 1049 1050 if (TM->getTargetTriple().getArch() == Triple::amdgcn && 1051 EnableLowerKernelArguments) 1052 addPass(createAMDGPULowerKernelArgumentsPass()); 1053 1054 TargetPassConfig::addCodeGenPrepare(); 1055 1056 if (isPassEnabled(EnableLoadStoreVectorizer)) 1057 addPass(createLoadStoreVectorizerPass()); 1058 1059 // LowerSwitch pass may introduce unreachable blocks that can 1060 // cause unexpected behavior for subsequent passes. Placing it 1061 // here seems better that these blocks would get cleaned up by 1062 // UnreachableBlockElim inserted next in the pass flow. 1063 addPass(createLowerSwitchPass()); 1064 } 1065 1066 bool AMDGPUPassConfig::addPreISel() { 1067 if (TM->getOptLevel() > CodeGenOpt::None) 1068 addPass(createFlattenCFGPass()); 1069 return false; 1070 } 1071 1072 bool AMDGPUPassConfig::addInstSelector() { 1073 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); 1074 return false; 1075 } 1076 1077 bool AMDGPUPassConfig::addGCPasses() { 1078 // Do nothing. GC is not supported. 1079 return false; 1080 } 1081 1082 llvm::ScheduleDAGInstrs * 1083 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const { 1084 ScheduleDAGMILive *DAG = createGenericSchedLive(C); 1085 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 1086 return DAG; 1087 } 1088 1089 //===----------------------------------------------------------------------===// 1090 // GCN Pass Setup 1091 //===----------------------------------------------------------------------===// 1092 1093 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 1094 MachineSchedContext *C) const { 1095 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); 1096 if (ST.enableSIScheduler()) 1097 return createSIMachineScheduler(C); 1098 return createGCNMaxOccupancyMachineScheduler(C); 1099 } 1100 1101 bool GCNPassConfig::addPreISel() { 1102 AMDGPUPassConfig::addPreISel(); 1103 1104 if (TM->getOptLevel() > CodeGenOpt::None) 1105 addPass(createAMDGPULateCodeGenPreparePass()); 1106 1107 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { 1108 addPass(createAMDGPUAtomicOptimizerPass()); 1109 } 1110 1111 if (TM->getOptLevel() > CodeGenOpt::None) 1112 addPass(createSinkingPass()); 1113 1114 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit 1115 // regions formed by them. 1116 addPass(&AMDGPUUnifyDivergentExitNodesID); 1117 if (!LateCFGStructurize) { 1118 if (EnableStructurizerWorkarounds) { 1119 addPass(createFixIrreduciblePass()); 1120 addPass(createUnifyLoopExitsPass()); 1121 } 1122 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions 1123 } 1124 addPass(createAMDGPUAnnotateUniformValues()); 1125 if (!LateCFGStructurize) { 1126 addPass(createSIAnnotateControlFlowPass()); 1127 } 1128 addPass(createLCSSAPass()); 1129 1130 if (TM->getOptLevel() > CodeGenOpt::Less) 1131 addPass(&AMDGPUPerfHintAnalysisID); 1132 1133 return false; 1134 } 1135 1136 void GCNPassConfig::addMachineSSAOptimization() { 1137 TargetPassConfig::addMachineSSAOptimization(); 1138 1139 // We want to fold operands after PeepholeOptimizer has run (or as part of 1140 // it), because it will eliminate extra copies making it easier to fold the 1141 // real source operand. We want to eliminate dead instructions after, so that 1142 // we see fewer uses of the copies. We then need to clean up the dead 1143 // instructions leftover after the operands are folded as well. 1144 // 1145 // XXX - Can we get away without running DeadMachineInstructionElim again? 1146 addPass(&SIFoldOperandsID); 1147 if (EnableDPPCombine) 1148 addPass(&GCNDPPCombineID); 1149 addPass(&SILoadStoreOptimizerID); 1150 if (isPassEnabled(EnableSDWAPeephole)) { 1151 addPass(&SIPeepholeSDWAID); 1152 addPass(&EarlyMachineLICMID); 1153 addPass(&MachineCSEID); 1154 addPass(&SIFoldOperandsID); 1155 } 1156 addPass(&DeadMachineInstructionElimID); 1157 addPass(createSIShrinkInstructionsPass()); 1158 } 1159 1160 bool GCNPassConfig::addILPOpts() { 1161 if (EnableEarlyIfConversion) 1162 addPass(&EarlyIfConverterID); 1163 1164 TargetPassConfig::addILPOpts(); 1165 return false; 1166 } 1167 1168 bool GCNPassConfig::addInstSelector() { 1169 AMDGPUPassConfig::addInstSelector(); 1170 addPass(&SIFixSGPRCopiesID); 1171 addPass(createSILowerI1CopiesPass()); 1172 return false; 1173 } 1174 1175 bool GCNPassConfig::addIRTranslator() { 1176 addPass(new IRTranslator(getOptLevel())); 1177 return false; 1178 } 1179 1180 void GCNPassConfig::addPreLegalizeMachineIR() { 1181 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1182 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone)); 1183 addPass(new Localizer()); 1184 } 1185 1186 bool GCNPassConfig::addLegalizeMachineIR() { 1187 addPass(new Legalizer()); 1188 return false; 1189 } 1190 1191 void GCNPassConfig::addPreRegBankSelect() { 1192 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1193 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone)); 1194 } 1195 1196 bool GCNPassConfig::addRegBankSelect() { 1197 addPass(new RegBankSelect()); 1198 return false; 1199 } 1200 1201 void GCNPassConfig::addPreGlobalInstructionSelect() { 1202 bool IsOptNone = getOptLevel() == CodeGenOpt::None; 1203 addPass(createAMDGPURegBankCombiner(IsOptNone)); 1204 } 1205 1206 bool GCNPassConfig::addGlobalInstructionSelect() { 1207 addPass(new InstructionSelect(getOptLevel())); 1208 return false; 1209 } 1210 1211 void GCNPassConfig::addPreRegAlloc() { 1212 if (LateCFGStructurize) { 1213 addPass(createAMDGPUMachineCFGStructurizerPass()); 1214 } 1215 } 1216 1217 void GCNPassConfig::addFastRegAlloc() { 1218 // FIXME: We have to disable the verifier here because of PHIElimination + 1219 // TwoAddressInstructions disabling it. 1220 1221 // This must be run immediately after phi elimination and before 1222 // TwoAddressInstructions, otherwise the processing of the tied operand of 1223 // SI_ELSE will introduce a copy of the tied operand source after the else. 1224 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1225 1226 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID); 1227 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID); 1228 1229 TargetPassConfig::addFastRegAlloc(); 1230 } 1231 1232 void GCNPassConfig::addOptimizedRegAlloc() { 1233 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation 1234 // instructions that cause scheduling barriers. 1235 insertPass(&MachineSchedulerID, &SIWholeQuadModeID); 1236 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID); 1237 1238 if (OptExecMaskPreRA) 1239 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); 1240 1241 if (isPassEnabled(EnablePreRAOptimizations)) 1242 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); 1243 1244 // This is not an essential optimization and it has a noticeable impact on 1245 // compilation time, so we only enable it from O2. 1246 if (TM->getOptLevel() > CodeGenOpt::Less) 1247 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID); 1248 1249 // FIXME: when an instruction has a Killed operand, and the instruction is 1250 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of 1251 // the register in LiveVariables, this would trigger a failure in verifier, 1252 // we should fix it and enable the verifier. 1253 if (OptVGPRLiveRange) 1254 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID); 1255 // This must be run immediately after phi elimination and before 1256 // TwoAddressInstructions, otherwise the processing of the tied operand of 1257 // SI_ELSE will introduce a copy of the tied operand source after the else. 1258 insertPass(&PHIEliminationID, &SILowerControlFlowID); 1259 1260 if (EnableDCEInRA) 1261 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); 1262 1263 TargetPassConfig::addOptimizedRegAlloc(); 1264 } 1265 1266 bool GCNPassConfig::addPreRewrite() { 1267 if (EnableRegReassign) 1268 addPass(&GCNNSAReassignID); 1269 return true; 1270 } 1271 1272 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) { 1273 // Initialize the global default. 1274 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag, 1275 initializeDefaultSGPRRegisterAllocatorOnce); 1276 1277 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault(); 1278 if (Ctor != useDefaultRegisterAllocator) 1279 return Ctor(); 1280 1281 if (Optimized) 1282 return createGreedyRegisterAllocator(onlyAllocateSGPRs); 1283 1284 return createFastRegisterAllocator(onlyAllocateSGPRs, false); 1285 } 1286 1287 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) { 1288 // Initialize the global default. 1289 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag, 1290 initializeDefaultVGPRRegisterAllocatorOnce); 1291 1292 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault(); 1293 if (Ctor != useDefaultRegisterAllocator) 1294 return Ctor(); 1295 1296 if (Optimized) 1297 return createGreedyVGPRRegisterAllocator(); 1298 1299 return createFastVGPRRegisterAllocator(); 1300 } 1301 1302 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) { 1303 llvm_unreachable("should not be used"); 1304 } 1305 1306 static const char RegAllocOptNotSupportedMessage[] = 1307 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc"; 1308 1309 bool GCNPassConfig::addRegAssignAndRewriteFast() { 1310 if (!usingDefaultRegAlloc()) 1311 report_fatal_error(RegAllocOptNotSupportedMessage); 1312 1313 addPass(createSGPRAllocPass(false)); 1314 1315 // Equivalent of PEI for SGPRs. 1316 addPass(&SILowerSGPRSpillsID); 1317 1318 addPass(createVGPRAllocPass(false)); 1319 return true; 1320 } 1321 1322 bool GCNPassConfig::addRegAssignAndRewriteOptimized() { 1323 if (!usingDefaultRegAlloc()) 1324 report_fatal_error(RegAllocOptNotSupportedMessage); 1325 1326 addPass(createSGPRAllocPass(true)); 1327 1328 // Commit allocated register changes. This is mostly necessary because too 1329 // many things rely on the use lists of the physical registers, such as the 1330 // verifier. This is only necessary with allocators which use LiveIntervals, 1331 // since FastRegAlloc does the replacements itself. 1332 addPass(createVirtRegRewriter(false)); 1333 1334 // Equivalent of PEI for SGPRs. 1335 addPass(&SILowerSGPRSpillsID); 1336 1337 addPass(createVGPRAllocPass(true)); 1338 1339 addPreRewrite(); 1340 addPass(&VirtRegRewriterID); 1341 1342 return true; 1343 } 1344 1345 void GCNPassConfig::addPostRegAlloc() { 1346 addPass(&SIFixVGPRCopiesID); 1347 if (getOptLevel() > CodeGenOpt::None) 1348 addPass(&SIOptimizeExecMaskingID); 1349 TargetPassConfig::addPostRegAlloc(); 1350 } 1351 1352 void GCNPassConfig::addPreSched2() { 1353 if (TM->getOptLevel() > CodeGenOpt::None) 1354 addPass(createSIShrinkInstructionsPass()); 1355 addPass(&SIPostRABundlerID); 1356 } 1357 1358 void GCNPassConfig::addPreEmitPass() { 1359 addPass(createSIMemoryLegalizerPass()); 1360 addPass(createSIInsertWaitcntsPass()); 1361 1362 addPass(createSIModeRegisterPass()); 1363 1364 if (getOptLevel() > CodeGenOpt::None) 1365 addPass(&SIInsertHardClausesID); 1366 1367 addPass(&SILateBranchLoweringPassID); 1368 if (getOptLevel() > CodeGenOpt::None) 1369 addPass(&SIPreEmitPeepholeID); 1370 // The hazard recognizer that runs as part of the post-ra scheduler does not 1371 // guarantee to be able handle all hazards correctly. This is because if there 1372 // are multiple scheduling regions in a basic block, the regions are scheduled 1373 // bottom up, so when we begin to schedule a region we don't know what 1374 // instructions were emitted directly before it. 1375 // 1376 // Here we add a stand-alone hazard recognizer pass which can handle all 1377 // cases. 1378 addPass(&PostRAHazardRecognizerID); 1379 addPass(&BranchRelaxationPassID); 1380 } 1381 1382 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 1383 return new GCNPassConfig(*this, PM); 1384 } 1385 1386 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const { 1387 return new yaml::SIMachineFunctionInfo(); 1388 } 1389 1390 yaml::MachineFunctionInfo * 1391 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const { 1392 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1393 return new yaml::SIMachineFunctionInfo( 1394 *MFI, *MF.getSubtarget().getRegisterInfo(), MF); 1395 } 1396 1397 bool GCNTargetMachine::parseMachineFunctionInfo( 1398 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS, 1399 SMDiagnostic &Error, SMRange &SourceRange) const { 1400 const yaml::SIMachineFunctionInfo &YamlMFI = 1401 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_); 1402 MachineFunction &MF = PFS.MF; 1403 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1404 1405 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange)) 1406 return true; 1407 1408 if (MFI->Occupancy == 0) { 1409 // Fixup the subtarget dependent default value. 1410 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1411 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize()); 1412 } 1413 1414 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { 1415 Register TempReg; 1416 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { 1417 SourceRange = RegName.SourceRange; 1418 return true; 1419 } 1420 RegVal = TempReg; 1421 1422 return false; 1423 }; 1424 1425 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { 1426 // Create a diagnostic for a the register string literal. 1427 const MemoryBuffer &Buffer = 1428 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID()); 1429 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1430 RegName.Value.size(), SourceMgr::DK_Error, 1431 "incorrect register class for field", RegName.Value, 1432 None, None); 1433 SourceRange = RegName.SourceRange; 1434 return true; 1435 }; 1436 1437 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || 1438 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || 1439 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) 1440 return true; 1441 1442 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG && 1443 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) { 1444 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); 1445 } 1446 1447 if (MFI->FrameOffsetReg != AMDGPU::FP_REG && 1448 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { 1449 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); 1450 } 1451 1452 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG && 1453 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) { 1454 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg); 1455 } 1456 1457 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A, 1458 const TargetRegisterClass &RC, 1459 ArgDescriptor &Arg, unsigned UserSGPRs, 1460 unsigned SystemSGPRs) { 1461 // Skip parsing if it's not present. 1462 if (!A) 1463 return false; 1464 1465 if (A->IsRegister) { 1466 Register Reg; 1467 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) { 1468 SourceRange = A->RegisterName.SourceRange; 1469 return true; 1470 } 1471 if (!RC.contains(Reg)) 1472 return diagnoseRegisterClass(A->RegisterName); 1473 Arg = ArgDescriptor::createRegister(Reg); 1474 } else 1475 Arg = ArgDescriptor::createStack(A->StackOffset); 1476 // Check and apply the optional mask. 1477 if (A->Mask) 1478 Arg = ArgDescriptor::createArg(Arg, A->Mask.getValue()); 1479 1480 MFI->NumUserSGPRs += UserSGPRs; 1481 MFI->NumSystemSGPRs += SystemSGPRs; 1482 return false; 1483 }; 1484 1485 if (YamlMFI.ArgInfo && 1486 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer, 1487 AMDGPU::SGPR_128RegClass, 1488 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) || 1489 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr, 1490 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr, 1491 2, 0) || 1492 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass, 1493 MFI->ArgInfo.QueuePtr, 2, 0) || 1494 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr, 1495 AMDGPU::SReg_64RegClass, 1496 MFI->ArgInfo.KernargSegmentPtr, 2, 0) || 1497 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID, 1498 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID, 1499 2, 0) || 1500 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit, 1501 AMDGPU::SReg_64RegClass, 1502 MFI->ArgInfo.FlatScratchInit, 2, 0) || 1503 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, 1504 AMDGPU::SGPR_32RegClass, 1505 MFI->ArgInfo.PrivateSegmentSize, 0, 0) || 1506 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, 1507 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 1508 0, 1) || 1509 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY, 1510 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY, 1511 0, 1) || 1512 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ, 1513 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ, 1514 0, 1) || 1515 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo, 1516 AMDGPU::SGPR_32RegClass, 1517 MFI->ArgInfo.WorkGroupInfo, 0, 1) || 1518 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset, 1519 AMDGPU::SGPR_32RegClass, 1520 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) || 1521 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr, 1522 AMDGPU::SReg_64RegClass, 1523 MFI->ArgInfo.ImplicitArgPtr, 0, 0) || 1524 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr, 1525 AMDGPU::SReg_64RegClass, 1526 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) || 1527 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX, 1528 AMDGPU::VGPR_32RegClass, 1529 MFI->ArgInfo.WorkItemIDX, 0, 0) || 1530 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY, 1531 AMDGPU::VGPR_32RegClass, 1532 MFI->ArgInfo.WorkItemIDY, 0, 0) || 1533 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ, 1534 AMDGPU::VGPR_32RegClass, 1535 MFI->ArgInfo.WorkItemIDZ, 0, 0))) 1536 return true; 1537 1538 MFI->Mode.IEEE = YamlMFI.Mode.IEEE; 1539 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp; 1540 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals; 1541 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals; 1542 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals; 1543 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals; 1544 1545 return false; 1546 } 1547