1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUTargetObjectFile.h" 20 #include "AMDGPUTargetTransformInfo.h" 21 #include "GCNSchedStrategy.h" 22 #include "R600ISelLowering.h" 23 #include "R600InstrInfo.h" 24 #include "R600MachineScheduler.h" 25 #include "SIISelLowering.h" 26 #include "SIInstrInfo.h" 27 #include "SIMachineScheduler.h" 28 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 29 #include "llvm/CodeGen/Passes.h" 30 #include "llvm/CodeGen/TargetPassConfig.h" 31 #include "llvm/Support/TargetRegistry.h" 32 #include "llvm/Transforms/IPO.h" 33 #include "llvm/Transforms/IPO/AlwaysInliner.h" 34 #include "llvm/Transforms/Scalar.h" 35 #include "llvm/Transforms/Scalar/GVN.h" 36 #include "llvm/Transforms/Vectorize.h" 37 38 using namespace llvm; 39 40 static cl::opt<bool> EnableR600StructurizeCFG( 41 "r600-ir-structurize", 42 cl::desc("Use StructurizeCFG IR pass"), 43 cl::init(true)); 44 45 static cl::opt<bool> EnableSROA( 46 "amdgpu-sroa", 47 cl::desc("Run SROA after promote alloca pass"), 48 cl::ReallyHidden, 49 cl::init(true)); 50 51 static cl::opt<bool> EnableR600IfConvert( 52 "r600-if-convert", 53 cl::desc("Use if conversion pass"), 54 cl::ReallyHidden, 55 cl::init(true)); 56 57 // Option to disable vectorizer for tests. 58 static cl::opt<bool> EnableLoadStoreVectorizer( 59 "amdgpu-load-store-vectorizer", 60 cl::desc("Enable load store vectorizer"), 61 cl::init(true), 62 cl::Hidden); 63 64 extern "C" void LLVMInitializeAMDGPUTarget() { 65 // Register the target 66 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); 67 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); 68 69 PassRegistry *PR = PassRegistry::getPassRegistry(); 70 initializeSILowerI1CopiesPass(*PR); 71 initializeSIFixSGPRCopiesPass(*PR); 72 initializeSIFoldOperandsPass(*PR); 73 initializeSIShrinkInstructionsPass(*PR); 74 initializeSIFixControlFlowLiveIntervalsPass(*PR); 75 initializeSILoadStoreOptimizerPass(*PR); 76 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 77 initializeAMDGPUAnnotateUniformValuesPass(*PR); 78 initializeAMDGPUPromoteAllocaPass(*PR); 79 initializeAMDGPUCodeGenPreparePass(*PR); 80 initializeSIAnnotateControlFlowPass(*PR); 81 initializeSIInsertWaitsPass(*PR); 82 initializeSIWholeQuadModePass(*PR); 83 initializeSILowerControlFlowPass(*PR); 84 initializeSIInsertSkipsPass(*PR); 85 initializeSIDebuggerInsertNopsPass(*PR); 86 initializeSIOptimizeExecMaskingPass(*PR); 87 } 88 89 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 90 return make_unique<AMDGPUTargetObjectFile>(); 91 } 92 93 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 94 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 95 } 96 97 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 98 return new SIScheduleDAGMI(C); 99 } 100 101 static ScheduleDAGInstrs * 102 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { 103 ScheduleDAGMILive *DAG = 104 new ScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); 105 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); 106 return DAG; 107 } 108 109 static MachineSchedRegistry 110 R600SchedRegistry("r600", "Run R600's custom scheduler", 111 createR600MachineScheduler); 112 113 static MachineSchedRegistry 114 SISchedRegistry("si", "Run SI's custom scheduler", 115 createSIMachineScheduler); 116 117 static MachineSchedRegistry 118 GCNMaxOccupancySchedRegistry("gcn-max-occupancy", 119 "Run GCN scheduler to maximize occupancy", 120 createGCNMaxOccupancyMachineScheduler); 121 122 static StringRef computeDataLayout(const Triple &TT) { 123 if (TT.getArch() == Triple::r600) { 124 // 32-bit pointers. 125 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 126 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 127 } 128 129 // 32-bit private, local, and region pointers. 64-bit global, constant and 130 // flat. 131 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 132 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 133 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 134 } 135 136 LLVM_READNONE 137 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 138 if (!GPU.empty()) 139 return GPU; 140 141 // HSA only supports CI+, so change the default GPU to a CI for HSA. 142 if (TT.getArch() == Triple::amdgcn) 143 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 144 145 return "r600"; 146 } 147 148 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 149 // The AMDGPU toolchain only supports generating shared objects, so we 150 // must always use PIC. 151 return Reloc::PIC_; 152 } 153 154 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 155 StringRef CPU, StringRef FS, 156 TargetOptions Options, 157 Optional<Reloc::Model> RM, 158 CodeModel::Model CM, 159 CodeGenOpt::Level OptLevel) 160 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 161 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 162 TLOF(createTLOF(getTargetTriple())), 163 IntrinsicInfo() { 164 setRequiresStructuredCFG(true); 165 initAsmInfo(); 166 } 167 168 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 169 170 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 171 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 172 return GPUAttr.hasAttribute(Attribute::None) ? 173 getTargetCPU() : GPUAttr.getValueAsString(); 174 } 175 176 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 177 Attribute FSAttr = F.getFnAttribute("target-features"); 178 179 return FSAttr.hasAttribute(Attribute::None) ? 180 getTargetFeatureString() : 181 FSAttr.getValueAsString(); 182 } 183 184 //===----------------------------------------------------------------------===// 185 // R600 Target Machine (R600 -> Cayman) 186 //===----------------------------------------------------------------------===// 187 188 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 189 StringRef CPU, StringRef FS, 190 TargetOptions Options, 191 Optional<Reloc::Model> RM, 192 CodeModel::Model CM, CodeGenOpt::Level OL) 193 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 194 195 const R600Subtarget *R600TargetMachine::getSubtargetImpl( 196 const Function &F) const { 197 StringRef GPU = getGPUName(F); 198 StringRef FS = getFeatureString(F); 199 200 SmallString<128> SubtargetKey(GPU); 201 SubtargetKey.append(FS); 202 203 auto &I = SubtargetMap[SubtargetKey]; 204 if (!I) { 205 // This needs to be done before we create a new subtarget since any 206 // creation will depend on the TM and the code generation flags on the 207 // function that reside in TargetOptions. 208 resetTargetOptions(F); 209 I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); 210 } 211 212 return I.get(); 213 } 214 215 //===----------------------------------------------------------------------===// 216 // GCN Target Machine (SI+) 217 //===----------------------------------------------------------------------===// 218 219 #ifdef LLVM_BUILD_GLOBAL_ISEL 220 namespace { 221 struct SIGISelActualAccessor : public GISelAccessor { 222 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 223 const AMDGPUCallLowering *getCallLowering() const override { 224 return CallLoweringInfo.get(); 225 } 226 }; 227 } // End anonymous namespace. 228 #endif 229 230 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 231 StringRef CPU, StringRef FS, 232 TargetOptions Options, 233 Optional<Reloc::Model> RM, 234 CodeModel::Model CM, CodeGenOpt::Level OL) 235 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 236 237 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { 238 StringRef GPU = getGPUName(F); 239 StringRef FS = getFeatureString(F); 240 241 SmallString<128> SubtargetKey(GPU); 242 SubtargetKey.append(FS); 243 244 auto &I = SubtargetMap[SubtargetKey]; 245 if (!I) { 246 // This needs to be done before we create a new subtarget since any 247 // creation will depend on the TM and the code generation flags on the 248 // function that reside in TargetOptions. 249 resetTargetOptions(F); 250 I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); 251 252 #ifndef LLVM_BUILD_GLOBAL_ISEL 253 GISelAccessor *GISel = new GISelAccessor(); 254 #else 255 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 256 GISel->CallLoweringInfo.reset( 257 new AMDGPUCallLowering(*I->getTargetLowering())); 258 #endif 259 260 I->setGISelAccessor(*GISel); 261 } 262 263 return I.get(); 264 } 265 266 //===----------------------------------------------------------------------===// 267 // AMDGPU Pass Setup 268 //===----------------------------------------------------------------------===// 269 270 namespace { 271 272 class AMDGPUPassConfig : public TargetPassConfig { 273 public: 274 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 275 : TargetPassConfig(TM, PM) { 276 277 // Exceptions and StackMaps are not supported, so these passes will never do 278 // anything. 279 disablePass(&StackMapLivenessID); 280 disablePass(&FuncletLayoutID); 281 } 282 283 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 284 return getTM<AMDGPUTargetMachine>(); 285 } 286 287 void addEarlyCSEOrGVNPass(); 288 void addStraightLineScalarOptimizationPasses(); 289 void addIRPasses() override; 290 void addCodeGenPrepare() override; 291 bool addPreISel() override; 292 bool addInstSelector() override; 293 bool addGCPasses() override; 294 }; 295 296 class R600PassConfig final : public AMDGPUPassConfig { 297 public: 298 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 299 : AMDGPUPassConfig(TM, PM) { } 300 301 ScheduleDAGInstrs *createMachineScheduler( 302 MachineSchedContext *C) const override { 303 return createR600MachineScheduler(C); 304 } 305 306 bool addPreISel() override; 307 void addPreRegAlloc() override; 308 void addPreSched2() override; 309 void addPreEmitPass() override; 310 }; 311 312 class GCNPassConfig final : public AMDGPUPassConfig { 313 public: 314 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 315 : AMDGPUPassConfig(TM, PM) { } 316 317 GCNTargetMachine &getGCNTargetMachine() const { 318 return getTM<GCNTargetMachine>(); 319 } 320 321 ScheduleDAGInstrs * 322 createMachineScheduler(MachineSchedContext *C) const override; 323 324 void addIRPasses() override; 325 bool addPreISel() override; 326 void addMachineSSAOptimization() override; 327 bool addInstSelector() override; 328 #ifdef LLVM_BUILD_GLOBAL_ISEL 329 bool addIRTranslator() override; 330 bool addLegalizeMachineIR() override; 331 bool addRegBankSelect() override; 332 bool addGlobalInstructionSelect() override; 333 #endif 334 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 335 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 336 void addPreRegAlloc() override; 337 void addPostRegAlloc() override; 338 void addPreSched2() override; 339 void addPreEmitPass() override; 340 }; 341 342 } // End of anonymous namespace 343 344 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 345 return TargetIRAnalysis([this](const Function &F) { 346 return TargetTransformInfo(AMDGPUTTIImpl(this, F)); 347 }); 348 } 349 350 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 351 if (getOptLevel() == CodeGenOpt::Aggressive) 352 addPass(createGVNPass()); 353 else 354 addPass(createEarlyCSEPass()); 355 } 356 357 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 358 addPass(createSeparateConstOffsetFromGEPPass()); 359 addPass(createSpeculativeExecutionPass()); 360 // ReassociateGEPs exposes more opportunites for SLSR. See 361 // the example in reassociate-geps-and-slsr.ll. 362 addPass(createStraightLineStrengthReducePass()); 363 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 364 // EarlyCSE can reuse. 365 addEarlyCSEOrGVNPass(); 366 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 367 addPass(createNaryReassociatePass()); 368 // NaryReassociate on GEPs creates redundant common expressions, so run 369 // EarlyCSE after it. 370 addPass(createEarlyCSEPass()); 371 } 372 373 void AMDGPUPassConfig::addIRPasses() { 374 // There is no reason to run these. 375 disablePass(&StackMapLivenessID); 376 disablePass(&FuncletLayoutID); 377 disablePass(&PatchableFunctionID); 378 379 // Function calls are not supported, so make sure we inline everything. 380 addPass(createAMDGPUAlwaysInlinePass()); 381 addPass(createAlwaysInlinerLegacyPass()); 382 // We need to add the barrier noop pass, otherwise adding the function 383 // inlining pass will cause all of the PassConfigs passes to be run 384 // one function at a time, which means if we have a nodule with two 385 // functions, then we will generate code for the first function 386 // without ever running any passes on the second. 387 addPass(createBarrierNoopPass()); 388 389 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 390 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 391 392 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 393 if (TM.getOptLevel() > CodeGenOpt::None) { 394 addPass(createAMDGPUPromoteAlloca(&TM)); 395 396 if (EnableSROA) 397 addPass(createSROAPass()); 398 399 addStraightLineScalarOptimizationPasses(); 400 } 401 402 TargetPassConfig::addIRPasses(); 403 404 // EarlyCSE is not always strong enough to clean up what LSR produces. For 405 // example, GVN can combine 406 // 407 // %0 = add %a, %b 408 // %1 = add %b, %a 409 // 410 // and 411 // 412 // %0 = shl nsw %a, 2 413 // %1 = shl %a, 2 414 // 415 // but EarlyCSE can do neither of them. 416 if (getOptLevel() != CodeGenOpt::None) 417 addEarlyCSEOrGVNPass(); 418 } 419 420 void AMDGPUPassConfig::addCodeGenPrepare() { 421 TargetPassConfig::addCodeGenPrepare(); 422 423 if (EnableLoadStoreVectorizer) 424 addPass(createLoadStoreVectorizerPass()); 425 } 426 427 bool AMDGPUPassConfig::addPreISel() { 428 addPass(createFlattenCFGPass()); 429 return false; 430 } 431 432 bool AMDGPUPassConfig::addInstSelector() { 433 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); 434 return false; 435 } 436 437 bool AMDGPUPassConfig::addGCPasses() { 438 // Do nothing. GC is not supported. 439 return false; 440 } 441 442 //===----------------------------------------------------------------------===// 443 // R600 Pass Setup 444 //===----------------------------------------------------------------------===// 445 446 bool R600PassConfig::addPreISel() { 447 AMDGPUPassConfig::addPreISel(); 448 449 if (EnableR600StructurizeCFG) 450 addPass(createStructurizeCFGPass()); 451 return false; 452 } 453 454 void R600PassConfig::addPreRegAlloc() { 455 addPass(createR600VectorRegMerger(*TM)); 456 } 457 458 void R600PassConfig::addPreSched2() { 459 addPass(createR600EmitClauseMarkers(), false); 460 if (EnableR600IfConvert) 461 addPass(&IfConverterID, false); 462 addPass(createR600ClauseMergePass(*TM), false); 463 } 464 465 void R600PassConfig::addPreEmitPass() { 466 addPass(createAMDGPUCFGStructurizerPass(), false); 467 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 468 addPass(&FinalizeMachineBundlesID, false); 469 addPass(createR600Packetizer(*TM), false); 470 addPass(createR600ControlFlowFinalizer(*TM), false); 471 } 472 473 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 474 return new R600PassConfig(this, PM); 475 } 476 477 //===----------------------------------------------------------------------===// 478 // GCN Pass Setup 479 //===----------------------------------------------------------------------===// 480 481 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 482 MachineSchedContext *C) const { 483 const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); 484 if (ST.enableSIScheduler()) 485 return createSIMachineScheduler(C); 486 return createGCNMaxOccupancyMachineScheduler(C); 487 } 488 489 bool GCNPassConfig::addPreISel() { 490 AMDGPUPassConfig::addPreISel(); 491 492 // FIXME: We need to run a pass to propagate the attributes when calls are 493 // supported. 494 addPass(&AMDGPUAnnotateKernelFeaturesID); 495 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 496 addPass(createSinkingPass()); 497 addPass(createSITypeRewriter()); 498 addPass(createAMDGPUAnnotateUniformValues()); 499 addPass(createSIAnnotateControlFlowPass()); 500 501 return false; 502 } 503 504 void GCNPassConfig::addMachineSSAOptimization() { 505 TargetPassConfig::addMachineSSAOptimization(); 506 507 // We want to fold operands after PeepholeOptimizer has run (or as part of 508 // it), because it will eliminate extra copies making it easier to fold the 509 // real source operand. We want to eliminate dead instructions after, so that 510 // we see fewer uses of the copies. We then need to clean up the dead 511 // instructions leftover after the operands are folded as well. 512 // 513 // XXX - Can we get away without running DeadMachineInstructionElim again? 514 addPass(&SIFoldOperandsID); 515 addPass(&DeadMachineInstructionElimID); 516 addPass(&SILoadStoreOptimizerID); 517 } 518 519 void GCNPassConfig::addIRPasses() { 520 // TODO: May want to move later or split into an early and late one. 521 addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); 522 523 AMDGPUPassConfig::addIRPasses(); 524 } 525 526 bool GCNPassConfig::addInstSelector() { 527 AMDGPUPassConfig::addInstSelector(); 528 addPass(createSILowerI1CopiesPass()); 529 addPass(&SIFixSGPRCopiesID); 530 return false; 531 } 532 533 #ifdef LLVM_BUILD_GLOBAL_ISEL 534 bool GCNPassConfig::addIRTranslator() { 535 addPass(new IRTranslator()); 536 return false; 537 } 538 539 bool GCNPassConfig::addLegalizeMachineIR() { 540 return false; 541 } 542 543 bool GCNPassConfig::addRegBankSelect() { 544 return false; 545 } 546 547 bool GCNPassConfig::addGlobalInstructionSelect() { 548 return false; 549 } 550 #endif 551 552 void GCNPassConfig::addPreRegAlloc() { 553 addPass(createSIShrinkInstructionsPass()); 554 addPass(createSIWholeQuadModePass()); 555 } 556 557 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 558 // FIXME: We have to disable the verifier here because of PHIElimination + 559 // TwoAddressInstructions disabling it. 560 561 // This must be run immediately after phi elimination and before 562 // TwoAddressInstructions, otherwise the processing of the tied operand of 563 // SI_ELSE will introduce a copy of the tied operand source after the else. 564 insertPass(&PHIEliminationID, &SILowerControlFlowID, false); 565 566 TargetPassConfig::addFastRegAlloc(RegAllocPass); 567 } 568 569 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 570 // This needs to be run directly before register allocation because earlier 571 // passes might recompute live intervals. 572 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 573 574 // This must be run immediately after phi elimination and before 575 // TwoAddressInstructions, otherwise the processing of the tied operand of 576 // SI_ELSE will introduce a copy of the tied operand source after the else. 577 insertPass(&PHIEliminationID, &SILowerControlFlowID, false); 578 579 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 580 } 581 582 void GCNPassConfig::addPostRegAlloc() { 583 addPass(&SIOptimizeExecMaskingID); 584 TargetPassConfig::addPostRegAlloc(); 585 } 586 587 void GCNPassConfig::addPreSched2() { 588 } 589 590 void GCNPassConfig::addPreEmitPass() { 591 // The hazard recognizer that runs as part of the post-ra scheduler does not 592 // guarantee to be able handle all hazards correctly. This is because if there 593 // are multiple scheduling regions in a basic block, the regions are scheduled 594 // bottom up, so when we begin to schedule a region we don't know what 595 // instructions were emitted directly before it. 596 // 597 // Here we add a stand-alone hazard recognizer pass which can handle all 598 // cases. 599 addPass(&PostRAHazardRecognizerID); 600 601 addPass(createSIInsertWaitsPass()); 602 addPass(createSIShrinkInstructionsPass()); 603 addPass(&SIInsertSkipsPassID); 604 addPass(createSIDebuggerInsertNopsPass()); 605 addPass(&BranchRelaxationPassID); 606 } 607 608 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 609 return new GCNPassConfig(this, PM); 610 } 611