1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUTargetObjectFile.h" 20 #include "AMDGPUTargetTransformInfo.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "R600MachineScheduler.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 #include "SIMachineScheduler.h" 27 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 28 #include "llvm/CodeGen/Passes.h" 29 #include "llvm/CodeGen/TargetPassConfig.h" 30 #include "llvm/Support/TargetRegistry.h" 31 #include "llvm/Transforms/IPO.h" 32 #include "llvm/Transforms/IPO/AlwaysInliner.h" 33 #include "llvm/Transforms/Scalar.h" 34 #include "llvm/Transforms/Scalar/GVN.h" 35 #include "llvm/Transforms/Vectorize.h" 36 37 using namespace llvm; 38 39 static cl::opt<bool> EnableR600StructurizeCFG( 40 "r600-ir-structurize", 41 cl::desc("Use StructurizeCFG IR pass"), 42 cl::init(true)); 43 44 static cl::opt<bool> EnableSROA( 45 "amdgpu-sroa", 46 cl::desc("Run SROA after promote alloca pass"), 47 cl::ReallyHidden, 48 cl::init(true)); 49 50 static cl::opt<bool> EnableR600IfConvert( 51 "r600-if-convert", 52 cl::desc("Use if conversion pass"), 53 cl::ReallyHidden, 54 cl::init(true)); 55 56 // Option to disable vectorizer for tests. 57 static cl::opt<bool> EnableLoadStoreVectorizer( 58 "amdgpu-load-store-vectorizer", 59 cl::desc("Enable load store vectorizer"), 60 cl::init(false), 61 cl::Hidden); 62 63 extern "C" void LLVMInitializeAMDGPUTarget() { 64 // Register the target 65 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 66 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 67 68 PassRegistry *PR = PassRegistry::getPassRegistry(); 69 initializeSILowerI1CopiesPass(*PR); 70 initializeSIFixSGPRCopiesPass(*PR); 71 initializeSIFoldOperandsPass(*PR); 72 initializeSIShrinkInstructionsPass(*PR); 73 initializeSIFixControlFlowLiveIntervalsPass(*PR); 74 initializeSILoadStoreOptimizerPass(*PR); 75 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 76 initializeAMDGPUAnnotateUniformValuesPass(*PR); 77 initializeAMDGPUPromoteAllocaPass(*PR); 78 initializeAMDGPUCodeGenPreparePass(*PR); 79 initializeSIAnnotateControlFlowPass(*PR); 80 initializeSIInsertWaitsPass(*PR); 81 initializeSIWholeQuadModePass(*PR); 82 initializeSILowerControlFlowPass(*PR); 83 initializeSIInsertSkipsPass(*PR); 84 initializeSIDebuggerInsertNopsPass(*PR); 85 } 86 87 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 88 return make_unique<AMDGPUTargetObjectFile>(); 89 } 90 91 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 92 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 93 } 94 95 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) { 96 return new SIScheduleDAGMI(C); 97 } 98 99 static MachineSchedRegistry 100 R600SchedRegistry("r600", "Run R600's custom scheduler", 101 createR600MachineScheduler); 102 103 static MachineSchedRegistry 104 SISchedRegistry("si", "Run SI's custom scheduler", 105 createSIMachineScheduler); 106 107 static StringRef computeDataLayout(const Triple &TT) { 108 if (TT.getArch() == Triple::r600) { 109 // 32-bit pointers. 110 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 111 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 112 } 113 114 // 32-bit private, local, and region pointers. 64-bit global, constant and 115 // flat. 116 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 117 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 118 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 119 } 120 121 LLVM_READNONE 122 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 123 if (!GPU.empty()) 124 return GPU; 125 126 // HSA only supports CI+, so change the default GPU to a CI for HSA. 127 if (TT.getArch() == Triple::amdgcn) 128 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 129 130 return "r600"; 131 } 132 133 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 134 // The AMDGPU toolchain only supports generating shared objects, so we 135 // must always use PIC. 136 return Reloc::PIC_; 137 } 138 139 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 140 StringRef CPU, StringRef FS, 141 TargetOptions Options, 142 Optional<Reloc::Model> RM, 143 CodeModel::Model CM, 144 CodeGenOpt::Level OptLevel) 145 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 146 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 147 TLOF(createTLOF(getTargetTriple())), 148 IntrinsicInfo() { 149 setRequiresStructuredCFG(true); 150 initAsmInfo(); 151 } 152 153 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 154 155 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 156 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 157 return GPUAttr.hasAttribute(Attribute::None) ? 158 getTargetCPU() : GPUAttr.getValueAsString(); 159 } 160 161 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 162 Attribute FSAttr = F.getFnAttribute("target-features"); 163 164 return FSAttr.hasAttribute(Attribute::None) ? 165 getTargetFeatureString() : 166 FSAttr.getValueAsString(); 167 } 168 169 //===----------------------------------------------------------------------===// 170 // R600 Target Machine (R600 -> Cayman) 171 //===----------------------------------------------------------------------===// 172 173 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 174 StringRef CPU, StringRef FS, 175 TargetOptions Options, 176 Optional<Reloc::Model> RM, 177 CodeModel::Model CM, CodeGenOpt::Level OL) 178 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 179 180 const R600Subtarget *R600TargetMachine::getSubtargetImpl( 181 const Function &F) const { 182 StringRef GPU = getGPUName(F); 183 StringRef FS = getFeatureString(F); 184 185 SmallString<128> SubtargetKey(GPU); 186 SubtargetKey.append(FS); 187 188 auto &I = SubtargetMap[SubtargetKey]; 189 if (!I) { 190 // This needs to be done before we create a new subtarget since any 191 // creation will depend on the TM and the code generation flags on the 192 // function that reside in TargetOptions. 193 resetTargetOptions(F); 194 I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); 195 } 196 197 return I.get(); 198 } 199 200 //===----------------------------------------------------------------------===// 201 // GCN Target Machine (SI+) 202 //===----------------------------------------------------------------------===// 203 204 #ifdef LLVM_BUILD_GLOBAL_ISEL 205 namespace { 206 struct SIGISelActualAccessor : public GISelAccessor { 207 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 208 const AMDGPUCallLowering *getCallLowering() const override { 209 return CallLoweringInfo.get(); 210 } 211 }; 212 } // End anonymous namespace. 213 #endif 214 215 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 216 StringRef CPU, StringRef FS, 217 TargetOptions Options, 218 Optional<Reloc::Model> RM, 219 CodeModel::Model CM, CodeGenOpt::Level OL) 220 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 221 222 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { 223 StringRef GPU = getGPUName(F); 224 StringRef FS = getFeatureString(F); 225 226 SmallString<128> SubtargetKey(GPU); 227 SubtargetKey.append(FS); 228 229 auto &I = SubtargetMap[SubtargetKey]; 230 if (!I) { 231 // This needs to be done before we create a new subtarget since any 232 // creation will depend on the TM and the code generation flags on the 233 // function that reside in TargetOptions. 234 resetTargetOptions(F); 235 I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); 236 237 #ifndef LLVM_BUILD_GLOBAL_ISEL 238 GISelAccessor *GISel = new GISelAccessor(); 239 #else 240 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 241 GISel->CallLoweringInfo.reset( 242 new AMDGPUCallLowering(*I->getTargetLowering())); 243 #endif 244 245 I->setGISelAccessor(*GISel); 246 } 247 248 return I.get(); 249 } 250 251 //===----------------------------------------------------------------------===// 252 // AMDGPU Pass Setup 253 //===----------------------------------------------------------------------===// 254 255 namespace { 256 257 class AMDGPUPassConfig : public TargetPassConfig { 258 public: 259 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 260 : TargetPassConfig(TM, PM) { 261 262 // Exceptions and StackMaps are not supported, so these passes will never do 263 // anything. 264 disablePass(&StackMapLivenessID); 265 disablePass(&FuncletLayoutID); 266 } 267 268 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 269 return getTM<AMDGPUTargetMachine>(); 270 } 271 272 void addEarlyCSEOrGVNPass(); 273 void addStraightLineScalarOptimizationPasses(); 274 void addIRPasses() override; 275 void addCodeGenPrepare() override; 276 bool addPreISel() override; 277 bool addInstSelector() override; 278 bool addGCPasses() override; 279 }; 280 281 class R600PassConfig final : public AMDGPUPassConfig { 282 public: 283 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 284 : AMDGPUPassConfig(TM, PM) { } 285 286 ScheduleDAGInstrs *createMachineScheduler( 287 MachineSchedContext *C) const override { 288 return createR600MachineScheduler(C); 289 } 290 291 bool addPreISel() override; 292 void addPreRegAlloc() override; 293 void addPreSched2() override; 294 void addPreEmitPass() override; 295 }; 296 297 class GCNPassConfig final : public AMDGPUPassConfig { 298 public: 299 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 300 : AMDGPUPassConfig(TM, PM) { } 301 302 GCNTargetMachine &getGCNTargetMachine() const { 303 return getTM<GCNTargetMachine>(); 304 } 305 306 ScheduleDAGInstrs * 307 createMachineScheduler(MachineSchedContext *C) const override; 308 309 void addIRPasses() override; 310 bool addPreISel() override; 311 void addMachineSSAOptimization() override; 312 bool addInstSelector() override; 313 #ifdef LLVM_BUILD_GLOBAL_ISEL 314 bool addIRTranslator() override; 315 bool addLegalizeMachineIR() override; 316 bool addRegBankSelect() override; 317 bool addGlobalInstructionSelect() override; 318 #endif 319 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 320 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 321 void addPreRegAlloc() override; 322 void addPreSched2() override; 323 void addPreEmitPass() override; 324 }; 325 326 } // End of anonymous namespace 327 328 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 329 return TargetIRAnalysis([this](const Function &F) { 330 return TargetTransformInfo(AMDGPUTTIImpl(this, F)); 331 }); 332 } 333 334 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 335 if (getOptLevel() == CodeGenOpt::Aggressive) 336 addPass(createGVNPass()); 337 else 338 addPass(createEarlyCSEPass()); 339 } 340 341 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 342 addPass(createSeparateConstOffsetFromGEPPass()); 343 addPass(createSpeculativeExecutionPass()); 344 // ReassociateGEPs exposes more opportunites for SLSR. See 345 // the example in reassociate-geps-and-slsr.ll. 346 addPass(createStraightLineStrengthReducePass()); 347 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 348 // EarlyCSE can reuse. 349 addEarlyCSEOrGVNPass(); 350 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 351 addPass(createNaryReassociatePass()); 352 // NaryReassociate on GEPs creates redundant common expressions, so run 353 // EarlyCSE after it. 354 addPass(createEarlyCSEPass()); 355 } 356 357 void AMDGPUPassConfig::addIRPasses() { 358 // There is no reason to run these. 359 disablePass(&StackMapLivenessID); 360 disablePass(&FuncletLayoutID); 361 disablePass(&PatchableFunctionID); 362 363 // Function calls are not supported, so make sure we inline everything. 364 addPass(createAMDGPUAlwaysInlinePass()); 365 addPass(createAlwaysInlinerLegacyPass()); 366 // We need to add the barrier noop pass, otherwise adding the function 367 // inlining pass will cause all of the PassConfigs passes to be run 368 // one function at a time, which means if we have a nodule with two 369 // functions, then we will generate code for the first function 370 // without ever running any passes on the second. 371 addPass(createBarrierNoopPass()); 372 373 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 374 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 375 376 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 377 if (TM.getOptLevel() > CodeGenOpt::None) { 378 addPass(createAMDGPUPromoteAlloca(&TM)); 379 380 if (EnableSROA) 381 addPass(createSROAPass()); 382 } 383 384 addStraightLineScalarOptimizationPasses(); 385 386 TargetPassConfig::addIRPasses(); 387 388 // EarlyCSE is not always strong enough to clean up what LSR produces. For 389 // example, GVN can combine 390 // 391 // %0 = add %a, %b 392 // %1 = add %b, %a 393 // 394 // and 395 // 396 // %0 = shl nsw %a, 2 397 // %1 = shl %a, 2 398 // 399 // but EarlyCSE can do neither of them. 400 if (getOptLevel() != CodeGenOpt::None) 401 addEarlyCSEOrGVNPass(); 402 } 403 404 void AMDGPUPassConfig::addCodeGenPrepare() { 405 TargetPassConfig::addCodeGenPrepare(); 406 407 if (EnableLoadStoreVectorizer) 408 addPass(createLoadStoreVectorizerPass()); 409 } 410 411 bool AMDGPUPassConfig::addPreISel() { 412 addPass(createFlattenCFGPass()); 413 return false; 414 } 415 416 bool AMDGPUPassConfig::addInstSelector() { 417 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 418 return false; 419 } 420 421 bool AMDGPUPassConfig::addGCPasses() { 422 // Do nothing. GC is not supported. 423 return false; 424 } 425 426 //===----------------------------------------------------------------------===// 427 // R600 Pass Setup 428 //===----------------------------------------------------------------------===// 429 430 bool R600PassConfig::addPreISel() { 431 AMDGPUPassConfig::addPreISel(); 432 433 if (EnableR600StructurizeCFG) 434 addPass(createStructurizeCFGPass()); 435 return false; 436 } 437 438 void R600PassConfig::addPreRegAlloc() { 439 addPass(createR600VectorRegMerger(*TM)); 440 } 441 442 void R600PassConfig::addPreSched2() { 443 addPass(createR600EmitClauseMarkers(), false); 444 if (EnableR600IfConvert) 445 addPass(&IfConverterID, false); 446 addPass(createR600ClauseMergePass(*TM), false); 447 } 448 449 void R600PassConfig::addPreEmitPass() { 450 addPass(createAMDGPUCFGStructurizerPass(), false); 451 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 452 addPass(&FinalizeMachineBundlesID, false); 453 addPass(createR600Packetizer(*TM), false); 454 addPass(createR600ControlFlowFinalizer(*TM), false); 455 } 456 457 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 458 return new R600PassConfig(this, PM); 459 } 460 461 //===----------------------------------------------------------------------===// 462 // GCN Pass Setup 463 //===----------------------------------------------------------------------===// 464 465 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 466 MachineSchedContext *C) const { 467 const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); 468 if (ST.enableSIScheduler()) 469 return createSIMachineScheduler(C); 470 return nullptr; 471 } 472 473 bool GCNPassConfig::addPreISel() { 474 AMDGPUPassConfig::addPreISel(); 475 476 // FIXME: We need to run a pass to propagate the attributes when calls are 477 // supported. 478 addPass(&AMDGPUAnnotateKernelFeaturesID); 479 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 480 addPass(createSinkingPass()); 481 addPass(createSITypeRewriter()); 482 addPass(createAMDGPUAnnotateUniformValues()); 483 addPass(createSIAnnotateControlFlowPass()); 484 485 return false; 486 } 487 488 void GCNPassConfig::addMachineSSAOptimization() { 489 TargetPassConfig::addMachineSSAOptimization(); 490 491 // We want to fold operands after PeepholeOptimizer has run (or as part of 492 // it), because it will eliminate extra copies making it easier to fold the 493 // real source operand. We want to eliminate dead instructions after, so that 494 // we see fewer uses of the copies. We then need to clean up the dead 495 // instructions leftover after the operands are folded as well. 496 // 497 // XXX - Can we get away without running DeadMachineInstructionElim again? 498 addPass(&SIFoldOperandsID); 499 addPass(&DeadMachineInstructionElimID); 500 } 501 502 void GCNPassConfig::addIRPasses() { 503 // TODO: May want to move later or split into an early and late one. 504 addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); 505 506 AMDGPUPassConfig::addIRPasses(); 507 } 508 509 bool GCNPassConfig::addInstSelector() { 510 AMDGPUPassConfig::addInstSelector(); 511 addPass(createSILowerI1CopiesPass()); 512 addPass(&SIFixSGPRCopiesID); 513 return false; 514 } 515 516 #ifdef LLVM_BUILD_GLOBAL_ISEL 517 bool GCNPassConfig::addIRTranslator() { 518 addPass(new IRTranslator()); 519 return false; 520 } 521 522 bool GCNPassConfig::addLegalizeMachineIR() { 523 return false; 524 } 525 526 bool GCNPassConfig::addRegBankSelect() { 527 return false; 528 } 529 530 bool GCNPassConfig::addGlobalInstructionSelect() { 531 return false; 532 } 533 #endif 534 535 void GCNPassConfig::addPreRegAlloc() { 536 if (getOptLevel() > CodeGenOpt::None) { 537 // Don't do this with no optimizations since it throws away debug info by 538 // merging nonadjacent loads. 539 540 // This should be run after scheduling, but before register allocation. It 541 // also need extra copies to the address operand to be eliminated. 542 543 // FIXME: Move pre-RA and remove extra reg coalescer run. 544 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 545 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 546 } 547 548 addPass(createSIShrinkInstructionsPass()); 549 addPass(createSIWholeQuadModePass()); 550 } 551 552 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 553 // FIXME: We have to disable the verifier here because of PHIElimination + 554 // TwoAddressInstructions disabling it. 555 insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false); 556 557 TargetPassConfig::addFastRegAlloc(RegAllocPass); 558 } 559 560 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 561 // This needs to be run directly before register allocation because earlier 562 // passes might recompute live intervals. 563 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 564 565 // TODO: It might be better to run this right after phi elimination, but for 566 // now that would require not running the verifier. 567 insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID); 568 569 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 570 } 571 572 void GCNPassConfig::addPreSched2() { 573 } 574 575 void GCNPassConfig::addPreEmitPass() { 576 // The hazard recognizer that runs as part of the post-ra scheduler does not 577 // guarantee to be able handle all hazards correctly. This is because if there 578 // are multiple scheduling regions in a basic block, the regions are scheduled 579 // bottom up, so when we begin to schedule a region we don't know what 580 // instructions were emitted directly before it. 581 // 582 // Here we add a stand-alone hazard recognizer pass which can handle all 583 // cases. 584 addPass(&PostRAHazardRecognizerID); 585 586 addPass(createSIInsertWaitsPass()); 587 addPass(createSIShrinkInstructionsPass()); 588 addPass(&SIInsertSkipsPassID); 589 addPass(createSIDebuggerInsertNopsPass()); 590 } 591 592 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 593 return new GCNPassConfig(this, PM); 594 } 595