1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUTargetObjectFile.h" 20 #include "AMDGPUTargetTransformInfo.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "R600MachineScheduler.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 27 #include "llvm/Analysis/Passes.h" 28 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 29 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 30 #include "llvm/CodeGen/MachineModuleInfo.h" 31 #include "llvm/CodeGen/Passes.h" 32 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 33 #include "llvm/CodeGen/TargetPassConfig.h" 34 #include "llvm/IR/Verifier.h" 35 #include "llvm/MC/MCAsmInfo.h" 36 #include "llvm/IR/LegacyPassManager.h" 37 #include "llvm/Support/TargetRegistry.h" 38 #include "llvm/Support/raw_os_ostream.h" 39 #include "llvm/Transforms/IPO.h" 40 #include "llvm/Transforms/Scalar.h" 41 #include "llvm/Transforms/Scalar/GVN.h" 42 #include "llvm/Transforms/Vectorize.h" 43 44 using namespace llvm; 45 46 static cl::opt<bool> EnableR600StructurizeCFG( 47 "r600-ir-structurize", 48 cl::desc("Use StructurizeCFG IR pass"), 49 cl::init(true)); 50 51 static cl::opt<bool> EnableSROA( 52 "amdgpu-sroa", 53 cl::desc("Run SROA after promote alloca pass"), 54 cl::ReallyHidden, 55 cl::init(true)); 56 57 static cl::opt<bool> EnableR600IfConvert( 58 "r600-if-convert", 59 cl::desc("Use if conversion pass"), 60 cl::ReallyHidden, 61 cl::init(true)); 62 63 // Option to disable vectorizer for tests. 64 static cl::opt<bool> EnableLoadStoreVectorizer( 65 "amdgpu-load-store-vectorizer", 66 cl::desc("Enable load store vectorizer"), 67 cl::init(false), 68 cl::Hidden); 69 70 extern "C" void LLVMInitializeAMDGPUTarget() { 71 // Register the target 72 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 73 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 74 75 PassRegistry *PR = PassRegistry::getPassRegistry(); 76 initializeSILowerI1CopiesPass(*PR); 77 initializeSIFixSGPRCopiesPass(*PR); 78 initializeSIFoldOperandsPass(*PR); 79 initializeSIShrinkInstructionsPass(*PR); 80 initializeSIFixControlFlowLiveIntervalsPass(*PR); 81 initializeSILoadStoreOptimizerPass(*PR); 82 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 83 initializeAMDGPUAnnotateUniformValuesPass(*PR); 84 initializeAMDGPUPromoteAllocaPass(*PR); 85 initializeAMDGPUCodeGenPreparePass(*PR); 86 initializeSIAnnotateControlFlowPass(*PR); 87 initializeSIDebuggerInsertNopsPass(*PR); 88 initializeSIInsertWaitsPass(*PR); 89 initializeSIWholeQuadModePass(*PR); 90 initializeSILowerControlFlowPass(*PR); 91 initializeSIDebuggerInsertNopsPass(*PR); 92 } 93 94 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 95 return make_unique<AMDGPUTargetObjectFile>(); 96 } 97 98 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 99 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 100 } 101 102 static MachineSchedRegistry 103 R600SchedRegistry("r600", "Run R600's custom scheduler", 104 createR600MachineScheduler); 105 106 static MachineSchedRegistry 107 SISchedRegistry("si", "Run SI's custom scheduler", 108 createSIMachineScheduler); 109 110 static StringRef computeDataLayout(const Triple &TT) { 111 if (TT.getArch() == Triple::r600) { 112 // 32-bit pointers. 113 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 114 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 115 } 116 117 // 32-bit private, local, and region pointers. 64-bit global, constant and 118 // flat. 119 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 120 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 121 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 122 } 123 124 LLVM_READNONE 125 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 126 if (!GPU.empty()) 127 return GPU; 128 129 // HSA only supports CI+, so change the default GPU to a CI for HSA. 130 if (TT.getArch() == Triple::amdgcn) 131 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 132 133 return "r600"; 134 } 135 136 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 137 // The AMDGPU toolchain only supports generating shared objects, so we 138 // must always use PIC. 139 return Reloc::PIC_; 140 } 141 142 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 143 StringRef CPU, StringRef FS, 144 TargetOptions Options, 145 Optional<Reloc::Model> RM, 146 CodeModel::Model CM, 147 CodeGenOpt::Level OptLevel) 148 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 149 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 150 TLOF(createTLOF(getTargetTriple())), 151 IntrinsicInfo() { 152 setRequiresStructuredCFG(true); 153 initAsmInfo(); 154 } 155 156 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 157 158 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 159 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 160 return GPUAttr.hasAttribute(Attribute::None) ? 161 getTargetCPU() : GPUAttr.getValueAsString(); 162 } 163 164 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 165 Attribute FSAttr = F.getFnAttribute("target-features"); 166 167 return FSAttr.hasAttribute(Attribute::None) ? 168 getTargetFeatureString() : 169 FSAttr.getValueAsString(); 170 } 171 172 //===----------------------------------------------------------------------===// 173 // R600 Target Machine (R600 -> Cayman) 174 //===----------------------------------------------------------------------===// 175 176 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 177 StringRef CPU, StringRef FS, 178 TargetOptions Options, 179 Optional<Reloc::Model> RM, 180 CodeModel::Model CM, CodeGenOpt::Level OL) 181 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 182 183 const R600Subtarget *R600TargetMachine::getSubtargetImpl( 184 const Function &F) const { 185 StringRef GPU = getGPUName(F); 186 StringRef FS = getFeatureString(F); 187 188 SmallString<128> SubtargetKey(GPU); 189 SubtargetKey.append(FS); 190 191 auto &I = SubtargetMap[SubtargetKey]; 192 if (!I) { 193 // This needs to be done before we create a new subtarget since any 194 // creation will depend on the TM and the code generation flags on the 195 // function that reside in TargetOptions. 196 resetTargetOptions(F); 197 I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); 198 } 199 200 return I.get(); 201 } 202 203 //===----------------------------------------------------------------------===// 204 // GCN Target Machine (SI+) 205 //===----------------------------------------------------------------------===// 206 207 #ifdef LLVM_BUILD_GLOBAL_ISEL 208 namespace { 209 struct SIGISelActualAccessor : public GISelAccessor { 210 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 211 const AMDGPUCallLowering *getCallLowering() const override { 212 return CallLoweringInfo.get(); 213 } 214 }; 215 } // End anonymous namespace. 216 #endif 217 218 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 219 StringRef CPU, StringRef FS, 220 TargetOptions Options, 221 Optional<Reloc::Model> RM, 222 CodeModel::Model CM, CodeGenOpt::Level OL) 223 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 224 225 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { 226 StringRef GPU = getGPUName(F); 227 StringRef FS = getFeatureString(F); 228 229 SmallString<128> SubtargetKey(GPU); 230 SubtargetKey.append(FS); 231 232 auto &I = SubtargetMap[SubtargetKey]; 233 if (!I) { 234 // This needs to be done before we create a new subtarget since any 235 // creation will depend on the TM and the code generation flags on the 236 // function that reside in TargetOptions. 237 resetTargetOptions(F); 238 I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); 239 240 #ifndef LLVM_BUILD_GLOBAL_ISEL 241 GISelAccessor *GISel = new GISelAccessor(); 242 #else 243 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 244 GISel->CallLoweringInfo.reset( 245 new AMDGPUCallLowering(*I->getTargetLowering())); 246 #endif 247 248 I->setGISelAccessor(*GISel); 249 } 250 251 return I.get(); 252 } 253 254 //===----------------------------------------------------------------------===// 255 // AMDGPU Pass Setup 256 //===----------------------------------------------------------------------===// 257 258 namespace { 259 260 class AMDGPUPassConfig : public TargetPassConfig { 261 public: 262 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 263 : TargetPassConfig(TM, PM) { 264 265 // Exceptions and StackMaps are not supported, so these passes will never do 266 // anything. 267 disablePass(&StackMapLivenessID); 268 disablePass(&FuncletLayoutID); 269 } 270 271 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 272 return getTM<AMDGPUTargetMachine>(); 273 } 274 275 void addEarlyCSEOrGVNPass(); 276 void addStraightLineScalarOptimizationPasses(); 277 void addIRPasses() override; 278 void addCodeGenPrepare() override; 279 bool addPreISel() override; 280 bool addInstSelector() override; 281 bool addGCPasses() override; 282 }; 283 284 class R600PassConfig final : public AMDGPUPassConfig { 285 public: 286 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 287 : AMDGPUPassConfig(TM, PM) { } 288 289 ScheduleDAGInstrs *createMachineScheduler( 290 MachineSchedContext *C) const override { 291 return createR600MachineScheduler(C); 292 } 293 294 bool addPreISel() override; 295 void addPreRegAlloc() override; 296 void addPreSched2() override; 297 void addPreEmitPass() override; 298 }; 299 300 class GCNPassConfig final : public AMDGPUPassConfig { 301 public: 302 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 303 : AMDGPUPassConfig(TM, PM) { } 304 305 GCNTargetMachine &getGCNTargetMachine() const { 306 return getTM<GCNTargetMachine>(); 307 } 308 309 ScheduleDAGInstrs * 310 createMachineScheduler(MachineSchedContext *C) const override; 311 312 void addIRPasses() override; 313 bool addPreISel() override; 314 void addMachineSSAOptimization() override; 315 bool addInstSelector() override; 316 #ifdef LLVM_BUILD_GLOBAL_ISEL 317 bool addIRTranslator() override; 318 bool addLegalizeMachineIR() override; 319 bool addRegBankSelect() override; 320 bool addGlobalInstructionSelect() override; 321 #endif 322 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 323 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 324 void addPreRegAlloc() override; 325 void addPreSched2() override; 326 void addPreEmitPass() override; 327 }; 328 329 } // End of anonymous namespace 330 331 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 332 return TargetIRAnalysis([this](const Function &F) { 333 return TargetTransformInfo(AMDGPUTTIImpl(this, F)); 334 }); 335 } 336 337 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 338 if (getOptLevel() == CodeGenOpt::Aggressive) 339 addPass(createGVNPass()); 340 else 341 addPass(createEarlyCSEPass()); 342 } 343 344 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 345 addPass(createSeparateConstOffsetFromGEPPass()); 346 addPass(createSpeculativeExecutionPass()); 347 // ReassociateGEPs exposes more opportunites for SLSR. See 348 // the example in reassociate-geps-and-slsr.ll. 349 addPass(createStraightLineStrengthReducePass()); 350 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 351 // EarlyCSE can reuse. 352 addEarlyCSEOrGVNPass(); 353 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 354 addPass(createNaryReassociatePass()); 355 // NaryReassociate on GEPs creates redundant common expressions, so run 356 // EarlyCSE after it. 357 addPass(createEarlyCSEPass()); 358 } 359 360 void AMDGPUPassConfig::addIRPasses() { 361 // There is no reason to run these. 362 disablePass(&StackMapLivenessID); 363 disablePass(&FuncletLayoutID); 364 disablePass(&PatchableFunctionID); 365 366 // Function calls are not supported, so make sure we inline everything. 367 addPass(createAMDGPUAlwaysInlinePass()); 368 addPass(createAlwaysInlinerPass()); 369 // We need to add the barrier noop pass, otherwise adding the function 370 // inlining pass will cause all of the PassConfigs passes to be run 371 // one function at a time, which means if we have a nodule with two 372 // functions, then we will generate code for the first function 373 // without ever running any passes on the second. 374 addPass(createBarrierNoopPass()); 375 376 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 377 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 378 379 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 380 if (TM.getOptLevel() > CodeGenOpt::None) { 381 addPass(createAMDGPUPromoteAlloca(&TM)); 382 383 if (EnableSROA) 384 addPass(createSROAPass()); 385 } 386 387 addStraightLineScalarOptimizationPasses(); 388 389 TargetPassConfig::addIRPasses(); 390 391 // EarlyCSE is not always strong enough to clean up what LSR produces. For 392 // example, GVN can combine 393 // 394 // %0 = add %a, %b 395 // %1 = add %b, %a 396 // 397 // and 398 // 399 // %0 = shl nsw %a, 2 400 // %1 = shl %a, 2 401 // 402 // but EarlyCSE can do neither of them. 403 if (getOptLevel() != CodeGenOpt::None) 404 addEarlyCSEOrGVNPass(); 405 } 406 407 void AMDGPUPassConfig::addCodeGenPrepare() { 408 TargetPassConfig::addCodeGenPrepare(); 409 410 if (EnableLoadStoreVectorizer) 411 addPass(createLoadStoreVectorizerPass()); 412 } 413 414 bool AMDGPUPassConfig::addPreISel() { 415 addPass(createFlattenCFGPass()); 416 return false; 417 } 418 419 bool AMDGPUPassConfig::addInstSelector() { 420 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 421 return false; 422 } 423 424 bool AMDGPUPassConfig::addGCPasses() { 425 // Do nothing. GC is not supported. 426 return false; 427 } 428 429 //===----------------------------------------------------------------------===// 430 // R600 Pass Setup 431 //===----------------------------------------------------------------------===// 432 433 bool R600PassConfig::addPreISel() { 434 AMDGPUPassConfig::addPreISel(); 435 436 if (EnableR600StructurizeCFG) 437 addPass(createStructurizeCFGPass()); 438 return false; 439 } 440 441 void R600PassConfig::addPreRegAlloc() { 442 addPass(createR600VectorRegMerger(*TM)); 443 } 444 445 void R600PassConfig::addPreSched2() { 446 addPass(createR600EmitClauseMarkers(), false); 447 if (EnableR600IfConvert) 448 addPass(&IfConverterID, false); 449 addPass(createR600ClauseMergePass(*TM), false); 450 } 451 452 void R600PassConfig::addPreEmitPass() { 453 addPass(createAMDGPUCFGStructurizerPass(), false); 454 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 455 addPass(&FinalizeMachineBundlesID, false); 456 addPass(createR600Packetizer(*TM), false); 457 addPass(createR600ControlFlowFinalizer(*TM), false); 458 } 459 460 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 461 return new R600PassConfig(this, PM); 462 } 463 464 //===----------------------------------------------------------------------===// 465 // GCN Pass Setup 466 //===----------------------------------------------------------------------===// 467 468 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 469 MachineSchedContext *C) const { 470 const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); 471 if (ST.enableSIScheduler()) 472 return createSIMachineScheduler(C); 473 return nullptr; 474 } 475 476 bool GCNPassConfig::addPreISel() { 477 AMDGPUPassConfig::addPreISel(); 478 479 // FIXME: We need to run a pass to propagate the attributes when calls are 480 // supported. 481 addPass(&AMDGPUAnnotateKernelFeaturesID); 482 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 483 addPass(createSinkingPass()); 484 addPass(createSITypeRewriter()); 485 addPass(createAMDGPUAnnotateUniformValues()); 486 addPass(createSIAnnotateControlFlowPass()); 487 488 return false; 489 } 490 491 void GCNPassConfig::addMachineSSAOptimization() { 492 TargetPassConfig::addMachineSSAOptimization(); 493 494 // We want to fold operands after PeepholeOptimizer has run (or as part of 495 // it), because it will eliminate extra copies making it easier to fold the 496 // real source operand. We want to eliminate dead instructions after, so that 497 // we see fewer uses of the copies. We then need to clean up the dead 498 // instructions leftover after the operands are folded as well. 499 // 500 // XXX - Can we get away without running DeadMachineInstructionElim again? 501 addPass(&SIFoldOperandsID); 502 addPass(&DeadMachineInstructionElimID); 503 } 504 505 void GCNPassConfig::addIRPasses() { 506 // TODO: May want to move later or split into an early and late one. 507 addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); 508 509 AMDGPUPassConfig::addIRPasses(); 510 } 511 512 bool GCNPassConfig::addInstSelector() { 513 AMDGPUPassConfig::addInstSelector(); 514 addPass(createSILowerI1CopiesPass()); 515 addPass(&SIFixSGPRCopiesID); 516 return false; 517 } 518 519 #ifdef LLVM_BUILD_GLOBAL_ISEL 520 bool GCNPassConfig::addIRTranslator() { 521 addPass(new IRTranslator()); 522 return false; 523 } 524 525 bool GCNPassConfig::addLegalizeMachineIR() { 526 return false; 527 } 528 529 bool GCNPassConfig::addRegBankSelect() { 530 return false; 531 } 532 533 bool GCNPassConfig::addGlobalInstructionSelect() { 534 return false; 535 } 536 #endif 537 538 void GCNPassConfig::addPreRegAlloc() { 539 // This needs to be run directly before register allocation because 540 // earlier passes might recompute live intervals. 541 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 542 if (getOptLevel() > CodeGenOpt::None) { 543 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 544 } 545 546 if (getOptLevel() > CodeGenOpt::None) { 547 // Don't do this with no optimizations since it throws away debug info by 548 // merging nonadjacent loads. 549 550 // This should be run after scheduling, but before register allocation. It 551 // also need extra copies to the address operand to be eliminated. 552 553 // FIXME: Move pre-RA and remove extra reg coalescer run. 554 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 555 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 556 } 557 558 addPass(createSIShrinkInstructionsPass()); 559 addPass(createSIWholeQuadModePass()); 560 } 561 562 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 563 TargetPassConfig::addFastRegAlloc(RegAllocPass); 564 } 565 566 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 567 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 568 } 569 570 void GCNPassConfig::addPreSched2() { 571 } 572 573 void GCNPassConfig::addPreEmitPass() { 574 // The hazard recognizer that runs as part of the post-ra scheduler does not 575 // guarantee to be able handle all hazards correctly. This is because if there 576 // are multiple scheduling regions in a basic block, the regions are scheduled 577 // bottom up, so when we begin to schedule a region we don't know what 578 // instructions were emitted directly before it. 579 // 580 // Here we add a stand-alone hazard recognizer pass which can handle all 581 // cases. 582 addPass(&PostRAHazardRecognizerID); 583 584 addPass(createSIInsertWaitsPass()); 585 addPass(createSIShrinkInstructionsPass()); 586 addPass(createSILowerControlFlowPass()); 587 addPass(createSIDebuggerInsertNopsPass()); 588 } 589 590 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 591 return new GCNPassConfig(this, PM); 592 } 593