1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPU.h" 18 #include "AMDGPUCallLowering.h" 19 #include "AMDGPUTargetObjectFile.h" 20 #include "AMDGPUTargetTransformInfo.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "R600MachineScheduler.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 27 #include "llvm/Analysis/Passes.h" 28 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 29 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 30 #include "llvm/CodeGen/MachineModuleInfo.h" 31 #include "llvm/CodeGen/Passes.h" 32 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 33 #include "llvm/CodeGen/TargetPassConfig.h" 34 #include "llvm/IR/Verifier.h" 35 #include "llvm/MC/MCAsmInfo.h" 36 #include "llvm/IR/LegacyPassManager.h" 37 #include "llvm/Support/TargetRegistry.h" 38 #include "llvm/Support/raw_os_ostream.h" 39 #include "llvm/Transforms/IPO.h" 40 #include "llvm/Transforms/Scalar.h" 41 #include "llvm/Transforms/Scalar/GVN.h" 42 43 using namespace llvm; 44 45 static cl::opt<bool> EnableR600StructurizeCFG( 46 "r600-ir-structurize", 47 cl::desc("Use StructurizeCFG IR pass"), 48 cl::init(true)); 49 50 static cl::opt<bool> EnableSROA( 51 "amdgpu-sroa", 52 cl::desc("Run SROA after promote alloca pass"), 53 cl::ReallyHidden, 54 cl::init(true)); 55 56 static cl::opt<bool> EnableR600IfConvert( 57 "r600-if-convert", 58 cl::desc("Use if conversion pass"), 59 cl::ReallyHidden, 60 cl::init(true)); 61 62 extern "C" void LLVMInitializeAMDGPUTarget() { 63 // Register the target 64 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 65 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 66 67 PassRegistry *PR = PassRegistry::getPassRegistry(); 68 initializeSILowerI1CopiesPass(*PR); 69 initializeSIFixSGPRCopiesPass(*PR); 70 initializeSIFoldOperandsPass(*PR); 71 initializeSIShrinkInstructionsPass(*PR); 72 initializeSIFixControlFlowLiveIntervalsPass(*PR); 73 initializeSILoadStoreOptimizerPass(*PR); 74 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 75 initializeAMDGPUAnnotateUniformValuesPass(*PR); 76 initializeAMDGPUPromoteAllocaPass(*PR); 77 initializeAMDGPUCodeGenPreparePass(*PR); 78 initializeSIAnnotateControlFlowPass(*PR); 79 initializeSIDebuggerInsertNopsPass(*PR); 80 initializeSIInsertWaitsPass(*PR); 81 initializeSIWholeQuadModePass(*PR); 82 initializeSILowerControlFlowPass(*PR); 83 initializeSIDebuggerInsertNopsPass(*PR); 84 } 85 86 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 87 return make_unique<AMDGPUTargetObjectFile>(); 88 } 89 90 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 91 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 92 } 93 94 static MachineSchedRegistry 95 R600SchedRegistry("r600", "Run R600's custom scheduler", 96 createR600MachineScheduler); 97 98 static MachineSchedRegistry 99 SISchedRegistry("si", "Run SI's custom scheduler", 100 createSIMachineScheduler); 101 102 static StringRef computeDataLayout(const Triple &TT) { 103 if (TT.getArch() == Triple::r600) { 104 // 32-bit pointers. 105 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 106 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 107 } 108 109 // 32-bit private, local, and region pointers. 64-bit global, constant and 110 // flat. 111 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 112 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 113 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 114 } 115 116 LLVM_READNONE 117 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 118 if (!GPU.empty()) 119 return GPU; 120 121 // HSA only supports CI+, so change the default GPU to a CI for HSA. 122 if (TT.getArch() == Triple::amdgcn) 123 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 124 125 return "r600"; 126 } 127 128 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 129 if (!RM.hasValue()) 130 return Reloc::PIC_; 131 return *RM; 132 } 133 134 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 135 StringRef CPU, StringRef FS, 136 TargetOptions Options, 137 Optional<Reloc::Model> RM, 138 CodeModel::Model CM, 139 CodeGenOpt::Level OptLevel) 140 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 141 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 142 TLOF(createTLOF(getTargetTriple())), 143 IntrinsicInfo() { 144 setRequiresStructuredCFG(true); 145 initAsmInfo(); 146 } 147 148 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 149 150 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { 151 Attribute GPUAttr = F.getFnAttribute("target-cpu"); 152 return GPUAttr.hasAttribute(Attribute::None) ? 153 getTargetCPU() : GPUAttr.getValueAsString(); 154 } 155 156 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { 157 Attribute FSAttr = F.getFnAttribute("target-features"); 158 159 return FSAttr.hasAttribute(Attribute::None) ? 160 getTargetFeatureString() : 161 FSAttr.getValueAsString(); 162 } 163 164 //===----------------------------------------------------------------------===// 165 // R600 Target Machine (R600 -> Cayman) 166 //===----------------------------------------------------------------------===// 167 168 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 169 StringRef CPU, StringRef FS, 170 TargetOptions Options, 171 Optional<Reloc::Model> RM, 172 CodeModel::Model CM, CodeGenOpt::Level OL) 173 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 174 175 const R600Subtarget *R600TargetMachine::getSubtargetImpl( 176 const Function &F) const { 177 StringRef GPU = getGPUName(F); 178 StringRef FS = getFeatureString(F); 179 180 SmallString<128> SubtargetKey(GPU); 181 SubtargetKey.append(FS); 182 183 auto &I = SubtargetMap[SubtargetKey]; 184 if (!I) { 185 // This needs to be done before we create a new subtarget since any 186 // creation will depend on the TM and the code generation flags on the 187 // function that reside in TargetOptions. 188 resetTargetOptions(F); 189 I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); 190 } 191 192 return I.get(); 193 } 194 195 //===----------------------------------------------------------------------===// 196 // GCN Target Machine (SI+) 197 //===----------------------------------------------------------------------===// 198 199 #ifdef LLVM_BUILD_GLOBAL_ISEL 200 namespace { 201 struct SIGISelActualAccessor : public GISelAccessor { 202 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 203 const AMDGPUCallLowering *getCallLowering() const override { 204 return CallLoweringInfo.get(); 205 } 206 }; 207 } // End anonymous namespace. 208 #endif 209 210 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 211 StringRef CPU, StringRef FS, 212 TargetOptions Options, 213 Optional<Reloc::Model> RM, 214 CodeModel::Model CM, CodeGenOpt::Level OL) 215 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 216 217 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { 218 StringRef GPU = getGPUName(F); 219 StringRef FS = getFeatureString(F); 220 221 SmallString<128> SubtargetKey(GPU); 222 SubtargetKey.append(FS); 223 224 auto &I = SubtargetMap[SubtargetKey]; 225 if (!I) { 226 // This needs to be done before we create a new subtarget since any 227 // creation will depend on the TM and the code generation flags on the 228 // function that reside in TargetOptions. 229 resetTargetOptions(F); 230 I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); 231 232 #ifndef LLVM_BUILD_GLOBAL_ISEL 233 GISelAccessor *GISel = new GISelAccessor(); 234 #else 235 SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); 236 GISel->CallLoweringInfo.reset( 237 new AMDGPUCallLowering(*I->getTargetLowering())); 238 #endif 239 240 I->setGISelAccessor(*GISel); 241 } 242 243 return I.get(); 244 } 245 246 //===----------------------------------------------------------------------===// 247 // AMDGPU Pass Setup 248 //===----------------------------------------------------------------------===// 249 250 namespace { 251 252 class AMDGPUPassConfig : public TargetPassConfig { 253 public: 254 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 255 : TargetPassConfig(TM, PM) { 256 257 // Exceptions and StackMaps are not supported, so these passes will never do 258 // anything. 259 disablePass(&StackMapLivenessID); 260 disablePass(&FuncletLayoutID); 261 } 262 263 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 264 return getTM<AMDGPUTargetMachine>(); 265 } 266 267 void addEarlyCSEOrGVNPass(); 268 void addStraightLineScalarOptimizationPasses(); 269 void addIRPasses() override; 270 bool addPreISel() override; 271 bool addInstSelector() override; 272 bool addGCPasses() override; 273 }; 274 275 class R600PassConfig final : public AMDGPUPassConfig { 276 public: 277 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 278 : AMDGPUPassConfig(TM, PM) { } 279 280 ScheduleDAGInstrs *createMachineScheduler( 281 MachineSchedContext *C) const override { 282 return createR600MachineScheduler(C); 283 } 284 285 bool addPreISel() override; 286 void addPreRegAlloc() override; 287 void addPreSched2() override; 288 void addPreEmitPass() override; 289 }; 290 291 class GCNPassConfig final : public AMDGPUPassConfig { 292 public: 293 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 294 : AMDGPUPassConfig(TM, PM) { } 295 296 GCNTargetMachine &getGCNTargetMachine() const { 297 return getTM<GCNTargetMachine>(); 298 } 299 300 ScheduleDAGInstrs * 301 createMachineScheduler(MachineSchedContext *C) const override; 302 303 bool addPreISel() override; 304 void addMachineSSAOptimization() override; 305 bool addInstSelector() override; 306 #ifdef LLVM_BUILD_GLOBAL_ISEL 307 bool addIRTranslator() override; 308 bool addRegBankSelect() override; 309 #endif 310 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 311 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 312 void addPreRegAlloc() override; 313 void addPreSched2() override; 314 void addPreEmitPass() override; 315 }; 316 317 } // End of anonymous namespace 318 319 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 320 return TargetIRAnalysis([this](const Function &F) { 321 return TargetTransformInfo(AMDGPUTTIImpl(this, F)); 322 }); 323 } 324 325 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 326 if (getOptLevel() == CodeGenOpt::Aggressive) 327 addPass(createGVNPass()); 328 else 329 addPass(createEarlyCSEPass()); 330 } 331 332 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 333 addPass(createSeparateConstOffsetFromGEPPass()); 334 addPass(createSpeculativeExecutionPass()); 335 // ReassociateGEPs exposes more opportunites for SLSR. See 336 // the example in reassociate-geps-and-slsr.ll. 337 addPass(createStraightLineStrengthReducePass()); 338 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 339 // EarlyCSE can reuse. 340 addEarlyCSEOrGVNPass(); 341 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 342 addPass(createNaryReassociatePass()); 343 // NaryReassociate on GEPs creates redundant common expressions, so run 344 // EarlyCSE after it. 345 addPass(createEarlyCSEPass()); 346 } 347 348 void AMDGPUPassConfig::addIRPasses() { 349 // There is no reason to run these. 350 disablePass(&StackMapLivenessID); 351 disablePass(&FuncletLayoutID); 352 disablePass(&PatchableFunctionID); 353 354 // Function calls are not supported, so make sure we inline everything. 355 addPass(createAMDGPUAlwaysInlinePass()); 356 addPass(createAlwaysInlinerPass()); 357 // We need to add the barrier noop pass, otherwise adding the function 358 // inlining pass will cause all of the PassConfigs passes to be run 359 // one function at a time, which means if we have a nodule with two 360 // functions, then we will generate code for the first function 361 // without ever running any passes on the second. 362 addPass(createBarrierNoopPass()); 363 364 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 365 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 366 367 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 368 if (TM.getOptLevel() > CodeGenOpt::None) { 369 addPass(createAMDGPUPromoteAlloca(&TM)); 370 371 if (EnableSROA) 372 addPass(createSROAPass()); 373 } 374 375 addStraightLineScalarOptimizationPasses(); 376 377 TargetPassConfig::addIRPasses(); 378 379 // EarlyCSE is not always strong enough to clean up what LSR produces. For 380 // example, GVN can combine 381 // 382 // %0 = add %a, %b 383 // %1 = add %b, %a 384 // 385 // and 386 // 387 // %0 = shl nsw %a, 2 388 // %1 = shl %a, 2 389 // 390 // but EarlyCSE can do neither of them. 391 if (getOptLevel() != CodeGenOpt::None) 392 addEarlyCSEOrGVNPass(); 393 } 394 395 bool AMDGPUPassConfig::addPreISel() { 396 addPass(createFlattenCFGPass()); 397 return false; 398 } 399 400 bool AMDGPUPassConfig::addInstSelector() { 401 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 402 return false; 403 } 404 405 bool AMDGPUPassConfig::addGCPasses() { 406 // Do nothing. GC is not supported. 407 return false; 408 } 409 410 //===----------------------------------------------------------------------===// 411 // R600 Pass Setup 412 //===----------------------------------------------------------------------===// 413 414 bool R600PassConfig::addPreISel() { 415 AMDGPUPassConfig::addPreISel(); 416 417 if (EnableR600StructurizeCFG) 418 addPass(createStructurizeCFGPass()); 419 addPass(createR600TextureIntrinsicsReplacer()); 420 return false; 421 } 422 423 void R600PassConfig::addPreRegAlloc() { 424 addPass(createR600VectorRegMerger(*TM)); 425 } 426 427 void R600PassConfig::addPreSched2() { 428 addPass(createR600EmitClauseMarkers(), false); 429 if (EnableR600IfConvert) 430 addPass(&IfConverterID, false); 431 addPass(createR600ClauseMergePass(*TM), false); 432 } 433 434 void R600PassConfig::addPreEmitPass() { 435 addPass(createAMDGPUCFGStructurizerPass(), false); 436 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 437 addPass(&FinalizeMachineBundlesID, false); 438 addPass(createR600Packetizer(*TM), false); 439 addPass(createR600ControlFlowFinalizer(*TM), false); 440 } 441 442 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 443 return new R600PassConfig(this, PM); 444 } 445 446 //===----------------------------------------------------------------------===// 447 // GCN Pass Setup 448 //===----------------------------------------------------------------------===// 449 450 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( 451 MachineSchedContext *C) const { 452 const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); 453 if (ST.enableSIScheduler()) 454 return createSIMachineScheduler(C); 455 return nullptr; 456 } 457 458 bool GCNPassConfig::addPreISel() { 459 AMDGPUPassConfig::addPreISel(); 460 461 // FIXME: We need to run a pass to propagate the attributes when calls are 462 // supported. 463 addPass(&AMDGPUAnnotateKernelFeaturesID); 464 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 465 addPass(createSinkingPass()); 466 addPass(createSITypeRewriter()); 467 addPass(createAMDGPUAnnotateUniformValues()); 468 addPass(createSIAnnotateControlFlowPass()); 469 470 return false; 471 } 472 473 void GCNPassConfig::addMachineSSAOptimization() { 474 TargetPassConfig::addMachineSSAOptimization(); 475 476 // We want to fold operands after PeepholeOptimizer has run (or as part of 477 // it), because it will eliminate extra copies making it easier to fold the 478 // real source operand. We want to eliminate dead instructions after, so that 479 // we see fewer uses of the copies. We then need to clean up the dead 480 // instructions leftover after the operands are folded as well. 481 // 482 // XXX - Can we get away without running DeadMachineInstructionElim again? 483 addPass(&SIFoldOperandsID); 484 addPass(&DeadMachineInstructionElimID); 485 } 486 487 bool GCNPassConfig::addInstSelector() { 488 AMDGPUPassConfig::addInstSelector(); 489 addPass(createSILowerI1CopiesPass()); 490 addPass(&SIFixSGPRCopiesID); 491 return false; 492 } 493 494 #ifdef LLVM_BUILD_GLOBAL_ISEL 495 bool GCNPassConfig::addIRTranslator() { 496 addPass(new IRTranslator()); 497 return false; 498 } 499 500 bool GCNPassConfig::addRegBankSelect() { 501 return false; 502 } 503 #endif 504 505 void GCNPassConfig::addPreRegAlloc() { 506 // This needs to be run directly before register allocation because 507 // earlier passes might recompute live intervals. 508 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 509 if (getOptLevel() > CodeGenOpt::None) { 510 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 511 } 512 513 if (getOptLevel() > CodeGenOpt::None) { 514 // Don't do this with no optimizations since it throws away debug info by 515 // merging nonadjacent loads. 516 517 // This should be run after scheduling, but before register allocation. It 518 // also need extra copies to the address operand to be eliminated. 519 520 // FIXME: Move pre-RA and remove extra reg coalescer run. 521 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 522 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 523 } 524 525 addPass(createSIShrinkInstructionsPass()); 526 addPass(createSIWholeQuadModePass()); 527 } 528 529 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 530 TargetPassConfig::addFastRegAlloc(RegAllocPass); 531 } 532 533 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 534 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 535 } 536 537 void GCNPassConfig::addPreSched2() { 538 } 539 540 void GCNPassConfig::addPreEmitPass() { 541 // The hazard recognizer that runs as part of the post-ra scheduler does not 542 // guarantee to be able handle all hazards correctly. This is because if there 543 // are multiple scheduling regions in a basic block, the regions are scheduled 544 // bottom up, so when we begin to schedule a region we don't know what 545 // instructions were emitted directly before it. 546 // 547 // Here we add a stand-alone hazard recognizer pass which can handle all 548 // cases. 549 addPass(&PostRAHazardRecognizerID); 550 551 addPass(createSIInsertWaitsPass()); 552 addPass(createSIShrinkInstructionsPass()); 553 addPass(createSILowerControlFlowPass()); 554 addPass(createSIDebuggerInsertNopsPass()); 555 } 556 557 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 558 return new GCNPassConfig(this, PM); 559 } 560