1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUTargetObjectFile.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUTargetTransformInfo.h" 20 #include "R600ISelLowering.h" 21 #include "R600InstrInfo.h" 22 #include "R600MachineScheduler.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "llvm/Analysis/Passes.h" 26 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 27 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 28 #include "llvm/CodeGen/MachineModuleInfo.h" 29 #include "llvm/CodeGen/Passes.h" 30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 31 #include "llvm/CodeGen/TargetPassConfig.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/MC/MCAsmInfo.h" 34 #include "llvm/IR/LegacyPassManager.h" 35 #include "llvm/Support/TargetRegistry.h" 36 #include "llvm/Support/raw_os_ostream.h" 37 #include "llvm/Transforms/IPO.h" 38 #include "llvm/Transforms/Scalar.h" 39 #include "llvm/Transforms/Scalar/GVN.h" 40 41 using namespace llvm; 42 43 static cl::opt<bool> EnableR600StructurizeCFG( 44 "r600-ir-structurize", 45 cl::desc("Use StructurizeCFG IR pass"), 46 cl::init(true)); 47 48 extern "C" void LLVMInitializeAMDGPUTarget() { 49 // Register the target 50 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 51 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 52 53 PassRegistry *PR = PassRegistry::getPassRegistry(); 54 initializeSILowerI1CopiesPass(*PR); 55 initializeSIFixSGPRCopiesPass(*PR); 56 initializeSIFoldOperandsPass(*PR); 57 initializeSIShrinkInstructionsPass(*PR); 58 initializeSIFixControlFlowLiveIntervalsPass(*PR); 59 initializeSILoadStoreOptimizerPass(*PR); 60 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 61 initializeAMDGPUAnnotateUniformValuesPass(*PR); 62 initializeAMDGPUPromoteAllocaPass(*PR); 63 initializeAMDGPUCodeGenPreparePass(*PR); 64 initializeSIAnnotateControlFlowPass(*PR); 65 initializeSIDebuggerInsertNopsPass(*PR); 66 initializeSIInsertWaitsPass(*PR); 67 initializeSIWholeQuadModePass(*PR); 68 initializeSILowerControlFlowPass(*PR); 69 initializeSIDebuggerInsertNopsPass(*PR); 70 } 71 72 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 73 return make_unique<AMDGPUTargetObjectFile>(); 74 } 75 76 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 77 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 78 } 79 80 static MachineSchedRegistry 81 R600SchedRegistry("r600", "Run R600's custom scheduler", 82 createR600MachineScheduler); 83 84 static MachineSchedRegistry 85 SISchedRegistry("si", "Run SI's custom scheduler", 86 createSIMachineScheduler); 87 88 static StringRef computeDataLayout(const Triple &TT) { 89 if (TT.getArch() == Triple::r600) { 90 // 32-bit pointers. 91 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 92 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 93 } 94 95 // 32-bit private, local, and region pointers. 64-bit global, constant and 96 // flat. 97 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 98 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 99 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 100 } 101 102 LLVM_READNONE 103 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 104 if (!GPU.empty()) 105 return GPU; 106 107 // HSA only supports CI+, so change the default GPU to a CI for HSA. 108 if (TT.getArch() == Triple::amdgcn) 109 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 110 111 return "r600"; 112 } 113 114 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 115 if (!RM.hasValue()) 116 return Reloc::PIC_; 117 return *RM; 118 } 119 120 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 121 StringRef CPU, StringRef FS, 122 TargetOptions Options, 123 Optional<Reloc::Model> RM, 124 CodeModel::Model CM, 125 CodeGenOpt::Level OptLevel) 126 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 127 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 128 TLOF(createTLOF(getTargetTriple())), 129 IntrinsicInfo() { 130 setRequiresStructuredCFG(true); 131 initAsmInfo(); 132 } 133 134 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 135 136 //===----------------------------------------------------------------------===// 137 // R600 Target Machine (R600 -> Cayman) 138 //===----------------------------------------------------------------------===// 139 140 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 141 StringRef CPU, StringRef FS, 142 TargetOptions Options, 143 Optional<Reloc::Model> RM, 144 CodeModel::Model CM, CodeGenOpt::Level OL) 145 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), 146 Subtarget(TT, getTargetCPU(), FS, *this) {} 147 148 //===----------------------------------------------------------------------===// 149 // GCN Target Machine (SI+) 150 //===----------------------------------------------------------------------===// 151 152 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 153 StringRef CPU, StringRef FS, 154 TargetOptions Options, 155 Optional<Reloc::Model> RM, 156 CodeModel::Model CM, CodeGenOpt::Level OL) 157 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), 158 Subtarget(TT, getTargetCPU(), FS, *this) {} 159 160 //===----------------------------------------------------------------------===// 161 // AMDGPU Pass Setup 162 //===----------------------------------------------------------------------===// 163 164 namespace { 165 166 class AMDGPUPassConfig : public TargetPassConfig { 167 public: 168 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 169 : TargetPassConfig(TM, PM) { 170 171 // Exceptions and StackMaps are not supported, so these passes will never do 172 // anything. 173 disablePass(&StackMapLivenessID); 174 disablePass(&FuncletLayoutID); 175 } 176 177 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 178 return getTM<AMDGPUTargetMachine>(); 179 } 180 181 void addEarlyCSEOrGVNPass(); 182 void addStraightLineScalarOptimizationPasses(); 183 void addIRPasses() override; 184 bool addPreISel() override; 185 bool addInstSelector() override; 186 bool addGCPasses() override; 187 }; 188 189 class R600PassConfig final : public AMDGPUPassConfig { 190 public: 191 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 192 : AMDGPUPassConfig(TM, PM) { } 193 194 ScheduleDAGInstrs *createMachineScheduler( 195 MachineSchedContext *C) const override { 196 return createR600MachineScheduler(C); 197 } 198 199 bool addPreISel() override; 200 void addPreRegAlloc() override; 201 void addPreSched2() override; 202 void addPreEmitPass() override; 203 }; 204 205 class GCNPassConfig final : public AMDGPUPassConfig { 206 public: 207 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 208 : AMDGPUPassConfig(TM, PM) { } 209 210 GCNTargetMachine &getGCNTargetMachine() const { 211 return getTM<GCNTargetMachine>(); 212 } 213 214 ScheduleDAGInstrs * 215 createMachineScheduler(MachineSchedContext *C) const override { 216 const SISubtarget *ST = getGCNTargetMachine().getSubtargetImpl(); 217 if (ST->enableSIScheduler()) 218 return createSIMachineScheduler(C); 219 return nullptr; 220 } 221 222 bool addPreISel() override; 223 void addMachineSSAOptimization() override; 224 bool addInstSelector() override; 225 #ifdef LLVM_BUILD_GLOBAL_ISEL 226 bool addIRTranslator() override; 227 bool addRegBankSelect() override; 228 #endif 229 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 230 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 231 void addPreRegAlloc() override; 232 void addPreSched2() override; 233 void addPreEmitPass() override; 234 }; 235 236 } // End of anonymous namespace 237 238 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 239 return TargetIRAnalysis([this](const Function &F) { 240 return TargetTransformInfo( 241 AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); 242 }); 243 } 244 245 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 246 if (getOptLevel() == CodeGenOpt::Aggressive) 247 addPass(createGVNPass()); 248 else 249 addPass(createEarlyCSEPass()); 250 } 251 252 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 253 addPass(createSeparateConstOffsetFromGEPPass()); 254 addPass(createSpeculativeExecutionPass()); 255 // ReassociateGEPs exposes more opportunites for SLSR. See 256 // the example in reassociate-geps-and-slsr.ll. 257 addPass(createStraightLineStrengthReducePass()); 258 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 259 // EarlyCSE can reuse. 260 addEarlyCSEOrGVNPass(); 261 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 262 addPass(createNaryReassociatePass()); 263 // NaryReassociate on GEPs creates redundant common expressions, so run 264 // EarlyCSE after it. 265 addPass(createEarlyCSEPass()); 266 } 267 268 void AMDGPUPassConfig::addIRPasses() { 269 // There is no reason to run these. 270 disablePass(&StackMapLivenessID); 271 disablePass(&FuncletLayoutID); 272 disablePass(&PatchableFunctionID); 273 274 // Function calls are not supported, so make sure we inline everything. 275 addPass(createAMDGPUAlwaysInlinePass()); 276 addPass(createAlwaysInlinerPass()); 277 // We need to add the barrier noop pass, otherwise adding the function 278 // inlining pass will cause all of the PassConfigs passes to be run 279 // one function at a time, which means if we have a nodule with two 280 // functions, then we will generate code for the first function 281 // without ever running any passes on the second. 282 addPass(createBarrierNoopPass()); 283 284 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 285 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 286 287 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 288 const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); 289 if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { 290 addPass(createAMDGPUPromoteAlloca(&TM)); 291 addPass(createSROAPass()); 292 } 293 294 addStraightLineScalarOptimizationPasses(); 295 296 TargetPassConfig::addIRPasses(); 297 298 // EarlyCSE is not always strong enough to clean up what LSR produces. For 299 // example, GVN can combine 300 // 301 // %0 = add %a, %b 302 // %1 = add %b, %a 303 // 304 // and 305 // 306 // %0 = shl nsw %a, 2 307 // %1 = shl %a, 2 308 // 309 // but EarlyCSE can do neither of them. 310 if (getOptLevel() != CodeGenOpt::None) 311 addEarlyCSEOrGVNPass(); 312 } 313 314 bool AMDGPUPassConfig::addPreISel() { 315 addPass(createFlattenCFGPass()); 316 return false; 317 } 318 319 bool AMDGPUPassConfig::addInstSelector() { 320 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 321 return false; 322 } 323 324 bool AMDGPUPassConfig::addGCPasses() { 325 // Do nothing. GC is not supported. 326 return false; 327 } 328 329 //===----------------------------------------------------------------------===// 330 // R600 Pass Setup 331 //===----------------------------------------------------------------------===// 332 333 bool R600PassConfig::addPreISel() { 334 AMDGPUPassConfig::addPreISel(); 335 336 if (EnableR600StructurizeCFG) 337 addPass(createStructurizeCFGPass()); 338 addPass(createR600TextureIntrinsicsReplacer()); 339 return false; 340 } 341 342 void R600PassConfig::addPreRegAlloc() { 343 addPass(createR600VectorRegMerger(*TM)); 344 } 345 346 void R600PassConfig::addPreSched2() { 347 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 348 addPass(createR600EmitClauseMarkers(), false); 349 if (ST.isIfCvtEnabled()) 350 addPass(&IfConverterID, false); 351 addPass(createR600ClauseMergePass(*TM), false); 352 } 353 354 void R600PassConfig::addPreEmitPass() { 355 addPass(createAMDGPUCFGStructurizerPass(), false); 356 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 357 addPass(&FinalizeMachineBundlesID, false); 358 addPass(createR600Packetizer(*TM), false); 359 addPass(createR600ControlFlowFinalizer(*TM), false); 360 } 361 362 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 363 return new R600PassConfig(this, PM); 364 } 365 366 //===----------------------------------------------------------------------===// 367 // GCN Pass Setup 368 //===----------------------------------------------------------------------===// 369 370 bool GCNPassConfig::addPreISel() { 371 AMDGPUPassConfig::addPreISel(); 372 373 // FIXME: We need to run a pass to propagate the attributes when calls are 374 // supported. 375 addPass(&AMDGPUAnnotateKernelFeaturesID); 376 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 377 addPass(createSinkingPass()); 378 addPass(createSITypeRewriter()); 379 addPass(createAMDGPUAnnotateUniformValues()); 380 addPass(createSIAnnotateControlFlowPass()); 381 382 return false; 383 } 384 385 void GCNPassConfig::addMachineSSAOptimization() { 386 TargetPassConfig::addMachineSSAOptimization(); 387 388 // We want to fold operands after PeepholeOptimizer has run (or as part of 389 // it), because it will eliminate extra copies making it easier to fold the 390 // real source operand. We want to eliminate dead instructions after, so that 391 // we see fewer uses of the copies. We then need to clean up the dead 392 // instructions leftover after the operands are folded as well. 393 // 394 // XXX - Can we get away without running DeadMachineInstructionElim again? 395 addPass(&SIFoldOperandsID); 396 addPass(&DeadMachineInstructionElimID); 397 } 398 399 bool GCNPassConfig::addInstSelector() { 400 AMDGPUPassConfig::addInstSelector(); 401 addPass(createSILowerI1CopiesPass()); 402 addPass(&SIFixSGPRCopiesID); 403 return false; 404 } 405 406 #ifdef LLVM_BUILD_GLOBAL_ISEL 407 bool GCNPassConfig::addIRTranslator() { 408 addPass(new IRTranslator()); 409 return false; 410 } 411 412 bool GCNPassConfig::addRegBankSelect() { 413 return false; 414 } 415 #endif 416 417 void GCNPassConfig::addPreRegAlloc() { 418 const SISubtarget &ST = *getGCNTargetMachine().getSubtargetImpl(); 419 420 // This needs to be run directly before register allocation because 421 // earlier passes might recompute live intervals. 422 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 423 if (getOptLevel() > CodeGenOpt::None) { 424 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 425 } 426 427 if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { 428 // Don't do this with no optimizations since it throws away debug info by 429 // merging nonadjacent loads. 430 431 // This should be run after scheduling, but before register allocation. It 432 // also need extra copies to the address operand to be eliminated. 433 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 434 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 435 } 436 addPass(createSIShrinkInstructionsPass()); 437 addPass(createSIWholeQuadModePass()); 438 } 439 440 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 441 TargetPassConfig::addFastRegAlloc(RegAllocPass); 442 } 443 444 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 445 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 446 } 447 448 void GCNPassConfig::addPreSched2() { 449 } 450 451 void GCNPassConfig::addPreEmitPass() { 452 453 // The hazard recognizer that runs as part of the post-ra scheduler does not 454 // gaurantee to be able handle all hazards correctly. This is because 455 // if there are multiple scheduling regions in a basic block, the regions 456 // are scheduled bottom up, so when we begin to schedule a region we don't 457 // know what instructions were emitted directly before it. 458 // 459 // Here we add a stand-alone hazard recognizer pass which can handle all cases. 460 // hazard recognizer pass. 461 addPass(&PostRAHazardRecognizerID); 462 463 addPass(createSIInsertWaitsPass()); 464 addPass(createSIShrinkInstructionsPass()); 465 addPass(createSILowerControlFlowPass()); 466 addPass(createSIDebuggerInsertNopsPass()); 467 } 468 469 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 470 return new GCNPassConfig(this, PM); 471 } 472