1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUTargetObjectFile.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUTargetTransformInfo.h" 20 #include "R600ISelLowering.h" 21 #include "R600InstrInfo.h" 22 #include "R600MachineScheduler.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "llvm/Analysis/Passes.h" 26 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 27 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 28 #include "llvm/CodeGen/MachineModuleInfo.h" 29 #include "llvm/CodeGen/Passes.h" 30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 31 #include "llvm/CodeGen/TargetPassConfig.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/MC/MCAsmInfo.h" 34 #include "llvm/IR/LegacyPassManager.h" 35 #include "llvm/Support/TargetRegistry.h" 36 #include "llvm/Support/raw_os_ostream.h" 37 #include "llvm/Transforms/IPO.h" 38 #include "llvm/Transforms/Scalar.h" 39 #include "llvm/Transforms/Scalar/GVN.h" 40 #include "llvm/CodeGen/Passes.h" 41 42 using namespace llvm; 43 44 extern "C" void LLVMInitializeAMDGPUTarget() { 45 // Register the target 46 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 47 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 48 49 PassRegistry *PR = PassRegistry::getPassRegistry(); 50 initializeSILowerI1CopiesPass(*PR); 51 initializeSIFixSGPRCopiesPass(*PR); 52 initializeSIFoldOperandsPass(*PR); 53 initializeSIShrinkInstructionsPass(*PR); 54 initializeSIFixControlFlowLiveIntervalsPass(*PR); 55 initializeSILoadStoreOptimizerPass(*PR); 56 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 57 initializeAMDGPUAnnotateUniformValuesPass(*PR); 58 initializeAMDGPUPromoteAllocaPass(*PR); 59 initializeSIAnnotateControlFlowPass(*PR); 60 initializeSIDebuggerInsertNopsPass(*PR); 61 initializeSIInsertWaitsPass(*PR); 62 initializeSIWholeQuadModePass(*PR); 63 initializeSILowerControlFlowPass(*PR); 64 initializeSIDebuggerInsertNopsPass(*PR); 65 } 66 67 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 68 return make_unique<AMDGPUTargetObjectFile>(); 69 } 70 71 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 72 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 73 } 74 75 static MachineSchedRegistry 76 R600SchedRegistry("r600", "Run R600's custom scheduler", 77 createR600MachineScheduler); 78 79 static MachineSchedRegistry 80 SISchedRegistry("si", "Run SI's custom scheduler", 81 createSIMachineScheduler); 82 83 static StringRef computeDataLayout(const Triple &TT) { 84 if (TT.getArch() == Triple::r600) { 85 // 32-bit pointers. 86 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 87 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 88 } 89 90 // 32-bit private, local, and region pointers. 64-bit global, constant and 91 // flat. 92 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 93 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 94 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 95 } 96 97 LLVM_READNONE 98 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 99 if (!GPU.empty()) 100 return GPU; 101 102 // HSA only supports CI+, so change the default GPU to a CI for HSA. 103 if (TT.getArch() == Triple::amdgcn) 104 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 105 106 return "r600"; 107 } 108 109 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 110 if (!RM.hasValue()) 111 return Reloc::PIC_; 112 return *RM; 113 } 114 115 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 116 StringRef CPU, StringRef FS, 117 TargetOptions Options, 118 Optional<Reloc::Model> RM, 119 CodeModel::Model CM, 120 CodeGenOpt::Level OptLevel) 121 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 122 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 123 TLOF(createTLOF(getTargetTriple())), 124 Subtarget(TT, getTargetCPU(), FS, *this), IntrinsicInfo() { 125 setRequiresStructuredCFG(true); 126 initAsmInfo(); 127 } 128 129 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 130 131 //===----------------------------------------------------------------------===// 132 // R600 Target Machine (R600 -> Cayman) 133 //===----------------------------------------------------------------------===// 134 135 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 136 StringRef CPU, StringRef FS, 137 TargetOptions Options, 138 Optional<Reloc::Model> RM, 139 CodeModel::Model CM, CodeGenOpt::Level OL) 140 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 141 142 //===----------------------------------------------------------------------===// 143 // GCN Target Machine (SI+) 144 //===----------------------------------------------------------------------===// 145 146 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 147 StringRef CPU, StringRef FS, 148 TargetOptions Options, 149 Optional<Reloc::Model> RM, 150 CodeModel::Model CM, CodeGenOpt::Level OL) 151 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 152 153 //===----------------------------------------------------------------------===// 154 // AMDGPU Pass Setup 155 //===----------------------------------------------------------------------===// 156 157 namespace { 158 159 class AMDGPUPassConfig : public TargetPassConfig { 160 public: 161 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 162 : TargetPassConfig(TM, PM) { 163 164 // Exceptions and StackMaps are not supported, so these passes will never do 165 // anything. 166 disablePass(&StackMapLivenessID); 167 disablePass(&FuncletLayoutID); 168 } 169 170 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 171 return getTM<AMDGPUTargetMachine>(); 172 } 173 174 ScheduleDAGInstrs * 175 createMachineScheduler(MachineSchedContext *C) const override { 176 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 177 if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) 178 return createR600MachineScheduler(C); 179 else if (ST.enableSIScheduler()) 180 return createSIMachineScheduler(C); 181 return nullptr; 182 } 183 184 void addEarlyCSEOrGVNPass(); 185 void addStraightLineScalarOptimizationPasses(); 186 void addIRPasses() override; 187 bool addPreISel() override; 188 bool addInstSelector() override; 189 bool addGCPasses() override; 190 }; 191 192 class R600PassConfig final : public AMDGPUPassConfig { 193 public: 194 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 195 : AMDGPUPassConfig(TM, PM) { } 196 197 bool addPreISel() override; 198 void addPreRegAlloc() override; 199 void addPreSched2() override; 200 void addPreEmitPass() override; 201 }; 202 203 class GCNPassConfig final : public AMDGPUPassConfig { 204 public: 205 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 206 : AMDGPUPassConfig(TM, PM) { } 207 bool addPreISel() override; 208 void addMachineSSAOptimization() override; 209 bool addInstSelector() override; 210 #ifdef LLVM_BUILD_GLOBAL_ISEL 211 bool addIRTranslator() override; 212 bool addRegBankSelect() override; 213 #endif 214 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 215 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 216 void addPreRegAlloc() override; 217 void addPreSched2() override; 218 void addPreEmitPass() override; 219 }; 220 221 } // End of anonymous namespace 222 223 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 224 return TargetIRAnalysis([this](const Function &F) { 225 return TargetTransformInfo( 226 AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); 227 }); 228 } 229 230 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { 231 if (getOptLevel() == CodeGenOpt::Aggressive) 232 addPass(createGVNPass()); 233 else 234 addPass(createEarlyCSEPass()); 235 } 236 237 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { 238 addPass(createSeparateConstOffsetFromGEPPass()); 239 addPass(createSpeculativeExecutionPass()); 240 // ReassociateGEPs exposes more opportunites for SLSR. See 241 // the example in reassociate-geps-and-slsr.ll. 242 addPass(createStraightLineStrengthReducePass()); 243 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 244 // EarlyCSE can reuse. 245 addEarlyCSEOrGVNPass(); 246 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 247 addPass(createNaryReassociatePass()); 248 // NaryReassociate on GEPs creates redundant common expressions, so run 249 // EarlyCSE after it. 250 addPass(createEarlyCSEPass()); 251 } 252 253 void AMDGPUPassConfig::addIRPasses() { 254 // There is no reason to run these. 255 disablePass(&StackMapLivenessID); 256 disablePass(&FuncletLayoutID); 257 disablePass(&PatchableFunctionID); 258 259 // Function calls are not supported, so make sure we inline everything. 260 addPass(createAMDGPUAlwaysInlinePass()); 261 addPass(createAlwaysInlinerPass()); 262 // We need to add the barrier noop pass, otherwise adding the function 263 // inlining pass will cause all of the PassConfigs passes to be run 264 // one function at a time, which means if we have a nodule with two 265 // functions, then we will generate code for the first function 266 // without ever running any passes on the second. 267 addPass(createBarrierNoopPass()); 268 269 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 270 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 271 272 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 273 const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); 274 if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { 275 addPass(createAMDGPUPromoteAlloca(&TM)); 276 addPass(createSROAPass()); 277 } 278 279 addStraightLineScalarOptimizationPasses(); 280 281 TargetPassConfig::addIRPasses(); 282 283 // EarlyCSE is not always strong enough to clean up what LSR produces. For 284 // example, GVN can combine 285 // 286 // %0 = add %a, %b 287 // %1 = add %b, %a 288 // 289 // and 290 // 291 // %0 = shl nsw %a, 2 292 // %1 = shl %a, 2 293 // 294 // but EarlyCSE can do neither of them. 295 if (getOptLevel() != CodeGenOpt::None) 296 addEarlyCSEOrGVNPass(); 297 } 298 299 bool 300 AMDGPUPassConfig::addPreISel() { 301 addPass(createFlattenCFGPass()); 302 return false; 303 } 304 305 bool AMDGPUPassConfig::addInstSelector() { 306 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 307 return false; 308 } 309 310 bool AMDGPUPassConfig::addGCPasses() { 311 // Do nothing. GC is not supported. 312 return false; 313 } 314 315 //===----------------------------------------------------------------------===// 316 // R600 Pass Setup 317 //===----------------------------------------------------------------------===// 318 319 bool R600PassConfig::addPreISel() { 320 AMDGPUPassConfig::addPreISel(); 321 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 322 if (ST.IsIRStructurizerEnabled()) 323 addPass(createStructurizeCFGPass()); 324 addPass(createR600TextureIntrinsicsReplacer()); 325 return false; 326 } 327 328 void R600PassConfig::addPreRegAlloc() { 329 addPass(createR600VectorRegMerger(*TM)); 330 } 331 332 void R600PassConfig::addPreSched2() { 333 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 334 addPass(createR600EmitClauseMarkers(), false); 335 if (ST.isIfCvtEnabled()) 336 addPass(&IfConverterID, false); 337 addPass(createR600ClauseMergePass(*TM), false); 338 } 339 340 void R600PassConfig::addPreEmitPass() { 341 addPass(createAMDGPUCFGStructurizerPass(), false); 342 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 343 addPass(&FinalizeMachineBundlesID, false); 344 addPass(createR600Packetizer(*TM), false); 345 addPass(createR600ControlFlowFinalizer(*TM), false); 346 } 347 348 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 349 return new R600PassConfig(this, PM); 350 } 351 352 //===----------------------------------------------------------------------===// 353 // GCN Pass Setup 354 //===----------------------------------------------------------------------===// 355 356 bool GCNPassConfig::addPreISel() { 357 AMDGPUPassConfig::addPreISel(); 358 359 // FIXME: We need to run a pass to propagate the attributes when calls are 360 // supported. 361 addPass(&AMDGPUAnnotateKernelFeaturesID); 362 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 363 addPass(createSinkingPass()); 364 addPass(createSITypeRewriter()); 365 addPass(createAMDGPUAnnotateUniformValues()); 366 addPass(createSIAnnotateControlFlowPass()); 367 368 return false; 369 } 370 371 void GCNPassConfig::addMachineSSAOptimization() { 372 TargetPassConfig::addMachineSSAOptimization(); 373 374 // We want to fold operands after PeepholeOptimizer has run (or as part of 375 // it), because it will eliminate extra copies making it easier to fold the 376 // real source operand. We want to eliminate dead instructions after, so that 377 // we see fewer uses of the copies. We then need to clean up the dead 378 // instructions leftover after the operands are folded as well. 379 // 380 // XXX - Can we get away without running DeadMachineInstructionElim again? 381 addPass(&SIFoldOperandsID); 382 addPass(&DeadMachineInstructionElimID); 383 } 384 385 bool GCNPassConfig::addInstSelector() { 386 AMDGPUPassConfig::addInstSelector(); 387 addPass(createSILowerI1CopiesPass()); 388 addPass(&SIFixSGPRCopiesID); 389 return false; 390 } 391 392 #ifdef LLVM_BUILD_GLOBAL_ISEL 393 bool GCNPassConfig::addIRTranslator() { 394 addPass(new IRTranslator()); 395 return false; 396 } 397 398 bool GCNPassConfig::addRegBankSelect() { 399 return false; 400 } 401 #endif 402 403 void GCNPassConfig::addPreRegAlloc() { 404 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 405 406 // This needs to be run directly before register allocation because 407 // earlier passes might recompute live intervals. 408 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 409 if (getOptLevel() > CodeGenOpt::None) { 410 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 411 } 412 413 if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { 414 // Don't do this with no optimizations since it throws away debug info by 415 // merging nonadjacent loads. 416 417 // This should be run after scheduling, but before register allocation. It 418 // also need extra copies to the address operand to be eliminated. 419 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 420 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 421 } 422 addPass(createSIShrinkInstructionsPass(), false); 423 addPass(createSIWholeQuadModePass()); 424 } 425 426 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 427 TargetPassConfig::addFastRegAlloc(RegAllocPass); 428 } 429 430 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 431 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 432 } 433 434 void GCNPassConfig::addPreSched2() { 435 } 436 437 void GCNPassConfig::addPreEmitPass() { 438 439 // The hazard recognizer that runs as part of the post-ra scheduler does not 440 // gaurantee to be able handle all hazards correctly. This is because 441 // if there are multiple scheduling regions in a basic block, the regions 442 // are scheduled bottom up, so when we begin to schedule a region we don't 443 // know what instructions were emitted directly before it. 444 // 445 // Here we add a stand-alone hazard recognizer pass which can handle all cases. 446 // hazard recognizer pass. 447 addPass(&PostRAHazardRecognizerID); 448 449 addPass(createSIInsertWaitsPass()); 450 addPass(createSIShrinkInstructionsPass()); 451 addPass(createSILowerControlFlowPass(), false); 452 addPass(createSIDebuggerInsertNopsPass(), false); 453 } 454 455 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 456 return new GCNPassConfig(this, PM); 457 } 458