1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUTargetObjectFile.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUTargetTransformInfo.h" 20 #include "R600ISelLowering.h" 21 #include "R600InstrInfo.h" 22 #include "R600MachineScheduler.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "llvm/Analysis/Passes.h" 26 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 27 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 28 #include "llvm/CodeGen/MachineModuleInfo.h" 29 #include "llvm/CodeGen/Passes.h" 30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 31 #include "llvm/CodeGen/TargetPassConfig.h" 32 #include "llvm/IR/Verifier.h" 33 #include "llvm/MC/MCAsmInfo.h" 34 #include "llvm/IR/LegacyPassManager.h" 35 #include "llvm/Support/TargetRegistry.h" 36 #include "llvm/Support/raw_os_ostream.h" 37 #include "llvm/Transforms/IPO.h" 38 #include "llvm/Transforms/Scalar.h" 39 #include <llvm/CodeGen/Passes.h> 40 41 using namespace llvm; 42 43 extern "C" void LLVMInitializeAMDGPUTarget() { 44 // Register the target 45 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 46 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 47 48 PassRegistry *PR = PassRegistry::getPassRegistry(); 49 initializeSILowerI1CopiesPass(*PR); 50 initializeSIFixSGPRCopiesPass(*PR); 51 initializeSIFoldOperandsPass(*PR); 52 initializeSIShrinkInstructionsPass(*PR); 53 initializeSIFixControlFlowLiveIntervalsPass(*PR); 54 initializeSILoadStoreOptimizerPass(*PR); 55 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 56 initializeAMDGPUAnnotateUniformValuesPass(*PR); 57 initializeAMDGPUPromoteAllocaPass(*PR); 58 initializeSIAnnotateControlFlowPass(*PR); 59 initializeSIDebuggerInsertNopsPass(*PR); 60 initializeSIInsertWaitsPass(*PR); 61 initializeSIWholeQuadModePass(*PR); 62 initializeSILowerControlFlowPass(*PR); 63 initializeSIDebuggerInsertNopsPass(*PR); 64 } 65 66 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 67 return make_unique<AMDGPUTargetObjectFile>(); 68 } 69 70 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 71 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 72 } 73 74 static MachineSchedRegistry 75 R600SchedRegistry("r600", "Run R600's custom scheduler", 76 createR600MachineScheduler); 77 78 static MachineSchedRegistry 79 SISchedRegistry("si", "Run SI's custom scheduler", 80 createSIMachineScheduler); 81 82 static StringRef computeDataLayout(const Triple &TT) { 83 if (TT.getArch() == Triple::r600) { 84 // 32-bit pointers. 85 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 86 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 87 } 88 89 // 32-bit private, local, and region pointers. 64-bit global, constant and 90 // flat. 91 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" 92 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" 93 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; 94 } 95 96 LLVM_READNONE 97 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 98 if (!GPU.empty()) 99 return GPU; 100 101 // HSA only supports CI+, so change the default GPU to a CI for HSA. 102 if (TT.getArch() == Triple::amdgcn) 103 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 104 105 return "r600"; 106 } 107 108 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { 109 if (!RM.hasValue()) 110 return Reloc::PIC_; 111 return *RM; 112 } 113 114 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 115 StringRef CPU, StringRef FS, 116 TargetOptions Options, 117 Optional<Reloc::Model> RM, 118 CodeModel::Model CM, 119 CodeGenOpt::Level OptLevel) 120 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), 121 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), 122 TLOF(createTLOF(getTargetTriple())), 123 Subtarget(TT, getTargetCPU(), FS, *this), IntrinsicInfo() { 124 setRequiresStructuredCFG(true); 125 initAsmInfo(); 126 } 127 128 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 129 130 //===----------------------------------------------------------------------===// 131 // R600 Target Machine (R600 -> Cayman) 132 //===----------------------------------------------------------------------===// 133 134 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 135 StringRef CPU, StringRef FS, 136 TargetOptions Options, 137 Optional<Reloc::Model> RM, 138 CodeModel::Model CM, CodeGenOpt::Level OL) 139 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 140 141 //===----------------------------------------------------------------------===// 142 // GCN Target Machine (SI+) 143 //===----------------------------------------------------------------------===// 144 145 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 146 StringRef CPU, StringRef FS, 147 TargetOptions Options, 148 Optional<Reloc::Model> RM, 149 CodeModel::Model CM, CodeGenOpt::Level OL) 150 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 151 152 //===----------------------------------------------------------------------===// 153 // AMDGPU Pass Setup 154 //===----------------------------------------------------------------------===// 155 156 namespace { 157 158 class AMDGPUPassConfig : public TargetPassConfig { 159 public: 160 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 161 : TargetPassConfig(TM, PM) { 162 163 // Exceptions and StackMaps are not supported, so these passes will never do 164 // anything. 165 disablePass(&StackMapLivenessID); 166 disablePass(&FuncletLayoutID); 167 } 168 169 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 170 return getTM<AMDGPUTargetMachine>(); 171 } 172 173 ScheduleDAGInstrs * 174 createMachineScheduler(MachineSchedContext *C) const override { 175 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 176 if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) 177 return createR600MachineScheduler(C); 178 else if (ST.enableSIScheduler()) 179 return createSIMachineScheduler(C); 180 return nullptr; 181 } 182 183 void addIRPasses() override; 184 void addCodeGenPrepare() override; 185 bool addPreISel() override; 186 bool addInstSelector() override; 187 bool addGCPasses() override; 188 }; 189 190 class R600PassConfig final : public AMDGPUPassConfig { 191 public: 192 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 193 : AMDGPUPassConfig(TM, PM) { } 194 195 bool addPreISel() override; 196 void addPreRegAlloc() override; 197 void addPreSched2() override; 198 void addPreEmitPass() override; 199 }; 200 201 class GCNPassConfig final : public AMDGPUPassConfig { 202 public: 203 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 204 : AMDGPUPassConfig(TM, PM) { } 205 bool addPreISel() override; 206 void addMachineSSAOptimization() override; 207 bool addInstSelector() override; 208 #ifdef LLVM_BUILD_GLOBAL_ISEL 209 bool addIRTranslator() override; 210 bool addRegBankSelect() override; 211 #endif 212 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 213 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 214 void addPreRegAlloc() override; 215 void addPreSched2() override; 216 void addPreEmitPass() override; 217 }; 218 219 } // End of anonymous namespace 220 221 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 222 return TargetIRAnalysis([this](const Function &F) { 223 return TargetTransformInfo( 224 AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); 225 }); 226 } 227 228 void AMDGPUPassConfig::addIRPasses() { 229 // There is no reason to run these. 230 disablePass(&StackMapLivenessID); 231 disablePass(&FuncletLayoutID); 232 disablePass(&PatchableFunctionID); 233 234 // Function calls are not supported, so make sure we inline everything. 235 addPass(createAMDGPUAlwaysInlinePass()); 236 addPass(createAlwaysInlinerPass()); 237 // We need to add the barrier noop pass, otherwise adding the function 238 // inlining pass will cause all of the PassConfigs passes to be run 239 // one function at a time, which means if we have a nodule with two 240 // functions, then we will generate code for the first function 241 // without ever running any passes on the second. 242 addPass(createBarrierNoopPass()); 243 244 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 245 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 246 247 TargetPassConfig::addIRPasses(); 248 } 249 250 void AMDGPUPassConfig::addCodeGenPrepare() { 251 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 252 const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); 253 if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { 254 addPass(createAMDGPUPromoteAlloca(&TM)); 255 addPass(createSROAPass()); 256 } 257 TargetPassConfig::addCodeGenPrepare(); 258 } 259 260 bool 261 AMDGPUPassConfig::addPreISel() { 262 addPass(createFlattenCFGPass()); 263 return false; 264 } 265 266 bool AMDGPUPassConfig::addInstSelector() { 267 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 268 return false; 269 } 270 271 bool AMDGPUPassConfig::addGCPasses() { 272 // Do nothing. GC is not supported. 273 return false; 274 } 275 276 //===----------------------------------------------------------------------===// 277 // R600 Pass Setup 278 //===----------------------------------------------------------------------===// 279 280 bool R600PassConfig::addPreISel() { 281 AMDGPUPassConfig::addPreISel(); 282 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 283 if (ST.IsIRStructurizerEnabled()) 284 addPass(createStructurizeCFGPass()); 285 addPass(createR600TextureIntrinsicsReplacer()); 286 return false; 287 } 288 289 void R600PassConfig::addPreRegAlloc() { 290 addPass(createR600VectorRegMerger(*TM)); 291 } 292 293 void R600PassConfig::addPreSched2() { 294 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 295 addPass(createR600EmitClauseMarkers(), false); 296 if (ST.isIfCvtEnabled()) 297 addPass(&IfConverterID, false); 298 addPass(createR600ClauseMergePass(*TM), false); 299 } 300 301 void R600PassConfig::addPreEmitPass() { 302 addPass(createAMDGPUCFGStructurizerPass(), false); 303 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 304 addPass(&FinalizeMachineBundlesID, false); 305 addPass(createR600Packetizer(*TM), false); 306 addPass(createR600ControlFlowFinalizer(*TM), false); 307 } 308 309 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 310 return new R600PassConfig(this, PM); 311 } 312 313 //===----------------------------------------------------------------------===// 314 // GCN Pass Setup 315 //===----------------------------------------------------------------------===// 316 317 bool GCNPassConfig::addPreISel() { 318 AMDGPUPassConfig::addPreISel(); 319 320 // FIXME: We need to run a pass to propagate the attributes when calls are 321 // supported. 322 addPass(&AMDGPUAnnotateKernelFeaturesID); 323 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 324 addPass(createSinkingPass()); 325 addPass(createSITypeRewriter()); 326 addPass(createAMDGPUAnnotateUniformValues()); 327 addPass(createSIAnnotateControlFlowPass()); 328 329 return false; 330 } 331 332 void GCNPassConfig::addMachineSSAOptimization() { 333 TargetPassConfig::addMachineSSAOptimization(); 334 335 // We want to fold operands after PeepholeOptimizer has run (or as part of 336 // it), because it will eliminate extra copies making it easier to fold the 337 // real source operand. We want to eliminate dead instructions after, so that 338 // we see fewer uses of the copies. We then need to clean up the dead 339 // instructions leftover after the operands are folded as well. 340 // 341 // XXX - Can we get away without running DeadMachineInstructionElim again? 342 addPass(&SIFoldOperandsID); 343 addPass(&DeadMachineInstructionElimID); 344 } 345 346 bool GCNPassConfig::addInstSelector() { 347 AMDGPUPassConfig::addInstSelector(); 348 addPass(createSILowerI1CopiesPass()); 349 addPass(&SIFixSGPRCopiesID); 350 return false; 351 } 352 353 #ifdef LLVM_BUILD_GLOBAL_ISEL 354 bool GCNPassConfig::addIRTranslator() { 355 addPass(new IRTranslator()); 356 return false; 357 } 358 359 bool GCNPassConfig::addRegBankSelect() { 360 return false; 361 } 362 #endif 363 364 void GCNPassConfig::addPreRegAlloc() { 365 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 366 367 // This needs to be run directly before register allocation because 368 // earlier passes might recompute live intervals. 369 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 370 if (getOptLevel() > CodeGenOpt::None) { 371 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 372 } 373 374 if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { 375 // Don't do this with no optimizations since it throws away debug info by 376 // merging nonadjacent loads. 377 378 // This should be run after scheduling, but before register allocation. It 379 // also need extra copies to the address operand to be eliminated. 380 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 381 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 382 } 383 addPass(createSIShrinkInstructionsPass(), false); 384 addPass(createSIWholeQuadModePass()); 385 } 386 387 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 388 TargetPassConfig::addFastRegAlloc(RegAllocPass); 389 } 390 391 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 392 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 393 } 394 395 void GCNPassConfig::addPreSched2() { 396 } 397 398 void GCNPassConfig::addPreEmitPass() { 399 400 // The hazard recognizer that runs as part of the post-ra scheduler does not 401 // gaurantee to be able handle all hazards correctly. This is because 402 // if there are multiple scheduling regions in a basic block, the regions 403 // are scheduled bottom up, so when we begin to schedule a region we don't 404 // know what instructions were emitted directly before it. 405 // 406 // Here we add a stand-alone hazard recognizer pass which can handle all cases. 407 // hazard recognizer pass. 408 addPass(&PostRAHazardRecognizerID); 409 410 addPass(createSIInsertWaitsPass()); 411 addPass(createSIShrinkInstructionsPass()); 412 addPass(createSILowerControlFlowPass(), false); 413 addPass(createSIDebuggerInsertNopsPass(), false); 414 } 415 416 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 417 return new GCNPassConfig(this, PM); 418 } 419