1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief The AMDGPU target machine contains all of the hardware specific 12 /// information needed to emit code for R600 and SI GPUs. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPUTargetMachine.h" 17 #include "AMDGPUTargetObjectFile.h" 18 #include "AMDGPU.h" 19 #include "AMDGPUTargetTransformInfo.h" 20 #include "R600ISelLowering.h" 21 #include "R600InstrInfo.h" 22 #include "R600MachineScheduler.h" 23 #include "SIISelLowering.h" 24 #include "SIInstrInfo.h" 25 #include "llvm/Analysis/Passes.h" 26 #include "llvm/CodeGen/GlobalISel/IRTranslator.h" 27 #include "llvm/CodeGen/MachineFunctionAnalysis.h" 28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 29 #include "llvm/CodeGen/MachineModuleInfo.h" 30 #include "llvm/CodeGen/Passes.h" 31 #include "llvm/IR/Verifier.h" 32 #include "llvm/MC/MCAsmInfo.h" 33 #include "llvm/IR/LegacyPassManager.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/TargetRegistry.h" 36 #include "llvm/Support/raw_os_ostream.h" 37 #include "llvm/Transforms/IPO.h" 38 #include "llvm/Transforms/Scalar.h" 39 #include <llvm/CodeGen/Passes.h> 40 41 using namespace llvm; 42 43 extern "C" void LLVMInitializeAMDGPUTarget() { 44 // Register the target 45 RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); 46 RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); 47 48 PassRegistry *PR = PassRegistry::getPassRegistry(); 49 initializeSILowerI1CopiesPass(*PR); 50 initializeSIFixSGPRCopiesPass(*PR); 51 initializeSIFoldOperandsPass(*PR); 52 initializeSIFixControlFlowLiveIntervalsPass(*PR); 53 initializeSILoadStoreOptimizerPass(*PR); 54 initializeAMDGPUAnnotateKernelFeaturesPass(*PR); 55 initializeAMDGPUAnnotateUniformValuesPass(*PR); 56 initializeAMDGPUPromoteAllocaPass(*PR); 57 initializeSIAnnotateControlFlowPass(*PR); 58 initializeSIInsertNopsPass(*PR); 59 initializeSIInsertWaitsPass(*PR); 60 initializeSIWholeQuadModePass(*PR); 61 initializeSILowerControlFlowPass(*PR); 62 } 63 64 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { 65 if (TT.getOS() == Triple::AMDHSA) 66 return make_unique<AMDGPUHSATargetObjectFile>(); 67 68 return make_unique<AMDGPUTargetObjectFile>(); 69 } 70 71 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { 72 return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>()); 73 } 74 75 static MachineSchedRegistry 76 R600SchedRegistry("r600", "Run R600's custom scheduler", 77 createR600MachineScheduler); 78 79 static MachineSchedRegistry 80 SISchedRegistry("si", "Run SI's custom scheduler", 81 createSIMachineScheduler); 82 83 static std::string computeDataLayout(const Triple &TT) { 84 std::string Ret = "e-p:32:32"; 85 86 if (TT.getArch() == Triple::amdgcn) { 87 // 32-bit private, local, and region pointers. 64-bit global and constant. 88 Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; 89 } 90 91 Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" 92 "-v512:512-v1024:1024-v2048:2048-n32:64"; 93 94 return Ret; 95 } 96 97 LLVM_READNONE 98 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { 99 if (!GPU.empty()) 100 return GPU; 101 102 // HSA only supports CI+, so change the default GPU to a CI for HSA. 103 if (TT.getArch() == Triple::amdgcn) 104 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; 105 106 return ""; 107 } 108 109 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, 110 StringRef CPU, StringRef FS, 111 TargetOptions Options, Reloc::Model RM, 112 CodeModel::Model CM, 113 CodeGenOpt::Level OptLevel) 114 : LLVMTargetMachine(T, computeDataLayout(TT), TT, 115 getGPUOrDefault(TT, CPU), FS, Options, RM, CM, 116 OptLevel), 117 TLOF(createTLOF(getTargetTriple())), 118 Subtarget(TT, getTargetCPU(), FS, *this), 119 IntrinsicInfo() { 120 setRequiresStructuredCFG(true); 121 initAsmInfo(); 122 } 123 124 AMDGPUTargetMachine::~AMDGPUTargetMachine() { } 125 126 //===----------------------------------------------------------------------===// 127 // R600 Target Machine (R600 -> Cayman) 128 //===----------------------------------------------------------------------===// 129 130 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, 131 StringRef CPU, StringRef FS, 132 TargetOptions Options, Reloc::Model RM, 133 CodeModel::Model CM, CodeGenOpt::Level OL) 134 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 135 136 //===----------------------------------------------------------------------===// 137 // GCN Target Machine (SI+) 138 //===----------------------------------------------------------------------===// 139 140 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, 141 StringRef CPU, StringRef FS, 142 TargetOptions Options, Reloc::Model RM, 143 CodeModel::Model CM, CodeGenOpt::Level OL) 144 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} 145 146 //===----------------------------------------------------------------------===// 147 // AMDGPU Pass Setup 148 //===----------------------------------------------------------------------===// 149 150 namespace { 151 152 cl::opt<bool> InsertNops( 153 "amdgpu-insert-nops", 154 cl::desc("Insert two nop instructions for each high level source statement"), 155 cl::init(false)); 156 157 class AMDGPUPassConfig : public TargetPassConfig { 158 public: 159 AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) 160 : TargetPassConfig(TM, PM) { 161 162 // Exceptions and StackMaps are not supported, so these passes will never do 163 // anything. 164 disablePass(&StackMapLivenessID); 165 disablePass(&FuncletLayoutID); 166 } 167 168 AMDGPUTargetMachine &getAMDGPUTargetMachine() const { 169 return getTM<AMDGPUTargetMachine>(); 170 } 171 172 ScheduleDAGInstrs * 173 createMachineScheduler(MachineSchedContext *C) const override { 174 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 175 if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) 176 return createR600MachineScheduler(C); 177 else if (ST.enableSIScheduler()) 178 return createSIMachineScheduler(C); 179 return nullptr; 180 } 181 182 void addIRPasses() override; 183 void addCodeGenPrepare() override; 184 bool addPreISel() override; 185 bool addInstSelector() override; 186 bool addGCPasses() override; 187 }; 188 189 class R600PassConfig final : public AMDGPUPassConfig { 190 public: 191 R600PassConfig(TargetMachine *TM, PassManagerBase &PM) 192 : AMDGPUPassConfig(TM, PM) { } 193 194 bool addPreISel() override; 195 void addPreRegAlloc() override; 196 void addPreSched2() override; 197 void addPreEmitPass() override; 198 }; 199 200 class GCNPassConfig final : public AMDGPUPassConfig { 201 public: 202 GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) 203 : AMDGPUPassConfig(TM, PM) { } 204 bool addPreISel() override; 205 void addMachineSSAOptimization() override; 206 bool addInstSelector() override; 207 #ifdef LLVM_BUILD_GLOBAL_ISEL 208 bool addIRTranslator() override; 209 bool addRegBankSelect() override; 210 #endif 211 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 212 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 213 void addPreRegAlloc() override; 214 void addPostRegAlloc() override; 215 void addPreSched2() override; 216 void addPreEmitPass() override; 217 }; 218 219 } // End of anonymous namespace 220 221 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { 222 return TargetIRAnalysis([this](const Function &F) { 223 return TargetTransformInfo( 224 AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); 225 }); 226 } 227 228 void AMDGPUPassConfig::addIRPasses() { 229 // Function calls are not supported, so make sure we inline everything. 230 addPass(createAMDGPUAlwaysInlinePass()); 231 addPass(createAlwaysInlinerPass()); 232 // We need to add the barrier noop pass, otherwise adding the function 233 // inlining pass will cause all of the PassConfigs passes to be run 234 // one function at a time, which means if we have a nodule with two 235 // functions, then we will generate code for the first function 236 // without ever running any passes on the second. 237 addPass(createBarrierNoopPass()); 238 239 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. 240 addPass(createAMDGPUOpenCLImageTypeLoweringPass()); 241 242 TargetPassConfig::addIRPasses(); 243 } 244 245 void AMDGPUPassConfig::addCodeGenPrepare() { 246 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); 247 const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); 248 if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { 249 addPass(createAMDGPUPromoteAlloca(&TM)); 250 addPass(createSROAPass()); 251 } 252 TargetPassConfig::addCodeGenPrepare(); 253 } 254 255 bool 256 AMDGPUPassConfig::addPreISel() { 257 addPass(createFlattenCFGPass()); 258 return false; 259 } 260 261 bool AMDGPUPassConfig::addInstSelector() { 262 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); 263 return false; 264 } 265 266 bool AMDGPUPassConfig::addGCPasses() { 267 // Do nothing. GC is not supported. 268 return false; 269 } 270 271 //===----------------------------------------------------------------------===// 272 // R600 Pass Setup 273 //===----------------------------------------------------------------------===// 274 275 bool R600PassConfig::addPreISel() { 276 AMDGPUPassConfig::addPreISel(); 277 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 278 if (ST.IsIRStructurizerEnabled()) 279 addPass(createStructurizeCFGPass()); 280 addPass(createR600TextureIntrinsicsReplacer()); 281 return false; 282 } 283 284 void R600PassConfig::addPreRegAlloc() { 285 addPass(createR600VectorRegMerger(*TM)); 286 } 287 288 void R600PassConfig::addPreSched2() { 289 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 290 addPass(createR600EmitClauseMarkers(), false); 291 if (ST.isIfCvtEnabled()) 292 addPass(&IfConverterID, false); 293 addPass(createR600ClauseMergePass(*TM), false); 294 } 295 296 void R600PassConfig::addPreEmitPass() { 297 addPass(createAMDGPUCFGStructurizerPass(), false); 298 addPass(createR600ExpandSpecialInstrsPass(*TM), false); 299 addPass(&FinalizeMachineBundlesID, false); 300 addPass(createR600Packetizer(*TM), false); 301 addPass(createR600ControlFlowFinalizer(*TM), false); 302 } 303 304 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { 305 return new R600PassConfig(this, PM); 306 } 307 308 //===----------------------------------------------------------------------===// 309 // GCN Pass Setup 310 //===----------------------------------------------------------------------===// 311 312 bool GCNPassConfig::addPreISel() { 313 AMDGPUPassConfig::addPreISel(); 314 315 // FIXME: We need to run a pass to propagate the attributes when calls are 316 // supported. 317 addPass(&AMDGPUAnnotateKernelFeaturesID); 318 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions 319 addPass(createSinkingPass()); 320 addPass(createSITypeRewriter()); 321 addPass(createAMDGPUAnnotateUniformValues()); 322 addPass(createSIAnnotateControlFlowPass()); 323 324 return false; 325 } 326 327 void GCNPassConfig::addMachineSSAOptimization() { 328 TargetPassConfig::addMachineSSAOptimization(); 329 330 // We want to fold operands after PeepholeOptimizer has run (or as part of 331 // it), because it will eliminate extra copies making it easier to fold the 332 // real source operand. We want to eliminate dead instructions after, so that 333 // we see fewer uses of the copies. We then need to clean up the dead 334 // instructions leftover after the operands are folded as well. 335 // 336 // XXX - Can we get away without running DeadMachineInstructionElim again? 337 addPass(&SIFoldOperandsID); 338 addPass(&DeadMachineInstructionElimID); 339 } 340 341 bool GCNPassConfig::addInstSelector() { 342 AMDGPUPassConfig::addInstSelector(); 343 addPass(createSILowerI1CopiesPass()); 344 addPass(&SIFixSGPRCopiesID); 345 return false; 346 } 347 348 #ifdef LLVM_BUILD_GLOBAL_ISEL 349 bool GCNPassConfig::addIRTranslator() { 350 addPass(new IRTranslator()); 351 return false; 352 } 353 354 bool GCNPassConfig::addRegBankSelect() { 355 return false; 356 } 357 #endif 358 359 void GCNPassConfig::addPreRegAlloc() { 360 const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); 361 362 // This needs to be run directly before register allocation because 363 // earlier passes might recompute live intervals. 364 // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass 365 if (getOptLevel() > CodeGenOpt::None) { 366 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); 367 } 368 369 if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { 370 // Don't do this with no optimizations since it throws away debug info by 371 // merging nonadjacent loads. 372 373 // This should be run after scheduling, but before register allocation. It 374 // also need extra copies to the address operand to be eliminated. 375 insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); 376 insertPass(&MachineSchedulerID, &RegisterCoalescerID); 377 } 378 addPass(createSIShrinkInstructionsPass(), false); 379 addPass(createSIWholeQuadModePass()); 380 } 381 382 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 383 TargetPassConfig::addFastRegAlloc(RegAllocPass); 384 } 385 386 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 387 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); 388 } 389 390 void GCNPassConfig::addPostRegAlloc() { 391 addPass(createSIShrinkInstructionsPass(), false); 392 } 393 394 void GCNPassConfig::addPreSched2() { 395 } 396 397 void GCNPassConfig::addPreEmitPass() { 398 addPass(createSIInsertWaitsPass(), false); 399 addPass(createSILowerControlFlowPass(), false); 400 if (InsertNops) { 401 addPass(createSIInsertNopsPass(), false); 402 } 403 } 404 405 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { 406 return new GCNPassConfig(this, PM); 407 } 408