1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // Top-level implementation for the NVPTX target. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXLowerAggrCopies.h" 17 #include "NVPTXTargetMachine.h" 18 #include "NVPTXTargetObjectFile.h" 19 #include "NVPTXTargetTransformInfo.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/Triple.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/LegacyPassManager.h" 26 #include "llvm/Pass.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Support/TargetRegistry.h" 29 #include "llvm/Target/TargetMachine.h" 30 #include "llvm/Target/TargetOptions.h" 31 #include "llvm/Transforms/Scalar.h" 32 #include "llvm/Transforms/Scalar/GVN.h" 33 #include "llvm/Transforms/Vectorize.h" 34 #include <cassert> 35 #include <string> 36 37 using namespace llvm; 38 39 // LSV is still relatively new; this switch lets us turn it off in case we 40 // encounter (or suspect) a bug. 41 static cl::opt<bool> 42 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 43 cl::desc("Disable load/store vectorizer"), 44 cl::init(false), cl::Hidden); 45 46 namespace llvm { 47 48 void initializeNVVMIntrRangePass(PassRegistry&); 49 void initializeNVVMReflectPass(PassRegistry&); 50 void initializeGenericToNVVMPass(PassRegistry&); 51 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 52 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 53 void initializeNVPTXInferAddressSpacesPass(PassRegistry &); 54 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 55 void initializeNVPTXLowerArgsPass(PassRegistry &); 56 void initializeNVPTXLowerAllocaPass(PassRegistry &); 57 58 } // end namespace llvm 59 60 extern "C" void LLVMInitializeNVPTXTarget() { 61 // Register the target. 62 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 63 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 64 65 // FIXME: This pass is really intended to be invoked during IR optimization, 66 // but it's very NVPTX-specific. 67 PassRegistry &PR = *PassRegistry::getPassRegistry(); 68 initializeNVVMReflectPass(PR); 69 initializeNVVMIntrRangePass(PR); 70 initializeGenericToNVVMPass(PR); 71 initializeNVPTXAllocaHoistingPass(PR); 72 initializeNVPTXAssignValidGlobalNamesPass(PR); 73 initializeNVPTXInferAddressSpacesPass(PR); 74 initializeNVPTXLowerArgsPass(PR); 75 initializeNVPTXLowerAllocaPass(PR); 76 initializeNVPTXLowerAggrCopiesPass(PR); 77 } 78 79 static std::string computeDataLayout(bool is64Bit) { 80 std::string Ret = "e"; 81 82 if (!is64Bit) 83 Ret += "-p:32:32"; 84 85 Ret += "-i64:64-v16:16-v32:32-n16:32:64"; 86 87 return Ret; 88 } 89 90 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 91 StringRef CPU, StringRef FS, 92 const TargetOptions &Options, 93 Optional<Reloc::Model> RM, 94 CodeModel::Model CM, 95 CodeGenOpt::Level OL, bool is64bit) 96 // The pic relocation model is used regardless of what the client has 97 // specified, as it is the only relocation model currently supported. 98 : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options, 99 Reloc::PIC_, CM, OL), 100 is64bit(is64bit), 101 TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), 102 Subtarget(TT, CPU, FS, *this) { 103 if (TT.getOS() == Triple::NVCL) 104 drvInterface = NVPTX::NVCL; 105 else 106 drvInterface = NVPTX::CUDA; 107 initAsmInfo(); 108 } 109 110 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 111 112 void NVPTXTargetMachine32::anchor() {} 113 114 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 115 StringRef CPU, StringRef FS, 116 const TargetOptions &Options, 117 Optional<Reloc::Model> RM, 118 CodeModel::Model CM, 119 CodeGenOpt::Level OL) 120 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 121 122 void NVPTXTargetMachine64::anchor() {} 123 124 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 125 StringRef CPU, StringRef FS, 126 const TargetOptions &Options, 127 Optional<Reloc::Model> RM, 128 CodeModel::Model CM, 129 CodeGenOpt::Level OL) 130 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 131 132 namespace { 133 134 class NVPTXPassConfig : public TargetPassConfig { 135 public: 136 NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM) 137 : TargetPassConfig(TM, PM) {} 138 139 NVPTXTargetMachine &getNVPTXTargetMachine() const { 140 return getTM<NVPTXTargetMachine>(); 141 } 142 143 void addIRPasses() override; 144 bool addInstSelector() override; 145 void addPostRegAlloc() override; 146 void addMachineSSAOptimization() override; 147 148 FunctionPass *createTargetRegisterAllocator(bool) override; 149 void addFastRegAlloc(FunctionPass *RegAllocPass) override; 150 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; 151 152 private: 153 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 154 // function is only called in opt mode. 155 void addEarlyCSEOrGVNPass(); 156 157 // Add passes that propagate special memory spaces. 158 void addAddressSpaceInferencePasses(); 159 160 // Add passes that perform straight-line scalar optimizations. 161 void addStraightLineScalarOptimizationPasses(); 162 }; 163 164 } // end anonymous namespace 165 166 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 167 return new NVPTXPassConfig(this, PM); 168 } 169 170 void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) { 171 PM.add(createNVVMReflectPass()); 172 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 173 } 174 175 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { 176 return TargetIRAnalysis([this](const Function &F) { 177 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 178 }); 179 } 180 181 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 182 if (getOptLevel() == CodeGenOpt::Aggressive) 183 addPass(createGVNPass()); 184 else 185 addPass(createEarlyCSEPass()); 186 } 187 188 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 189 // NVPTXLowerArgs emits alloca for byval parameters which can often 190 // be eliminated by SROA. 191 addPass(createSROAPass()); 192 addPass(createNVPTXLowerAllocaPass()); 193 addPass(createNVPTXInferAddressSpacesPass()); 194 } 195 196 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 197 addPass(createSeparateConstOffsetFromGEPPass()); 198 addPass(createSpeculativeExecutionPass()); 199 // ReassociateGEPs exposes more opportunites for SLSR. See 200 // the example in reassociate-geps-and-slsr.ll. 201 addPass(createStraightLineStrengthReducePass()); 202 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 203 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 204 // for some of our benchmarks. 205 addEarlyCSEOrGVNPass(); 206 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 207 addPass(createNaryReassociatePass()); 208 // NaryReassociate on GEPs creates redundant common expressions, so run 209 // EarlyCSE after it. 210 addPass(createEarlyCSEPass()); 211 } 212 213 void NVPTXPassConfig::addIRPasses() { 214 // The following passes are known to not play well with virtual regs hanging 215 // around after register allocation (which in our case, is *all* registers). 216 // We explicitly disable them here. We do, however, need some functionality 217 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 218 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 219 disablePass(&PrologEpilogCodeInserterID); 220 disablePass(&MachineCopyPropagationID); 221 disablePass(&TailDuplicateID); 222 disablePass(&StackMapLivenessID); 223 disablePass(&LiveDebugValuesID); 224 disablePass(&PostRASchedulerID); 225 disablePass(&FuncletLayoutID); 226 disablePass(&PatchableFunctionID); 227 228 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 229 // it here does nothing. But since we need it for correctness when lowering 230 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 231 // call addEarlyAsPossiblePasses. 232 addPass(createNVVMReflectPass()); 233 234 if (getOptLevel() != CodeGenOpt::None) 235 addPass(createNVPTXImageOptimizerPass()); 236 addPass(createNVPTXAssignValidGlobalNamesPass()); 237 addPass(createGenericToNVVMPass()); 238 239 // NVPTXLowerArgs is required for correctness and should be run right 240 // before the address space inference passes. 241 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 242 if (getOptLevel() != CodeGenOpt::None) { 243 addAddressSpaceInferencePasses(); 244 if (!DisableLoadStoreVectorizer) 245 addPass(createLoadStoreVectorizerPass()); 246 addStraightLineScalarOptimizationPasses(); 247 } 248 249 // === LSR and other generic IR passes === 250 TargetPassConfig::addIRPasses(); 251 // EarlyCSE is not always strong enough to clean up what LSR produces. For 252 // example, GVN can combine 253 // 254 // %0 = add %a, %b 255 // %1 = add %b, %a 256 // 257 // and 258 // 259 // %0 = shl nsw %a, 2 260 // %1 = shl %a, 2 261 // 262 // but EarlyCSE can do neither of them. 263 if (getOptLevel() != CodeGenOpt::None) 264 addEarlyCSEOrGVNPass(); 265 } 266 267 bool NVPTXPassConfig::addInstSelector() { 268 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 269 270 addPass(createLowerAggrCopies()); 271 addPass(createAllocaHoisting()); 272 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 273 274 if (!ST.hasImageHandles()) 275 addPass(createNVPTXReplaceImageHandlesPass()); 276 277 return false; 278 } 279 280 void NVPTXPassConfig::addPostRegAlloc() { 281 addPass(createNVPTXPrologEpilogPass(), false); 282 if (getOptLevel() != CodeGenOpt::None) { 283 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 284 // index with VRFrame register. NVPTXPeephole need to be run after that and 285 // will replace VRFrame with VRFrameLocal when possible. 286 addPass(createNVPTXPeephole()); 287 } 288 } 289 290 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 291 return nullptr; // No reg alloc 292 } 293 294 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { 295 assert(!RegAllocPass && "NVPTX uses no regalloc!"); 296 addPass(&PHIEliminationID); 297 addPass(&TwoAddressInstructionPassID); 298 } 299 300 void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { 301 assert(!RegAllocPass && "NVPTX uses no regalloc!"); 302 303 addPass(&ProcessImplicitDefsID); 304 addPass(&LiveVariablesID); 305 addPass(&MachineLoopInfoID); 306 addPass(&PHIEliminationID); 307 308 addPass(&TwoAddressInstructionPassID); 309 addPass(&RegisterCoalescerID); 310 311 // PreRA instruction scheduling. 312 if (addPass(&MachineSchedulerID)) 313 printAndVerify("After Machine Scheduling"); 314 315 316 addPass(&StackSlotColoringID); 317 318 // FIXME: Needs physical registers 319 //addPass(&PostRAMachineLICMID); 320 321 printAndVerify("After StackSlotColoring"); 322 } 323 324 void NVPTXPassConfig::addMachineSSAOptimization() { 325 // Pre-ra tail duplication. 326 if (addPass(&EarlyTailDuplicateID)) 327 printAndVerify("After Pre-RegAlloc TailDuplicate"); 328 329 // Optimize PHIs before DCE: removing dead PHI cycles may make more 330 // instructions dead. 331 addPass(&OptimizePHIsID); 332 333 // This pass merges large allocas. StackSlotColoring is a different pass 334 // which merges spill slots. 335 addPass(&StackColoringID); 336 337 // If the target requests it, assign local variables to stack slots relative 338 // to one another and simplify frame index references where possible. 339 addPass(&LocalStackSlotAllocationID); 340 341 // With optimization, dead code should already be eliminated. However 342 // there is one known exception: lowered code for arguments that are only 343 // used by tail calls, where the tail calls reuse the incoming stack 344 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 345 addPass(&DeadMachineInstructionElimID); 346 printAndVerify("After codegen DCE pass"); 347 348 // Allow targets to insert passes that improve instruction level parallelism, 349 // like if-conversion. Such passes will typically need dominator trees and 350 // loop info, just like LICM and CSE below. 351 if (addILPOpts()) 352 printAndVerify("After ILP optimizations"); 353 354 addPass(&MachineLICMID); 355 addPass(&MachineCSEID); 356 357 addPass(&MachineSinkingID); 358 printAndVerify("After Machine LICM, CSE and Sinking passes"); 359 360 addPass(&PeepholeOptimizerID); 361 printAndVerify("After codegen peephole optimization pass"); 362 } 363