1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXLowerAggrCopies.h" 17 #include "NVPTXTargetObjectFile.h" 18 #include "NVPTXTargetTransformInfo.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/LegacyPassManager.h" 25 #include "llvm/Pass.h" 26 #include "llvm/Support/CommandLine.h" 27 #include "llvm/Support/TargetRegistry.h" 28 #include "llvm/Target/TargetMachine.h" 29 #include "llvm/Target/TargetOptions.h" 30 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 31 #include "llvm/Transforms/Scalar.h" 32 #include "llvm/Transforms/Scalar/GVN.h" 33 #include "llvm/Transforms/Vectorize.h" 34 #include <cassert> 35 #include <string> 36 37 using namespace llvm; 38 39 // LSV is still relatively new; this switch lets us turn it off in case we 40 // encounter (or suspect) a bug. 41 static cl::opt<bool> 42 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 43 cl::desc("Disable load/store vectorizer"), 44 cl::init(false), cl::Hidden); 45 46 // TODO: Remove this flag when we are confident with no regressions. 47 static cl::opt<bool> DisableRequireStructuredCFG( 48 "disable-nvptx-require-structured-cfg", 49 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 50 "structured CFG. The requirement should be disabled only when " 51 "unexpected regressions happen."), 52 cl::init(false), cl::Hidden); 53 54 static cl::opt<bool> UseShortPointersOpt( 55 "nvptx-short-ptr", 56 cl::desc( 57 "Use 32-bit pointers for accessing const/local/shared address spaces."), 58 cl::init(false), cl::Hidden); 59 60 namespace llvm { 61 62 void initializeNVVMIntrRangePass(PassRegistry&); 63 void initializeNVVMReflectPass(PassRegistry&); 64 void initializeGenericToNVVMPass(PassRegistry&); 65 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 66 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 67 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 68 void initializeNVPTXLowerArgsPass(PassRegistry &); 69 void initializeNVPTXLowerAllocaPass(PassRegistry &); 70 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 71 72 } // end namespace llvm 73 74 extern "C" void LLVMInitializeNVPTXTarget() { 75 // Register the target. 76 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 77 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 78 79 // FIXME: This pass is really intended to be invoked during IR optimization, 80 // but it's very NVPTX-specific. 81 PassRegistry &PR = *PassRegistry::getPassRegistry(); 82 initializeNVVMReflectPass(PR); 83 initializeNVVMIntrRangePass(PR); 84 initializeGenericToNVVMPass(PR); 85 initializeNVPTXAllocaHoistingPass(PR); 86 initializeNVPTXAssignValidGlobalNamesPass(PR); 87 initializeNVPTXLowerArgsPass(PR); 88 initializeNVPTXLowerAllocaPass(PR); 89 initializeNVPTXLowerAggrCopiesPass(PR); 90 initializeNVPTXProxyRegErasurePass(PR); 91 } 92 93 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 94 std::string Ret = "e"; 95 96 if (!is64Bit) 97 Ret += "-p:32:32"; 98 else if (UseShortPointers) 99 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 100 101 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 102 103 return Ret; 104 } 105 106 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 107 StringRef CPU, StringRef FS, 108 const TargetOptions &Options, 109 Optional<Reloc::Model> RM, 110 Optional<CodeModel::Model> CM, 111 CodeGenOpt::Level OL, bool is64bit) 112 // The pic relocation model is used regardless of what the client has 113 // specified, as it is the only relocation model currently supported. 114 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 115 CPU, FS, Options, Reloc::PIC_, 116 getEffectiveCodeModel(CM, CodeModel::Small), OL), 117 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 118 TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), 119 Subtarget(TT, CPU, FS, *this) { 120 if (TT.getOS() == Triple::NVCL) 121 drvInterface = NVPTX::NVCL; 122 else 123 drvInterface = NVPTX::CUDA; 124 if (!DisableRequireStructuredCFG) 125 setRequiresStructuredCFG(true); 126 initAsmInfo(); 127 } 128 129 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 130 131 void NVPTXTargetMachine32::anchor() {} 132 133 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 134 StringRef CPU, StringRef FS, 135 const TargetOptions &Options, 136 Optional<Reloc::Model> RM, 137 Optional<CodeModel::Model> CM, 138 CodeGenOpt::Level OL, bool JIT) 139 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 140 141 void NVPTXTargetMachine64::anchor() {} 142 143 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 144 StringRef CPU, StringRef FS, 145 const TargetOptions &Options, 146 Optional<Reloc::Model> RM, 147 Optional<CodeModel::Model> CM, 148 CodeGenOpt::Level OL, bool JIT) 149 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 150 151 namespace { 152 153 class NVPTXPassConfig : public TargetPassConfig { 154 public: 155 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 156 : TargetPassConfig(TM, PM) {} 157 158 NVPTXTargetMachine &getNVPTXTargetMachine() const { 159 return getTM<NVPTXTargetMachine>(); 160 } 161 162 void addIRPasses() override; 163 bool addInstSelector() override; 164 void addPreRegAlloc() override; 165 void addPostRegAlloc() override; 166 void addMachineSSAOptimization() override; 167 168 FunctionPass *createTargetRegisterAllocator(bool) override; 169 void addFastRegAlloc() override; 170 void addOptimizedRegAlloc() override; 171 172 bool addRegAssignmentFast() override { 173 llvm_unreachable("should not be used"); 174 } 175 176 bool addRegAssignmentOptimized() override { 177 llvm_unreachable("should not be used"); 178 } 179 180 private: 181 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 182 // function is only called in opt mode. 183 void addEarlyCSEOrGVNPass(); 184 185 // Add passes that propagate special memory spaces. 186 void addAddressSpaceInferencePasses(); 187 188 // Add passes that perform straight-line scalar optimizations. 189 void addStraightLineScalarOptimizationPasses(); 190 }; 191 192 } // end anonymous namespace 193 194 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 195 return new NVPTXPassConfig(*this, PM); 196 } 197 198 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 199 Builder.addExtension( 200 PassManagerBuilder::EP_EarlyAsPossible, 201 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 202 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 203 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 204 }); 205 } 206 207 TargetTransformInfo 208 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { 209 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 210 } 211 212 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 213 if (getOptLevel() == CodeGenOpt::Aggressive) 214 addPass(createGVNPass()); 215 else 216 addPass(createEarlyCSEPass()); 217 } 218 219 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 220 // NVPTXLowerArgs emits alloca for byval parameters which can often 221 // be eliminated by SROA. 222 addPass(createSROAPass()); 223 addPass(createNVPTXLowerAllocaPass()); 224 addPass(createInferAddressSpacesPass()); 225 } 226 227 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 228 addPass(createSeparateConstOffsetFromGEPPass()); 229 addPass(createSpeculativeExecutionPass()); 230 // ReassociateGEPs exposes more opportunites for SLSR. See 231 // the example in reassociate-geps-and-slsr.ll. 232 addPass(createStraightLineStrengthReducePass()); 233 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 234 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 235 // for some of our benchmarks. 236 addEarlyCSEOrGVNPass(); 237 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 238 addPass(createNaryReassociatePass()); 239 // NaryReassociate on GEPs creates redundant common expressions, so run 240 // EarlyCSE after it. 241 addPass(createEarlyCSEPass()); 242 } 243 244 void NVPTXPassConfig::addIRPasses() { 245 // The following passes are known to not play well with virtual regs hanging 246 // around after register allocation (which in our case, is *all* registers). 247 // We explicitly disable them here. We do, however, need some functionality 248 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 249 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 250 disablePass(&PrologEpilogCodeInserterID); 251 disablePass(&MachineCopyPropagationID); 252 disablePass(&TailDuplicateID); 253 disablePass(&StackMapLivenessID); 254 disablePass(&LiveDebugValuesID); 255 disablePass(&PostRAMachineSinkingID); 256 disablePass(&PostRASchedulerID); 257 disablePass(&FuncletLayoutID); 258 disablePass(&PatchableFunctionID); 259 disablePass(&ShrinkWrapID); 260 261 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 262 // it here does nothing. But since we need it for correctness when lowering 263 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 264 // call addEarlyAsPossiblePasses. 265 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 266 addPass(createNVVMReflectPass(ST.getSmVersion())); 267 268 if (getOptLevel() != CodeGenOpt::None) 269 addPass(createNVPTXImageOptimizerPass()); 270 addPass(createNVPTXAssignValidGlobalNamesPass()); 271 addPass(createGenericToNVVMPass()); 272 273 // NVPTXLowerArgs is required for correctness and should be run right 274 // before the address space inference passes. 275 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 276 if (getOptLevel() != CodeGenOpt::None) { 277 addAddressSpaceInferencePasses(); 278 if (!DisableLoadStoreVectorizer) 279 addPass(createLoadStoreVectorizerPass()); 280 addStraightLineScalarOptimizationPasses(); 281 } 282 283 // === LSR and other generic IR passes === 284 TargetPassConfig::addIRPasses(); 285 // EarlyCSE is not always strong enough to clean up what LSR produces. For 286 // example, GVN can combine 287 // 288 // %0 = add %a, %b 289 // %1 = add %b, %a 290 // 291 // and 292 // 293 // %0 = shl nsw %a, 2 294 // %1 = shl %a, 2 295 // 296 // but EarlyCSE can do neither of them. 297 if (getOptLevel() != CodeGenOpt::None) 298 addEarlyCSEOrGVNPass(); 299 } 300 301 bool NVPTXPassConfig::addInstSelector() { 302 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 303 304 addPass(createLowerAggrCopies()); 305 addPass(createAllocaHoisting()); 306 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 307 308 if (!ST.hasImageHandles()) 309 addPass(createNVPTXReplaceImageHandlesPass()); 310 311 return false; 312 } 313 314 void NVPTXPassConfig::addPreRegAlloc() { 315 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 316 addPass(createNVPTXProxyRegErasurePass()); 317 } 318 319 void NVPTXPassConfig::addPostRegAlloc() { 320 addPass(createNVPTXPrologEpilogPass(), false); 321 if (getOptLevel() != CodeGenOpt::None) { 322 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 323 // index with VRFrame register. NVPTXPeephole need to be run after that and 324 // will replace VRFrame with VRFrameLocal when possible. 325 addPass(createNVPTXPeephole()); 326 } 327 } 328 329 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 330 return nullptr; // No reg alloc 331 } 332 333 void NVPTXPassConfig::addFastRegAlloc() { 334 addPass(&PHIEliminationID); 335 addPass(&TwoAddressInstructionPassID); 336 } 337 338 void NVPTXPassConfig::addOptimizedRegAlloc() { 339 addPass(&ProcessImplicitDefsID); 340 addPass(&LiveVariablesID); 341 addPass(&MachineLoopInfoID); 342 addPass(&PHIEliminationID); 343 344 addPass(&TwoAddressInstructionPassID); 345 addPass(&RegisterCoalescerID); 346 347 // PreRA instruction scheduling. 348 if (addPass(&MachineSchedulerID)) 349 printAndVerify("After Machine Scheduling"); 350 351 352 addPass(&StackSlotColoringID); 353 354 // FIXME: Needs physical registers 355 //addPass(&MachineLICMID); 356 357 printAndVerify("After StackSlotColoring"); 358 } 359 360 void NVPTXPassConfig::addMachineSSAOptimization() { 361 // Pre-ra tail duplication. 362 if (addPass(&EarlyTailDuplicateID)) 363 printAndVerify("After Pre-RegAlloc TailDuplicate"); 364 365 // Optimize PHIs before DCE: removing dead PHI cycles may make more 366 // instructions dead. 367 addPass(&OptimizePHIsID); 368 369 // This pass merges large allocas. StackSlotColoring is a different pass 370 // which merges spill slots. 371 addPass(&StackColoringID); 372 373 // If the target requests it, assign local variables to stack slots relative 374 // to one another and simplify frame index references where possible. 375 addPass(&LocalStackSlotAllocationID); 376 377 // With optimization, dead code should already be eliminated. However 378 // there is one known exception: lowered code for arguments that are only 379 // used by tail calls, where the tail calls reuse the incoming stack 380 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 381 addPass(&DeadMachineInstructionElimID); 382 printAndVerify("After codegen DCE pass"); 383 384 // Allow targets to insert passes that improve instruction level parallelism, 385 // like if-conversion. Such passes will typically need dominator trees and 386 // loop info, just like LICM and CSE below. 387 if (addILPOpts()) 388 printAndVerify("After ILP optimizations"); 389 390 addPass(&EarlyMachineLICMID); 391 addPass(&MachineCSEID); 392 393 addPass(&MachineSinkingID); 394 printAndVerify("After Machine LICM, CSE and Sinking passes"); 395 396 addPass(&PeepholeOptimizerID); 397 printAndVerify("After codegen peephole optimization pass"); 398 } 399