1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXAtomicLower.h" 17 #include "NVPTXLowerAggrCopies.h" 18 #include "NVPTXTargetObjectFile.h" 19 #include "NVPTXTargetTransformInfo.h" 20 #include "TargetInfo/NVPTXTargetInfo.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/Triple.h" 23 #include "llvm/Analysis/TargetTransformInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/LegacyPassManager.h" 27 #include "llvm/MC/TargetRegistry.h" 28 #include "llvm/Pass.h" 29 #include "llvm/Passes/PassBuilder.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/Target/TargetOptions.h" 33 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 34 #include "llvm/Transforms/Scalar.h" 35 #include "llvm/Transforms/Scalar/GVN.h" 36 #include "llvm/Transforms/Vectorize.h" 37 #include <cassert> 38 #include <string> 39 40 using namespace llvm; 41 42 // LSV is still relatively new; this switch lets us turn it off in case we 43 // encounter (or suspect) a bug. 44 static cl::opt<bool> 45 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 46 cl::desc("Disable load/store vectorizer"), 47 cl::init(false), cl::Hidden); 48 49 // TODO: Remove this flag when we are confident with no regressions. 50 static cl::opt<bool> DisableRequireStructuredCFG( 51 "disable-nvptx-require-structured-cfg", 52 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 53 "structured CFG. The requirement should be disabled only when " 54 "unexpected regressions happen."), 55 cl::init(false), cl::Hidden); 56 57 static cl::opt<bool> UseShortPointersOpt( 58 "nvptx-short-ptr", 59 cl::desc( 60 "Use 32-bit pointers for accessing const/local/shared address spaces."), 61 cl::init(false), cl::Hidden); 62 63 namespace llvm { 64 65 void initializeNVVMIntrRangePass(PassRegistry&); 66 void initializeNVVMReflectPass(PassRegistry&); 67 void initializeGenericToNVVMPass(PassRegistry&); 68 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 69 void initializeNVPTXAtomicLowerPass(PassRegistry &); 70 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 71 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 72 void initializeNVPTXLowerArgsPass(PassRegistry &); 73 void initializeNVPTXLowerAllocaPass(PassRegistry &); 74 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 75 76 } // end namespace llvm 77 78 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 79 // Register the target. 80 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 81 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 82 83 // FIXME: This pass is really intended to be invoked during IR optimization, 84 // but it's very NVPTX-specific. 85 PassRegistry &PR = *PassRegistry::getPassRegistry(); 86 initializeNVVMReflectPass(PR); 87 initializeNVVMIntrRangePass(PR); 88 initializeGenericToNVVMPass(PR); 89 initializeNVPTXAllocaHoistingPass(PR); 90 initializeNVPTXAssignValidGlobalNamesPass(PR); 91 initializeNVPTXAtomicLowerPass(PR); 92 initializeNVPTXLowerArgsPass(PR); 93 initializeNVPTXLowerAllocaPass(PR); 94 initializeNVPTXLowerAggrCopiesPass(PR); 95 initializeNVPTXProxyRegErasurePass(PR); 96 } 97 98 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 99 std::string Ret = "e"; 100 101 if (!is64Bit) 102 Ret += "-p:32:32"; 103 else if (UseShortPointers) 104 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 105 106 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 107 108 return Ret; 109 } 110 111 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 112 StringRef CPU, StringRef FS, 113 const TargetOptions &Options, 114 Optional<Reloc::Model> RM, 115 Optional<CodeModel::Model> CM, 116 CodeGenOpt::Level OL, bool is64bit) 117 // The pic relocation model is used regardless of what the client has 118 // specified, as it is the only relocation model currently supported. 119 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 120 CPU, FS, Options, Reloc::PIC_, 121 getEffectiveCodeModel(CM, CodeModel::Small), OL), 122 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 123 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 124 Subtarget(TT, std::string(CPU), std::string(FS), *this) { 125 if (TT.getOS() == Triple::NVCL) 126 drvInterface = NVPTX::NVCL; 127 else 128 drvInterface = NVPTX::CUDA; 129 if (!DisableRequireStructuredCFG) 130 setRequiresStructuredCFG(true); 131 initAsmInfo(); 132 } 133 134 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 135 136 void NVPTXTargetMachine32::anchor() {} 137 138 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 139 StringRef CPU, StringRef FS, 140 const TargetOptions &Options, 141 Optional<Reloc::Model> RM, 142 Optional<CodeModel::Model> CM, 143 CodeGenOpt::Level OL, bool JIT) 144 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 145 146 void NVPTXTargetMachine64::anchor() {} 147 148 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 149 StringRef CPU, StringRef FS, 150 const TargetOptions &Options, 151 Optional<Reloc::Model> RM, 152 Optional<CodeModel::Model> CM, 153 CodeGenOpt::Level OL, bool JIT) 154 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 155 156 namespace { 157 158 class NVPTXPassConfig : public TargetPassConfig { 159 public: 160 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 161 : TargetPassConfig(TM, PM) {} 162 163 NVPTXTargetMachine &getNVPTXTargetMachine() const { 164 return getTM<NVPTXTargetMachine>(); 165 } 166 167 void addIRPasses() override; 168 bool addInstSelector() override; 169 void addPreRegAlloc() override; 170 void addPostRegAlloc() override; 171 void addMachineSSAOptimization() override; 172 173 FunctionPass *createTargetRegisterAllocator(bool) override; 174 void addFastRegAlloc() override; 175 void addOptimizedRegAlloc() override; 176 177 bool addRegAssignAndRewriteFast() override { 178 llvm_unreachable("should not be used"); 179 } 180 181 bool addRegAssignAndRewriteOptimized() override { 182 llvm_unreachable("should not be used"); 183 } 184 185 private: 186 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 187 // function is only called in opt mode. 188 void addEarlyCSEOrGVNPass(); 189 190 // Add passes that propagate special memory spaces. 191 void addAddressSpaceInferencePasses(); 192 193 // Add passes that perform straight-line scalar optimizations. 194 void addStraightLineScalarOptimizationPasses(); 195 }; 196 197 } // end anonymous namespace 198 199 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 200 return new NVPTXPassConfig(*this, PM); 201 } 202 203 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 204 Builder.addExtension( 205 PassManagerBuilder::EP_EarlyAsPossible, 206 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 207 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 208 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 209 }); 210 } 211 212 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { 213 PB.registerPipelineParsingCallback( 214 [](StringRef PassName, FunctionPassManager &PM, 215 ArrayRef<PassBuilder::PipelineElement>) { 216 if (PassName == "nvvm-reflect") { 217 PM.addPass(NVVMReflectPass()); 218 return true; 219 } 220 if (PassName == "nvvm-intr-range") { 221 PM.addPass(NVVMIntrRangePass()); 222 return true; 223 } 224 return false; 225 }); 226 227 PB.registerPipelineStartEPCallback( 228 [this](ModulePassManager &PM, OptimizationLevel Level) { 229 FunctionPassManager FPM; 230 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 231 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 232 // investigate and re-enable. 233 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 234 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 235 }); 236 } 237 238 TargetTransformInfo 239 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { 240 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 241 } 242 243 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 244 if (getOptLevel() == CodeGenOpt::Aggressive) 245 addPass(createGVNPass()); 246 else 247 addPass(createEarlyCSEPass()); 248 } 249 250 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 251 // NVPTXLowerArgs emits alloca for byval parameters which can often 252 // be eliminated by SROA. 253 addPass(createSROAPass()); 254 addPass(createNVPTXLowerAllocaPass()); 255 addPass(createInferAddressSpacesPass()); 256 addPass(createNVPTXAtomicLowerPass()); 257 } 258 259 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 260 addPass(createSeparateConstOffsetFromGEPPass()); 261 addPass(createSpeculativeExecutionPass()); 262 // ReassociateGEPs exposes more opportunites for SLSR. See 263 // the example in reassociate-geps-and-slsr.ll. 264 addPass(createStraightLineStrengthReducePass()); 265 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 266 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 267 // for some of our benchmarks. 268 addEarlyCSEOrGVNPass(); 269 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 270 addPass(createNaryReassociatePass()); 271 // NaryReassociate on GEPs creates redundant common expressions, so run 272 // EarlyCSE after it. 273 addPass(createEarlyCSEPass()); 274 } 275 276 void NVPTXPassConfig::addIRPasses() { 277 // The following passes are known to not play well with virtual regs hanging 278 // around after register allocation (which in our case, is *all* registers). 279 // We explicitly disable them here. We do, however, need some functionality 280 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 281 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 282 disablePass(&PrologEpilogCodeInserterID); 283 disablePass(&MachineCopyPropagationID); 284 disablePass(&TailDuplicateID); 285 disablePass(&StackMapLivenessID); 286 disablePass(&LiveDebugValuesID); 287 disablePass(&PostRAMachineSinkingID); 288 disablePass(&PostRASchedulerID); 289 disablePass(&FuncletLayoutID); 290 disablePass(&PatchableFunctionID); 291 disablePass(&ShrinkWrapID); 292 293 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 294 // it here does nothing. But since we need it for correctness when lowering 295 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 296 // call addEarlyAsPossiblePasses. 297 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 298 addPass(createNVVMReflectPass(ST.getSmVersion())); 299 300 if (getOptLevel() != CodeGenOpt::None) 301 addPass(createNVPTXImageOptimizerPass()); 302 addPass(createNVPTXAssignValidGlobalNamesPass()); 303 addPass(createGenericToNVVMPass()); 304 305 // NVPTXLowerArgs is required for correctness and should be run right 306 // before the address space inference passes. 307 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 308 if (getOptLevel() != CodeGenOpt::None) { 309 addAddressSpaceInferencePasses(); 310 addStraightLineScalarOptimizationPasses(); 311 } 312 313 // === LSR and other generic IR passes === 314 TargetPassConfig::addIRPasses(); 315 // EarlyCSE is not always strong enough to clean up what LSR produces. For 316 // example, GVN can combine 317 // 318 // %0 = add %a, %b 319 // %1 = add %b, %a 320 // 321 // and 322 // 323 // %0 = shl nsw %a, 2 324 // %1 = shl %a, 2 325 // 326 // but EarlyCSE can do neither of them. 327 if (getOptLevel() != CodeGenOpt::None) { 328 addEarlyCSEOrGVNPass(); 329 if (!DisableLoadStoreVectorizer) 330 addPass(createLoadStoreVectorizerPass()); 331 addPass(createSROAPass()); 332 } 333 } 334 335 bool NVPTXPassConfig::addInstSelector() { 336 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 337 338 addPass(createLowerAggrCopies()); 339 addPass(createAllocaHoisting()); 340 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 341 342 if (!ST.hasImageHandles()) 343 addPass(createNVPTXReplaceImageHandlesPass()); 344 345 return false; 346 } 347 348 void NVPTXPassConfig::addPreRegAlloc() { 349 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 350 addPass(createNVPTXProxyRegErasurePass()); 351 } 352 353 void NVPTXPassConfig::addPostRegAlloc() { 354 addPass(createNVPTXPrologEpilogPass()); 355 if (getOptLevel() != CodeGenOpt::None) { 356 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 357 // index with VRFrame register. NVPTXPeephole need to be run after that and 358 // will replace VRFrame with VRFrameLocal when possible. 359 addPass(createNVPTXPeephole()); 360 } 361 } 362 363 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 364 return nullptr; // No reg alloc 365 } 366 367 void NVPTXPassConfig::addFastRegAlloc() { 368 addPass(&PHIEliminationID); 369 addPass(&TwoAddressInstructionPassID); 370 } 371 372 void NVPTXPassConfig::addOptimizedRegAlloc() { 373 addPass(&ProcessImplicitDefsID); 374 addPass(&LiveVariablesID); 375 addPass(&MachineLoopInfoID); 376 addPass(&PHIEliminationID); 377 378 addPass(&TwoAddressInstructionPassID); 379 addPass(&RegisterCoalescerID); 380 381 // PreRA instruction scheduling. 382 if (addPass(&MachineSchedulerID)) 383 printAndVerify("After Machine Scheduling"); 384 385 386 addPass(&StackSlotColoringID); 387 388 // FIXME: Needs physical registers 389 //addPass(&MachineLICMID); 390 391 printAndVerify("After StackSlotColoring"); 392 } 393 394 void NVPTXPassConfig::addMachineSSAOptimization() { 395 // Pre-ra tail duplication. 396 if (addPass(&EarlyTailDuplicateID)) 397 printAndVerify("After Pre-RegAlloc TailDuplicate"); 398 399 // Optimize PHIs before DCE: removing dead PHI cycles may make more 400 // instructions dead. 401 addPass(&OptimizePHIsID); 402 403 // This pass merges large allocas. StackSlotColoring is a different pass 404 // which merges spill slots. 405 addPass(&StackColoringID); 406 407 // If the target requests it, assign local variables to stack slots relative 408 // to one another and simplify frame index references where possible. 409 addPass(&LocalStackSlotAllocationID); 410 411 // With optimization, dead code should already be eliminated. However 412 // there is one known exception: lowered code for arguments that are only 413 // used by tail calls, where the tail calls reuse the incoming stack 414 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 415 addPass(&DeadMachineInstructionElimID); 416 printAndVerify("After codegen DCE pass"); 417 418 // Allow targets to insert passes that improve instruction level parallelism, 419 // like if-conversion. Such passes will typically need dominator trees and 420 // loop info, just like LICM and CSE below. 421 if (addILPOpts()) 422 printAndVerify("After ILP optimizations"); 423 424 addPass(&EarlyMachineLICMID); 425 addPass(&MachineCSEID); 426 427 addPass(&MachineSinkingID); 428 printAndVerify("After Machine LICM, CSE and Sinking passes"); 429 430 addPass(&PeepholeOptimizerID); 431 printAndVerify("After codegen peephole optimization pass"); 432 } 433