1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXAtomicLower.h" 17 #include "NVPTXLowerAggrCopies.h" 18 #include "NVPTXTargetObjectFile.h" 19 #include "NVPTXTargetTransformInfo.h" 20 #include "TargetInfo/NVPTXTargetInfo.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/Triple.h" 23 #include "llvm/Analysis/TargetTransformInfo.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/LegacyPassManager.h" 27 #include "llvm/Pass.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/TargetRegistry.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/Target/TargetOptions.h" 33 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 34 #include "llvm/Transforms/Scalar.h" 35 #include "llvm/Transforms/Scalar/GVN.h" 36 #include "llvm/Transforms/Vectorize.h" 37 #include <cassert> 38 #include <string> 39 40 using namespace llvm; 41 42 // LSV is still relatively new; this switch lets us turn it off in case we 43 // encounter (or suspect) a bug. 44 static cl::opt<bool> 45 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 46 cl::desc("Disable load/store vectorizer"), 47 cl::init(false), cl::Hidden); 48 49 // TODO: Remove this flag when we are confident with no regressions. 50 static cl::opt<bool> DisableRequireStructuredCFG( 51 "disable-nvptx-require-structured-cfg", 52 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 53 "structured CFG. The requirement should be disabled only when " 54 "unexpected regressions happen."), 55 cl::init(false), cl::Hidden); 56 57 static cl::opt<bool> UseShortPointersOpt( 58 "nvptx-short-ptr", 59 cl::desc( 60 "Use 32-bit pointers for accessing const/local/shared address spaces."), 61 cl::init(false), cl::Hidden); 62 63 namespace llvm { 64 65 void initializeNVVMIntrRangePass(PassRegistry&); 66 void initializeNVVMReflectPass(PassRegistry&); 67 void initializeGenericToNVVMPass(PassRegistry&); 68 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 69 void initializeNVPTXAtomicLowerPass(PassRegistry &); 70 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 71 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 72 void initializeNVPTXLowerArgsPass(PassRegistry &); 73 void initializeNVPTXLowerAllocaPass(PassRegistry &); 74 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 75 76 } // end namespace llvm 77 78 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 79 // Register the target. 80 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 81 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 82 83 // FIXME: This pass is really intended to be invoked during IR optimization, 84 // but it's very NVPTX-specific. 85 PassRegistry &PR = *PassRegistry::getPassRegistry(); 86 initializeNVVMReflectPass(PR); 87 initializeNVVMIntrRangePass(PR); 88 initializeGenericToNVVMPass(PR); 89 initializeNVPTXAllocaHoistingPass(PR); 90 initializeNVPTXAssignValidGlobalNamesPass(PR); 91 initializeNVPTXAtomicLowerPass(PR); 92 initializeNVPTXLowerArgsPass(PR); 93 initializeNVPTXLowerAllocaPass(PR); 94 initializeNVPTXLowerAggrCopiesPass(PR); 95 initializeNVPTXProxyRegErasurePass(PR); 96 } 97 98 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 99 std::string Ret = "e"; 100 101 if (!is64Bit) 102 Ret += "-p:32:32"; 103 else if (UseShortPointers) 104 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 105 106 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 107 108 return Ret; 109 } 110 111 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 112 StringRef CPU, StringRef FS, 113 const TargetOptions &Options, 114 Optional<Reloc::Model> RM, 115 Optional<CodeModel::Model> CM, 116 CodeGenOpt::Level OL, bool is64bit) 117 // The pic relocation model is used regardless of what the client has 118 // specified, as it is the only relocation model currently supported. 119 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 120 CPU, FS, Options, Reloc::PIC_, 121 getEffectiveCodeModel(CM, CodeModel::Small), OL), 122 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 123 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 124 Subtarget(TT, std::string(CPU), std::string(FS), *this) { 125 if (TT.getOS() == Triple::NVCL) 126 drvInterface = NVPTX::NVCL; 127 else 128 drvInterface = NVPTX::CUDA; 129 if (!DisableRequireStructuredCFG) 130 setRequiresStructuredCFG(true); 131 initAsmInfo(); 132 } 133 134 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 135 136 void NVPTXTargetMachine32::anchor() {} 137 138 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 139 StringRef CPU, StringRef FS, 140 const TargetOptions &Options, 141 Optional<Reloc::Model> RM, 142 Optional<CodeModel::Model> CM, 143 CodeGenOpt::Level OL, bool JIT) 144 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 145 146 void NVPTXTargetMachine64::anchor() {} 147 148 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 149 StringRef CPU, StringRef FS, 150 const TargetOptions &Options, 151 Optional<Reloc::Model> RM, 152 Optional<CodeModel::Model> CM, 153 CodeGenOpt::Level OL, bool JIT) 154 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 155 156 namespace { 157 158 class NVPTXPassConfig : public TargetPassConfig { 159 public: 160 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 161 : TargetPassConfig(TM, PM) {} 162 163 NVPTXTargetMachine &getNVPTXTargetMachine() const { 164 return getTM<NVPTXTargetMachine>(); 165 } 166 167 void addIRPasses() override; 168 bool addInstSelector() override; 169 void addPreRegAlloc() override; 170 void addPostRegAlloc() override; 171 void addMachineSSAOptimization() override; 172 173 FunctionPass *createTargetRegisterAllocator(bool) override; 174 void addFastRegAlloc() override; 175 void addOptimizedRegAlloc() override; 176 177 bool addRegAssignAndRewriteFast() override { 178 llvm_unreachable("should not be used"); 179 } 180 181 bool addRegAssignAndRewriteOptimized() override { 182 llvm_unreachable("should not be used"); 183 } 184 185 private: 186 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 187 // function is only called in opt mode. 188 void addEarlyCSEOrGVNPass(); 189 190 // Add passes that propagate special memory spaces. 191 void addAddressSpaceInferencePasses(); 192 193 // Add passes that perform straight-line scalar optimizations. 194 void addStraightLineScalarOptimizationPasses(); 195 }; 196 197 } // end anonymous namespace 198 199 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 200 return new NVPTXPassConfig(*this, PM); 201 } 202 203 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 204 Builder.addExtension( 205 PassManagerBuilder::EP_EarlyAsPossible, 206 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 207 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 208 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 209 }); 210 } 211 212 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, 213 bool DebugPassManager) { 214 PB.registerPipelineParsingCallback( 215 [](StringRef PassName, FunctionPassManager &PM, 216 ArrayRef<PassBuilder::PipelineElement>) { 217 if (PassName == "nvvm-reflect") { 218 PM.addPass(NVVMReflectPass()); 219 return true; 220 } 221 if (PassName == "nvvm-intr-range") { 222 PM.addPass(NVVMIntrRangePass()); 223 return true; 224 } 225 return false; 226 }); 227 228 PB.registerPipelineStartEPCallback( 229 [this, DebugPassManager](ModulePassManager &PM, 230 PassBuilder::OptimizationLevel Level) { 231 FunctionPassManager FPM(DebugPassManager); 232 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 233 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 234 // investigate and re-enable. 235 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 236 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 237 }); 238 } 239 240 TargetTransformInfo 241 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { 242 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 243 } 244 245 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 246 if (getOptLevel() == CodeGenOpt::Aggressive) 247 addPass(createGVNPass()); 248 else 249 addPass(createEarlyCSEPass()); 250 } 251 252 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 253 // NVPTXLowerArgs emits alloca for byval parameters which can often 254 // be eliminated by SROA. 255 addPass(createSROAPass()); 256 addPass(createNVPTXLowerAllocaPass()); 257 addPass(createInferAddressSpacesPass()); 258 addPass(createNVPTXAtomicLowerPass()); 259 } 260 261 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 262 addPass(createSeparateConstOffsetFromGEPPass()); 263 addPass(createSpeculativeExecutionPass()); 264 // ReassociateGEPs exposes more opportunites for SLSR. See 265 // the example in reassociate-geps-and-slsr.ll. 266 addPass(createStraightLineStrengthReducePass()); 267 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 268 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 269 // for some of our benchmarks. 270 addEarlyCSEOrGVNPass(); 271 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 272 addPass(createNaryReassociatePass()); 273 // NaryReassociate on GEPs creates redundant common expressions, so run 274 // EarlyCSE after it. 275 addPass(createEarlyCSEPass()); 276 } 277 278 void NVPTXPassConfig::addIRPasses() { 279 // The following passes are known to not play well with virtual regs hanging 280 // around after register allocation (which in our case, is *all* registers). 281 // We explicitly disable them here. We do, however, need some functionality 282 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 283 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 284 disablePass(&PrologEpilogCodeInserterID); 285 disablePass(&MachineCopyPropagationID); 286 disablePass(&TailDuplicateID); 287 disablePass(&StackMapLivenessID); 288 disablePass(&LiveDebugValuesID); 289 disablePass(&PostRAMachineSinkingID); 290 disablePass(&PostRASchedulerID); 291 disablePass(&FuncletLayoutID); 292 disablePass(&PatchableFunctionID); 293 disablePass(&ShrinkWrapID); 294 295 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 296 // it here does nothing. But since we need it for correctness when lowering 297 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 298 // call addEarlyAsPossiblePasses. 299 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 300 addPass(createNVVMReflectPass(ST.getSmVersion())); 301 302 if (getOptLevel() != CodeGenOpt::None) 303 addPass(createNVPTXImageOptimizerPass()); 304 addPass(createNVPTXAssignValidGlobalNamesPass()); 305 addPass(createGenericToNVVMPass()); 306 307 // NVPTXLowerArgs is required for correctness and should be run right 308 // before the address space inference passes. 309 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 310 if (getOptLevel() != CodeGenOpt::None) { 311 addAddressSpaceInferencePasses(); 312 addStraightLineScalarOptimizationPasses(); 313 } 314 315 // === LSR and other generic IR passes === 316 TargetPassConfig::addIRPasses(); 317 // EarlyCSE is not always strong enough to clean up what LSR produces. For 318 // example, GVN can combine 319 // 320 // %0 = add %a, %b 321 // %1 = add %b, %a 322 // 323 // and 324 // 325 // %0 = shl nsw %a, 2 326 // %1 = shl %a, 2 327 // 328 // but EarlyCSE can do neither of them. 329 if (getOptLevel() != CodeGenOpt::None) { 330 addEarlyCSEOrGVNPass(); 331 if (!DisableLoadStoreVectorizer) 332 addPass(createLoadStoreVectorizerPass()); 333 } 334 } 335 336 bool NVPTXPassConfig::addInstSelector() { 337 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 338 339 addPass(createLowerAggrCopies()); 340 addPass(createAllocaHoisting()); 341 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 342 343 if (!ST.hasImageHandles()) 344 addPass(createNVPTXReplaceImageHandlesPass()); 345 346 return false; 347 } 348 349 void NVPTXPassConfig::addPreRegAlloc() { 350 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 351 addPass(createNVPTXProxyRegErasurePass()); 352 } 353 354 void NVPTXPassConfig::addPostRegAlloc() { 355 addPass(createNVPTXPrologEpilogPass(), false); 356 if (getOptLevel() != CodeGenOpt::None) { 357 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 358 // index with VRFrame register. NVPTXPeephole need to be run after that and 359 // will replace VRFrame with VRFrameLocal when possible. 360 addPass(createNVPTXPeephole()); 361 } 362 } 363 364 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 365 return nullptr; // No reg alloc 366 } 367 368 void NVPTXPassConfig::addFastRegAlloc() { 369 addPass(&PHIEliminationID); 370 addPass(&TwoAddressInstructionPassID); 371 } 372 373 void NVPTXPassConfig::addOptimizedRegAlloc() { 374 addPass(&ProcessImplicitDefsID); 375 addPass(&LiveVariablesID); 376 addPass(&MachineLoopInfoID); 377 addPass(&PHIEliminationID); 378 379 addPass(&TwoAddressInstructionPassID); 380 addPass(&RegisterCoalescerID); 381 382 // PreRA instruction scheduling. 383 if (addPass(&MachineSchedulerID)) 384 printAndVerify("After Machine Scheduling"); 385 386 387 addPass(&StackSlotColoringID); 388 389 // FIXME: Needs physical registers 390 //addPass(&MachineLICMID); 391 392 printAndVerify("After StackSlotColoring"); 393 } 394 395 void NVPTXPassConfig::addMachineSSAOptimization() { 396 // Pre-ra tail duplication. 397 if (addPass(&EarlyTailDuplicateID)) 398 printAndVerify("After Pre-RegAlloc TailDuplicate"); 399 400 // Optimize PHIs before DCE: removing dead PHI cycles may make more 401 // instructions dead. 402 addPass(&OptimizePHIsID); 403 404 // This pass merges large allocas. StackSlotColoring is a different pass 405 // which merges spill slots. 406 addPass(&StackColoringID); 407 408 // If the target requests it, assign local variables to stack slots relative 409 // to one another and simplify frame index references where possible. 410 addPass(&LocalStackSlotAllocationID); 411 412 // With optimization, dead code should already be eliminated. However 413 // there is one known exception: lowered code for arguments that are only 414 // used by tail calls, where the tail calls reuse the incoming stack 415 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 416 addPass(&DeadMachineInstructionElimID); 417 printAndVerify("After codegen DCE pass"); 418 419 // Allow targets to insert passes that improve instruction level parallelism, 420 // like if-conversion. Such passes will typically need dominator trees and 421 // loop info, just like LICM and CSE below. 422 if (addILPOpts()) 423 printAndVerify("After ILP optimizations"); 424 425 addPass(&EarlyMachineLICMID); 426 addPass(&MachineCSEID); 427 428 addPass(&MachineSinkingID); 429 printAndVerify("After Machine LICM, CSE and Sinking passes"); 430 431 addPass(&PeepholeOptimizerID); 432 printAndVerify("After codegen peephole optimization pass"); 433 } 434