1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Top-level implementation for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXTargetMachine.h" 14 #include "NVPTX.h" 15 #include "NVPTXAllocaHoisting.h" 16 #include "NVPTXLowerAggrCopies.h" 17 #include "NVPTXTargetObjectFile.h" 18 #include "NVPTXTargetTransformInfo.h" 19 #include "TargetInfo/NVPTXTargetInfo.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/Triple.h" 22 #include "llvm/Analysis/TargetTransformInfo.h" 23 #include "llvm/CodeGen/Passes.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/LegacyPassManager.h" 26 #include "llvm/Pass.h" 27 #include "llvm/Passes/PassBuilder.h" 28 #include "llvm/Support/CommandLine.h" 29 #include "llvm/Support/TargetRegistry.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetOptions.h" 32 #include "llvm/Transforms/IPO/PassManagerBuilder.h" 33 #include "llvm/Transforms/Scalar.h" 34 #include "llvm/Transforms/Scalar/GVN.h" 35 #include "llvm/Transforms/Vectorize.h" 36 #include <cassert> 37 #include <string> 38 39 using namespace llvm; 40 41 // LSV is still relatively new; this switch lets us turn it off in case we 42 // encounter (or suspect) a bug. 43 static cl::opt<bool> 44 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer", 45 cl::desc("Disable load/store vectorizer"), 46 cl::init(false), cl::Hidden); 47 48 // TODO: Remove this flag when we are confident with no regressions. 49 static cl::opt<bool> DisableRequireStructuredCFG( 50 "disable-nvptx-require-structured-cfg", 51 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving " 52 "structured CFG. The requirement should be disabled only when " 53 "unexpected regressions happen."), 54 cl::init(false), cl::Hidden); 55 56 static cl::opt<bool> UseShortPointersOpt( 57 "nvptx-short-ptr", 58 cl::desc( 59 "Use 32-bit pointers for accessing const/local/shared address spaces."), 60 cl::init(false), cl::Hidden); 61 62 namespace llvm { 63 64 void initializeNVVMIntrRangePass(PassRegistry&); 65 void initializeNVVMReflectPass(PassRegistry&); 66 void initializeGenericToNVVMPass(PassRegistry&); 67 void initializeNVPTXAllocaHoistingPass(PassRegistry &); 68 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); 69 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); 70 void initializeNVPTXLowerArgsPass(PassRegistry &); 71 void initializeNVPTXLowerAllocaPass(PassRegistry &); 72 void initializeNVPTXProxyRegErasurePass(PassRegistry &); 73 74 } // end namespace llvm 75 76 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { 77 // Register the target. 78 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32()); 79 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64()); 80 81 // FIXME: This pass is really intended to be invoked during IR optimization, 82 // but it's very NVPTX-specific. 83 PassRegistry &PR = *PassRegistry::getPassRegistry(); 84 initializeNVVMReflectPass(PR); 85 initializeNVVMIntrRangePass(PR); 86 initializeGenericToNVVMPass(PR); 87 initializeNVPTXAllocaHoistingPass(PR); 88 initializeNVPTXAssignValidGlobalNamesPass(PR); 89 initializeNVPTXLowerArgsPass(PR); 90 initializeNVPTXLowerAllocaPass(PR); 91 initializeNVPTXLowerAggrCopiesPass(PR); 92 initializeNVPTXProxyRegErasurePass(PR); 93 } 94 95 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { 96 std::string Ret = "e"; 97 98 if (!is64Bit) 99 Ret += "-p:32:32"; 100 else if (UseShortPointers) 101 Ret += "-p3:32:32-p4:32:32-p5:32:32"; 102 103 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; 104 105 return Ret; 106 } 107 108 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, 109 StringRef CPU, StringRef FS, 110 const TargetOptions &Options, 111 Optional<Reloc::Model> RM, 112 Optional<CodeModel::Model> CM, 113 CodeGenOpt::Level OL, bool is64bit) 114 // The pic relocation model is used regardless of what the client has 115 // specified, as it is the only relocation model currently supported. 116 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT, 117 CPU, FS, Options, Reloc::PIC_, 118 getEffectiveCodeModel(CM, CodeModel::Small), OL), 119 is64bit(is64bit), UseShortPointers(UseShortPointersOpt), 120 TLOF(std::make_unique<NVPTXTargetObjectFile>()), 121 Subtarget(TT, std::string(CPU), std::string(FS), *this) { 122 if (TT.getOS() == Triple::NVCL) 123 drvInterface = NVPTX::NVCL; 124 else 125 drvInterface = NVPTX::CUDA; 126 if (!DisableRequireStructuredCFG) 127 setRequiresStructuredCFG(true); 128 initAsmInfo(); 129 } 130 131 NVPTXTargetMachine::~NVPTXTargetMachine() = default; 132 133 void NVPTXTargetMachine32::anchor() {} 134 135 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT, 136 StringRef CPU, StringRef FS, 137 const TargetOptions &Options, 138 Optional<Reloc::Model> RM, 139 Optional<CodeModel::Model> CM, 140 CodeGenOpt::Level OL, bool JIT) 141 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} 142 143 void NVPTXTargetMachine64::anchor() {} 144 145 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT, 146 StringRef CPU, StringRef FS, 147 const TargetOptions &Options, 148 Optional<Reloc::Model> RM, 149 Optional<CodeModel::Model> CM, 150 CodeGenOpt::Level OL, bool JIT) 151 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} 152 153 namespace { 154 155 class NVPTXPassConfig : public TargetPassConfig { 156 public: 157 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM) 158 : TargetPassConfig(TM, PM) {} 159 160 NVPTXTargetMachine &getNVPTXTargetMachine() const { 161 return getTM<NVPTXTargetMachine>(); 162 } 163 164 void addIRPasses() override; 165 bool addInstSelector() override; 166 void addPreRegAlloc() override; 167 void addPostRegAlloc() override; 168 void addMachineSSAOptimization() override; 169 170 FunctionPass *createTargetRegisterAllocator(bool) override; 171 void addFastRegAlloc() override; 172 void addOptimizedRegAlloc() override; 173 174 bool addRegAssignAndRewriteFast() override { 175 llvm_unreachable("should not be used"); 176 } 177 178 bool addRegAssignAndRewriteOptimized() override { 179 llvm_unreachable("should not be used"); 180 } 181 182 private: 183 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This 184 // function is only called in opt mode. 185 void addEarlyCSEOrGVNPass(); 186 187 // Add passes that propagate special memory spaces. 188 void addAddressSpaceInferencePasses(); 189 190 // Add passes that perform straight-line scalar optimizations. 191 void addStraightLineScalarOptimizationPasses(); 192 }; 193 194 } // end anonymous namespace 195 196 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { 197 return new NVPTXPassConfig(*this, PM); 198 } 199 200 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { 201 Builder.addExtension( 202 PassManagerBuilder::EP_EarlyAsPossible, 203 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) { 204 PM.add(createNVVMReflectPass(Subtarget.getSmVersion())); 205 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion())); 206 }); 207 } 208 209 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, 210 bool DebugPassManager) { 211 PB.registerPipelineParsingCallback( 212 [](StringRef PassName, FunctionPassManager &PM, 213 ArrayRef<PassBuilder::PipelineElement>) { 214 if (PassName == "nvvm-reflect") { 215 PM.addPass(NVVMReflectPass()); 216 return true; 217 } 218 if (PassName == "nvvm-intr-range") { 219 PM.addPass(NVVMIntrRangePass()); 220 return true; 221 } 222 return false; 223 }); 224 225 PB.registerPipelineStartEPCallback( 226 [this, DebugPassManager](ModulePassManager &PM, 227 PassBuilder::OptimizationLevel Level) { 228 FunctionPassManager FPM(DebugPassManager); 229 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); 230 // FIXME: NVVMIntrRangePass is causing numerical discrepancies, 231 // investigate and re-enable. 232 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); 233 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 234 }); 235 } 236 237 TargetTransformInfo 238 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { 239 return TargetTransformInfo(NVPTXTTIImpl(this, F)); 240 } 241 242 void NVPTXPassConfig::addEarlyCSEOrGVNPass() { 243 if (getOptLevel() == CodeGenOpt::Aggressive) 244 addPass(createGVNPass()); 245 else 246 addPass(createEarlyCSEPass()); 247 } 248 249 void NVPTXPassConfig::addAddressSpaceInferencePasses() { 250 // NVPTXLowerArgs emits alloca for byval parameters which can often 251 // be eliminated by SROA. 252 addPass(createSROAPass()); 253 addPass(createNVPTXLowerAllocaPass()); 254 addPass(createInferAddressSpacesPass()); 255 } 256 257 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() { 258 addPass(createSeparateConstOffsetFromGEPPass()); 259 addPass(createSpeculativeExecutionPass()); 260 // ReassociateGEPs exposes more opportunites for SLSR. See 261 // the example in reassociate-geps-and-slsr.ll. 262 addPass(createStraightLineStrengthReducePass()); 263 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or 264 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE 265 // for some of our benchmarks. 266 addEarlyCSEOrGVNPass(); 267 // Run NaryReassociate after EarlyCSE/GVN to be more effective. 268 addPass(createNaryReassociatePass()); 269 // NaryReassociate on GEPs creates redundant common expressions, so run 270 // EarlyCSE after it. 271 addPass(createEarlyCSEPass()); 272 } 273 274 void NVPTXPassConfig::addIRPasses() { 275 // The following passes are known to not play well with virtual regs hanging 276 // around after register allocation (which in our case, is *all* registers). 277 // We explicitly disable them here. We do, however, need some functionality 278 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the 279 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). 280 disablePass(&PrologEpilogCodeInserterID); 281 disablePass(&MachineCopyPropagationID); 282 disablePass(&TailDuplicateID); 283 disablePass(&StackMapLivenessID); 284 disablePass(&LiveDebugValuesID); 285 disablePass(&PostRAMachineSinkingID); 286 disablePass(&PostRASchedulerID); 287 disablePass(&FuncletLayoutID); 288 disablePass(&PatchableFunctionID); 289 disablePass(&ShrinkWrapID); 290 291 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running 292 // it here does nothing. But since we need it for correctness when lowering 293 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't 294 // call addEarlyAsPossiblePasses. 295 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 296 addPass(createNVVMReflectPass(ST.getSmVersion())); 297 298 if (getOptLevel() != CodeGenOpt::None) 299 addPass(createNVPTXImageOptimizerPass()); 300 addPass(createNVPTXAssignValidGlobalNamesPass()); 301 addPass(createGenericToNVVMPass()); 302 303 // NVPTXLowerArgs is required for correctness and should be run right 304 // before the address space inference passes. 305 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); 306 if (getOptLevel() != CodeGenOpt::None) { 307 addAddressSpaceInferencePasses(); 308 addStraightLineScalarOptimizationPasses(); 309 } 310 311 // === LSR and other generic IR passes === 312 TargetPassConfig::addIRPasses(); 313 // EarlyCSE is not always strong enough to clean up what LSR produces. For 314 // example, GVN can combine 315 // 316 // %0 = add %a, %b 317 // %1 = add %b, %a 318 // 319 // and 320 // 321 // %0 = shl nsw %a, 2 322 // %1 = shl %a, 2 323 // 324 // but EarlyCSE can do neither of them. 325 if (getOptLevel() != CodeGenOpt::None) { 326 addEarlyCSEOrGVNPass(); 327 if (!DisableLoadStoreVectorizer) 328 addPass(createLoadStoreVectorizerPass()); 329 } 330 } 331 332 bool NVPTXPassConfig::addInstSelector() { 333 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl(); 334 335 addPass(createLowerAggrCopies()); 336 addPass(createAllocaHoisting()); 337 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel())); 338 339 if (!ST.hasImageHandles()) 340 addPass(createNVPTXReplaceImageHandlesPass()); 341 342 return false; 343 } 344 345 void NVPTXPassConfig::addPreRegAlloc() { 346 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive. 347 addPass(createNVPTXProxyRegErasurePass()); 348 } 349 350 void NVPTXPassConfig::addPostRegAlloc() { 351 addPass(createNVPTXPrologEpilogPass(), false); 352 if (getOptLevel() != CodeGenOpt::None) { 353 // NVPTXPrologEpilogPass calculates frame object offset and replace frame 354 // index with VRFrame register. NVPTXPeephole need to be run after that and 355 // will replace VRFrame with VRFrameLocal when possible. 356 addPass(createNVPTXPeephole()); 357 } 358 } 359 360 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) { 361 return nullptr; // No reg alloc 362 } 363 364 void NVPTXPassConfig::addFastRegAlloc() { 365 addPass(&PHIEliminationID); 366 addPass(&TwoAddressInstructionPassID); 367 } 368 369 void NVPTXPassConfig::addOptimizedRegAlloc() { 370 addPass(&ProcessImplicitDefsID); 371 addPass(&LiveVariablesID); 372 addPass(&MachineLoopInfoID); 373 addPass(&PHIEliminationID); 374 375 addPass(&TwoAddressInstructionPassID); 376 addPass(&RegisterCoalescerID); 377 378 // PreRA instruction scheduling. 379 if (addPass(&MachineSchedulerID)) 380 printAndVerify("After Machine Scheduling"); 381 382 383 addPass(&StackSlotColoringID); 384 385 // FIXME: Needs physical registers 386 //addPass(&MachineLICMID); 387 388 printAndVerify("After StackSlotColoring"); 389 } 390 391 void NVPTXPassConfig::addMachineSSAOptimization() { 392 // Pre-ra tail duplication. 393 if (addPass(&EarlyTailDuplicateID)) 394 printAndVerify("After Pre-RegAlloc TailDuplicate"); 395 396 // Optimize PHIs before DCE: removing dead PHI cycles may make more 397 // instructions dead. 398 addPass(&OptimizePHIsID); 399 400 // This pass merges large allocas. StackSlotColoring is a different pass 401 // which merges spill slots. 402 addPass(&StackColoringID); 403 404 // If the target requests it, assign local variables to stack slots relative 405 // to one another and simplify frame index references where possible. 406 addPass(&LocalStackSlotAllocationID); 407 408 // With optimization, dead code should already be eliminated. However 409 // there is one known exception: lowered code for arguments that are only 410 // used by tail calls, where the tail calls reuse the incoming stack 411 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll). 412 addPass(&DeadMachineInstructionElimID); 413 printAndVerify("After codegen DCE pass"); 414 415 // Allow targets to insert passes that improve instruction level parallelism, 416 // like if-conversion. Such passes will typically need dominator trees and 417 // loop info, just like LICM and CSE below. 418 if (addILPOpts()) 419 printAndVerify("After ILP optimizations"); 420 421 addPass(&EarlyMachineLICMID); 422 addPass(&MachineCSEID); 423 424 addPass(&MachineSinkingID); 425 printAndVerify("After Machine LICM, CSE and Sinking passes"); 426 427 addPass(&PeepholeOptimizerID); 428 printAndVerify("After codegen peephole optimization pass"); 429 } 430