1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Top-level implementation for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "NVPTXTargetMachine.h"
14 #include "NVPTX.h"
15 #include "NVPTXAllocaHoisting.h"
16 #include "NVPTXAtomicLower.h"
17 #include "NVPTXLowerAggrCopies.h"
18 #include "NVPTXTargetObjectFile.h"
19 #include "NVPTXTargetTransformInfo.h"
20 #include "TargetInfo/NVPTXTargetInfo.h"
21 #include "llvm/ADT/STLExtras.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/IntrinsicsNVPTX.h"
27 #include "llvm/IR/LegacyPassManager.h"
28 #include "llvm/MC/TargetRegistry.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Passes/PassBuilder.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Target/TargetMachine.h"
33 #include "llvm/Target/TargetOptions.h"
34 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
35 #include "llvm/Transforms/Scalar.h"
36 #include "llvm/Transforms/Scalar/GVN.h"
37 #include "llvm/Transforms/Vectorize.h"
38 #include <cassert>
39 #include <string>
40
41 using namespace llvm;
42
43 // LSV is still relatively new; this switch lets us turn it off in case we
44 // encounter (or suspect) a bug.
45 static cl::opt<bool>
46 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
47 cl::desc("Disable load/store vectorizer"),
48 cl::init(false), cl::Hidden);
49
50 // TODO: Remove this flag when we are confident with no regressions.
51 static cl::opt<bool> DisableRequireStructuredCFG(
52 "disable-nvptx-require-structured-cfg",
53 cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
54 "structured CFG. The requirement should be disabled only when "
55 "unexpected regressions happen."),
56 cl::init(false), cl::Hidden);
57
58 static cl::opt<bool> UseShortPointersOpt(
59 "nvptx-short-ptr",
60 cl::desc(
61 "Use 32-bit pointers for accessing const/local/shared address spaces."),
62 cl::init(false), cl::Hidden);
63
64 namespace llvm {
65
66 void initializeNVVMIntrRangePass(PassRegistry&);
67 void initializeNVVMReflectPass(PassRegistry&);
68 void initializeGenericToNVVMPass(PassRegistry&);
69 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
70 void initializeNVPTXAtomicLowerPass(PassRegistry &);
71 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
72 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
73 void initializeNVPTXLowerArgsPass(PassRegistry &);
74 void initializeNVPTXLowerAllocaPass(PassRegistry &);
75 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
76
77 } // end namespace llvm
78
LLVMInitializeNVPTXTarget()79 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
80 // Register the target.
81 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
82 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
83
84 // FIXME: This pass is really intended to be invoked during IR optimization,
85 // but it's very NVPTX-specific.
86 PassRegistry &PR = *PassRegistry::getPassRegistry();
87 initializeNVVMReflectPass(PR);
88 initializeNVVMIntrRangePass(PR);
89 initializeGenericToNVVMPass(PR);
90 initializeNVPTXAllocaHoistingPass(PR);
91 initializeNVPTXAssignValidGlobalNamesPass(PR);
92 initializeNVPTXAtomicLowerPass(PR);
93 initializeNVPTXLowerArgsPass(PR);
94 initializeNVPTXLowerAllocaPass(PR);
95 initializeNVPTXLowerAggrCopiesPass(PR);
96 initializeNVPTXProxyRegErasurePass(PR);
97 }
98
computeDataLayout(bool is64Bit,bool UseShortPointers)99 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
100 std::string Ret = "e";
101
102 if (!is64Bit)
103 Ret += "-p:32:32";
104 else if (UseShortPointers)
105 Ret += "-p3:32:32-p4:32:32-p5:32:32";
106
107 Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
108
109 return Ret;
110 }
111
NVPTXTargetMachine(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool is64bit)112 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
113 StringRef CPU, StringRef FS,
114 const TargetOptions &Options,
115 Optional<Reloc::Model> RM,
116 Optional<CodeModel::Model> CM,
117 CodeGenOpt::Level OL, bool is64bit)
118 // The pic relocation model is used regardless of what the client has
119 // specified, as it is the only relocation model currently supported.
120 : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
121 CPU, FS, Options, Reloc::PIC_,
122 getEffectiveCodeModel(CM, CodeModel::Small), OL),
123 is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
124 TLOF(std::make_unique<NVPTXTargetObjectFile>()),
125 Subtarget(TT, std::string(CPU), std::string(FS), *this) {
126 if (TT.getOS() == Triple::NVCL)
127 drvInterface = NVPTX::NVCL;
128 else
129 drvInterface = NVPTX::CUDA;
130 if (!DisableRequireStructuredCFG)
131 setRequiresStructuredCFG(true);
132 initAsmInfo();
133 }
134
135 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
136
anchor()137 void NVPTXTargetMachine32::anchor() {}
138
NVPTXTargetMachine32(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)139 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
140 StringRef CPU, StringRef FS,
141 const TargetOptions &Options,
142 Optional<Reloc::Model> RM,
143 Optional<CodeModel::Model> CM,
144 CodeGenOpt::Level OL, bool JIT)
145 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
146
anchor()147 void NVPTXTargetMachine64::anchor() {}
148
NVPTXTargetMachine64(const Target & T,const Triple & TT,StringRef CPU,StringRef FS,const TargetOptions & Options,Optional<Reloc::Model> RM,Optional<CodeModel::Model> CM,CodeGenOpt::Level OL,bool JIT)149 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
150 StringRef CPU, StringRef FS,
151 const TargetOptions &Options,
152 Optional<Reloc::Model> RM,
153 Optional<CodeModel::Model> CM,
154 CodeGenOpt::Level OL, bool JIT)
155 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
156
157 namespace {
158
159 class NVPTXPassConfig : public TargetPassConfig {
160 public:
NVPTXPassConfig(NVPTXTargetMachine & TM,PassManagerBase & PM)161 NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
162 : TargetPassConfig(TM, PM) {}
163
getNVPTXTargetMachine() const164 NVPTXTargetMachine &getNVPTXTargetMachine() const {
165 return getTM<NVPTXTargetMachine>();
166 }
167
168 void addIRPasses() override;
169 bool addInstSelector() override;
170 void addPreRegAlloc() override;
171 void addPostRegAlloc() override;
172 void addMachineSSAOptimization() override;
173
174 FunctionPass *createTargetRegisterAllocator(bool) override;
175 void addFastRegAlloc() override;
176 void addOptimizedRegAlloc() override;
177
addRegAssignAndRewriteFast()178 bool addRegAssignAndRewriteFast() override {
179 llvm_unreachable("should not be used");
180 }
181
addRegAssignAndRewriteOptimized()182 bool addRegAssignAndRewriteOptimized() override {
183 llvm_unreachable("should not be used");
184 }
185
186 private:
187 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
188 // function is only called in opt mode.
189 void addEarlyCSEOrGVNPass();
190
191 // Add passes that propagate special memory spaces.
192 void addAddressSpaceInferencePasses();
193
194 // Add passes that perform straight-line scalar optimizations.
195 void addStraightLineScalarOptimizationPasses();
196 };
197
198 } // end anonymous namespace
199
createPassConfig(PassManagerBase & PM)200 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
201 return new NVPTXPassConfig(*this, PM);
202 }
203
adjustPassManager(PassManagerBuilder & Builder)204 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
205 Builder.addExtension(
206 PassManagerBuilder::EP_EarlyAsPossible,
207 [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
208 PM.add(createNVVMReflectPass(Subtarget.getSmVersion()));
209 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
210 });
211 }
212
registerPassBuilderCallbacks(PassBuilder & PB)213 void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
214 PB.registerPipelineParsingCallback(
215 [](StringRef PassName, FunctionPassManager &PM,
216 ArrayRef<PassBuilder::PipelineElement>) {
217 if (PassName == "nvvm-reflect") {
218 PM.addPass(NVVMReflectPass());
219 return true;
220 }
221 if (PassName == "nvvm-intr-range") {
222 PM.addPass(NVVMIntrRangePass());
223 return true;
224 }
225 return false;
226 });
227
228 PB.registerPipelineStartEPCallback(
229 [this](ModulePassManager &PM, OptimizationLevel Level) {
230 FunctionPassManager FPM;
231 FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion()));
232 // FIXME: NVVMIntrRangePass is causing numerical discrepancies,
233 // investigate and re-enable.
234 // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion()));
235 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
236 });
237 }
238
239 TargetTransformInfo
getTargetTransformInfo(const Function & F) const240 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) const {
241 return TargetTransformInfo(NVPTXTTIImpl(this, F));
242 }
243
244 std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value * V) const245 NVPTXTargetMachine::getPredicatedAddrSpace(const Value *V) const {
246 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
247 switch (II->getIntrinsicID()) {
248 case Intrinsic::nvvm_isspacep_const:
249 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_CONST);
250 case Intrinsic::nvvm_isspacep_global:
251 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_GLOBAL);
252 case Intrinsic::nvvm_isspacep_local:
253 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_LOCAL);
254 case Intrinsic::nvvm_isspacep_shared:
255 return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED);
256 default:
257 break;
258 }
259 }
260 return std::make_pair(nullptr, -1);
261 }
262
addEarlyCSEOrGVNPass()263 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
264 if (getOptLevel() == CodeGenOpt::Aggressive)
265 addPass(createGVNPass());
266 else
267 addPass(createEarlyCSEPass());
268 }
269
addAddressSpaceInferencePasses()270 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
271 // NVPTXLowerArgs emits alloca for byval parameters which can often
272 // be eliminated by SROA.
273 addPass(createSROAPass());
274 addPass(createNVPTXLowerAllocaPass());
275 addPass(createInferAddressSpacesPass());
276 addPass(createNVPTXAtomicLowerPass());
277 }
278
addStraightLineScalarOptimizationPasses()279 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
280 addPass(createSeparateConstOffsetFromGEPPass());
281 addPass(createSpeculativeExecutionPass());
282 // ReassociateGEPs exposes more opportunites for SLSR. See
283 // the example in reassociate-geps-and-slsr.ll.
284 addPass(createStraightLineStrengthReducePass());
285 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
286 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
287 // for some of our benchmarks.
288 addEarlyCSEOrGVNPass();
289 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
290 addPass(createNaryReassociatePass());
291 // NaryReassociate on GEPs creates redundant common expressions, so run
292 // EarlyCSE after it.
293 addPass(createEarlyCSEPass());
294 }
295
addIRPasses()296 void NVPTXPassConfig::addIRPasses() {
297 // The following passes are known to not play well with virtual regs hanging
298 // around after register allocation (which in our case, is *all* registers).
299 // We explicitly disable them here. We do, however, need some functionality
300 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
301 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
302 disablePass(&PrologEpilogCodeInserterID);
303 disablePass(&MachineCopyPropagationID);
304 disablePass(&TailDuplicateID);
305 disablePass(&StackMapLivenessID);
306 disablePass(&LiveDebugValuesID);
307 disablePass(&PostRAMachineSinkingID);
308 disablePass(&PostRASchedulerID);
309 disablePass(&FuncletLayoutID);
310 disablePass(&PatchableFunctionID);
311 disablePass(&ShrinkWrapID);
312
313 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
314 // it here does nothing. But since we need it for correctness when lowering
315 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
316 // call addEarlyAsPossiblePasses.
317 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
318 addPass(createNVVMReflectPass(ST.getSmVersion()));
319
320 if (getOptLevel() != CodeGenOpt::None)
321 addPass(createNVPTXImageOptimizerPass());
322 addPass(createNVPTXAssignValidGlobalNamesPass());
323 addPass(createGenericToNVVMPass());
324
325 // NVPTXLowerArgs is required for correctness and should be run right
326 // before the address space inference passes.
327 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
328 if (getOptLevel() != CodeGenOpt::None) {
329 addAddressSpaceInferencePasses();
330 addStraightLineScalarOptimizationPasses();
331 }
332
333 addPass(createAtomicExpandPass());
334
335 // === LSR and other generic IR passes ===
336 TargetPassConfig::addIRPasses();
337 // EarlyCSE is not always strong enough to clean up what LSR produces. For
338 // example, GVN can combine
339 //
340 // %0 = add %a, %b
341 // %1 = add %b, %a
342 //
343 // and
344 //
345 // %0 = shl nsw %a, 2
346 // %1 = shl %a, 2
347 //
348 // but EarlyCSE can do neither of them.
349 if (getOptLevel() != CodeGenOpt::None) {
350 addEarlyCSEOrGVNPass();
351 if (!DisableLoadStoreVectorizer)
352 addPass(createLoadStoreVectorizerPass());
353 addPass(createSROAPass());
354 }
355 }
356
addInstSelector()357 bool NVPTXPassConfig::addInstSelector() {
358 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
359
360 addPass(createLowerAggrCopies());
361 addPass(createAllocaHoisting());
362 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
363
364 if (!ST.hasImageHandles())
365 addPass(createNVPTXReplaceImageHandlesPass());
366
367 return false;
368 }
369
addPreRegAlloc()370 void NVPTXPassConfig::addPreRegAlloc() {
371 // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
372 addPass(createNVPTXProxyRegErasurePass());
373 }
374
addPostRegAlloc()375 void NVPTXPassConfig::addPostRegAlloc() {
376 addPass(createNVPTXPrologEpilogPass());
377 if (getOptLevel() != CodeGenOpt::None) {
378 // NVPTXPrologEpilogPass calculates frame object offset and replace frame
379 // index with VRFrame register. NVPTXPeephole need to be run after that and
380 // will replace VRFrame with VRFrameLocal when possible.
381 addPass(createNVPTXPeephole());
382 }
383 }
384
createTargetRegisterAllocator(bool)385 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
386 return nullptr; // No reg alloc
387 }
388
addFastRegAlloc()389 void NVPTXPassConfig::addFastRegAlloc() {
390 addPass(&PHIEliminationID);
391 addPass(&TwoAddressInstructionPassID);
392 }
393
addOptimizedRegAlloc()394 void NVPTXPassConfig::addOptimizedRegAlloc() {
395 addPass(&ProcessImplicitDefsID);
396 addPass(&LiveVariablesID);
397 addPass(&MachineLoopInfoID);
398 addPass(&PHIEliminationID);
399
400 addPass(&TwoAddressInstructionPassID);
401 addPass(&RegisterCoalescerID);
402
403 // PreRA instruction scheduling.
404 if (addPass(&MachineSchedulerID))
405 printAndVerify("After Machine Scheduling");
406
407
408 addPass(&StackSlotColoringID);
409
410 // FIXME: Needs physical registers
411 //addPass(&MachineLICMID);
412
413 printAndVerify("After StackSlotColoring");
414 }
415
addMachineSSAOptimization()416 void NVPTXPassConfig::addMachineSSAOptimization() {
417 // Pre-ra tail duplication.
418 if (addPass(&EarlyTailDuplicateID))
419 printAndVerify("After Pre-RegAlloc TailDuplicate");
420
421 // Optimize PHIs before DCE: removing dead PHI cycles may make more
422 // instructions dead.
423 addPass(&OptimizePHIsID);
424
425 // This pass merges large allocas. StackSlotColoring is a different pass
426 // which merges spill slots.
427 addPass(&StackColoringID);
428
429 // If the target requests it, assign local variables to stack slots relative
430 // to one another and simplify frame index references where possible.
431 addPass(&LocalStackSlotAllocationID);
432
433 // With optimization, dead code should already be eliminated. However
434 // there is one known exception: lowered code for arguments that are only
435 // used by tail calls, where the tail calls reuse the incoming stack
436 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
437 addPass(&DeadMachineInstructionElimID);
438 printAndVerify("After codegen DCE pass");
439
440 // Allow targets to insert passes that improve instruction level parallelism,
441 // like if-conversion. Such passes will typically need dominator trees and
442 // loop info, just like LICM and CSE below.
443 if (addILPOpts())
444 printAndVerify("After ILP optimizations");
445
446 addPass(&EarlyMachineLICMID);
447 addPass(&MachineCSEID);
448
449 addPass(&MachineSinkingID);
450 printAndVerify("After Machine LICM, CSE and Sinking passes");
451
452 addPass(&PeepholeOptimizerID);
453 printAndVerify("After codegen peephole optimization pass");
454 }
455