1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Top-level implementation for the NVPTX target.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "NVPTXTargetMachine.h"
14 #include "NVPTX.h"
15 #include "NVPTXAllocaHoisting.h"
16 #include "NVPTXLowerAggrCopies.h"
17 #include "NVPTXTargetObjectFile.h"
18 #include "NVPTXTargetTransformInfo.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/TargetTransformInfo.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/LegacyPassManager.h"
25 #include "llvm/Pass.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/TargetRegistry.h"
28 #include "llvm/Target/TargetMachine.h"
29 #include "llvm/Target/TargetOptions.h"
30 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
31 #include "llvm/Transforms/Scalar.h"
32 #include "llvm/Transforms/Scalar/GVN.h"
33 #include "llvm/Transforms/Vectorize.h"
34 #include <cassert>
35 #include <string>
36 
37 using namespace llvm;
38 
39 // LSV is still relatively new; this switch lets us turn it off in case we
40 // encounter (or suspect) a bug.
41 static cl::opt<bool>
42     DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
43                                cl::desc("Disable load/store vectorizer"),
44                                cl::init(false), cl::Hidden);
45 
46 // TODO: Remove this flag when we are confident with no regressions.
47 static cl::opt<bool> DisableRequireStructuredCFG(
48     "disable-nvptx-require-structured-cfg",
49     cl::desc("Transitional flag to turn off NVPTX's requirement on preserving "
50              "structured CFG. The requirement should be disabled only when "
51              "unexpected regressions happen."),
52     cl::init(false), cl::Hidden);
53 
54 static cl::opt<bool> UseShortPointersOpt(
55     "nvptx-short-ptr",
56     cl::desc(
57         "Use 32-bit pointers for accessing const/local/shared address spaces."),
58     cl::init(false), cl::Hidden);
59 
60 namespace llvm {
61 
62 void initializeNVVMIntrRangePass(PassRegistry&);
63 void initializeNVVMReflectPass(PassRegistry&);
64 void initializeGenericToNVVMPass(PassRegistry&);
65 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
66 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
67 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
68 void initializeNVPTXLowerArgsPass(PassRegistry &);
69 void initializeNVPTXLowerAllocaPass(PassRegistry &);
70 void initializeNVPTXProxyRegErasurePass(PassRegistry &);
71 
72 } // end namespace llvm
73 
74 extern "C" void LLVMInitializeNVPTXTarget() {
75   // Register the target.
76   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
77   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
78 
79   // FIXME: This pass is really intended to be invoked during IR optimization,
80   // but it's very NVPTX-specific.
81   PassRegistry &PR = *PassRegistry::getPassRegistry();
82   initializeNVVMReflectPass(PR);
83   initializeNVVMIntrRangePass(PR);
84   initializeGenericToNVVMPass(PR);
85   initializeNVPTXAllocaHoistingPass(PR);
86   initializeNVPTXAssignValidGlobalNamesPass(PR);
87   initializeNVPTXLowerArgsPass(PR);
88   initializeNVPTXLowerAllocaPass(PR);
89   initializeNVPTXLowerAggrCopiesPass(PR);
90   initializeNVPTXProxyRegErasurePass(PR);
91 }
92 
93 static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
94   std::string Ret = "e";
95 
96   if (!is64Bit)
97     Ret += "-p:32:32";
98   else if (UseShortPointers)
99     Ret += "-p3:32:32-p4:32:32-p5:32:32";
100 
101   Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
102 
103   return Ret;
104 }
105 
106 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
107                                        StringRef CPU, StringRef FS,
108                                        const TargetOptions &Options,
109                                        Optional<Reloc::Model> RM,
110                                        Optional<CodeModel::Model> CM,
111                                        CodeGenOpt::Level OL, bool is64bit)
112     // The pic relocation model is used regardless of what the client has
113     // specified, as it is the only relocation model currently supported.
114     : LLVMTargetMachine(T, computeDataLayout(is64bit, UseShortPointersOpt), TT,
115                         CPU, FS, Options, Reloc::PIC_,
116                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
117       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
118       TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
119       Subtarget(TT, CPU, FS, *this) {
120   if (TT.getOS() == Triple::NVCL)
121     drvInterface = NVPTX::NVCL;
122   else
123     drvInterface = NVPTX::CUDA;
124   if (!DisableRequireStructuredCFG)
125     setRequiresStructuredCFG(true);
126   initAsmInfo();
127 }
128 
129 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
130 
131 void NVPTXTargetMachine32::anchor() {}
132 
133 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
134                                            StringRef CPU, StringRef FS,
135                                            const TargetOptions &Options,
136                                            Optional<Reloc::Model> RM,
137                                            Optional<CodeModel::Model> CM,
138                                            CodeGenOpt::Level OL, bool JIT)
139     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
140 
141 void NVPTXTargetMachine64::anchor() {}
142 
143 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
144                                            StringRef CPU, StringRef FS,
145                                            const TargetOptions &Options,
146                                            Optional<Reloc::Model> RM,
147                                            Optional<CodeModel::Model> CM,
148                                            CodeGenOpt::Level OL, bool JIT)
149     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
150 
151 namespace {
152 
153 class NVPTXPassConfig : public TargetPassConfig {
154 public:
155   NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
156       : TargetPassConfig(TM, PM) {}
157 
158   NVPTXTargetMachine &getNVPTXTargetMachine() const {
159     return getTM<NVPTXTargetMachine>();
160   }
161 
162   void addIRPasses() override;
163   bool addInstSelector() override;
164   void addPreRegAlloc() override;
165   void addPostRegAlloc() override;
166   void addMachineSSAOptimization() override;
167 
168   FunctionPass *createTargetRegisterAllocator(bool) override;
169   void addFastRegAlloc() override;
170   void addOptimizedRegAlloc() override;
171 
172   bool addRegAssignmentFast() override {
173     llvm_unreachable("should not be used");
174   }
175 
176   bool addRegAssignmentOptimized() override {
177     llvm_unreachable("should not be used");
178   }
179 
180 private:
181   // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
182   // function is only called in opt mode.
183   void addEarlyCSEOrGVNPass();
184 
185   // Add passes that propagate special memory spaces.
186   void addAddressSpaceInferencePasses();
187 
188   // Add passes that perform straight-line scalar optimizations.
189   void addStraightLineScalarOptimizationPasses();
190 };
191 
192 } // end anonymous namespace
193 
194 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
195   return new NVPTXPassConfig(*this, PM);
196 }
197 
198 void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
199   Builder.addExtension(
200     PassManagerBuilder::EP_EarlyAsPossible,
201     [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
202       PM.add(createNVVMReflectPass(Subtarget.getSmVersion()));
203       PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
204     });
205 }
206 
207 TargetTransformInfo
208 NVPTXTargetMachine::getTargetTransformInfo(const Function &F) {
209   return TargetTransformInfo(NVPTXTTIImpl(this, F));
210 }
211 
212 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
213   if (getOptLevel() == CodeGenOpt::Aggressive)
214     addPass(createGVNPass());
215   else
216     addPass(createEarlyCSEPass());
217 }
218 
219 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
220   // NVPTXLowerArgs emits alloca for byval parameters which can often
221   // be eliminated by SROA.
222   addPass(createSROAPass());
223   addPass(createNVPTXLowerAllocaPass());
224   addPass(createInferAddressSpacesPass());
225 }
226 
227 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
228   addPass(createSeparateConstOffsetFromGEPPass());
229   addPass(createSpeculativeExecutionPass());
230   // ReassociateGEPs exposes more opportunites for SLSR. See
231   // the example in reassociate-geps-and-slsr.ll.
232   addPass(createStraightLineStrengthReducePass());
233   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
234   // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
235   // for some of our benchmarks.
236   addEarlyCSEOrGVNPass();
237   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
238   addPass(createNaryReassociatePass());
239   // NaryReassociate on GEPs creates redundant common expressions, so run
240   // EarlyCSE after it.
241   addPass(createEarlyCSEPass());
242 }
243 
244 void NVPTXPassConfig::addIRPasses() {
245   // The following passes are known to not play well with virtual regs hanging
246   // around after register allocation (which in our case, is *all* registers).
247   // We explicitly disable them here.  We do, however, need some functionality
248   // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
249   // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
250   disablePass(&PrologEpilogCodeInserterID);
251   disablePass(&MachineCopyPropagationID);
252   disablePass(&TailDuplicateID);
253   disablePass(&StackMapLivenessID);
254   disablePass(&LiveDebugValuesID);
255   disablePass(&PostRAMachineSinkingID);
256   disablePass(&PostRASchedulerID);
257   disablePass(&FuncletLayoutID);
258   disablePass(&PatchableFunctionID);
259   disablePass(&ShrinkWrapID);
260 
261   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
262   // it here does nothing.  But since we need it for correctness when lowering
263   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
264   // call addEarlyAsPossiblePasses.
265   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
266   addPass(createNVVMReflectPass(ST.getSmVersion()));
267 
268   if (getOptLevel() != CodeGenOpt::None)
269     addPass(createNVPTXImageOptimizerPass());
270   addPass(createNVPTXAssignValidGlobalNamesPass());
271   addPass(createGenericToNVVMPass());
272 
273   // NVPTXLowerArgs is required for correctness and should be run right
274   // before the address space inference passes.
275   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
276   if (getOptLevel() != CodeGenOpt::None) {
277     addAddressSpaceInferencePasses();
278     if (!DisableLoadStoreVectorizer)
279       addPass(createLoadStoreVectorizerPass());
280     addStraightLineScalarOptimizationPasses();
281   }
282 
283   // === LSR and other generic IR passes ===
284   TargetPassConfig::addIRPasses();
285   // EarlyCSE is not always strong enough to clean up what LSR produces. For
286   // example, GVN can combine
287   //
288   //   %0 = add %a, %b
289   //   %1 = add %b, %a
290   //
291   // and
292   //
293   //   %0 = shl nsw %a, 2
294   //   %1 = shl %a, 2
295   //
296   // but EarlyCSE can do neither of them.
297   if (getOptLevel() != CodeGenOpt::None)
298     addEarlyCSEOrGVNPass();
299 }
300 
301 bool NVPTXPassConfig::addInstSelector() {
302   const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
303 
304   addPass(createLowerAggrCopies());
305   addPass(createAllocaHoisting());
306   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
307 
308   if (!ST.hasImageHandles())
309     addPass(createNVPTXReplaceImageHandlesPass());
310 
311   return false;
312 }
313 
314 void NVPTXPassConfig::addPreRegAlloc() {
315   // Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
316   addPass(createNVPTXProxyRegErasurePass());
317 }
318 
319 void NVPTXPassConfig::addPostRegAlloc() {
320   addPass(createNVPTXPrologEpilogPass(), false);
321   if (getOptLevel() != CodeGenOpt::None) {
322     // NVPTXPrologEpilogPass calculates frame object offset and replace frame
323     // index with VRFrame register. NVPTXPeephole need to be run after that and
324     // will replace VRFrame with VRFrameLocal when possible.
325     addPass(createNVPTXPeephole());
326   }
327 }
328 
329 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
330   return nullptr; // No reg alloc
331 }
332 
333 void NVPTXPassConfig::addFastRegAlloc() {
334   addPass(&PHIEliminationID);
335   addPass(&TwoAddressInstructionPassID);
336 }
337 
338 void NVPTXPassConfig::addOptimizedRegAlloc() {
339   addPass(&ProcessImplicitDefsID);
340   addPass(&LiveVariablesID);
341   addPass(&MachineLoopInfoID);
342   addPass(&PHIEliminationID);
343 
344   addPass(&TwoAddressInstructionPassID);
345   addPass(&RegisterCoalescerID);
346 
347   // PreRA instruction scheduling.
348   if (addPass(&MachineSchedulerID))
349     printAndVerify("After Machine Scheduling");
350 
351 
352   addPass(&StackSlotColoringID);
353 
354   // FIXME: Needs physical registers
355   //addPass(&MachineLICMID);
356 
357   printAndVerify("After StackSlotColoring");
358 }
359 
360 void NVPTXPassConfig::addMachineSSAOptimization() {
361   // Pre-ra tail duplication.
362   if (addPass(&EarlyTailDuplicateID))
363     printAndVerify("After Pre-RegAlloc TailDuplicate");
364 
365   // Optimize PHIs before DCE: removing dead PHI cycles may make more
366   // instructions dead.
367   addPass(&OptimizePHIsID);
368 
369   // This pass merges large allocas. StackSlotColoring is a different pass
370   // which merges spill slots.
371   addPass(&StackColoringID);
372 
373   // If the target requests it, assign local variables to stack slots relative
374   // to one another and simplify frame index references where possible.
375   addPass(&LocalStackSlotAllocationID);
376 
377   // With optimization, dead code should already be eliminated. However
378   // there is one known exception: lowered code for arguments that are only
379   // used by tail calls, where the tail calls reuse the incoming stack
380   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
381   addPass(&DeadMachineInstructionElimID);
382   printAndVerify("After codegen DCE pass");
383 
384   // Allow targets to insert passes that improve instruction level parallelism,
385   // like if-conversion. Such passes will typically need dominator trees and
386   // loop info, just like LICM and CSE below.
387   if (addILPOpts())
388     printAndVerify("After ILP optimizations");
389 
390   addPass(&EarlyMachineLICMID);
391   addPass(&MachineCSEID);
392 
393   addPass(&MachineSinkingID);
394   printAndVerify("After Machine LICM, CSE and Sinking passes");
395 
396   addPass(&PeepholeOptimizerID);
397   printAndVerify("After codegen peephole optimization pass");
398 }
399