1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief The AMDGPU target machine contains all of the hardware specific
12 /// information  needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUTargetObjectFile.h"
18 #include "AMDGPU.h"
19 #include "AMDGPUTargetTransformInfo.h"
20 #include "R600ISelLowering.h"
21 #include "R600InstrInfo.h"
22 #include "R600MachineScheduler.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "llvm/Analysis/Passes.h"
26 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
27 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
28 #include "llvm/CodeGen/MachineModuleInfo.h"
29 #include "llvm/CodeGen/Passes.h"
30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/IR/Verifier.h"
33 #include "llvm/MC/MCAsmInfo.h"
34 #include "llvm/IR/LegacyPassManager.h"
35 #include "llvm/Support/TargetRegistry.h"
36 #include "llvm/Support/raw_os_ostream.h"
37 #include "llvm/Transforms/IPO.h"
38 #include "llvm/Transforms/Scalar.h"
39 #include "llvm/Transforms/Scalar/GVN.h"
40 #include "llvm/CodeGen/Passes.h"
41 
42 using namespace llvm;
43 
44 extern "C" void LLVMInitializeAMDGPUTarget() {
45   // Register the target
46   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
47   RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
48 
49   PassRegistry *PR = PassRegistry::getPassRegistry();
50   initializeSILowerI1CopiesPass(*PR);
51   initializeSIFixSGPRCopiesPass(*PR);
52   initializeSIFoldOperandsPass(*PR);
53   initializeSIShrinkInstructionsPass(*PR);
54   initializeSIFixControlFlowLiveIntervalsPass(*PR);
55   initializeSILoadStoreOptimizerPass(*PR);
56   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
57   initializeAMDGPUAnnotateUniformValuesPass(*PR);
58   initializeAMDGPUPromoteAllocaPass(*PR);
59   initializeSIAnnotateControlFlowPass(*PR);
60   initializeSIDebuggerInsertNopsPass(*PR);
61   initializeSIInsertWaitsPass(*PR);
62   initializeSIWholeQuadModePass(*PR);
63   initializeSILowerControlFlowPass(*PR);
64   initializeSIDebuggerInsertNopsPass(*PR);
65 }
66 
67 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
68   return make_unique<AMDGPUTargetObjectFile>();
69 }
70 
71 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
72   return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
73 }
74 
75 static MachineSchedRegistry
76 R600SchedRegistry("r600", "Run R600's custom scheduler",
77                    createR600MachineScheduler);
78 
79 static MachineSchedRegistry
80 SISchedRegistry("si", "Run SI's custom scheduler",
81                 createSIMachineScheduler);
82 
83 static StringRef computeDataLayout(const Triple &TT) {
84   if (TT.getArch() == Triple::r600) {
85     // 32-bit pointers.
86     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
87             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
88   }
89 
90   // 32-bit private, local, and region pointers. 64-bit global, constant and
91   // flat.
92   return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
93          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
94          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
95 }
96 
97 LLVM_READNONE
98 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
99   if (!GPU.empty())
100     return GPU;
101 
102   // HSA only supports CI+, so change the default GPU to a CI for HSA.
103   if (TT.getArch() == Triple::amdgcn)
104     return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
105 
106   return "r600";
107 }
108 
109 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
110   if (!RM.hasValue())
111     return Reloc::PIC_;
112   return *RM;
113 }
114 
115 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
116                                          StringRef CPU, StringRef FS,
117                                          TargetOptions Options,
118                                          Optional<Reloc::Model> RM,
119                                          CodeModel::Model CM,
120                                          CodeGenOpt::Level OptLevel)
121     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
122                         FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
123       TLOF(createTLOF(getTargetTriple())),
124       Subtarget(TT, getTargetCPU(), FS, *this), IntrinsicInfo() {
125   setRequiresStructuredCFG(true);
126   initAsmInfo();
127 }
128 
129 AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
130 
131 //===----------------------------------------------------------------------===//
132 // R600 Target Machine (R600 -> Cayman)
133 //===----------------------------------------------------------------------===//
134 
135 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
136                                      StringRef CPU, StringRef FS,
137                                      TargetOptions Options,
138                                      Optional<Reloc::Model> RM,
139                                      CodeModel::Model CM, CodeGenOpt::Level OL)
140     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
141 
142 //===----------------------------------------------------------------------===//
143 // GCN Target Machine (SI+)
144 //===----------------------------------------------------------------------===//
145 
146 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
147                                    StringRef CPU, StringRef FS,
148                                    TargetOptions Options,
149                                    Optional<Reloc::Model> RM,
150                                    CodeModel::Model CM, CodeGenOpt::Level OL)
151     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
152 
153 //===----------------------------------------------------------------------===//
154 // AMDGPU Pass Setup
155 //===----------------------------------------------------------------------===//
156 
157 namespace {
158 
159 class AMDGPUPassConfig : public TargetPassConfig {
160 public:
161   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
162     : TargetPassConfig(TM, PM) {
163 
164     // Exceptions and StackMaps are not supported, so these passes will never do
165     // anything.
166     disablePass(&StackMapLivenessID);
167     disablePass(&FuncletLayoutID);
168   }
169 
170   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
171     return getTM<AMDGPUTargetMachine>();
172   }
173 
174   ScheduleDAGInstrs *
175   createMachineScheduler(MachineSchedContext *C) const override {
176     const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
177     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
178       return createR600MachineScheduler(C);
179     else if (ST.enableSIScheduler())
180       return createSIMachineScheduler(C);
181     return nullptr;
182   }
183 
184   void addEarlyCSEOrGVNPass();
185   void addStraightLineScalarOptimizationPasses();
186   void addIRPasses() override;
187   bool addPreISel() override;
188   bool addInstSelector() override;
189   bool addGCPasses() override;
190 };
191 
192 class R600PassConfig final : public AMDGPUPassConfig {
193 public:
194   R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
195     : AMDGPUPassConfig(TM, PM) { }
196 
197   bool addPreISel() override;
198   void addPreRegAlloc() override;
199   void addPreSched2() override;
200   void addPreEmitPass() override;
201 };
202 
203 class GCNPassConfig final : public AMDGPUPassConfig {
204 public:
205   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
206     : AMDGPUPassConfig(TM, PM) { }
207   bool addPreISel() override;
208   void addMachineSSAOptimization() override;
209   bool addInstSelector() override;
210 #ifdef LLVM_BUILD_GLOBAL_ISEL
211   bool addIRTranslator() override;
212   bool addRegBankSelect() override;
213 #endif
214   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
215   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
216   void addPreRegAlloc() override;
217   void addPreSched2() override;
218   void addPreEmitPass() override;
219 };
220 
221 } // End of anonymous namespace
222 
223 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
224   return TargetIRAnalysis([this](const Function &F) {
225     return TargetTransformInfo(
226         AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
227   });
228 }
229 
230 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
231   if (getOptLevel() == CodeGenOpt::Aggressive)
232     addPass(createGVNPass());
233   else
234     addPass(createEarlyCSEPass());
235 }
236 
237 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
238   addPass(createSeparateConstOffsetFromGEPPass());
239   addPass(createSpeculativeExecutionPass());
240   // ReassociateGEPs exposes more opportunites for SLSR. See
241   // the example in reassociate-geps-and-slsr.ll.
242   addPass(createStraightLineStrengthReducePass());
243   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
244   // EarlyCSE can reuse.
245   addEarlyCSEOrGVNPass();
246   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
247   addPass(createNaryReassociatePass());
248   // NaryReassociate on GEPs creates redundant common expressions, so run
249   // EarlyCSE after it.
250   addPass(createEarlyCSEPass());
251 }
252 
253 void AMDGPUPassConfig::addIRPasses() {
254   // There is no reason to run these.
255   disablePass(&StackMapLivenessID);
256   disablePass(&FuncletLayoutID);
257   disablePass(&PatchableFunctionID);
258 
259   // Function calls are not supported, so make sure we inline everything.
260   addPass(createAMDGPUAlwaysInlinePass());
261   addPass(createAlwaysInlinerPass());
262   // We need to add the barrier noop pass, otherwise adding the function
263   // inlining pass will cause all of the PassConfigs passes to be run
264   // one function at a time, which means if we have a nodule with two
265   // functions, then we will generate code for the first function
266   // without ever running any passes on the second.
267   addPass(createBarrierNoopPass());
268 
269   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
270   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
271 
272   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
273   const AMDGPUSubtarget &ST = *TM.getSubtargetImpl();
274   if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) {
275     addPass(createAMDGPUPromoteAlloca(&TM));
276     addPass(createSROAPass());
277   }
278 
279   addStraightLineScalarOptimizationPasses();
280 
281   TargetPassConfig::addIRPasses();
282 
283   // EarlyCSE is not always strong enough to clean up what LSR produces. For
284   // example, GVN can combine
285   //
286   //   %0 = add %a, %b
287   //   %1 = add %b, %a
288   //
289   // and
290   //
291   //   %0 = shl nsw %a, 2
292   //   %1 = shl %a, 2
293   //
294   // but EarlyCSE can do neither of them.
295   if (getOptLevel() != CodeGenOpt::None)
296     addEarlyCSEOrGVNPass();
297 }
298 
299 bool
300 AMDGPUPassConfig::addPreISel() {
301   addPass(createFlattenCFGPass());
302   return false;
303 }
304 
305 bool AMDGPUPassConfig::addInstSelector() {
306   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
307   return false;
308 }
309 
310 bool AMDGPUPassConfig::addGCPasses() {
311   // Do nothing. GC is not supported.
312   return false;
313 }
314 
315 //===----------------------------------------------------------------------===//
316 // R600 Pass Setup
317 //===----------------------------------------------------------------------===//
318 
319 bool R600PassConfig::addPreISel() {
320   AMDGPUPassConfig::addPreISel();
321   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
322   if (ST.IsIRStructurizerEnabled())
323     addPass(createStructurizeCFGPass());
324   addPass(createR600TextureIntrinsicsReplacer());
325   return false;
326 }
327 
328 void R600PassConfig::addPreRegAlloc() {
329   addPass(createR600VectorRegMerger(*TM));
330 }
331 
332 void R600PassConfig::addPreSched2() {
333   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
334   addPass(createR600EmitClauseMarkers(), false);
335   if (ST.isIfCvtEnabled())
336     addPass(&IfConverterID, false);
337   addPass(createR600ClauseMergePass(*TM), false);
338 }
339 
340 void R600PassConfig::addPreEmitPass() {
341   addPass(createAMDGPUCFGStructurizerPass(), false);
342   addPass(createR600ExpandSpecialInstrsPass(*TM), false);
343   addPass(&FinalizeMachineBundlesID, false);
344   addPass(createR600Packetizer(*TM), false);
345   addPass(createR600ControlFlowFinalizer(*TM), false);
346 }
347 
348 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
349   return new R600PassConfig(this, PM);
350 }
351 
352 //===----------------------------------------------------------------------===//
353 // GCN Pass Setup
354 //===----------------------------------------------------------------------===//
355 
356 bool GCNPassConfig::addPreISel() {
357   AMDGPUPassConfig::addPreISel();
358 
359   // FIXME: We need to run a pass to propagate the attributes when calls are
360   // supported.
361   addPass(&AMDGPUAnnotateKernelFeaturesID);
362   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
363   addPass(createSinkingPass());
364   addPass(createSITypeRewriter());
365   addPass(createAMDGPUAnnotateUniformValues());
366   addPass(createSIAnnotateControlFlowPass());
367 
368   return false;
369 }
370 
371 void GCNPassConfig::addMachineSSAOptimization() {
372   TargetPassConfig::addMachineSSAOptimization();
373 
374   // We want to fold operands after PeepholeOptimizer has run (or as part of
375   // it), because it will eliminate extra copies making it easier to fold the
376   // real source operand. We want to eliminate dead instructions after, so that
377   // we see fewer uses of the copies. We then need to clean up the dead
378   // instructions leftover after the operands are folded as well.
379   //
380   // XXX - Can we get away without running DeadMachineInstructionElim again?
381   addPass(&SIFoldOperandsID);
382   addPass(&DeadMachineInstructionElimID);
383 }
384 
385 bool GCNPassConfig::addInstSelector() {
386   AMDGPUPassConfig::addInstSelector();
387   addPass(createSILowerI1CopiesPass());
388   addPass(&SIFixSGPRCopiesID);
389   return false;
390 }
391 
392 #ifdef LLVM_BUILD_GLOBAL_ISEL
393 bool GCNPassConfig::addIRTranslator() {
394   addPass(new IRTranslator());
395   return false;
396 }
397 
398 bool GCNPassConfig::addRegBankSelect() {
399   return false;
400 }
401 #endif
402 
403 void GCNPassConfig::addPreRegAlloc() {
404   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
405 
406   // This needs to be run directly before register allocation because
407   // earlier passes might recompute live intervals.
408   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
409   if (getOptLevel() > CodeGenOpt::None) {
410     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
411   }
412 
413   if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
414     // Don't do this with no optimizations since it throws away debug info by
415     // merging nonadjacent loads.
416 
417     // This should be run after scheduling, but before register allocation. It
418     // also need extra copies to the address operand to be eliminated.
419     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
420     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
421   }
422   addPass(createSIShrinkInstructionsPass(), false);
423   addPass(createSIWholeQuadModePass());
424 }
425 
426 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
427   TargetPassConfig::addFastRegAlloc(RegAllocPass);
428 }
429 
430 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
431   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
432 }
433 
434 void GCNPassConfig::addPreSched2() {
435 }
436 
437 void GCNPassConfig::addPreEmitPass() {
438 
439   // The hazard recognizer that runs as part of the post-ra scheduler does not
440   // gaurantee to be able handle all hazards correctly.  This is because
441   // if there are multiple scheduling regions in a basic block, the regions
442   // are scheduled bottom up, so when we begin to schedule a region we don't
443   // know what instructions were emitted directly before it.
444   //
445   // Here we add a stand-alone hazard recognizer pass which can handle all cases.
446   // hazard recognizer pass.
447   addPass(&PostRAHazardRecognizerID);
448 
449   addPass(createSIInsertWaitsPass());
450   addPass(createSIShrinkInstructionsPass());
451   addPass(createSILowerControlFlowPass(), false);
452   addPass(createSIDebuggerInsertNopsPass(), false);
453 }
454 
455 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
456   return new GCNPassConfig(this, PM);
457 }
458