1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief The AMDGPU target machine contains all of the hardware specific
12 /// information  needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUTargetObjectFile.h"
18 #include "AMDGPU.h"
19 #include "AMDGPUTargetTransformInfo.h"
20 #include "R600ISelLowering.h"
21 #include "R600InstrInfo.h"
22 #include "R600MachineScheduler.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "llvm/Analysis/Passes.h"
26 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
27 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
28 #include "llvm/CodeGen/MachineModuleInfo.h"
29 #include "llvm/CodeGen/Passes.h"
30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/IR/Verifier.h"
33 #include "llvm/MC/MCAsmInfo.h"
34 #include "llvm/IR/LegacyPassManager.h"
35 #include "llvm/Support/TargetRegistry.h"
36 #include "llvm/Support/raw_os_ostream.h"
37 #include "llvm/Transforms/IPO.h"
38 #include "llvm/Transforms/Scalar.h"
39 #include <llvm/CodeGen/Passes.h>
40 
41 using namespace llvm;
42 
43 extern "C" void LLVMInitializeAMDGPUTarget() {
44   // Register the target
45   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
46   RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
47 
48   PassRegistry *PR = PassRegistry::getPassRegistry();
49   initializeSILowerI1CopiesPass(*PR);
50   initializeSIFixSGPRCopiesPass(*PR);
51   initializeSIFoldOperandsPass(*PR);
52   initializeSIFixControlFlowLiveIntervalsPass(*PR);
53   initializeSILoadStoreOptimizerPass(*PR);
54   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
55   initializeAMDGPUAnnotateUniformValuesPass(*PR);
56   initializeAMDGPUPromoteAllocaPass(*PR);
57   initializeSIAnnotateControlFlowPass(*PR);
58   initializeSIDebuggerInsertNopsPass(*PR);
59   initializeSIInsertWaitsPass(*PR);
60   initializeSIWholeQuadModePass(*PR);
61   initializeSILowerControlFlowPass(*PR);
62 }
63 
64 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
65   return make_unique<AMDGPUTargetObjectFile>();
66 }
67 
68 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
69   return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
70 }
71 
72 static MachineSchedRegistry
73 R600SchedRegistry("r600", "Run R600's custom scheduler",
74                    createR600MachineScheduler);
75 
76 static MachineSchedRegistry
77 SISchedRegistry("si", "Run SI's custom scheduler",
78                 createSIMachineScheduler);
79 
80 static std::string computeDataLayout(const Triple &TT) {
81   std::string Ret = "e-p:32:32";
82 
83   if (TT.getArch() == Triple::amdgcn) {
84     // 32-bit private, local, and region pointers. 64-bit global and constant.
85     Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
86   }
87 
88   Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
89          "-v512:512-v1024:1024-v2048:2048-n32:64";
90 
91   return Ret;
92 }
93 
94 LLVM_READNONE
95 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
96   if (!GPU.empty())
97     return GPU;
98 
99   // HSA only supports CI+, so change the default GPU to a CI for HSA.
100   if (TT.getArch() == Triple::amdgcn)
101     return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
102 
103   return "";
104 }
105 
106 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
107   if (!RM.hasValue())
108     return Reloc::PIC_;
109   return *RM;
110 }
111 
112 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
113                                          StringRef CPU, StringRef FS,
114                                          TargetOptions Options,
115                                          Optional<Reloc::Model> RM,
116                                          CodeModel::Model CM,
117                                          CodeGenOpt::Level OptLevel)
118     : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
119                         FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
120       TLOF(createTLOF(getTargetTriple())),
121       Subtarget(TT, getTargetCPU(), FS, *this), IntrinsicInfo() {
122   setRequiresStructuredCFG(true);
123   initAsmInfo();
124 }
125 
126 AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
127 
128 //===----------------------------------------------------------------------===//
129 // R600 Target Machine (R600 -> Cayman)
130 //===----------------------------------------------------------------------===//
131 
132 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
133                                      StringRef CPU, StringRef FS,
134                                      TargetOptions Options,
135                                      Optional<Reloc::Model> RM,
136                                      CodeModel::Model CM, CodeGenOpt::Level OL)
137     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
138 
139 //===----------------------------------------------------------------------===//
140 // GCN Target Machine (SI+)
141 //===----------------------------------------------------------------------===//
142 
143 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
144                                    StringRef CPU, StringRef FS,
145                                    TargetOptions Options,
146                                    Optional<Reloc::Model> RM,
147                                    CodeModel::Model CM, CodeGenOpt::Level OL)
148     : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
149 
150 //===----------------------------------------------------------------------===//
151 // AMDGPU Pass Setup
152 //===----------------------------------------------------------------------===//
153 
154 namespace {
155 
156 class AMDGPUPassConfig : public TargetPassConfig {
157 public:
158   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
159     : TargetPassConfig(TM, PM) {
160 
161     // Exceptions and StackMaps are not supported, so these passes will never do
162     // anything.
163     disablePass(&StackMapLivenessID);
164     disablePass(&FuncletLayoutID);
165   }
166 
167   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
168     return getTM<AMDGPUTargetMachine>();
169   }
170 
171   ScheduleDAGInstrs *
172   createMachineScheduler(MachineSchedContext *C) const override {
173     const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
174     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
175       return createR600MachineScheduler(C);
176     else if (ST.enableSIScheduler())
177       return createSIMachineScheduler(C);
178     return nullptr;
179   }
180 
181   void addIRPasses() override;
182   void addCodeGenPrepare() override;
183   bool addPreISel() override;
184   bool addInstSelector() override;
185   bool addGCPasses() override;
186 };
187 
188 class R600PassConfig final : public AMDGPUPassConfig {
189 public:
190   R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
191     : AMDGPUPassConfig(TM, PM) { }
192 
193   bool addPreISel() override;
194   void addPreRegAlloc() override;
195   void addPreSched2() override;
196   void addPreEmitPass() override;
197 };
198 
199 class GCNPassConfig final : public AMDGPUPassConfig {
200 public:
201   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
202     : AMDGPUPassConfig(TM, PM) { }
203   bool addPreISel() override;
204   void addMachineSSAOptimization() override;
205   bool addInstSelector() override;
206 #ifdef LLVM_BUILD_GLOBAL_ISEL
207   bool addIRTranslator() override;
208   bool addRegBankSelect() override;
209 #endif
210   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
211   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
212   void addPreRegAlloc() override;
213   void addPreSched2() override;
214   void addPreEmitPass() override;
215 };
216 
217 } // End of anonymous namespace
218 
219 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
220   return TargetIRAnalysis([this](const Function &F) {
221     return TargetTransformInfo(
222         AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
223   });
224 }
225 
226 void AMDGPUPassConfig::addIRPasses() {
227   // There is no reason to run these.
228   disablePass(&StackMapLivenessID);
229   disablePass(&FuncletLayoutID);
230   disablePass(&PatchableFunctionID);
231 
232   // Function calls are not supported, so make sure we inline everything.
233   addPass(createAMDGPUAlwaysInlinePass());
234   addPass(createAlwaysInlinerPass());
235   // We need to add the barrier noop pass, otherwise adding the function
236   // inlining pass will cause all of the PassConfigs passes to be run
237   // one function at a time, which means if we have a nodule with two
238   // functions, then we will generate code for the first function
239   // without ever running any passes on the second.
240   addPass(createBarrierNoopPass());
241 
242   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
243   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
244 
245   TargetPassConfig::addIRPasses();
246 }
247 
248 void AMDGPUPassConfig::addCodeGenPrepare() {
249   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
250   const AMDGPUSubtarget &ST = *TM.getSubtargetImpl();
251   if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) {
252     addPass(createAMDGPUPromoteAlloca(&TM));
253     addPass(createSROAPass());
254   }
255   TargetPassConfig::addCodeGenPrepare();
256 }
257 
258 bool
259 AMDGPUPassConfig::addPreISel() {
260   addPass(createFlattenCFGPass());
261   return false;
262 }
263 
264 bool AMDGPUPassConfig::addInstSelector() {
265   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
266   return false;
267 }
268 
269 bool AMDGPUPassConfig::addGCPasses() {
270   // Do nothing. GC is not supported.
271   return false;
272 }
273 
274 //===----------------------------------------------------------------------===//
275 // R600 Pass Setup
276 //===----------------------------------------------------------------------===//
277 
278 bool R600PassConfig::addPreISel() {
279   AMDGPUPassConfig::addPreISel();
280   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
281   if (ST.IsIRStructurizerEnabled())
282     addPass(createStructurizeCFGPass());
283   addPass(createR600TextureIntrinsicsReplacer());
284   return false;
285 }
286 
287 void R600PassConfig::addPreRegAlloc() {
288   addPass(createR600VectorRegMerger(*TM));
289 }
290 
291 void R600PassConfig::addPreSched2() {
292   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
293   addPass(createR600EmitClauseMarkers(), false);
294   if (ST.isIfCvtEnabled())
295     addPass(&IfConverterID, false);
296   addPass(createR600ClauseMergePass(*TM), false);
297 }
298 
299 void R600PassConfig::addPreEmitPass() {
300   addPass(createAMDGPUCFGStructurizerPass(), false);
301   addPass(createR600ExpandSpecialInstrsPass(*TM), false);
302   addPass(&FinalizeMachineBundlesID, false);
303   addPass(createR600Packetizer(*TM), false);
304   addPass(createR600ControlFlowFinalizer(*TM), false);
305 }
306 
307 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
308   return new R600PassConfig(this, PM);
309 }
310 
311 //===----------------------------------------------------------------------===//
312 // GCN Pass Setup
313 //===----------------------------------------------------------------------===//
314 
315 bool GCNPassConfig::addPreISel() {
316   AMDGPUPassConfig::addPreISel();
317 
318   // FIXME: We need to run a pass to propagate the attributes when calls are
319   // supported.
320   addPass(&AMDGPUAnnotateKernelFeaturesID);
321   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
322   addPass(createSinkingPass());
323   addPass(createSITypeRewriter());
324   addPass(createAMDGPUAnnotateUniformValues());
325   addPass(createSIAnnotateControlFlowPass());
326 
327   return false;
328 }
329 
330 void GCNPassConfig::addMachineSSAOptimization() {
331   TargetPassConfig::addMachineSSAOptimization();
332 
333   // We want to fold operands after PeepholeOptimizer has run (or as part of
334   // it), because it will eliminate extra copies making it easier to fold the
335   // real source operand. We want to eliminate dead instructions after, so that
336   // we see fewer uses of the copies. We then need to clean up the dead
337   // instructions leftover after the operands are folded as well.
338   //
339   // XXX - Can we get away without running DeadMachineInstructionElim again?
340   addPass(&SIFoldOperandsID);
341   addPass(&DeadMachineInstructionElimID);
342 }
343 
344 bool GCNPassConfig::addInstSelector() {
345   AMDGPUPassConfig::addInstSelector();
346   addPass(createSILowerI1CopiesPass());
347   addPass(&SIFixSGPRCopiesID);
348   return false;
349 }
350 
351 #ifdef LLVM_BUILD_GLOBAL_ISEL
352 bool GCNPassConfig::addIRTranslator() {
353   addPass(new IRTranslator());
354   return false;
355 }
356 
357 bool GCNPassConfig::addRegBankSelect() {
358   return false;
359 }
360 #endif
361 
362 void GCNPassConfig::addPreRegAlloc() {
363   const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
364 
365   // This needs to be run directly before register allocation because
366   // earlier passes might recompute live intervals.
367   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
368   if (getOptLevel() > CodeGenOpt::None) {
369     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
370   }
371 
372   if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
373     // Don't do this with no optimizations since it throws away debug info by
374     // merging nonadjacent loads.
375 
376     // This should be run after scheduling, but before register allocation. It
377     // also need extra copies to the address operand to be eliminated.
378     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
379     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
380   }
381   addPass(createSIShrinkInstructionsPass(), false);
382   addPass(createSIWholeQuadModePass());
383 }
384 
385 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
386   TargetPassConfig::addFastRegAlloc(RegAllocPass);
387 }
388 
389 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
390   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
391 }
392 
393 void GCNPassConfig::addPreSched2() {
394 }
395 
396 void GCNPassConfig::addPreEmitPass() {
397 
398   // The hazard recognizer that runs as part of the post-ra scheduler does not
399   // gaurantee to be able handle all hazards correctly.  This is because
400   // if there are multiple scheduling regions in a basic block, the regions
401   // are scheduled bottom up, so when we begin to schedule a region we don't
402   // know what instructions were emitted directly before it.
403   //
404   // Here we add a stand-alone hazard recognizer pass which can handle all cases.
405   // hazard recognizer pass.
406   addPass(&PostRAHazardRecognizerID);
407 
408   addPass(createSIInsertWaitsPass(), false);
409   addPass(createSIShrinkInstructionsPass());
410   addPass(createSILowerControlFlowPass(), false);
411   addPass(createSIDebuggerInsertNopsPass(), false);
412 }
413 
414 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
415   return new GCNPassConfig(this, PM);
416 }
417