1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief The AMDGPU target machine contains all of the hardware specific
12 /// information  needed to emit code for R600 and SI GPUs.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUTargetObjectFile.h"
20 #include "AMDGPUTargetTransformInfo.h"
21 #include "R600ISelLowering.h"
22 #include "R600InstrInfo.h"
23 #include "R600MachineScheduler.h"
24 #include "SIISelLowering.h"
25 #include "SIInstrInfo.h"
26 
27 #include "llvm/Analysis/Passes.h"
28 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
29 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
30 #include "llvm/CodeGen/MachineModuleInfo.h"
31 #include "llvm/CodeGen/Passes.h"
32 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
33 #include "llvm/CodeGen/TargetPassConfig.h"
34 #include "llvm/IR/Verifier.h"
35 #include "llvm/MC/MCAsmInfo.h"
36 #include "llvm/IR/LegacyPassManager.h"
37 #include "llvm/Support/TargetRegistry.h"
38 #include "llvm/Support/raw_os_ostream.h"
39 #include "llvm/Transforms/IPO.h"
40 #include "llvm/Transforms/Scalar.h"
41 #include "llvm/Transforms/Scalar/GVN.h"
42 
43 using namespace llvm;
44 
45 static cl::opt<bool> EnableR600StructurizeCFG(
46   "r600-ir-structurize",
47   cl::desc("Use StructurizeCFG IR pass"),
48   cl::init(true));
49 
50 static cl::opt<bool> EnableSROA(
51   "amdgpu-sroa",
52   cl::desc("Run SROA after promote alloca pass"),
53   cl::ReallyHidden,
54   cl::init(true));
55 
56 static cl::opt<bool> EnableR600IfConvert(
57   "r600-if-convert",
58   cl::desc("Use if conversion pass"),
59   cl::ReallyHidden,
60   cl::init(true));
61 
62 extern "C" void LLVMInitializeAMDGPUTarget() {
63   // Register the target
64   RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
65   RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget);
66 
67   PassRegistry *PR = PassRegistry::getPassRegistry();
68   initializeSILowerI1CopiesPass(*PR);
69   initializeSIFixSGPRCopiesPass(*PR);
70   initializeSIFoldOperandsPass(*PR);
71   initializeSIShrinkInstructionsPass(*PR);
72   initializeSIFixControlFlowLiveIntervalsPass(*PR);
73   initializeSILoadStoreOptimizerPass(*PR);
74   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
75   initializeAMDGPUAnnotateUniformValuesPass(*PR);
76   initializeAMDGPUPromoteAllocaPass(*PR);
77   initializeAMDGPUCodeGenPreparePass(*PR);
78   initializeSIAnnotateControlFlowPass(*PR);
79   initializeSIDebuggerInsertNopsPass(*PR);
80   initializeSIInsertWaitsPass(*PR);
81   initializeSIWholeQuadModePass(*PR);
82   initializeSILowerControlFlowPass(*PR);
83   initializeSIDebuggerInsertNopsPass(*PR);
84 }
85 
86 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
87   return make_unique<AMDGPUTargetObjectFile>();
88 }
89 
90 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
91   return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
92 }
93 
94 static MachineSchedRegistry
95 R600SchedRegistry("r600", "Run R600's custom scheduler",
96                    createR600MachineScheduler);
97 
98 static MachineSchedRegistry
99 SISchedRegistry("si", "Run SI's custom scheduler",
100                 createSIMachineScheduler);
101 
102 static StringRef computeDataLayout(const Triple &TT) {
103   if (TT.getArch() == Triple::r600) {
104     // 32-bit pointers.
105     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
106             "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
107   }
108 
109   // 32-bit private, local, and region pointers. 64-bit global, constant and
110   // flat.
111   return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
112          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
113          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
114 }
115 
116 LLVM_READNONE
117 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
118   if (!GPU.empty())
119     return GPU;
120 
121   // HSA only supports CI+, so change the default GPU to a CI for HSA.
122   if (TT.getArch() == Triple::amdgcn)
123     return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
124 
125   return "r600";
126 }
127 
128 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
129   if (!RM.hasValue())
130     return Reloc::PIC_;
131   return *RM;
132 }
133 
134 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
135                                          StringRef CPU, StringRef FS,
136                                          TargetOptions Options,
137                                          Optional<Reloc::Model> RM,
138                                          CodeModel::Model CM,
139                                          CodeGenOpt::Level OptLevel)
140   : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
141                       FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
142     TLOF(createTLOF(getTargetTriple())),
143     IntrinsicInfo() {
144   setRequiresStructuredCFG(true);
145   initAsmInfo();
146 }
147 
148 AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
149 
150 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
151   Attribute GPUAttr = F.getFnAttribute("target-cpu");
152   return GPUAttr.hasAttribute(Attribute::None) ?
153     getTargetCPU() : GPUAttr.getValueAsString();
154 }
155 
156 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
157   Attribute FSAttr = F.getFnAttribute("target-features");
158 
159   return FSAttr.hasAttribute(Attribute::None) ?
160     getTargetFeatureString() :
161     FSAttr.getValueAsString();
162 }
163 
164 //===----------------------------------------------------------------------===//
165 // R600 Target Machine (R600 -> Cayman)
166 //===----------------------------------------------------------------------===//
167 
168 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
169                                      StringRef CPU, StringRef FS,
170                                      TargetOptions Options,
171                                      Optional<Reloc::Model> RM,
172                                      CodeModel::Model CM, CodeGenOpt::Level OL)
173   : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
174 
175 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
176   const Function &F) const {
177   StringRef GPU = getGPUName(F);
178   StringRef FS = getFeatureString(F);
179 
180   SmallString<128> SubtargetKey(GPU);
181   SubtargetKey.append(FS);
182 
183   auto &I = SubtargetMap[SubtargetKey];
184   if (!I) {
185     // This needs to be done before we create a new subtarget since any
186     // creation will depend on the TM and the code generation flags on the
187     // function that reside in TargetOptions.
188     resetTargetOptions(F);
189     I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
190   }
191 
192   return I.get();
193 }
194 
195 //===----------------------------------------------------------------------===//
196 // GCN Target Machine (SI+)
197 //===----------------------------------------------------------------------===//
198 
199 #ifdef LLVM_BUILD_GLOBAL_ISEL
200 namespace {
201 struct SIGISelActualAccessor : public GISelAccessor {
202   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
203   const AMDGPUCallLowering *getCallLowering() const override {
204     return CallLoweringInfo.get();
205   }
206 };
207 } // End anonymous namespace.
208 #endif
209 
210 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
211                                    StringRef CPU, StringRef FS,
212                                    TargetOptions Options,
213                                    Optional<Reloc::Model> RM,
214                                    CodeModel::Model CM, CodeGenOpt::Level OL)
215   : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
216 
217 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
218   StringRef GPU = getGPUName(F);
219   StringRef FS = getFeatureString(F);
220 
221   SmallString<128> SubtargetKey(GPU);
222   SubtargetKey.append(FS);
223 
224   auto &I = SubtargetMap[SubtargetKey];
225   if (!I) {
226     // This needs to be done before we create a new subtarget since any
227     // creation will depend on the TM and the code generation flags on the
228     // function that reside in TargetOptions.
229     resetTargetOptions(F);
230     I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
231 
232 #ifndef LLVM_BUILD_GLOBAL_ISEL
233     GISelAccessor *GISel = new GISelAccessor();
234 #else
235     SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
236     GISel->CallLoweringInfo.reset(
237       new AMDGPUCallLowering(*I->getTargetLowering()));
238 #endif
239 
240     I->setGISelAccessor(*GISel);
241   }
242 
243   return I.get();
244 }
245 
246 //===----------------------------------------------------------------------===//
247 // AMDGPU Pass Setup
248 //===----------------------------------------------------------------------===//
249 
250 namespace {
251 
252 class AMDGPUPassConfig : public TargetPassConfig {
253 public:
254   AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
255     : TargetPassConfig(TM, PM) {
256 
257     // Exceptions and StackMaps are not supported, so these passes will never do
258     // anything.
259     disablePass(&StackMapLivenessID);
260     disablePass(&FuncletLayoutID);
261   }
262 
263   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
264     return getTM<AMDGPUTargetMachine>();
265   }
266 
267   void addEarlyCSEOrGVNPass();
268   void addStraightLineScalarOptimizationPasses();
269   void addIRPasses() override;
270   bool addPreISel() override;
271   bool addInstSelector() override;
272   bool addGCPasses() override;
273 };
274 
275 class R600PassConfig final : public AMDGPUPassConfig {
276 public:
277   R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
278     : AMDGPUPassConfig(TM, PM) { }
279 
280   ScheduleDAGInstrs *createMachineScheduler(
281     MachineSchedContext *C) const override {
282     return createR600MachineScheduler(C);
283   }
284 
285   bool addPreISel() override;
286   void addPreRegAlloc() override;
287   void addPreSched2() override;
288   void addPreEmitPass() override;
289 };
290 
291 class GCNPassConfig final : public AMDGPUPassConfig {
292 public:
293   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
294     : AMDGPUPassConfig(TM, PM) { }
295 
296   GCNTargetMachine &getGCNTargetMachine() const {
297     return getTM<GCNTargetMachine>();
298   }
299 
300   ScheduleDAGInstrs *
301   createMachineScheduler(MachineSchedContext *C) const override;
302 
303   bool addPreISel() override;
304   void addMachineSSAOptimization() override;
305   bool addInstSelector() override;
306 #ifdef LLVM_BUILD_GLOBAL_ISEL
307   bool addIRTranslator() override;
308   bool addRegBankSelect() override;
309 #endif
310   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
311   void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
312   void addPreRegAlloc() override;
313   void addPreSched2() override;
314   void addPreEmitPass() override;
315 };
316 
317 } // End of anonymous namespace
318 
319 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
320   return TargetIRAnalysis([this](const Function &F) {
321     return TargetTransformInfo(AMDGPUTTIImpl(this, F));
322   });
323 }
324 
325 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
326   if (getOptLevel() == CodeGenOpt::Aggressive)
327     addPass(createGVNPass());
328   else
329     addPass(createEarlyCSEPass());
330 }
331 
332 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
333   addPass(createSeparateConstOffsetFromGEPPass());
334   addPass(createSpeculativeExecutionPass());
335   // ReassociateGEPs exposes more opportunites for SLSR. See
336   // the example in reassociate-geps-and-slsr.ll.
337   addPass(createStraightLineStrengthReducePass());
338   // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
339   // EarlyCSE can reuse.
340   addEarlyCSEOrGVNPass();
341   // Run NaryReassociate after EarlyCSE/GVN to be more effective.
342   addPass(createNaryReassociatePass());
343   // NaryReassociate on GEPs creates redundant common expressions, so run
344   // EarlyCSE after it.
345   addPass(createEarlyCSEPass());
346 }
347 
348 void AMDGPUPassConfig::addIRPasses() {
349   // There is no reason to run these.
350   disablePass(&StackMapLivenessID);
351   disablePass(&FuncletLayoutID);
352   disablePass(&PatchableFunctionID);
353 
354   // Function calls are not supported, so make sure we inline everything.
355   addPass(createAMDGPUAlwaysInlinePass());
356   addPass(createAlwaysInlinerPass());
357   // We need to add the barrier noop pass, otherwise adding the function
358   // inlining pass will cause all of the PassConfigs passes to be run
359   // one function at a time, which means if we have a nodule with two
360   // functions, then we will generate code for the first function
361   // without ever running any passes on the second.
362   addPass(createBarrierNoopPass());
363 
364   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
365   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
366 
367   const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
368   if (TM.getOptLevel() > CodeGenOpt::None) {
369     addPass(createAMDGPUPromoteAlloca(&TM));
370 
371     if (EnableSROA)
372       addPass(createSROAPass());
373   }
374 
375   addStraightLineScalarOptimizationPasses();
376 
377   TargetPassConfig::addIRPasses();
378 
379   // EarlyCSE is not always strong enough to clean up what LSR produces. For
380   // example, GVN can combine
381   //
382   //   %0 = add %a, %b
383   //   %1 = add %b, %a
384   //
385   // and
386   //
387   //   %0 = shl nsw %a, 2
388   //   %1 = shl %a, 2
389   //
390   // but EarlyCSE can do neither of them.
391   if (getOptLevel() != CodeGenOpt::None)
392     addEarlyCSEOrGVNPass();
393 }
394 
395 bool AMDGPUPassConfig::addPreISel() {
396   addPass(createFlattenCFGPass());
397   return false;
398 }
399 
400 bool AMDGPUPassConfig::addInstSelector() {
401   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
402   return false;
403 }
404 
405 bool AMDGPUPassConfig::addGCPasses() {
406   // Do nothing. GC is not supported.
407   return false;
408 }
409 
410 //===----------------------------------------------------------------------===//
411 // R600 Pass Setup
412 //===----------------------------------------------------------------------===//
413 
414 bool R600PassConfig::addPreISel() {
415   AMDGPUPassConfig::addPreISel();
416 
417   if (EnableR600StructurizeCFG)
418     addPass(createStructurizeCFGPass());
419   addPass(createR600TextureIntrinsicsReplacer());
420   return false;
421 }
422 
423 void R600PassConfig::addPreRegAlloc() {
424   addPass(createR600VectorRegMerger(*TM));
425 }
426 
427 void R600PassConfig::addPreSched2() {
428   addPass(createR600EmitClauseMarkers(), false);
429   if (EnableR600IfConvert)
430     addPass(&IfConverterID, false);
431   addPass(createR600ClauseMergePass(*TM), false);
432 }
433 
434 void R600PassConfig::addPreEmitPass() {
435   addPass(createAMDGPUCFGStructurizerPass(), false);
436   addPass(createR600ExpandSpecialInstrsPass(*TM), false);
437   addPass(&FinalizeMachineBundlesID, false);
438   addPass(createR600Packetizer(*TM), false);
439   addPass(createR600ControlFlowFinalizer(*TM), false);
440 }
441 
442 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
443   return new R600PassConfig(this, PM);
444 }
445 
446 //===----------------------------------------------------------------------===//
447 // GCN Pass Setup
448 //===----------------------------------------------------------------------===//
449 
450 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
451   MachineSchedContext *C) const {
452   const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
453   if (ST.enableSIScheduler())
454     return createSIMachineScheduler(C);
455   return nullptr;
456 }
457 
458 bool GCNPassConfig::addPreISel() {
459   AMDGPUPassConfig::addPreISel();
460 
461   // FIXME: We need to run a pass to propagate the attributes when calls are
462   // supported.
463   addPass(&AMDGPUAnnotateKernelFeaturesID);
464   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
465   addPass(createSinkingPass());
466   addPass(createSITypeRewriter());
467   addPass(createAMDGPUAnnotateUniformValues());
468   addPass(createSIAnnotateControlFlowPass());
469 
470   return false;
471 }
472 
473 void GCNPassConfig::addMachineSSAOptimization() {
474   TargetPassConfig::addMachineSSAOptimization();
475 
476   // We want to fold operands after PeepholeOptimizer has run (or as part of
477   // it), because it will eliminate extra copies making it easier to fold the
478   // real source operand. We want to eliminate dead instructions after, so that
479   // we see fewer uses of the copies. We then need to clean up the dead
480   // instructions leftover after the operands are folded as well.
481   //
482   // XXX - Can we get away without running DeadMachineInstructionElim again?
483   addPass(&SIFoldOperandsID);
484   addPass(&DeadMachineInstructionElimID);
485 }
486 
487 bool GCNPassConfig::addInstSelector() {
488   AMDGPUPassConfig::addInstSelector();
489   addPass(createSILowerI1CopiesPass());
490   addPass(&SIFixSGPRCopiesID);
491   return false;
492 }
493 
494 #ifdef LLVM_BUILD_GLOBAL_ISEL
495 bool GCNPassConfig::addIRTranslator() {
496   addPass(new IRTranslator());
497   return false;
498 }
499 
500 bool GCNPassConfig::addRegBankSelect() {
501   return false;
502 }
503 #endif
504 
505 void GCNPassConfig::addPreRegAlloc() {
506   // This needs to be run directly before register allocation because
507   // earlier passes might recompute live intervals.
508   // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
509   if (getOptLevel() > CodeGenOpt::None) {
510     insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
511   }
512 
513   if (getOptLevel() > CodeGenOpt::None) {
514     // Don't do this with no optimizations since it throws away debug info by
515     // merging nonadjacent loads.
516 
517     // This should be run after scheduling, but before register allocation. It
518     // also need extra copies to the address operand to be eliminated.
519 
520     // FIXME: Move pre-RA and remove extra reg coalescer run.
521     insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
522     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
523   }
524 
525   addPass(createSIShrinkInstructionsPass());
526   addPass(createSIWholeQuadModePass());
527 }
528 
529 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
530   TargetPassConfig::addFastRegAlloc(RegAllocPass);
531 }
532 
533 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
534   TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
535 }
536 
537 void GCNPassConfig::addPreSched2() {
538 }
539 
540 void GCNPassConfig::addPreEmitPass() {
541   // The hazard recognizer that runs as part of the post-ra scheduler does not
542   // guarantee to be able handle all hazards correctly. This is because if there
543   // are multiple scheduling regions in a basic block, the regions are scheduled
544   // bottom up, so when we begin to schedule a region we don't know what
545   // instructions were emitted directly before it.
546   //
547   // Here we add a stand-alone hazard recognizer pass which can handle all
548   // cases.
549   addPass(&PostRAHazardRecognizerID);
550 
551   addPass(createSIInsertWaitsPass());
552   addPass(createSIShrinkInstructionsPass());
553   addPass(createSILowerControlFlowPass());
554   addPass(createSIDebuggerInsertNopsPass());
555 }
556 
557 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
558   return new GCNPassConfig(this, PM);
559 }
560