1 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "SIDefines.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Analysis/DomTreeUpdater.h"
29 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
30 #include "llvm/Analysis/PostDominators.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/CFG.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/IntrinsicsAMDGPU.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/InitializePasses.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Utils.h"
48 #include "llvm/Transforms/Utils/Local.h"
49 
50 using namespace llvm;
51 
52 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
53 
54 namespace {
55 
56 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
57 public:
58   static char ID; // Pass identification, replacement for typeid
59 
60   AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
61     initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
62   }
63 
64   // We can preserve non-critical-edgeness when we unify function exit nodes
65   void getAnalysisUsage(AnalysisUsage &AU) const override;
66   bool runOnFunction(Function &F) override;
67 };
68 
69 } // end anonymous namespace
70 
71 char AMDGPUUnifyDivergentExitNodes::ID = 0;
72 
73 char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
74 
75 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
76                      "Unify divergent function exit nodes", false, false)
77 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
78 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
79 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
80 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
81                     "Unify divergent function exit nodes", false, false)
82 
83 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
84   if (RequireAndPreserveDomTree)
85     AU.addRequired<DominatorTreeWrapperPass>();
86 
87   AU.addRequired<PostDominatorTreeWrapperPass>();
88 
89   AU.addRequired<LegacyDivergenceAnalysis>();
90 
91   if (RequireAndPreserveDomTree) {
92     AU.addPreserved<DominatorTreeWrapperPass>();
93     // FIXME: preserve PostDominatorTreeWrapperPass
94   }
95 
96   // No divergent values are changed, only blocks and branch edges.
97   AU.addPreserved<LegacyDivergenceAnalysis>();
98 
99   // We preserve the non-critical-edgeness property
100   AU.addPreservedID(BreakCriticalEdgesID);
101 
102   // This is a cluster of orthogonal Transforms
103   AU.addPreservedID(LowerSwitchID);
104   FunctionPass::getAnalysisUsage(AU);
105 
106   AU.addRequired<TargetTransformInfoWrapperPass>();
107 }
108 
109 /// \returns true if \p BB is reachable through only uniform branches.
110 /// XXX - Is there a more efficient way to find this?
111 static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
112                                BasicBlock &BB) {
113   SmallVector<BasicBlock *, 8> Stack(predecessors(&BB));
114   SmallPtrSet<BasicBlock *, 8> Visited;
115 
116   while (!Stack.empty()) {
117     BasicBlock *Top = Stack.pop_back_val();
118     if (!DA.isUniform(Top->getTerminator()))
119       return false;
120 
121     for (BasicBlock *Pred : predecessors(Top)) {
122       if (Visited.insert(Pred).second)
123         Stack.push_back(Pred);
124     }
125   }
126 
127   return true;
128 }
129 
130 static void removeDoneExport(Function &F) {
131   ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
132   for (BasicBlock &BB : F) {
133     for (Instruction &I : BB) {
134       if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
135         if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
136           Intrin->setArgOperand(6, BoolFalse); // done
137         } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
138           Intrin->setArgOperand(4, BoolFalse); // done
139         }
140       }
141     }
142   }
143 }
144 
145 static BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
146                                        ArrayRef<BasicBlock *> ReturningBlocks,
147                                        bool InsertExport,
148                                        const TargetTransformInfo &TTI,
149                                        StringRef Name) {
150   // Otherwise, we need to insert a new basic block into the function, add a PHI
151   // nodes (if the function returns values), and convert all of the return
152   // instructions into unconditional branches.
153   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
154   IRBuilder<> B(NewRetBlock);
155 
156   if (InsertExport) {
157     // Ensure that there's only one "done" export in the shader by removing the
158     // "done" bit set on the original final export. More than one "done" export
159     // can lead to undefined behavior.
160     removeDoneExport(F);
161 
162     Value *Undef = UndefValue::get(B.getFloatTy());
163     B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
164                       {
165                         B.getInt32(AMDGPU::Exp::ET_NULL),
166                         B.getInt32(0), // enabled channels
167                         Undef, Undef, Undef, Undef, // values
168                         B.getTrue(), // done
169                         B.getTrue(), // valid mask
170                       });
171   }
172 
173   PHINode *PN = nullptr;
174   if (F.getReturnType()->isVoidTy()) {
175     B.CreateRetVoid();
176   } else {
177     // If the function doesn't return void... add a PHI node to the block...
178     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
179                      "UnifiedRetVal");
180     assert(!InsertExport);
181     B.CreateRet(PN);
182   }
183 
184   // Loop over all of the blocks, replacing the return instruction with an
185   // unconditional branch.
186   std::vector<DominatorTree::UpdateType> Updates;
187   Updates.reserve(ReturningBlocks.size());
188   for (BasicBlock *BB : ReturningBlocks) {
189     // Add an incoming element to the PHI node for every return instruction that
190     // is merging into this new block...
191     if (PN)
192       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
193 
194     // Remove and delete the return inst.
195     BB->getTerminator()->eraseFromParent();
196     BranchInst::Create(NewRetBlock, BB);
197     Updates.push_back({DominatorTree::Insert, BB, NewRetBlock});
198   }
199 
200   if (RequireAndPreserveDomTree)
201     DTU.applyUpdates(Updates);
202   Updates.clear();
203 
204   for (BasicBlock *BB : ReturningBlocks) {
205     // Cleanup possible branch to unconditional branch to the return.
206     simplifyCFG(BB, TTI, RequireAndPreserveDomTree ? &DTU : nullptr,
207                 SimplifyCFGOptions().bonusInstThreshold(2));
208   }
209 
210   return NewRetBlock;
211 }
212 
213 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
214   DominatorTree *DT = nullptr;
215   if (RequireAndPreserveDomTree)
216     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
217 
218   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
219 
220   // If there's only one exit, we don't need to do anything, unless this is a
221   // pixel shader and that exit is an infinite loop, since we still have to
222   // insert an export in that case.
223   if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
224     return false;
225 
226   LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
227 
228   // Loop over all of the blocks in a function, tracking all of the blocks that
229   // return.
230   SmallVector<BasicBlock *, 4> ReturningBlocks;
231   SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
232   SmallVector<BasicBlock *, 4> UnreachableBlocks;
233 
234   // Dummy return block for infinite loop.
235   BasicBlock *DummyReturnBB = nullptr;
236 
237   bool InsertExport = false;
238 
239   bool Changed = false;
240   std::vector<DominatorTree::UpdateType> Updates;
241 
242   for (BasicBlock *BB : PDT.roots()) {
243     if (isa<ReturnInst>(BB->getTerminator())) {
244       if (!isUniformlyReached(DA, *BB))
245         ReturningBlocks.push_back(BB);
246       else
247         UniformlyReachedRetBlocks.push_back(BB);
248     } else if (isa<UnreachableInst>(BB->getTerminator())) {
249       if (!isUniformlyReached(DA, *BB))
250         UnreachableBlocks.push_back(BB);
251     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
252 
253       ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
254       if (DummyReturnBB == nullptr) {
255         DummyReturnBB = BasicBlock::Create(F.getContext(),
256                                            "DummyReturnBlock", &F);
257         Type *RetTy = F.getReturnType();
258         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
259 
260         // For pixel shaders, the producer guarantees that an export is
261         // executed before each return instruction. However, if there is an
262         // infinite loop and we insert a return ourselves, we need to uphold
263         // that guarantee by inserting a null export. This can happen e.g. in
264         // an infinite loop with kill instructions, which is supposed to
265         // terminate. However, we don't need to do this if there is a non-void
266         // return value, since then there is an epilog afterwards which will
267         // still export.
268         //
269         // Note: In the case where only some threads enter the infinite loop,
270         // this can result in the null export happening redundantly after the
271         // original exports. However, The last "real" export happens after all
272         // the threads that didn't enter an infinite loop converged, which
273         // means that the only extra threads to execute the null export are
274         // threads that entered the infinite loop, and they only could've
275         // exited through being killed which sets their exec bit to 0.
276         // Therefore, unless there's an actual infinite loop, which can have
277         // invalid results, or there's a kill after the last export, which we
278         // assume the frontend won't do, this export will have the same exec
279         // mask as the last "real" export, and therefore the valid mask will be
280         // overwritten with the same value and will still be correct. Also,
281         // even though this forces an extra unnecessary export wait, we assume
282         // that this happens rare enough in practice to that we don't have to
283         // worry about performance.
284         if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
285             RetTy->isVoidTy()) {
286           InsertExport = true;
287         }
288 
289         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
290         ReturningBlocks.push_back(DummyReturnBB);
291       }
292 
293       if (BI->isUnconditional()) {
294         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
295         BI->eraseFromParent(); // Delete the unconditional branch.
296         // Add a new conditional branch with a dummy edge to the return block.
297         BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
298         Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
299       } else { // Conditional branch.
300         SmallVector<BasicBlock *, 2> Successors(succ_begin(BB), succ_end(BB));
301 
302         // Create a new transition block to hold the conditional branch.
303         BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
304 
305         Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
306 
307         // 'Successors' become successors of TransitionBB instead of BB,
308         // and TransitionBB becomes a single successor of BB.
309         Updates.push_back({DominatorTree::Insert, BB, TransitionBB});
310         for (BasicBlock *Successor : Successors) {
311           Updates.push_back({DominatorTree::Insert, TransitionBB, Successor});
312           Updates.push_back({DominatorTree::Delete, BB, Successor});
313         }
314 
315         // Create a branch that will always branch to the transition block and
316         // references DummyReturnBB.
317         BB->getTerminator()->eraseFromParent();
318         BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
319         Updates.push_back({DominatorTree::Insert, BB, DummyReturnBB});
320       }
321       Changed = true;
322     }
323   }
324 
325   if (!UnreachableBlocks.empty()) {
326     BasicBlock *UnreachableBlock = nullptr;
327 
328     if (UnreachableBlocks.size() == 1) {
329       UnreachableBlock = UnreachableBlocks.front();
330     } else {
331       UnreachableBlock = BasicBlock::Create(F.getContext(),
332                                             "UnifiedUnreachableBlock", &F);
333       new UnreachableInst(F.getContext(), UnreachableBlock);
334 
335       Updates.reserve(Updates.size() + UnreachableBlocks.size());
336       for (BasicBlock *BB : UnreachableBlocks) {
337         // Remove and delete the unreachable inst.
338         BB->getTerminator()->eraseFromParent();
339         BranchInst::Create(UnreachableBlock, BB);
340         Updates.push_back({DominatorTree::Insert, BB, UnreachableBlock});
341       }
342       Changed = true;
343     }
344 
345     if (!ReturningBlocks.empty()) {
346       // Don't create a new unreachable inst if we have a return. The
347       // structurizer/annotator can't handle the multiple exits
348 
349       Type *RetTy = F.getReturnType();
350       Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
351       // Remove and delete the unreachable inst.
352       UnreachableBlock->getTerminator()->eraseFromParent();
353 
354       Function *UnreachableIntrin =
355         Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
356 
357       // Insert a call to an intrinsic tracking that this is an unreachable
358       // point, in case we want to kill the active lanes or something later.
359       CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
360 
361       // Don't create a scalar trap. We would only want to trap if this code was
362       // really reached, but a scalar trap would happen even if no lanes
363       // actually reached here.
364       ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
365       ReturningBlocks.push_back(UnreachableBlock);
366       Changed = true;
367     }
368   }
369 
370   // FIXME: add PDT here once simplifycfg is ready.
371   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
372   if (RequireAndPreserveDomTree)
373     DTU.applyUpdates(Updates);
374   Updates.clear();
375 
376   // Now handle return blocks.
377   if (ReturningBlocks.empty())
378     return Changed; // No blocks return
379 
380   if (ReturningBlocks.size() == 1 && !InsertExport)
381     return Changed; // Already has a single return block
382 
383   const TargetTransformInfo &TTI
384     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
385 
386   // Unify returning blocks. If we are going to insert the export it is also
387   // necessary to include blocks that are uniformly reached, because in addition
388   // to inserting the export the "done" bits on existing exports will be cleared
389   // and we do not want to end up with the normal export in a non-unified,
390   // uniformly reached block with the "done" bit cleared.
391   auto BlocksToUnify = std::move(ReturningBlocks);
392   if (InsertExport) {
393     llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
394   }
395 
396   unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport, TTI,
397                       "UnifiedReturnBlock");
398   return true;
399 }
400