1 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "SIDefines.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
29 #include "llvm/Analysis/PostDominators.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/Function.h"
35 #include "llvm/IR/InstrTypes.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/Intrinsics.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Casting.h"
43 #include "llvm/Transforms/Scalar.h"
44 #include "llvm/Transforms/Utils.h"
45 #include "llvm/Transforms/Utils/Local.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
50 
51 namespace {
52 
53 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
54 public:
55   static char ID; // Pass identification, replacement for typeid
56 
57   AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
58     initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
59   }
60 
61   // We can preserve non-critical-edgeness when we unify function exit nodes
62   void getAnalysisUsage(AnalysisUsage &AU) const override;
63   bool runOnFunction(Function &F) override;
64 };
65 
66 } // end anonymous namespace
67 
68 char AMDGPUUnifyDivergentExitNodes::ID = 0;
69 
70 char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
71 
72 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
73                      "Unify divergent function exit nodes", false, false)
74 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
75 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
76 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
77                     "Unify divergent function exit nodes", false, false)
78 
79 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
80   // TODO: Preserve dominator tree.
81   AU.addRequired<PostDominatorTreeWrapperPass>();
82 
83   AU.addRequired<LegacyDivergenceAnalysis>();
84 
85   // No divergent values are changed, only blocks and branch edges.
86   AU.addPreserved<LegacyDivergenceAnalysis>();
87 
88   // We preserve the non-critical-edgeness property
89   AU.addPreservedID(BreakCriticalEdgesID);
90 
91   // This is a cluster of orthogonal Transforms
92   AU.addPreservedID(LowerSwitchID);
93   FunctionPass::getAnalysisUsage(AU);
94 
95   AU.addRequired<TargetTransformInfoWrapperPass>();
96 }
97 
98 /// \returns true if \p BB is reachable through only uniform branches.
99 /// XXX - Is there a more efficient way to find this?
100 static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
101                                BasicBlock &BB) {
102   SmallVector<BasicBlock *, 8> Stack;
103   SmallPtrSet<BasicBlock *, 8> Visited;
104 
105   for (BasicBlock *Pred : predecessors(&BB))
106     Stack.push_back(Pred);
107 
108   while (!Stack.empty()) {
109     BasicBlock *Top = Stack.pop_back_val();
110     if (!DA.isUniform(Top->getTerminator()))
111       return false;
112 
113     for (BasicBlock *Pred : predecessors(Top)) {
114       if (Visited.insert(Pred).second)
115         Stack.push_back(Pred);
116     }
117   }
118 
119   return true;
120 }
121 
122 static void removeDoneExport(Function &F) {
123   ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
124   for (BasicBlock &BB : F) {
125     for (Instruction &I : BB) {
126       if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
127         if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
128           Intrin->setArgOperand(6, BoolFalse); // done
129         } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
130           Intrin->setArgOperand(4, BoolFalse); // done
131         }
132       }
133     }
134   }
135 }
136 
137 static BasicBlock *unifyReturnBlockSet(Function &F,
138                                        ArrayRef<BasicBlock *> ReturningBlocks,
139                                        bool InsertExport,
140                                        const TargetTransformInfo &TTI,
141                                        StringRef Name) {
142   // Otherwise, we need to insert a new basic block into the function, add a PHI
143   // nodes (if the function returns values), and convert all of the return
144   // instructions into unconditional branches.
145   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
146   IRBuilder<> B(NewRetBlock);
147 
148   if (InsertExport) {
149     // Ensure that there's only one "done" export in the shader by removing the
150     // "done" bit set on the original final export. More than one "done" export
151     // can lead to undefined behavior.
152     removeDoneExport(F);
153 
154     Value *Undef = UndefValue::get(B.getFloatTy());
155     B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
156                       {
157                         B.getInt32(AMDGPU::Exp::ET_NULL),
158                         B.getInt32(0), // enabled channels
159                         Undef, Undef, Undef, Undef, // values
160                         B.getTrue(), // done
161                         B.getTrue(), // valid mask
162                       });
163   }
164 
165   PHINode *PN = nullptr;
166   if (F.getReturnType()->isVoidTy()) {
167     B.CreateRetVoid();
168   } else {
169     // If the function doesn't return void... add a PHI node to the block...
170     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
171                      "UnifiedRetVal");
172     assert(!InsertExport);
173     B.CreateRet(PN);
174   }
175 
176   // Loop over all of the blocks, replacing the return instruction with an
177   // unconditional branch.
178   for (BasicBlock *BB : ReturningBlocks) {
179     // Add an incoming element to the PHI node for every return instruction that
180     // is merging into this new block...
181     if (PN)
182       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
183 
184     // Remove and delete the return inst.
185     BB->getTerminator()->eraseFromParent();
186     BranchInst::Create(NewRetBlock, BB);
187   }
188 
189   for (BasicBlock *BB : ReturningBlocks) {
190     // Cleanup possible branch to unconditional branch to the return.
191     simplifyCFG(BB, TTI, SimplifyCFGOptions().bonusInstThreshold(2));
192   }
193 
194   return NewRetBlock;
195 }
196 
197 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
198   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
199 
200   // If there's only one exit, we don't need to do anything, unless this is a
201   // pixel shader and that exit is an infinite loop, since we still have to
202   // insert an export in that case.
203   if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
204     return false;
205 
206   LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
207 
208   // Loop over all of the blocks in a function, tracking all of the blocks that
209   // return.
210   SmallVector<BasicBlock *, 4> ReturningBlocks;
211   SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
212   SmallVector<BasicBlock *, 4> UnreachableBlocks;
213 
214   // Dummy return block for infinite loop.
215   BasicBlock *DummyReturnBB = nullptr;
216 
217   bool InsertExport = false;
218 
219   bool Changed = false;
220   for (BasicBlock *BB : PDT.roots()) {
221     if (isa<ReturnInst>(BB->getTerminator())) {
222       if (!isUniformlyReached(DA, *BB))
223         ReturningBlocks.push_back(BB);
224       else
225         UniformlyReachedRetBlocks.push_back(BB);
226     } else if (isa<UnreachableInst>(BB->getTerminator())) {
227       if (!isUniformlyReached(DA, *BB))
228         UnreachableBlocks.push_back(BB);
229     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
230 
231       ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
232       if (DummyReturnBB == nullptr) {
233         DummyReturnBB = BasicBlock::Create(F.getContext(),
234                                            "DummyReturnBlock", &F);
235         Type *RetTy = F.getReturnType();
236         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
237 
238         // For pixel shaders, the producer guarantees that an export is
239         // executed before each return instruction. However, if there is an
240         // infinite loop and we insert a return ourselves, we need to uphold
241         // that guarantee by inserting a null export. This can happen e.g. in
242         // an infinite loop with kill instructions, which is supposed to
243         // terminate. However, we don't need to do this if there is a non-void
244         // return value, since then there is an epilog afterwards which will
245         // still export.
246         //
247         // Note: In the case where only some threads enter the infinite loop,
248         // this can result in the null export happening redundantly after the
249         // original exports. However, The last "real" export happens after all
250         // the threads that didn't enter an infinite loop converged, which
251         // means that the only extra threads to execute the null export are
252         // threads that entered the infinite loop, and they only could've
253         // exited through being killed which sets their exec bit to 0.
254         // Therefore, unless there's an actual infinite loop, which can have
255         // invalid results, or there's a kill after the last export, which we
256         // assume the frontend won't do, this export will have the same exec
257         // mask as the last "real" export, and therefore the valid mask will be
258         // overwritten with the same value and will still be correct. Also,
259         // even though this forces an extra unnecessary export wait, we assume
260         // that this happens rare enough in practice to that we don't have to
261         // worry about performance.
262         if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
263             RetTy->isVoidTy()) {
264           InsertExport = true;
265         }
266 
267         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
268         ReturningBlocks.push_back(DummyReturnBB);
269       }
270 
271       if (BI->isUnconditional()) {
272         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
273         BI->eraseFromParent(); // Delete the unconditional branch.
274         // Add a new conditional branch with a dummy edge to the return block.
275         BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
276       } else { // Conditional branch.
277         // Create a new transition block to hold the conditional branch.
278         BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
279 
280         // Create a branch that will always branch to the transition block and
281         // references DummyReturnBB.
282         BB->getTerminator()->eraseFromParent();
283         BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
284       }
285       Changed = true;
286     }
287   }
288 
289   if (!UnreachableBlocks.empty()) {
290     BasicBlock *UnreachableBlock = nullptr;
291 
292     if (UnreachableBlocks.size() == 1) {
293       UnreachableBlock = UnreachableBlocks.front();
294     } else {
295       UnreachableBlock = BasicBlock::Create(F.getContext(),
296                                             "UnifiedUnreachableBlock", &F);
297       new UnreachableInst(F.getContext(), UnreachableBlock);
298 
299       for (BasicBlock *BB : UnreachableBlocks) {
300         // Remove and delete the unreachable inst.
301         BB->getTerminator()->eraseFromParent();
302         BranchInst::Create(UnreachableBlock, BB);
303       }
304       Changed = true;
305     }
306 
307     if (!ReturningBlocks.empty()) {
308       // Don't create a new unreachable inst if we have a return. The
309       // structurizer/annotator can't handle the multiple exits
310 
311       Type *RetTy = F.getReturnType();
312       Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
313       // Remove and delete the unreachable inst.
314       UnreachableBlock->getTerminator()->eraseFromParent();
315 
316       Function *UnreachableIntrin =
317         Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
318 
319       // Insert a call to an intrinsic tracking that this is an unreachable
320       // point, in case we want to kill the active lanes or something later.
321       CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
322 
323       // Don't create a scalar trap. We would only want to trap if this code was
324       // really reached, but a scalar trap would happen even if no lanes
325       // actually reached here.
326       ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
327       ReturningBlocks.push_back(UnreachableBlock);
328       Changed = true;
329     }
330   }
331 
332   // Now handle return blocks.
333   if (ReturningBlocks.empty())
334     return Changed; // No blocks return
335 
336   if (ReturningBlocks.size() == 1 && !InsertExport)
337     return Changed; // Already has a single return block
338 
339   const TargetTransformInfo &TTI
340     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
341 
342   // Unify returning blocks. If we are going to insert the export it is also
343   // necessary to include blocks that are uniformly reached, because in addition
344   // to inserting the export the "done" bits on existing exports will be cleared
345   // and we do not want to end up with the normal export in a non-unified,
346   // uniformly reached block with the "done" bit cleared.
347   auto BlocksToUnify = std::move(ReturningBlocks);
348   if (InsertExport) {
349     BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
350                          UniformlyReachedRetBlocks.end());
351   }
352 
353   unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
354                       "UnifiedReturnBlock");
355   return true;
356 }
357