1 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "SIDefines.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallPtrSet.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
28 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
29 #include "llvm/Analysis/PostDominators.h"
30 #include "llvm/Analysis/TargetTransformInfo.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/Function.h"
35 #include "llvm/IR/InstrTypes.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/Intrinsics.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/Type.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Casting.h"
43 #include "llvm/Transforms/Scalar.h"
44 #include "llvm/Transforms/Utils.h"
45 #include "llvm/Transforms/Utils/Local.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
50 
51 namespace {
52 
53 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
54 public:
55   static char ID; // Pass identification, replacement for typeid
56 
57   AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
58     initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
59   }
60 
61   // We can preserve non-critical-edgeness when we unify function exit nodes
62   void getAnalysisUsage(AnalysisUsage &AU) const override;
63   bool runOnFunction(Function &F) override;
64 };
65 
66 } // end anonymous namespace
67 
68 char AMDGPUUnifyDivergentExitNodes::ID = 0;
69 
70 char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
71 
72 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
73                      "Unify divergent function exit nodes", false, false)
74 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
75 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
76 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
77                     "Unify divergent function exit nodes", false, false)
78 
79 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
80   // TODO: Preserve dominator tree.
81   AU.addRequired<PostDominatorTreeWrapperPass>();
82 
83   AU.addRequired<LegacyDivergenceAnalysis>();
84 
85   // No divergent values are changed, only blocks and branch edges.
86   AU.addPreserved<LegacyDivergenceAnalysis>();
87 
88   // We preserve the non-critical-edgeness property
89   AU.addPreservedID(BreakCriticalEdgesID);
90 
91   // This is a cluster of orthogonal Transforms
92   AU.addPreservedID(LowerSwitchID);
93   FunctionPass::getAnalysisUsage(AU);
94 
95   AU.addRequired<TargetTransformInfoWrapperPass>();
96 }
97 
98 /// \returns true if \p BB is reachable through only uniform branches.
99 /// XXX - Is there a more efficient way to find this?
100 static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
101                                BasicBlock &BB) {
102   SmallVector<BasicBlock *, 8> Stack;
103   SmallPtrSet<BasicBlock *, 8> Visited;
104 
105   for (BasicBlock *Pred : predecessors(&BB))
106     Stack.push_back(Pred);
107 
108   while (!Stack.empty()) {
109     BasicBlock *Top = Stack.pop_back_val();
110     if (!DA.isUniform(Top->getTerminator()))
111       return false;
112 
113     for (BasicBlock *Pred : predecessors(Top)) {
114       if (Visited.insert(Pred).second)
115         Stack.push_back(Pred);
116     }
117   }
118 
119   return true;
120 }
121 
122 static void removeDoneExport(Function &F) {
123   ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
124   for (BasicBlock &BB : F) {
125     for (Instruction &I : BB) {
126       if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
127         if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
128           Intrin->setArgOperand(6, BoolFalse); // done
129         } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
130           Intrin->setArgOperand(4, BoolFalse); // done
131         }
132       }
133     }
134   }
135 }
136 
137 static BasicBlock *unifyReturnBlockSet(Function &F,
138                                        ArrayRef<BasicBlock *> ReturningBlocks,
139                                        bool InsertExport,
140                                        const TargetTransformInfo &TTI,
141                                        StringRef Name) {
142   // Otherwise, we need to insert a new basic block into the function, add a PHI
143   // nodes (if the function returns values), and convert all of the return
144   // instructions into unconditional branches.
145   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
146   IRBuilder<> B(NewRetBlock);
147 
148   if (InsertExport) {
149     // Ensure that there's only one "done" export in the shader by removing the
150     // "done" bit set on the original final export. More than one "done" export
151     // can lead to undefined behavior.
152     removeDoneExport(F);
153 
154     Value *Undef = UndefValue::get(B.getFloatTy());
155     B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
156                       {
157                         B.getInt32(AMDGPU::Exp::ET_NULL),
158                         B.getInt32(0), // enabled channels
159                         Undef, Undef, Undef, Undef, // values
160                         B.getTrue(), // done
161                         B.getTrue(), // valid mask
162                       });
163   }
164 
165   PHINode *PN = nullptr;
166   if (F.getReturnType()->isVoidTy()) {
167     B.CreateRetVoid();
168   } else {
169     // If the function doesn't return void... add a PHI node to the block...
170     PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
171                      "UnifiedRetVal");
172     assert(!InsertExport);
173     B.CreateRet(PN);
174   }
175 
176   // Loop over all of the blocks, replacing the return instruction with an
177   // unconditional branch.
178   for (BasicBlock *BB : ReturningBlocks) {
179     // Add an incoming element to the PHI node for every return instruction that
180     // is merging into this new block...
181     if (PN)
182       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
183 
184     // Remove and delete the return inst.
185     BB->getTerminator()->eraseFromParent();
186     BranchInst::Create(NewRetBlock, BB);
187   }
188 
189   for (BasicBlock *BB : ReturningBlocks) {
190     // Cleanup possible branch to unconditional branch to the return.
191     simplifyCFG(BB, TTI, /*DTU=*/nullptr,
192                 SimplifyCFGOptions().bonusInstThreshold(2));
193   }
194 
195   return NewRetBlock;
196 }
197 
198 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
199   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
200 
201   // If there's only one exit, we don't need to do anything, unless this is a
202   // pixel shader and that exit is an infinite loop, since we still have to
203   // insert an export in that case.
204   if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
205     return false;
206 
207   LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
208 
209   // Loop over all of the blocks in a function, tracking all of the blocks that
210   // return.
211   SmallVector<BasicBlock *, 4> ReturningBlocks;
212   SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
213   SmallVector<BasicBlock *, 4> UnreachableBlocks;
214 
215   // Dummy return block for infinite loop.
216   BasicBlock *DummyReturnBB = nullptr;
217 
218   bool InsertExport = false;
219 
220   bool Changed = false;
221   for (BasicBlock *BB : PDT.roots()) {
222     if (isa<ReturnInst>(BB->getTerminator())) {
223       if (!isUniformlyReached(DA, *BB))
224         ReturningBlocks.push_back(BB);
225       else
226         UniformlyReachedRetBlocks.push_back(BB);
227     } else if (isa<UnreachableInst>(BB->getTerminator())) {
228       if (!isUniformlyReached(DA, *BB))
229         UnreachableBlocks.push_back(BB);
230     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
231 
232       ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
233       if (DummyReturnBB == nullptr) {
234         DummyReturnBB = BasicBlock::Create(F.getContext(),
235                                            "DummyReturnBlock", &F);
236         Type *RetTy = F.getReturnType();
237         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
238 
239         // For pixel shaders, the producer guarantees that an export is
240         // executed before each return instruction. However, if there is an
241         // infinite loop and we insert a return ourselves, we need to uphold
242         // that guarantee by inserting a null export. This can happen e.g. in
243         // an infinite loop with kill instructions, which is supposed to
244         // terminate. However, we don't need to do this if there is a non-void
245         // return value, since then there is an epilog afterwards which will
246         // still export.
247         //
248         // Note: In the case where only some threads enter the infinite loop,
249         // this can result in the null export happening redundantly after the
250         // original exports. However, The last "real" export happens after all
251         // the threads that didn't enter an infinite loop converged, which
252         // means that the only extra threads to execute the null export are
253         // threads that entered the infinite loop, and they only could've
254         // exited through being killed which sets their exec bit to 0.
255         // Therefore, unless there's an actual infinite loop, which can have
256         // invalid results, or there's a kill after the last export, which we
257         // assume the frontend won't do, this export will have the same exec
258         // mask as the last "real" export, and therefore the valid mask will be
259         // overwritten with the same value and will still be correct. Also,
260         // even though this forces an extra unnecessary export wait, we assume
261         // that this happens rare enough in practice to that we don't have to
262         // worry about performance.
263         if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
264             RetTy->isVoidTy()) {
265           InsertExport = true;
266         }
267 
268         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
269         ReturningBlocks.push_back(DummyReturnBB);
270       }
271 
272       if (BI->isUnconditional()) {
273         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
274         BI->eraseFromParent(); // Delete the unconditional branch.
275         // Add a new conditional branch with a dummy edge to the return block.
276         BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
277       } else { // Conditional branch.
278         // Create a new transition block to hold the conditional branch.
279         BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
280 
281         // Create a branch that will always branch to the transition block and
282         // references DummyReturnBB.
283         BB->getTerminator()->eraseFromParent();
284         BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
285       }
286       Changed = true;
287     }
288   }
289 
290   if (!UnreachableBlocks.empty()) {
291     BasicBlock *UnreachableBlock = nullptr;
292 
293     if (UnreachableBlocks.size() == 1) {
294       UnreachableBlock = UnreachableBlocks.front();
295     } else {
296       UnreachableBlock = BasicBlock::Create(F.getContext(),
297                                             "UnifiedUnreachableBlock", &F);
298       new UnreachableInst(F.getContext(), UnreachableBlock);
299 
300       for (BasicBlock *BB : UnreachableBlocks) {
301         // Remove and delete the unreachable inst.
302         BB->getTerminator()->eraseFromParent();
303         BranchInst::Create(UnreachableBlock, BB);
304       }
305       Changed = true;
306     }
307 
308     if (!ReturningBlocks.empty()) {
309       // Don't create a new unreachable inst if we have a return. The
310       // structurizer/annotator can't handle the multiple exits
311 
312       Type *RetTy = F.getReturnType();
313       Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
314       // Remove and delete the unreachable inst.
315       UnreachableBlock->getTerminator()->eraseFromParent();
316 
317       Function *UnreachableIntrin =
318         Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
319 
320       // Insert a call to an intrinsic tracking that this is an unreachable
321       // point, in case we want to kill the active lanes or something later.
322       CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
323 
324       // Don't create a scalar trap. We would only want to trap if this code was
325       // really reached, but a scalar trap would happen even if no lanes
326       // actually reached here.
327       ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
328       ReturningBlocks.push_back(UnreachableBlock);
329       Changed = true;
330     }
331   }
332 
333   // Now handle return blocks.
334   if (ReturningBlocks.empty())
335     return Changed; // No blocks return
336 
337   if (ReturningBlocks.size() == 1 && !InsertExport)
338     return Changed; // Already has a single return block
339 
340   const TargetTransformInfo &TTI
341     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
342 
343   // Unify returning blocks. If we are going to insert the export it is also
344   // necessary to include blocks that are uniformly reached, because in addition
345   // to inserting the export the "done" bits on existing exports will be cleared
346   // and we do not want to end up with the normal export in a non-unified,
347   // uniformly reached block with the "done" bit cleared.
348   auto BlocksToUnify = std::move(ReturningBlocks);
349   if (InsertExport) {
350     BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
351                          UniformlyReachedRetBlocks.end());
352   }
353 
354   unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
355                       "UnifiedReturnBlock");
356   return true;
357 }
358