1 //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a general divergence analysis for loop vectorization 11 // and GPU programs. It determines which branches and values in a loop or GPU 12 // program are divergent. It can help branch optimizations such as jump 13 // threading and loop unswitching to make better decisions. 14 // 15 // GPU programs typically use the SIMD execution model, where multiple threads 16 // in the same execution group have to execute in lock-step. Therefore, if the 17 // code contains divergent branches (i.e., threads in a group do not agree on 18 // which path of the branch to take), the group of threads has to execute all 19 // the paths from that branch with different subsets of threads enabled until 20 // they re-converge. 21 // 22 // Due to this execution model, some optimizations such as jump 23 // threading and loop unswitching can interfere with thread re-convergence. 24 // Therefore, an analysis that computes which branches in a GPU program are 25 // divergent can help the compiler to selectively run these optimizations. 26 // 27 // This implementation is derived from the Vectorization Analysis of the 28 // Region Vectorizer (RV). That implementation in turn is based on the approach 29 // described in 30 // 31 // Improving Performance of OpenCL on CPUs 32 // Ralf Karrenberg and Sebastian Hack 33 // CC '12 34 // 35 // This DivergenceAnalysis implementation is generic in the sense that it does 36 // not itself identify original sources of divergence. 37 // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and 38 // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence 39 // (e.g., special variables that hold the thread ID or the iteration variable). 40 // 41 // The generic implementation propagates divergence to variables that are data 42 // or sync dependent on a source of divergence. 43 // 44 // While data dependency is a well-known concept, the notion of sync dependency 45 // is worth more explanation. Sync dependence characterizes the control flow 46 // aspect of the propagation of branch divergence. For example, 47 // 48 // %cond = icmp slt i32 %tid, 10 49 // br i1 %cond, label %then, label %else 50 // then: 51 // br label %merge 52 // else: 53 // br label %merge 54 // merge: 55 // %a = phi i32 [ 0, %then ], [ 1, %else ] 56 // 57 // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid 58 // because %tid is not on its use-def chains, %a is sync dependent on %tid 59 // because the branch "br i1 %cond" depends on %tid and affects which value %a 60 // is assigned to. 61 // 62 // The sync dependence detection (which branch induces divergence in which join 63 // points) is implemented in the SyncDependenceAnalysis. 64 // 65 // The current DivergenceAnalysis implementation has the following limitations: 66 // 1. intra-procedural. It conservatively considers the arguments of a 67 // non-kernel-entry function and the return value of a function call as 68 // divergent. 69 // 2. memory as black box. It conservatively considers values loaded from 70 // generic or local address as divergent. This can be improved by leveraging 71 // pointer analysis and/or by modelling non-escaping memory objects in SSA 72 // as done in RV. 73 // 74 //===----------------------------------------------------------------------===// 75 76 #include "llvm/Analysis/DivergenceAnalysis.h" 77 #include "llvm/Analysis/LoopInfo.h" 78 #include "llvm/Analysis/Passes.h" 79 #include "llvm/Analysis/PostDominators.h" 80 #include "llvm/Analysis/TargetTransformInfo.h" 81 #include "llvm/IR/Dominators.h" 82 #include "llvm/IR/InstIterator.h" 83 #include "llvm/IR/Instructions.h" 84 #include "llvm/IR/IntrinsicInst.h" 85 #include "llvm/IR/Value.h" 86 #include "llvm/Support/Debug.h" 87 #include "llvm/Support/raw_ostream.h" 88 #include <vector> 89 90 using namespace llvm; 91 92 #define DEBUG_TYPE "divergence-analysis" 93 94 // class DivergenceAnalysis 95 DivergenceAnalysis::DivergenceAnalysis( 96 const Function &F, const Loop *RegionLoop, const DominatorTree &DT, 97 const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) 98 : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), 99 IsLCSSAForm(IsLCSSAForm) {} 100 101 void DivergenceAnalysis::markDivergent(const Value &DivVal) { 102 assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal)); 103 assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); 104 DivergentValues.insert(&DivVal); 105 } 106 107 void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { 108 UniformOverrides.insert(&UniVal); 109 } 110 111 bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const { 112 if (Term.getNumSuccessors() <= 1) 113 return false; 114 if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) { 115 assert(BranchTerm->isConditional()); 116 return isDivergent(*BranchTerm->getCondition()); 117 } 118 if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) { 119 return isDivergent(*SwitchTerm->getCondition()); 120 } 121 if (isa<InvokeInst>(Term)) { 122 return false; // ignore abnormal executions through landingpad 123 } 124 125 llvm_unreachable("unexpected terminator"); 126 } 127 128 bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { 129 // TODO function calls with side effects, etc 130 for (const auto &Op : I.operands()) { 131 if (isDivergent(*Op)) 132 return true; 133 } 134 return false; 135 } 136 137 bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, 138 const Value &Val) const { 139 const auto *Inst = dyn_cast<const Instruction>(&Val); 140 if (!Inst) 141 return false; 142 // check whether any divergent loop carrying Val terminates before control 143 // proceeds to ObservingBlock 144 for (const auto *Loop = LI.getLoopFor(Inst->getParent()); 145 Loop != RegionLoop && !Loop->contains(&ObservingBlock); 146 Loop = Loop->getParentLoop()) { 147 if (DivergentLoops.find(Loop) != DivergentLoops.end()) 148 return true; 149 } 150 151 return false; 152 } 153 154 bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { 155 // joining divergent disjoint path in Phi parent block 156 if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { 157 return true; 158 } 159 160 // An incoming value could be divergent by itself. 161 // Otherwise, an incoming value could be uniform within the loop 162 // that carries its definition but it may appear divergent 163 // from outside the loop. This happens when divergent loop exits 164 // drop definitions of that uniform value in different iterations. 165 // 166 // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop 167 // if (i % thread_id == 0) break; // divergent loop exit 168 // } 169 // int divI = i; // divI is divergent 170 for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { 171 const auto *InVal = Phi.getIncomingValue(i); 172 if (isDivergent(*Phi.getIncomingValue(i)) || 173 isTemporalDivergent(*Phi.getParent(), *InVal)) { 174 return true; 175 } 176 } 177 return false; 178 } 179 180 bool DivergenceAnalysis::inRegion(const Instruction &I) const { 181 return I.getParent() && inRegion(*I.getParent()); 182 } 183 184 bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { 185 return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); 186 } 187 188 // marks all users of loop-carried values of the loop headed by LoopHeader as 189 // divergent 190 void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { 191 auto *DivLoop = LI.getLoopFor(&LoopHeader); 192 assert(DivLoop && "loopHeader is not actually part of a loop"); 193 194 SmallVector<BasicBlock *, 8> TaintStack; 195 DivLoop->getExitBlocks(TaintStack); 196 197 // Otherwise potential users of loop-carried values could be anywhere in the 198 // dominance region of DivLoop (including its fringes for phi nodes) 199 DenseSet<const BasicBlock *> Visited; 200 for (auto *Block : TaintStack) { 201 Visited.insert(Block); 202 } 203 Visited.insert(&LoopHeader); 204 205 while (!TaintStack.empty()) { 206 auto *UserBlock = TaintStack.back(); 207 TaintStack.pop_back(); 208 209 // don't spread divergence beyond the region 210 if (!inRegion(*UserBlock)) 211 continue; 212 213 assert(!DivLoop->contains(UserBlock) && 214 "irreducible control flow detected"); 215 216 // phi nodes at the fringes of the dominance region 217 if (!DT.dominates(&LoopHeader, UserBlock)) { 218 // all PHI nodes of UserBlock become divergent 219 for (auto &Phi : UserBlock->phis()) { 220 Worklist.push_back(&Phi); 221 } 222 continue; 223 } 224 225 // taint outside users of values carried by DivLoop 226 for (auto &I : *UserBlock) { 227 if (isAlwaysUniform(I)) 228 continue; 229 if (isDivergent(I)) 230 continue; 231 232 for (auto &Op : I.operands()) { 233 auto *OpInst = dyn_cast<Instruction>(&Op); 234 if (!OpInst) 235 continue; 236 if (DivLoop->contains(OpInst->getParent())) { 237 markDivergent(I); 238 pushUsers(I); 239 break; 240 } 241 } 242 } 243 244 // visit all blocks in the dominance region 245 for (auto *SuccBlock : successors(UserBlock)) { 246 if (!Visited.insert(SuccBlock).second) { 247 continue; 248 } 249 TaintStack.push_back(SuccBlock); 250 } 251 } 252 } 253 254 void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { 255 for (const auto &Phi : Block.phis()) { 256 if (isDivergent(Phi)) 257 continue; 258 Worklist.push_back(&Phi); 259 } 260 } 261 262 void DivergenceAnalysis::pushUsers(const Value &V) { 263 for (const auto *User : V.users()) { 264 const auto *UserInst = dyn_cast<const Instruction>(User); 265 if (!UserInst) 266 continue; 267 268 if (isDivergent(*UserInst)) 269 continue; 270 271 // only compute divergent inside loop 272 if (!inRegion(*UserInst)) 273 continue; 274 Worklist.push_back(UserInst); 275 } 276 } 277 278 bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, 279 const Loop *BranchLoop) { 280 LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); 281 282 // ignore divergence outside the region 283 if (!inRegion(JoinBlock)) { 284 return false; 285 } 286 287 // push non-divergent phi nodes in JoinBlock to the worklist 288 pushPHINodes(JoinBlock); 289 290 // JoinBlock is a divergent loop exit 291 if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { 292 return true; 293 } 294 295 // disjoint-paths divergent at JoinBlock 296 markBlockJoinDivergent(JoinBlock); 297 return false; 298 } 299 300 void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) { 301 LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); 302 303 markDivergent(Term); 304 305 const auto *BranchLoop = LI.getLoopFor(Term.getParent()); 306 307 // whether there is a divergent loop exit from BranchLoop (if any) 308 bool IsBranchLoopDivergent = false; 309 310 // iterate over all blocks reachable by disjoint from Term within the loop 311 // also iterates over loop exits that become divergent due to Term. 312 for (const auto *JoinBlock : SDA.join_blocks(Term)) { 313 IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); 314 } 315 316 // Branch loop is a divergent loop due to the divergent branch in Term 317 if (IsBranchLoopDivergent) { 318 assert(BranchLoop); 319 if (!DivergentLoops.insert(BranchLoop).second) { 320 return; 321 } 322 propagateLoopDivergence(*BranchLoop); 323 } 324 } 325 326 void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { 327 LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); 328 329 // don't propagate beyond region 330 if (!inRegion(*ExitingLoop.getHeader())) 331 return; 332 333 const auto *BranchLoop = ExitingLoop.getParentLoop(); 334 335 // Uses of loop-carried values could occur anywhere 336 // within the dominance region of the definition. All loop-carried 337 // definitions are dominated by the loop header (reducible control). 338 // Thus all users have to be in the dominance region of the loop header, 339 // except PHI nodes that can also live at the fringes of the dom region 340 // (incoming defining value). 341 if (!IsLCSSAForm) 342 taintLoopLiveOuts(*ExitingLoop.getHeader()); 343 344 // whether there is a divergent loop exit from BranchLoop (if any) 345 bool IsBranchLoopDivergent = false; 346 347 // iterate over all blocks reachable by disjoint paths from exits of 348 // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn 349 // become divergent. 350 for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { 351 IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); 352 } 353 354 // Branch loop is a divergent due to divergent loop exit in ExitingLoop 355 if (IsBranchLoopDivergent) { 356 assert(BranchLoop); 357 if (!DivergentLoops.insert(BranchLoop).second) { 358 return; 359 } 360 propagateLoopDivergence(*BranchLoop); 361 } 362 } 363 364 void DivergenceAnalysis::compute() { 365 for (auto *DivVal : DivergentValues) { 366 pushUsers(*DivVal); 367 } 368 369 // propagate divergence 370 while (!Worklist.empty()) { 371 const Instruction &I = *Worklist.back(); 372 Worklist.pop_back(); 373 374 // maintain uniformity of overrides 375 if (isAlwaysUniform(I)) 376 continue; 377 378 bool WasDivergent = isDivergent(I); 379 if (WasDivergent) 380 continue; 381 382 // propagate divergence caused by terminator 383 if (I.isTerminator()) { 384 if (updateTerminator(I)) { 385 // propagate control divergence to affected instructions 386 propagateBranchDivergence(I); 387 continue; 388 } 389 } 390 391 // update divergence of I due to divergent operands 392 bool DivergentUpd = false; 393 const auto *Phi = dyn_cast<const PHINode>(&I); 394 if (Phi) { 395 DivergentUpd = updatePHINode(*Phi); 396 } else { 397 DivergentUpd = updateNormalInstruction(I); 398 } 399 400 // propagate value divergence to users 401 if (DivergentUpd) { 402 markDivergent(I); 403 pushUsers(I); 404 } 405 } 406 } 407 408 bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { 409 return UniformOverrides.find(&V) != UniformOverrides.end(); 410 } 411 412 bool DivergenceAnalysis::isDivergent(const Value &V) const { 413 return DivergentValues.find(&V) != DivergentValues.end(); 414 } 415 416 void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { 417 if (DivergentValues.empty()) 418 return; 419 // iterate instructions using instructions() to ensure a deterministic order. 420 for (auto &I : instructions(F)) { 421 if (isDivergent(I)) 422 OS << "DIVERGENT:" << I << '\n'; 423 } 424 } 425 426 // class GPUDivergenceAnalysis 427 GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, 428 const DominatorTree &DT, 429 const PostDominatorTree &PDT, 430 const LoopInfo &LI, 431 const TargetTransformInfo &TTI) 432 : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { 433 for (auto &I : instructions(F)) { 434 if (TTI.isSourceOfDivergence(&I)) { 435 DA.markDivergent(I); 436 } else if (TTI.isAlwaysUniform(&I)) { 437 DA.addUniformOverride(I); 438 } 439 } 440 for (auto &Arg : F.args()) { 441 if (TTI.isSourceOfDivergence(&Arg)) { 442 DA.markDivergent(Arg); 443 } 444 } 445 446 DA.compute(); 447 } 448 449 bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { 450 return DA.isDivergent(val); 451 } 452 453 void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { 454 OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; 455 DA.print(OS, mod); 456 OS << "}\n"; 457 } 458