1 //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a general divergence analysis for loop vectorization 10 // and GPU programs. It determines which branches and values in a loop or GPU 11 // program are divergent. It can help branch optimizations such as jump 12 // threading and loop unswitching to make better decisions. 13 // 14 // GPU programs typically use the SIMD execution model, where multiple threads 15 // in the same execution group have to execute in lock-step. Therefore, if the 16 // code contains divergent branches (i.e., threads in a group do not agree on 17 // which path of the branch to take), the group of threads has to execute all 18 // the paths from that branch with different subsets of threads enabled until 19 // they re-converge. 20 // 21 // Due to this execution model, some optimizations such as jump 22 // threading and loop unswitching can interfere with thread re-convergence. 23 // Therefore, an analysis that computes which branches in a GPU program are 24 // divergent can help the compiler to selectively run these optimizations. 25 // 26 // This implementation is derived from the Vectorization Analysis of the 27 // Region Vectorizer (RV). That implementation in turn is based on the approach 28 // described in 29 // 30 // Improving Performance of OpenCL on CPUs 31 // Ralf Karrenberg and Sebastian Hack 32 // CC '12 33 // 34 // This DivergenceAnalysis implementation is generic in the sense that it does 35 // not itself identify original sources of divergence. 36 // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and 37 // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence 38 // (e.g., special variables that hold the thread ID or the iteration variable). 39 // 40 // The generic implementation propagates divergence to variables that are data 41 // or sync dependent on a source of divergence. 42 // 43 // While data dependency is a well-known concept, the notion of sync dependency 44 // is worth more explanation. Sync dependence characterizes the control flow 45 // aspect of the propagation of branch divergence. For example, 46 // 47 // %cond = icmp slt i32 %tid, 10 48 // br i1 %cond, label %then, label %else 49 // then: 50 // br label %merge 51 // else: 52 // br label %merge 53 // merge: 54 // %a = phi i32 [ 0, %then ], [ 1, %else ] 55 // 56 // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid 57 // because %tid is not on its use-def chains, %a is sync dependent on %tid 58 // because the branch "br i1 %cond" depends on %tid and affects which value %a 59 // is assigned to. 60 // 61 // The sync dependence detection (which branch induces divergence in which join 62 // points) is implemented in the SyncDependenceAnalysis. 63 // 64 // The current DivergenceAnalysis implementation has the following limitations: 65 // 1. intra-procedural. It conservatively considers the arguments of a 66 // non-kernel-entry function and the return value of a function call as 67 // divergent. 68 // 2. memory as black box. It conservatively considers values loaded from 69 // generic or local address as divergent. This can be improved by leveraging 70 // pointer analysis and/or by modelling non-escaping memory objects in SSA 71 // as done in RV. 72 // 73 //===----------------------------------------------------------------------===// 74 75 #include "llvm/Analysis/DivergenceAnalysis.h" 76 #include "llvm/Analysis/LoopInfo.h" 77 #include "llvm/Analysis/Passes.h" 78 #include "llvm/Analysis/PostDominators.h" 79 #include "llvm/Analysis/TargetTransformInfo.h" 80 #include "llvm/IR/Dominators.h" 81 #include "llvm/IR/InstIterator.h" 82 #include "llvm/IR/Instructions.h" 83 #include "llvm/IR/IntrinsicInst.h" 84 #include "llvm/IR/Value.h" 85 #include "llvm/Support/Debug.h" 86 #include "llvm/Support/raw_ostream.h" 87 #include <vector> 88 89 using namespace llvm; 90 91 #define DEBUG_TYPE "divergence-analysis" 92 93 // class DivergenceAnalysis 94 DivergenceAnalysis::DivergenceAnalysis( 95 const Function &F, const Loop *RegionLoop, const DominatorTree &DT, 96 const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm) 97 : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA), 98 IsLCSSAForm(IsLCSSAForm) {} 99 100 void DivergenceAnalysis::markDivergent(const Value &DivVal) { 101 assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal)); 102 assert(!isAlwaysUniform(DivVal) && "cannot be a divergent"); 103 DivergentValues.insert(&DivVal); 104 } 105 106 void DivergenceAnalysis::addUniformOverride(const Value &UniVal) { 107 UniformOverrides.insert(&UniVal); 108 } 109 110 bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const { 111 if (Term.getNumSuccessors() <= 1) 112 return false; 113 if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) { 114 assert(BranchTerm->isConditional()); 115 return isDivergent(*BranchTerm->getCondition()); 116 } 117 if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) { 118 return isDivergent(*SwitchTerm->getCondition()); 119 } 120 if (isa<InvokeInst>(Term)) { 121 return false; // ignore abnormal executions through landingpad 122 } 123 124 llvm_unreachable("unexpected terminator"); 125 } 126 127 bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const { 128 // TODO function calls with side effects, etc 129 for (const auto &Op : I.operands()) { 130 if (isDivergent(*Op)) 131 return true; 132 } 133 return false; 134 } 135 136 bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock, 137 const Value &Val) const { 138 const auto *Inst = dyn_cast<const Instruction>(&Val); 139 if (!Inst) 140 return false; 141 // check whether any divergent loop carrying Val terminates before control 142 // proceeds to ObservingBlock 143 for (const auto *Loop = LI.getLoopFor(Inst->getParent()); 144 Loop != RegionLoop && !Loop->contains(&ObservingBlock); 145 Loop = Loop->getParentLoop()) { 146 if (DivergentLoops.find(Loop) != DivergentLoops.end()) 147 return true; 148 } 149 150 return false; 151 } 152 153 bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const { 154 // joining divergent disjoint path in Phi parent block 155 if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) { 156 return true; 157 } 158 159 // An incoming value could be divergent by itself. 160 // Otherwise, an incoming value could be uniform within the loop 161 // that carries its definition but it may appear divergent 162 // from outside the loop. This happens when divergent loop exits 163 // drop definitions of that uniform value in different iterations. 164 // 165 // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop 166 // if (i % thread_id == 0) break; // divergent loop exit 167 // } 168 // int divI = i; // divI is divergent 169 for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) { 170 const auto *InVal = Phi.getIncomingValue(i); 171 if (isDivergent(*Phi.getIncomingValue(i)) || 172 isTemporalDivergent(*Phi.getParent(), *InVal)) { 173 return true; 174 } 175 } 176 return false; 177 } 178 179 bool DivergenceAnalysis::inRegion(const Instruction &I) const { 180 return I.getParent() && inRegion(*I.getParent()); 181 } 182 183 bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const { 184 return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB); 185 } 186 187 // marks all users of loop-carried values of the loop headed by LoopHeader as 188 // divergent 189 void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) { 190 auto *DivLoop = LI.getLoopFor(&LoopHeader); 191 assert(DivLoop && "loopHeader is not actually part of a loop"); 192 193 SmallVector<BasicBlock *, 8> TaintStack; 194 DivLoop->getExitBlocks(TaintStack); 195 196 // Otherwise potential users of loop-carried values could be anywhere in the 197 // dominance region of DivLoop (including its fringes for phi nodes) 198 DenseSet<const BasicBlock *> Visited; 199 for (auto *Block : TaintStack) { 200 Visited.insert(Block); 201 } 202 Visited.insert(&LoopHeader); 203 204 while (!TaintStack.empty()) { 205 auto *UserBlock = TaintStack.back(); 206 TaintStack.pop_back(); 207 208 // don't spread divergence beyond the region 209 if (!inRegion(*UserBlock)) 210 continue; 211 212 assert(!DivLoop->contains(UserBlock) && 213 "irreducible control flow detected"); 214 215 // phi nodes at the fringes of the dominance region 216 if (!DT.dominates(&LoopHeader, UserBlock)) { 217 // all PHI nodes of UserBlock become divergent 218 for (auto &Phi : UserBlock->phis()) { 219 Worklist.push_back(&Phi); 220 } 221 continue; 222 } 223 224 // taint outside users of values carried by DivLoop 225 for (auto &I : *UserBlock) { 226 if (isAlwaysUniform(I)) 227 continue; 228 if (isDivergent(I)) 229 continue; 230 231 for (auto &Op : I.operands()) { 232 auto *OpInst = dyn_cast<Instruction>(&Op); 233 if (!OpInst) 234 continue; 235 if (DivLoop->contains(OpInst->getParent())) { 236 markDivergent(I); 237 pushUsers(I); 238 break; 239 } 240 } 241 } 242 243 // visit all blocks in the dominance region 244 for (auto *SuccBlock : successors(UserBlock)) { 245 if (!Visited.insert(SuccBlock).second) { 246 continue; 247 } 248 TaintStack.push_back(SuccBlock); 249 } 250 } 251 } 252 253 void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) { 254 for (const auto &Phi : Block.phis()) { 255 if (isDivergent(Phi)) 256 continue; 257 Worklist.push_back(&Phi); 258 } 259 } 260 261 void DivergenceAnalysis::pushUsers(const Value &V) { 262 for (const auto *User : V.users()) { 263 const auto *UserInst = dyn_cast<const Instruction>(User); 264 if (!UserInst) 265 continue; 266 267 if (isDivergent(*UserInst)) 268 continue; 269 270 // only compute divergent inside loop 271 if (!inRegion(*UserInst)) 272 continue; 273 Worklist.push_back(UserInst); 274 } 275 } 276 277 bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock, 278 const Loop *BranchLoop) { 279 LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n"); 280 281 // ignore divergence outside the region 282 if (!inRegion(JoinBlock)) { 283 return false; 284 } 285 286 // push non-divergent phi nodes in JoinBlock to the worklist 287 pushPHINodes(JoinBlock); 288 289 // JoinBlock is a divergent loop exit 290 if (BranchLoop && !BranchLoop->contains(&JoinBlock)) { 291 return true; 292 } 293 294 // disjoint-paths divergent at JoinBlock 295 markBlockJoinDivergent(JoinBlock); 296 return false; 297 } 298 299 void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) { 300 LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n"); 301 302 markDivergent(Term); 303 304 // Don't propagate divergence from unreachable blocks. 305 if (!DT.isReachableFromEntry(Term.getParent())) 306 return; 307 308 const auto *BranchLoop = LI.getLoopFor(Term.getParent()); 309 310 // whether there is a divergent loop exit from BranchLoop (if any) 311 bool IsBranchLoopDivergent = false; 312 313 // iterate over all blocks reachable by disjoint from Term within the loop 314 // also iterates over loop exits that become divergent due to Term. 315 for (const auto *JoinBlock : SDA.join_blocks(Term)) { 316 IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); 317 } 318 319 // Branch loop is a divergent loop due to the divergent branch in Term 320 if (IsBranchLoopDivergent) { 321 assert(BranchLoop); 322 if (!DivergentLoops.insert(BranchLoop).second) { 323 return; 324 } 325 propagateLoopDivergence(*BranchLoop); 326 } 327 } 328 329 void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) { 330 LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n"); 331 332 // don't propagate beyond region 333 if (!inRegion(*ExitingLoop.getHeader())) 334 return; 335 336 const auto *BranchLoop = ExitingLoop.getParentLoop(); 337 338 // Uses of loop-carried values could occur anywhere 339 // within the dominance region of the definition. All loop-carried 340 // definitions are dominated by the loop header (reducible control). 341 // Thus all users have to be in the dominance region of the loop header, 342 // except PHI nodes that can also live at the fringes of the dom region 343 // (incoming defining value). 344 if (!IsLCSSAForm) 345 taintLoopLiveOuts(*ExitingLoop.getHeader()); 346 347 // whether there is a divergent loop exit from BranchLoop (if any) 348 bool IsBranchLoopDivergent = false; 349 350 // iterate over all blocks reachable by disjoint paths from exits of 351 // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn 352 // become divergent. 353 for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) { 354 IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop); 355 } 356 357 // Branch loop is a divergent due to divergent loop exit in ExitingLoop 358 if (IsBranchLoopDivergent) { 359 assert(BranchLoop); 360 if (!DivergentLoops.insert(BranchLoop).second) { 361 return; 362 } 363 propagateLoopDivergence(*BranchLoop); 364 } 365 } 366 367 void DivergenceAnalysis::compute() { 368 for (auto *DivVal : DivergentValues) { 369 pushUsers(*DivVal); 370 } 371 372 // propagate divergence 373 while (!Worklist.empty()) { 374 const Instruction &I = *Worklist.back(); 375 Worklist.pop_back(); 376 377 // maintain uniformity of overrides 378 if (isAlwaysUniform(I)) 379 continue; 380 381 bool WasDivergent = isDivergent(I); 382 if (WasDivergent) 383 continue; 384 385 // propagate divergence caused by terminator 386 if (I.isTerminator()) { 387 if (updateTerminator(I)) { 388 // propagate control divergence to affected instructions 389 propagateBranchDivergence(I); 390 continue; 391 } 392 } 393 394 // update divergence of I due to divergent operands 395 bool DivergentUpd = false; 396 const auto *Phi = dyn_cast<const PHINode>(&I); 397 if (Phi) { 398 DivergentUpd = updatePHINode(*Phi); 399 } else { 400 DivergentUpd = updateNormalInstruction(I); 401 } 402 403 // propagate value divergence to users 404 if (DivergentUpd) { 405 markDivergent(I); 406 pushUsers(I); 407 } 408 } 409 } 410 411 bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const { 412 return UniformOverrides.find(&V) != UniformOverrides.end(); 413 } 414 415 bool DivergenceAnalysis::isDivergent(const Value &V) const { 416 return DivergentValues.find(&V) != DivergentValues.end(); 417 } 418 419 bool DivergenceAnalysis::isDivergentUse(const Use &U) const { 420 Value &V = *U.get(); 421 Instruction &I = *cast<Instruction>(U.getUser()); 422 return isDivergent(V) || isTemporalDivergent(*I.getParent(), V); 423 } 424 425 void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const { 426 if (DivergentValues.empty()) 427 return; 428 // iterate instructions using instructions() to ensure a deterministic order. 429 for (auto &I : instructions(F)) { 430 if (isDivergent(I)) 431 OS << "DIVERGENT:" << I << '\n'; 432 } 433 } 434 435 // class GPUDivergenceAnalysis 436 GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F, 437 const DominatorTree &DT, 438 const PostDominatorTree &PDT, 439 const LoopInfo &LI, 440 const TargetTransformInfo &TTI) 441 : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) { 442 for (auto &I : instructions(F)) { 443 if (TTI.isSourceOfDivergence(&I)) { 444 DA.markDivergent(I); 445 } else if (TTI.isAlwaysUniform(&I)) { 446 DA.addUniformOverride(I); 447 } 448 } 449 for (auto &Arg : F.args()) { 450 if (TTI.isSourceOfDivergence(&Arg)) { 451 DA.markDivergent(Arg); 452 } 453 } 454 455 DA.compute(); 456 } 457 458 bool GPUDivergenceAnalysis::isDivergent(const Value &val) const { 459 return DA.isDivergent(val); 460 } 461 462 bool GPUDivergenceAnalysis::isDivergentUse(const Use &use) const { 463 return DA.isDivergentUse(use); 464 } 465 466 void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const { 467 OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n"; 468 DA.print(OS, mod); 469 OS << "}\n"; 470 } 471