1 //===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a pass to prepare loops for pre-increment addressing 11 // modes. Additional PHIs are created for loop induction variables used by 12 // load/store instructions so that the pre-increment forms can be used. 13 // Generically, this means transforming loops like this: 14 // for (int i = 0; i < n; ++i) 15 // array[i] = c; 16 // to look like this: 17 // T *p = array[-1]; 18 // for (int i = 0; i < n; ++i) 19 // *++p = c; 20 //===----------------------------------------------------------------------===// 21 22 #define DEBUG_TYPE "ppc-loop-preinc-prep" 23 24 #include "PPC.h" 25 #include "PPCSubtarget.h" 26 #include "PPCTargetMachine.h" 27 #include "llvm/ADT/DepthFirstIterator.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/Analysis/LoopInfo.h" 32 #include "llvm/Analysis/ScalarEvolution.h" 33 #include "llvm/Analysis/ScalarEvolutionExpander.h" 34 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 35 #include "llvm/IR/BasicBlock.h" 36 #include "llvm/IR/CFG.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/IntrinsicInst.h" 41 #include "llvm/IR/Module.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/Pass.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CommandLine.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Transforms/Scalar.h" 49 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 50 #include "llvm/Transforms/Utils/Local.h" 51 #include "llvm/Transforms/Utils/LoopUtils.h" 52 #include <cassert> 53 #include <iterator> 54 #include <utility> 55 56 using namespace llvm; 57 58 // By default, we limit this to creating 16 PHIs (which is a little over half 59 // of the allocatable register set). 60 static cl::opt<unsigned> MaxVars("ppc-preinc-prep-max-vars", 61 cl::Hidden, cl::init(16), 62 cl::desc("Potential PHI threshold for PPC preinc loop prep")); 63 64 namespace llvm { 65 66 void initializePPCLoopPreIncPrepPass(PassRegistry&); 67 68 } // end namespace llvm 69 70 namespace { 71 72 class PPCLoopPreIncPrep : public FunctionPass { 73 public: 74 static char ID; // Pass ID, replacement for typeid 75 76 PPCLoopPreIncPrep() : FunctionPass(ID) { 77 initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); 78 } 79 80 PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) { 81 initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry()); 82 } 83 84 void getAnalysisUsage(AnalysisUsage &AU) const override { 85 AU.addPreserved<DominatorTreeWrapperPass>(); 86 AU.addRequired<LoopInfoWrapperPass>(); 87 AU.addPreserved<LoopInfoWrapperPass>(); 88 AU.addRequired<ScalarEvolutionWrapperPass>(); 89 } 90 91 bool runOnFunction(Function &F) override; 92 93 bool runOnLoop(Loop *L); 94 void simplifyLoopLatch(Loop *L); 95 bool rotateLoop(Loop *L); 96 97 private: 98 PPCTargetMachine *TM = nullptr; 99 DominatorTree *DT; 100 LoopInfo *LI; 101 ScalarEvolution *SE; 102 bool PreserveLCSSA; 103 }; 104 105 } // end anonymous namespace 106 107 char PPCLoopPreIncPrep::ID = 0; 108 static const char *name = "Prepare loop for pre-inc. addressing modes"; 109 INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) 110 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 111 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 112 INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) 113 114 FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { 115 return new PPCLoopPreIncPrep(TM); 116 } 117 118 namespace { 119 120 struct BucketElement { 121 BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} 122 BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} 123 124 const SCEVConstant *Offset; 125 Instruction *Instr; 126 }; 127 128 struct Bucket { 129 Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), 130 Elements(1, BucketElement(I)) {} 131 132 const SCEV *BaseSCEV; 133 SmallVector<BucketElement, 16> Elements; 134 }; 135 136 } // end anonymous namespace 137 138 static bool IsPtrInBounds(Value *BasePtr) { 139 Value *StrippedBasePtr = BasePtr; 140 while (BitCastInst *BC = dyn_cast<BitCastInst>(StrippedBasePtr)) 141 StrippedBasePtr = BC->getOperand(0); 142 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(StrippedBasePtr)) 143 return GEP->isInBounds(); 144 145 return false; 146 } 147 148 static Value *GetPointerOperand(Value *MemI) { 149 if (LoadInst *LMemI = dyn_cast<LoadInst>(MemI)) { 150 return LMemI->getPointerOperand(); 151 } else if (StoreInst *SMemI = dyn_cast<StoreInst>(MemI)) { 152 return SMemI->getPointerOperand(); 153 } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(MemI)) { 154 if (IMemI->getIntrinsicID() == Intrinsic::prefetch) 155 return IMemI->getArgOperand(0); 156 } 157 158 return nullptr; 159 } 160 161 bool PPCLoopPreIncPrep::runOnFunction(Function &F) { 162 if (skipFunction(F)) 163 return false; 164 165 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 166 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 167 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 168 DT = DTWP ? &DTWP->getDomTree() : nullptr; 169 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 170 171 bool MadeChange = false; 172 173 for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) 174 for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) 175 MadeChange |= runOnLoop(*L); 176 177 return MadeChange; 178 } 179 180 bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { 181 bool MadeChange = false; 182 183 // Only prep. the inner-most loop 184 if (!L->empty()) 185 return MadeChange; 186 187 DEBUG(dbgs() << "PIP: Examining: " << *L << "\n"); 188 189 BasicBlock *Header = L->getHeader(); 190 191 const PPCSubtarget *ST = 192 TM ? TM->getSubtargetImpl(*Header->getParent()) : nullptr; 193 194 unsigned HeaderLoopPredCount = 195 std::distance(pred_begin(Header), pred_end(Header)); 196 197 // Collect buckets of comparable addresses used by loads and stores. 198 SmallVector<Bucket, 16> Buckets; 199 for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); 200 I != IE; ++I) { 201 for (BasicBlock::iterator J = (*I)->begin(), JE = (*I)->end(); 202 J != JE; ++J) { 203 Value *PtrValue; 204 Instruction *MemI; 205 206 if (LoadInst *LMemI = dyn_cast<LoadInst>(J)) { 207 MemI = LMemI; 208 PtrValue = LMemI->getPointerOperand(); 209 } else if (StoreInst *SMemI = dyn_cast<StoreInst>(J)) { 210 MemI = SMemI; 211 PtrValue = SMemI->getPointerOperand(); 212 } else if (IntrinsicInst *IMemI = dyn_cast<IntrinsicInst>(J)) { 213 if (IMemI->getIntrinsicID() == Intrinsic::prefetch) { 214 MemI = IMemI; 215 PtrValue = IMemI->getArgOperand(0); 216 } else continue; 217 } else continue; 218 219 unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); 220 if (PtrAddrSpace) 221 continue; 222 223 // There are no update forms for Altivec vector load/stores. 224 if (ST && ST->hasAltivec() && 225 PtrValue->getType()->getPointerElementType()->isVectorTy()) 226 continue; 227 228 if (L->isLoopInvariant(PtrValue)) 229 continue; 230 231 const SCEV *LSCEV = SE->getSCEVAtScope(PtrValue, L); 232 if (const SCEVAddRecExpr *LARSCEV = dyn_cast<SCEVAddRecExpr>(LSCEV)) { 233 if (LARSCEV->getLoop() != L) 234 continue; 235 } else { 236 continue; 237 } 238 239 bool FoundBucket = false; 240 for (auto &B : Buckets) { 241 const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); 242 if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) { 243 B.Elements.push_back(BucketElement(CDiff, MemI)); 244 FoundBucket = true; 245 break; 246 } 247 } 248 249 if (!FoundBucket) { 250 if (Buckets.size() == MaxVars) 251 return MadeChange; 252 Buckets.push_back(Bucket(LSCEV, MemI)); 253 } 254 } 255 } 256 257 if (Buckets.empty()) 258 return MadeChange; 259 260 BasicBlock *LoopPredecessor = L->getLoopPredecessor(); 261 // If there is no loop predecessor, or the loop predecessor's terminator 262 // returns a value (which might contribute to determining the loop's 263 // iteration space), insert a new preheader for the loop. 264 if (!LoopPredecessor || 265 !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { 266 LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); 267 if (LoopPredecessor) 268 MadeChange = true; 269 } 270 if (!LoopPredecessor) 271 return MadeChange; 272 273 DEBUG(dbgs() << "PIP: Found " << Buckets.size() << " buckets\n"); 274 275 SmallSet<BasicBlock *, 16> BBChanged; 276 for (unsigned i = 0, e = Buckets.size(); i != e; ++i) { 277 // The base address of each bucket is transformed into a phi and the others 278 // are rewritten as offsets of that variable. 279 280 // We have a choice now of which instruction's memory operand we use as the 281 // base for the generated PHI. Always picking the first instruction in each 282 // bucket does not work well, specifically because that instruction might 283 // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, 284 // the choice is somewhat arbitrary, because the backend will happily 285 // generate direct offsets from both the pre-incremented and 286 // post-incremented pointer values. Thus, we'll pick the first non-prefetch 287 // instruction in each bucket, and adjust the recurrence and other offsets 288 // accordingly. 289 for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { 290 if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr)) 291 if (II->getIntrinsicID() == Intrinsic::prefetch) 292 continue; 293 294 // If we'd otherwise pick the first element anyway, there's nothing to do. 295 if (j == 0) 296 break; 297 298 // If our chosen element has no offset from the base pointer, there's 299 // nothing to do. 300 if (!Buckets[i].Elements[j].Offset || 301 Buckets[i].Elements[j].Offset->isZero()) 302 break; 303 304 const SCEV *Offset = Buckets[i].Elements[j].Offset; 305 Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset); 306 for (auto &E : Buckets[i].Elements) { 307 if (E.Offset) 308 E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); 309 else 310 E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); 311 } 312 313 std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]); 314 break; 315 } 316 317 const SCEVAddRecExpr *BasePtrSCEV = 318 cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV); 319 if (!BasePtrSCEV->isAffine()) 320 continue; 321 322 DEBUG(dbgs() << "PIP: Transforming: " << *BasePtrSCEV << "\n"); 323 assert(BasePtrSCEV->getLoop() == L && 324 "AddRec for the wrong loop?"); 325 326 // The instruction corresponding to the Bucket's BaseSCEV must be the first 327 // in the vector of elements. 328 Instruction *MemI = Buckets[i].Elements.begin()->Instr; 329 Value *BasePtr = GetPointerOperand(MemI); 330 assert(BasePtr && "No pointer operand"); 331 332 Type *I8Ty = Type::getInt8Ty(MemI->getParent()->getContext()); 333 Type *I8PtrTy = Type::getInt8PtrTy(MemI->getParent()->getContext(), 334 BasePtr->getType()->getPointerAddressSpace()); 335 336 const SCEV *BasePtrStartSCEV = BasePtrSCEV->getStart(); 337 if (!SE->isLoopInvariant(BasePtrStartSCEV, L)) 338 continue; 339 340 const SCEVConstant *BasePtrIncSCEV = 341 dyn_cast<SCEVConstant>(BasePtrSCEV->getStepRecurrence(*SE)); 342 if (!BasePtrIncSCEV) 343 continue; 344 BasePtrStartSCEV = SE->getMinusSCEV(BasePtrStartSCEV, BasePtrIncSCEV); 345 if (!isSafeToExpand(BasePtrStartSCEV, *SE)) 346 continue; 347 348 DEBUG(dbgs() << "PIP: New start is: " << *BasePtrStartSCEV << "\n"); 349 350 PHINode *NewPHI = PHINode::Create(I8PtrTy, HeaderLoopPredCount, 351 MemI->hasName() ? MemI->getName() + ".phi" : "", 352 Header->getFirstNonPHI()); 353 354 SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(), "pistart"); 355 Value *BasePtrStart = SCEVE.expandCodeFor(BasePtrStartSCEV, I8PtrTy, 356 LoopPredecessor->getTerminator()); 357 358 // Note that LoopPredecessor might occur in the predecessor list multiple 359 // times, and we need to add it the right number of times. 360 for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); 361 PI != PE; ++PI) { 362 if (*PI != LoopPredecessor) 363 continue; 364 365 NewPHI->addIncoming(BasePtrStart, LoopPredecessor); 366 } 367 368 Instruction *InsPoint = &*Header->getFirstInsertionPt(); 369 GetElementPtrInst *PtrInc = GetElementPtrInst::Create( 370 I8Ty, NewPHI, BasePtrIncSCEV->getValue(), 371 MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); 372 PtrInc->setIsInBounds(IsPtrInBounds(BasePtr)); 373 for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); 374 PI != PE; ++PI) { 375 if (*PI == LoopPredecessor) 376 continue; 377 378 NewPHI->addIncoming(PtrInc, *PI); 379 } 380 381 Instruction *NewBasePtr; 382 if (PtrInc->getType() != BasePtr->getType()) 383 NewBasePtr = new BitCastInst(PtrInc, BasePtr->getType(), 384 PtrInc->hasName() ? PtrInc->getName() + ".cast" : "", InsPoint); 385 else 386 NewBasePtr = PtrInc; 387 388 if (Instruction *IDel = dyn_cast<Instruction>(BasePtr)) 389 BBChanged.insert(IDel->getParent()); 390 BasePtr->replaceAllUsesWith(NewBasePtr); 391 RecursivelyDeleteTriviallyDeadInstructions(BasePtr); 392 393 // Keep track of the replacement pointer values we've inserted so that we 394 // don't generate more pointer values than necessary. 395 SmallPtrSet<Value *, 16> NewPtrs; 396 NewPtrs.insert( NewBasePtr); 397 398 for (auto I = std::next(Buckets[i].Elements.begin()), 399 IE = Buckets[i].Elements.end(); I != IE; ++I) { 400 Value *Ptr = GetPointerOperand(I->Instr); 401 assert(Ptr && "No pointer operand"); 402 if (NewPtrs.count(Ptr)) 403 continue; 404 405 Instruction *RealNewPtr; 406 if (!I->Offset || I->Offset->getValue()->isZero()) { 407 RealNewPtr = NewBasePtr; 408 } else { 409 Instruction *PtrIP = dyn_cast<Instruction>(Ptr); 410 if (PtrIP && isa<Instruction>(NewBasePtr) && 411 cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent()) 412 PtrIP = nullptr; 413 else if (isa<PHINode>(PtrIP)) 414 PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); 415 else if (!PtrIP) 416 PtrIP = I->Instr; 417 418 GetElementPtrInst *NewPtr = GetElementPtrInst::Create( 419 I8Ty, PtrInc, I->Offset->getValue(), 420 I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); 421 if (!PtrIP) 422 NewPtr->insertAfter(cast<Instruction>(PtrInc)); 423 NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); 424 RealNewPtr = NewPtr; 425 } 426 427 if (Instruction *IDel = dyn_cast<Instruction>(Ptr)) 428 BBChanged.insert(IDel->getParent()); 429 430 Instruction *ReplNewPtr; 431 if (Ptr->getType() != RealNewPtr->getType()) { 432 ReplNewPtr = new BitCastInst(RealNewPtr, Ptr->getType(), 433 Ptr->hasName() ? Ptr->getName() + ".cast" : ""); 434 ReplNewPtr->insertAfter(RealNewPtr); 435 } else 436 ReplNewPtr = RealNewPtr; 437 438 Ptr->replaceAllUsesWith(ReplNewPtr); 439 RecursivelyDeleteTriviallyDeadInstructions(Ptr); 440 441 NewPtrs.insert(RealNewPtr); 442 } 443 444 MadeChange = true; 445 } 446 447 for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); 448 I != IE; ++I) { 449 if (BBChanged.count(*I)) 450 DeleteDeadPHIs(*I); 451 } 452 453 return MadeChange; 454 } 455