1 //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // The implementation for the loop memory dependence that was originally 11 // developed for the loop vectorizer. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Analysis/LoopAccessAnalysis.h" 16 #include "llvm/Analysis/LoopInfo.h" 17 #include "llvm/Analysis/ScalarEvolutionExpander.h" 18 #include "llvm/Analysis/ValueTracking.h" 19 #include "llvm/IR/DiagnosticInfo.h" 20 #include "llvm/IR/Dominators.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Transforms/Utils/VectorUtils.h" 24 using namespace llvm; 25 26 #define DEBUG_TYPE "loop-accesses" 27 28 static cl::opt<unsigned, true> 29 VectorizationFactor("force-vector-width", cl::Hidden, 30 cl::desc("Sets the SIMD width. Zero is autoselect."), 31 cl::location(VectorizerParams::VectorizationFactor)); 32 unsigned VectorizerParams::VectorizationFactor = 0; 33 34 static cl::opt<unsigned, true> 35 VectorizationInterleave("force-vector-interleave", cl::Hidden, 36 cl::desc("Sets the vectorization interleave count. " 37 "Zero is autoselect."), 38 cl::location( 39 VectorizerParams::VectorizationInterleave)); 40 unsigned VectorizerParams::VectorizationInterleave = 0; 41 42 /// When performing memory disambiguation checks at runtime do not make more 43 /// than this number of comparisons. 44 const unsigned VectorizerParams::RuntimeMemoryCheckThreshold = 8; 45 46 /// Maximum SIMD width. 47 const unsigned VectorizerParams::MaxVectorWidth = 64; 48 49 bool VectorizerParams::isInterleaveForced() { 50 return ::VectorizationInterleave.getNumOccurrences() > 0; 51 } 52 53 void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message, 54 const Function *TheFunction, 55 const Loop *TheLoop, 56 const char *PassName) { 57 DebugLoc DL = TheLoop->getStartLoc(); 58 if (const Instruction *I = Message.getInstr()) 59 DL = I->getDebugLoc(); 60 emitOptimizationRemarkAnalysis(TheFunction->getContext(), PassName, 61 *TheFunction, DL, Message.str()); 62 } 63 64 Value *llvm::stripIntegerCast(Value *V) { 65 if (CastInst *CI = dyn_cast<CastInst>(V)) 66 if (CI->getOperand(0)->getType()->isIntegerTy()) 67 return CI->getOperand(0); 68 return V; 69 } 70 71 const SCEV *llvm::replaceSymbolicStrideSCEV(ScalarEvolution *SE, 72 const ValueToValueMap &PtrToStride, 73 Value *Ptr, Value *OrigPtr) { 74 75 const SCEV *OrigSCEV = SE->getSCEV(Ptr); 76 77 // If there is an entry in the map return the SCEV of the pointer with the 78 // symbolic stride replaced by one. 79 ValueToValueMap::const_iterator SI = 80 PtrToStride.find(OrigPtr ? OrigPtr : Ptr); 81 if (SI != PtrToStride.end()) { 82 Value *StrideVal = SI->second; 83 84 // Strip casts. 85 StrideVal = stripIntegerCast(StrideVal); 86 87 // Replace symbolic stride by one. 88 Value *One = ConstantInt::get(StrideVal->getType(), 1); 89 ValueToValueMap RewriteMap; 90 RewriteMap[StrideVal] = One; 91 92 const SCEV *ByOne = 93 SCEVParameterRewriter::rewrite(OrigSCEV, *SE, RewriteMap, true); 94 DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV << " by: " << *ByOne 95 << "\n"); 96 return ByOne; 97 } 98 99 // Otherwise, just return the SCEV of the original pointer. 100 return SE->getSCEV(Ptr); 101 } 102 103 void LoopAccessInfo::RuntimePointerCheck::insert( 104 ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, unsigned DepSetId, 105 unsigned ASId, const ValueToValueMap &Strides) { 106 // Get the stride replaced scev. 107 const SCEV *Sc = replaceSymbolicStrideSCEV(SE, Strides, Ptr); 108 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); 109 assert(AR && "Invalid addrec expression"); 110 const SCEV *Ex = SE->getBackedgeTakenCount(Lp); 111 const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); 112 Pointers.push_back(Ptr); 113 Starts.push_back(AR->getStart()); 114 Ends.push_back(ScEnd); 115 IsWritePtr.push_back(WritePtr); 116 DependencySetId.push_back(DepSetId); 117 AliasSetId.push_back(ASId); 118 } 119 120 bool LoopAccessInfo::RuntimePointerCheck::needsChecking(unsigned I, 121 unsigned J) const { 122 // No need to check if two readonly pointers intersect. 123 if (!IsWritePtr[I] && !IsWritePtr[J]) 124 return false; 125 126 // Only need to check pointers between two different dependency sets. 127 if (DependencySetId[I] == DependencySetId[J]) 128 return false; 129 130 // Only need to check pointers in the same alias set. 131 if (AliasSetId[I] != AliasSetId[J]) 132 return false; 133 134 return true; 135 } 136 137 void LoopAccessInfo::RuntimePointerCheck::print(raw_ostream &OS, 138 unsigned Depth) const { 139 unsigned NumPointers = Pointers.size(); 140 if (NumPointers == 0) 141 return; 142 143 OS.indent(Depth) << "Run-time memory checks:\n"; 144 unsigned N = 0; 145 for (unsigned I = 0; I < NumPointers; ++I) 146 for (unsigned J = I + 1; J < NumPointers; ++J) 147 if (needsChecking(I, J)) { 148 OS.indent(Depth) << N++ << ":\n"; 149 OS.indent(Depth + 2) << *Pointers[I] << "\n"; 150 OS.indent(Depth + 2) << *Pointers[J] << "\n"; 151 } 152 } 153 154 namespace { 155 /// \brief Analyses memory accesses in a loop. 156 /// 157 /// Checks whether run time pointer checks are needed and builds sets for data 158 /// dependence checking. 159 class AccessAnalysis { 160 public: 161 /// \brief Read or write access location. 162 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; 163 typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; 164 165 /// \brief Set of potential dependent memory accesses. 166 typedef EquivalenceClasses<MemAccessInfo> DepCandidates; 167 168 AccessAnalysis(const DataLayout *Dl, AliasAnalysis *AA, DepCandidates &DA) : 169 DL(Dl), AST(*AA), DepCands(DA), IsRTCheckNeeded(false) {} 170 171 /// \brief Register a load and whether it is only read from. 172 void addLoad(AliasAnalysis::Location &Loc, bool IsReadOnly) { 173 Value *Ptr = const_cast<Value*>(Loc.Ptr); 174 AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); 175 Accesses.insert(MemAccessInfo(Ptr, false)); 176 if (IsReadOnly) 177 ReadOnlyPtr.insert(Ptr); 178 } 179 180 /// \brief Register a store. 181 void addStore(AliasAnalysis::Location &Loc) { 182 Value *Ptr = const_cast<Value*>(Loc.Ptr); 183 AST.add(Ptr, AliasAnalysis::UnknownSize, Loc.AATags); 184 Accesses.insert(MemAccessInfo(Ptr, true)); 185 } 186 187 /// \brief Check whether we can check the pointers at runtime for 188 /// non-intersection. 189 bool canCheckPtrAtRT(LoopAccessInfo::RuntimePointerCheck &RtCheck, 190 unsigned &NumComparisons, ScalarEvolution *SE, 191 Loop *TheLoop, const ValueToValueMap &Strides, 192 bool ShouldCheckStride = false); 193 194 /// \brief Goes over all memory accesses, checks whether a RT check is needed 195 /// and builds sets of dependent accesses. 196 void buildDependenceSets() { 197 processMemAccesses(); 198 } 199 200 bool isRTCheckNeeded() { return IsRTCheckNeeded; } 201 202 bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } 203 void resetDepChecks() { CheckDeps.clear(); } 204 205 MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; } 206 207 private: 208 typedef SetVector<MemAccessInfo> PtrAccessSet; 209 210 /// \brief Go over all memory access and check whether runtime pointer checks 211 /// are needed /// and build sets of dependency check candidates. 212 void processMemAccesses(); 213 214 /// Set of all accesses. 215 PtrAccessSet Accesses; 216 217 /// Set of accesses that need a further dependence check. 218 MemAccessInfoSet CheckDeps; 219 220 /// Set of pointers that are read only. 221 SmallPtrSet<Value*, 16> ReadOnlyPtr; 222 223 const DataLayout *DL; 224 225 /// An alias set tracker to partition the access set by underlying object and 226 //intrinsic property (such as TBAA metadata). 227 AliasSetTracker AST; 228 229 /// Sets of potentially dependent accesses - members of one set share an 230 /// underlying pointer. The set "CheckDeps" identfies which sets really need a 231 /// dependence check. 232 DepCandidates &DepCands; 233 234 bool IsRTCheckNeeded; 235 }; 236 237 } // end anonymous namespace 238 239 /// \brief Check whether a pointer can participate in a runtime bounds check. 240 static bool hasComputableBounds(ScalarEvolution *SE, 241 const ValueToValueMap &Strides, Value *Ptr) { 242 const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); 243 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); 244 if (!AR) 245 return false; 246 247 return AR->isAffine(); 248 } 249 250 /// \brief Check the stride of the pointer and ensure that it does not wrap in 251 /// the address space. 252 static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, 253 const Loop *Lp, const ValueToValueMap &StridesMap); 254 255 bool AccessAnalysis::canCheckPtrAtRT( 256 LoopAccessInfo::RuntimePointerCheck &RtCheck, unsigned &NumComparisons, 257 ScalarEvolution *SE, Loop *TheLoop, const ValueToValueMap &StridesMap, 258 bool ShouldCheckStride) { 259 // Find pointers with computable bounds. We are going to use this information 260 // to place a runtime bound check. 261 bool CanDoRT = true; 262 263 bool IsDepCheckNeeded = isDependencyCheckNeeded(); 264 NumComparisons = 0; 265 266 // We assign a consecutive id to access from different alias sets. 267 // Accesses between different groups doesn't need to be checked. 268 unsigned ASId = 1; 269 for (auto &AS : AST) { 270 unsigned NumReadPtrChecks = 0; 271 unsigned NumWritePtrChecks = 0; 272 273 // We assign consecutive id to access from different dependence sets. 274 // Accesses within the same set don't need a runtime check. 275 unsigned RunningDepId = 1; 276 DenseMap<Value *, unsigned> DepSetId; 277 278 for (auto A : AS) { 279 Value *Ptr = A.getValue(); 280 bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true)); 281 MemAccessInfo Access(Ptr, IsWrite); 282 283 if (IsWrite) 284 ++NumWritePtrChecks; 285 else 286 ++NumReadPtrChecks; 287 288 if (hasComputableBounds(SE, StridesMap, Ptr) && 289 // When we run after a failing dependency check we have to make sure we 290 // don't have wrapping pointers. 291 (!ShouldCheckStride || 292 isStridedPtr(SE, DL, Ptr, TheLoop, StridesMap) == 1)) { 293 // The id of the dependence set. 294 unsigned DepId; 295 296 if (IsDepCheckNeeded) { 297 Value *Leader = DepCands.getLeaderValue(Access).getPointer(); 298 unsigned &LeaderId = DepSetId[Leader]; 299 if (!LeaderId) 300 LeaderId = RunningDepId++; 301 DepId = LeaderId; 302 } else 303 // Each access has its own dependence set. 304 DepId = RunningDepId++; 305 306 RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap); 307 308 DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); 309 } else { 310 CanDoRT = false; 311 } 312 } 313 314 if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2) 315 NumComparisons += 0; // Only one dependence set. 316 else { 317 NumComparisons += (NumWritePtrChecks * (NumReadPtrChecks + 318 NumWritePtrChecks - 1)); 319 } 320 321 ++ASId; 322 } 323 324 // If the pointers that we would use for the bounds comparison have different 325 // address spaces, assume the values aren't directly comparable, so we can't 326 // use them for the runtime check. We also have to assume they could 327 // overlap. In the future there should be metadata for whether address spaces 328 // are disjoint. 329 unsigned NumPointers = RtCheck.Pointers.size(); 330 for (unsigned i = 0; i < NumPointers; ++i) { 331 for (unsigned j = i + 1; j < NumPointers; ++j) { 332 // Only need to check pointers between two different dependency sets. 333 if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j]) 334 continue; 335 // Only need to check pointers in the same alias set. 336 if (RtCheck.AliasSetId[i] != RtCheck.AliasSetId[j]) 337 continue; 338 339 Value *PtrI = RtCheck.Pointers[i]; 340 Value *PtrJ = RtCheck.Pointers[j]; 341 342 unsigned ASi = PtrI->getType()->getPointerAddressSpace(); 343 unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); 344 if (ASi != ASj) { 345 DEBUG(dbgs() << "LAA: Runtime check would require comparison between" 346 " different address spaces\n"); 347 return false; 348 } 349 } 350 } 351 352 return CanDoRT; 353 } 354 355 void AccessAnalysis::processMemAccesses() { 356 // We process the set twice: first we process read-write pointers, last we 357 // process read-only pointers. This allows us to skip dependence tests for 358 // read-only pointers. 359 360 DEBUG(dbgs() << "LAA: Processing memory accesses...\n"); 361 DEBUG(dbgs() << " AST: "; AST.dump()); 362 DEBUG(dbgs() << "LAA: Accesses:\n"); 363 DEBUG({ 364 for (auto A : Accesses) 365 dbgs() << "\t" << *A.getPointer() << " (" << 366 (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ? 367 "read-only" : "read")) << ")\n"; 368 }); 369 370 // The AliasSetTracker has nicely partitioned our pointers by metadata 371 // compatibility and potential for underlying-object overlap. As a result, we 372 // only need to check for potential pointer dependencies within each alias 373 // set. 374 for (auto &AS : AST) { 375 // Note that both the alias-set tracker and the alias sets themselves used 376 // linked lists internally and so the iteration order here is deterministic 377 // (matching the original instruction order within each set). 378 379 bool SetHasWrite = false; 380 381 // Map of pointers to last access encountered. 382 typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap; 383 UnderlyingObjToAccessMap ObjToLastAccess; 384 385 // Set of access to check after all writes have been processed. 386 PtrAccessSet DeferredAccesses; 387 388 // Iterate over each alias set twice, once to process read/write pointers, 389 // and then to process read-only pointers. 390 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { 391 bool UseDeferred = SetIteration > 0; 392 PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses; 393 394 for (auto AV : AS) { 395 Value *Ptr = AV.getValue(); 396 397 // For a single memory access in AliasSetTracker, Accesses may contain 398 // both read and write, and they both need to be handled for CheckDeps. 399 for (auto AC : S) { 400 if (AC.getPointer() != Ptr) 401 continue; 402 403 bool IsWrite = AC.getInt(); 404 405 // If we're using the deferred access set, then it contains only 406 // reads. 407 bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; 408 if (UseDeferred && !IsReadOnlyPtr) 409 continue; 410 // Otherwise, the pointer must be in the PtrAccessSet, either as a 411 // read or a write. 412 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || 413 S.count(MemAccessInfo(Ptr, false))) && 414 "Alias-set pointer not in the access set?"); 415 416 MemAccessInfo Access(Ptr, IsWrite); 417 DepCands.insert(Access); 418 419 // Memorize read-only pointers for later processing and skip them in 420 // the first round (they need to be checked after we have seen all 421 // write pointers). Note: we also mark pointer that are not 422 // consecutive as "read-only" pointers (so that we check 423 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". 424 if (!UseDeferred && IsReadOnlyPtr) { 425 DeferredAccesses.insert(Access); 426 continue; 427 } 428 429 // If this is a write - check other reads and writes for conflicts. If 430 // this is a read only check other writes for conflicts (but only if 431 // there is no other write to the ptr - this is an optimization to 432 // catch "a[i] = a[i] + " without having to do a dependence check). 433 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { 434 CheckDeps.insert(Access); 435 IsRTCheckNeeded = true; 436 } 437 438 if (IsWrite) 439 SetHasWrite = true; 440 441 // Create sets of pointers connected by a shared alias set and 442 // underlying object. 443 typedef SmallVector<Value *, 16> ValueVector; 444 ValueVector TempObjects; 445 GetUnderlyingObjects(Ptr, TempObjects, DL); 446 for (Value *UnderlyingObj : TempObjects) { 447 UnderlyingObjToAccessMap::iterator Prev = 448 ObjToLastAccess.find(UnderlyingObj); 449 if (Prev != ObjToLastAccess.end()) 450 DepCands.unionSets(Access, Prev->second); 451 452 ObjToLastAccess[UnderlyingObj] = Access; 453 } 454 } 455 } 456 } 457 } 458 } 459 460 namespace { 461 /// \brief Checks memory dependences among accesses to the same underlying 462 /// object to determine whether there vectorization is legal or not (and at 463 /// which vectorization factor). 464 /// 465 /// This class works under the assumption that we already checked that memory 466 /// locations with different underlying pointers are "must-not alias". 467 /// We use the ScalarEvolution framework to symbolically evalutate access 468 /// functions pairs. Since we currently don't restructure the loop we can rely 469 /// on the program order of memory accesses to determine their safety. 470 /// At the moment we will only deem accesses as safe for: 471 /// * A negative constant distance assuming program order. 472 /// 473 /// Safe: tmp = a[i + 1]; OR a[i + 1] = x; 474 /// a[i] = tmp; y = a[i]; 475 /// 476 /// The latter case is safe because later checks guarantuee that there can't 477 /// be a cycle through a phi node (that is, we check that "x" and "y" is not 478 /// the same variable: a header phi can only be an induction or a reduction, a 479 /// reduction can't have a memory sink, an induction can't have a memory 480 /// source). This is important and must not be violated (or we have to 481 /// resort to checking for cycles through memory). 482 /// 483 /// * A positive constant distance assuming program order that is bigger 484 /// than the biggest memory access. 485 /// 486 /// tmp = a[i] OR b[i] = x 487 /// a[i+2] = tmp y = b[i+2]; 488 /// 489 /// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively. 490 /// 491 /// * Zero distances and all accesses have the same size. 492 /// 493 class MemoryDepChecker { 494 public: 495 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; 496 typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet; 497 498 MemoryDepChecker(ScalarEvolution *Se, const DataLayout *Dl, const Loop *L) 499 : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0), 500 ShouldRetryWithRuntimeCheck(false) {} 501 502 /// \brief Register the location (instructions are given increasing numbers) 503 /// of a write access. 504 void addAccess(StoreInst *SI) { 505 Value *Ptr = SI->getPointerOperand(); 506 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); 507 InstMap.push_back(SI); 508 ++AccessIdx; 509 } 510 511 /// \brief Register the location (instructions are given increasing numbers) 512 /// of a write access. 513 void addAccess(LoadInst *LI) { 514 Value *Ptr = LI->getPointerOperand(); 515 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); 516 InstMap.push_back(LI); 517 ++AccessIdx; 518 } 519 520 /// \brief Check whether the dependencies between the accesses are safe. 521 /// 522 /// Only checks sets with elements in \p CheckDeps. 523 bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, 524 MemAccessInfoSet &CheckDeps, const ValueToValueMap &Strides); 525 526 /// \brief The maximum number of bytes of a vector register we can vectorize 527 /// the accesses safely with. 528 unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } 529 530 /// \brief In same cases when the dependency check fails we can still 531 /// vectorize the loop with a dynamic array access check. 532 bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; } 533 534 private: 535 ScalarEvolution *SE; 536 const DataLayout *DL; 537 const Loop *InnermostLoop; 538 539 /// \brief Maps access locations (ptr, read/write) to program order. 540 DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses; 541 542 /// \brief Memory access instructions in program order. 543 SmallVector<Instruction *, 16> InstMap; 544 545 /// \brief The program order index to be used for the next instruction. 546 unsigned AccessIdx; 547 548 // We can access this many bytes in parallel safely. 549 unsigned MaxSafeDepDistBytes; 550 551 /// \brief If we see a non-constant dependence distance we can still try to 552 /// vectorize this loop with runtime checks. 553 bool ShouldRetryWithRuntimeCheck; 554 555 /// \brief Check whether there is a plausible dependence between the two 556 /// accesses. 557 /// 558 /// Access \p A must happen before \p B in program order. The two indices 559 /// identify the index into the program order map. 560 /// 561 /// This function checks whether there is a plausible dependence (or the 562 /// absence of such can't be proved) between the two accesses. If there is a 563 /// plausible dependence but the dependence distance is bigger than one 564 /// element access it records this distance in \p MaxSafeDepDistBytes (if this 565 /// distance is smaller than any other distance encountered so far). 566 /// Otherwise, this function returns true signaling a possible dependence. 567 bool isDependent(const MemAccessInfo &A, unsigned AIdx, 568 const MemAccessInfo &B, unsigned BIdx, 569 const ValueToValueMap &Strides); 570 571 /// \brief Check whether the data dependence could prevent store-load 572 /// forwarding. 573 bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); 574 }; 575 576 } // end anonymous namespace 577 578 static bool isInBoundsGep(Value *Ptr) { 579 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) 580 return GEP->isInBounds(); 581 return false; 582 } 583 584 /// \brief Check whether the access through \p Ptr has a constant stride. 585 static int isStridedPtr(ScalarEvolution *SE, const DataLayout *DL, Value *Ptr, 586 const Loop *Lp, const ValueToValueMap &StridesMap) { 587 const Type *Ty = Ptr->getType(); 588 assert(Ty->isPointerTy() && "Unexpected non-ptr"); 589 590 // Make sure that the pointer does not point to aggregate types. 591 const PointerType *PtrTy = cast<PointerType>(Ty); 592 if (PtrTy->getElementType()->isAggregateType()) { 593 DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type" 594 << *Ptr << "\n"); 595 return 0; 596 } 597 598 const SCEV *PtrScev = replaceSymbolicStrideSCEV(SE, StridesMap, Ptr); 599 600 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); 601 if (!AR) { 602 DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " 603 << *Ptr << " SCEV: " << *PtrScev << "\n"); 604 return 0; 605 } 606 607 // The accesss function must stride over the innermost loop. 608 if (Lp != AR->getLoop()) { 609 DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " << 610 *Ptr << " SCEV: " << *PtrScev << "\n"); 611 } 612 613 // The address calculation must not wrap. Otherwise, a dependence could be 614 // inverted. 615 // An inbounds getelementptr that is a AddRec with a unit stride 616 // cannot wrap per definition. The unit stride requirement is checked later. 617 // An getelementptr without an inbounds attribute and unit stride would have 618 // to access the pointer value "0" which is undefined behavior in address 619 // space 0, therefore we can also vectorize this case. 620 bool IsInBoundsGEP = isInBoundsGep(Ptr); 621 bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask); 622 bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0; 623 if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) { 624 DEBUG(dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " 625 << *Ptr << " SCEV: " << *PtrScev << "\n"); 626 return 0; 627 } 628 629 // Check the step is constant. 630 const SCEV *Step = AR->getStepRecurrence(*SE); 631 632 // Calculate the pointer stride and check if it is consecutive. 633 const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); 634 if (!C) { 635 DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr << 636 " SCEV: " << *PtrScev << "\n"); 637 return 0; 638 } 639 640 int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType()); 641 const APInt &APStepVal = C->getValue()->getValue(); 642 643 // Huge step value - give up. 644 if (APStepVal.getBitWidth() > 64) 645 return 0; 646 647 int64_t StepVal = APStepVal.getSExtValue(); 648 649 // Strided access. 650 int64_t Stride = StepVal / Size; 651 int64_t Rem = StepVal % Size; 652 if (Rem) 653 return 0; 654 655 // If the SCEV could wrap but we have an inbounds gep with a unit stride we 656 // know we can't "wrap around the address space". In case of address space 657 // zero we know that this won't happen without triggering undefined behavior. 658 if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) && 659 Stride != 1 && Stride != -1) 660 return 0; 661 662 return Stride; 663 } 664 665 bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance, 666 unsigned TypeByteSize) { 667 // If loads occur at a distance that is not a multiple of a feasible vector 668 // factor store-load forwarding does not take place. 669 // Positive dependences might cause troubles because vectorizing them might 670 // prevent store-load forwarding making vectorized code run a lot slower. 671 // a[i] = a[i-3] ^ a[i-8]; 672 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and 673 // hence on your typical architecture store-load forwarding does not take 674 // place. Vectorizing in such cases does not make sense. 675 // Store-load forwarding distance. 676 const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize; 677 // Maximum vector factor. 678 unsigned MaxVFWithoutSLForwardIssues = 679 VectorizerParams::MaxVectorWidth * TypeByteSize; 680 if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues) 681 MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes; 682 683 for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues; 684 vf *= 2) { 685 if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) { 686 MaxVFWithoutSLForwardIssues = (vf >>=1); 687 break; 688 } 689 } 690 691 if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) { 692 DEBUG(dbgs() << "LAA: Distance " << Distance << 693 " that could cause a store-load forwarding conflict\n"); 694 return true; 695 } 696 697 if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes && 698 MaxVFWithoutSLForwardIssues != 699 VectorizerParams::MaxVectorWidth * TypeByteSize) 700 MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues; 701 return false; 702 } 703 704 bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, 705 const MemAccessInfo &B, unsigned BIdx, 706 const ValueToValueMap &Strides) { 707 assert (AIdx < BIdx && "Must pass arguments in program order"); 708 709 Value *APtr = A.getPointer(); 710 Value *BPtr = B.getPointer(); 711 bool AIsWrite = A.getInt(); 712 bool BIsWrite = B.getInt(); 713 714 // Two reads are independent. 715 if (!AIsWrite && !BIsWrite) 716 return false; 717 718 // We cannot check pointers in different address spaces. 719 if (APtr->getType()->getPointerAddressSpace() != 720 BPtr->getType()->getPointerAddressSpace()) 721 return true; 722 723 const SCEV *AScev = replaceSymbolicStrideSCEV(SE, Strides, APtr); 724 const SCEV *BScev = replaceSymbolicStrideSCEV(SE, Strides, BPtr); 725 726 int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop, Strides); 727 int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop, Strides); 728 729 const SCEV *Src = AScev; 730 const SCEV *Sink = BScev; 731 732 // If the induction step is negative we have to invert source and sink of the 733 // dependence. 734 if (StrideAPtr < 0) { 735 //Src = BScev; 736 //Sink = AScev; 737 std::swap(APtr, BPtr); 738 std::swap(Src, Sink); 739 std::swap(AIsWrite, BIsWrite); 740 std::swap(AIdx, BIdx); 741 std::swap(StrideAPtr, StrideBPtr); 742 } 743 744 const SCEV *Dist = SE->getMinusSCEV(Sink, Src); 745 746 DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink 747 << "(Induction step: " << StrideAPtr << ")\n"); 748 DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to " 749 << *InstMap[BIdx] << ": " << *Dist << "\n"); 750 751 // Need consecutive accesses. We don't want to vectorize 752 // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in 753 // the address space. 754 if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ 755 DEBUG(dbgs() << "Non-consecutive pointer access\n"); 756 return true; 757 } 758 759 const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); 760 if (!C) { 761 DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n"); 762 ShouldRetryWithRuntimeCheck = true; 763 return true; 764 } 765 766 Type *ATy = APtr->getType()->getPointerElementType(); 767 Type *BTy = BPtr->getType()->getPointerElementType(); 768 unsigned TypeByteSize = DL->getTypeAllocSize(ATy); 769 770 // Negative distances are not plausible dependencies. 771 const APInt &Val = C->getValue()->getValue(); 772 if (Val.isNegative()) { 773 bool IsTrueDataDependence = (AIsWrite && !BIsWrite); 774 if (IsTrueDataDependence && 775 (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || 776 ATy != BTy)) 777 return true; 778 779 DEBUG(dbgs() << "LAA: Dependence is negative: NoDep\n"); 780 return false; 781 } 782 783 // Write to the same location with the same size. 784 // Could be improved to assert type sizes are the same (i32 == float, etc). 785 if (Val == 0) { 786 if (ATy == BTy) 787 return false; 788 DEBUG(dbgs() << "LAA: Zero dependence difference but different types\n"); 789 return true; 790 } 791 792 assert(Val.isStrictlyPositive() && "Expect a positive value"); 793 794 // Positive distance bigger than max vectorization factor. 795 if (ATy != BTy) { 796 DEBUG(dbgs() << 797 "LAA: ReadWrite-Write positive dependency with different types\n"); 798 return false; 799 } 800 801 unsigned Distance = (unsigned) Val.getZExtValue(); 802 803 // Bail out early if passed-in parameters make vectorization not feasible. 804 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? 805 VectorizerParams::VectorizationFactor : 1); 806 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? 807 VectorizerParams::VectorizationInterleave : 1); 808 809 // The distance must be bigger than the size needed for a vectorized version 810 // of the operation and the size of the vectorized operation must not be 811 // bigger than the currrent maximum size. 812 if (Distance < 2*TypeByteSize || 813 2*TypeByteSize > MaxSafeDepDistBytes || 814 Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { 815 DEBUG(dbgs() << "LAA: Failure because of Positive distance " 816 << Val.getSExtValue() << '\n'); 817 return true; 818 } 819 820 MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? 821 Distance : MaxSafeDepDistBytes; 822 823 bool IsTrueDataDependence = (!AIsWrite && BIsWrite); 824 if (IsTrueDataDependence && 825 couldPreventStoreLoadForward(Distance, TypeByteSize)) 826 return true; 827 828 DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() << 829 " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n'); 830 831 return false; 832 } 833 834 bool MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, 835 MemAccessInfoSet &CheckDeps, 836 const ValueToValueMap &Strides) { 837 838 MaxSafeDepDistBytes = -1U; 839 while (!CheckDeps.empty()) { 840 MemAccessInfo CurAccess = *CheckDeps.begin(); 841 842 // Get the relevant memory access set. 843 EquivalenceClasses<MemAccessInfo>::iterator I = 844 AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); 845 846 // Check accesses within this set. 847 EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE; 848 AI = AccessSets.member_begin(I), AE = AccessSets.member_end(); 849 850 // Check every access pair. 851 while (AI != AE) { 852 CheckDeps.erase(*AI); 853 EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI); 854 while (OI != AE) { 855 // Check every accessing instruction pair in program order. 856 for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), 857 I1E = Accesses[*AI].end(); I1 != I1E; ++I1) 858 for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(), 859 I2E = Accesses[*OI].end(); I2 != I2E; ++I2) { 860 if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2, Strides)) 861 return false; 862 if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1, Strides)) 863 return false; 864 } 865 ++OI; 866 } 867 AI++; 868 } 869 } 870 return true; 871 } 872 873 bool LoopAccessInfo::canAnalyzeLoop() { 874 // We can only analyze innermost loops. 875 if (!TheLoop->empty()) { 876 emitAnalysis(LoopAccessReport() << "loop is not the innermost loop"); 877 return false; 878 } 879 880 // We must have a single backedge. 881 if (TheLoop->getNumBackEdges() != 1) { 882 emitAnalysis( 883 LoopAccessReport() << 884 "loop control flow is not understood by analyzer"); 885 return false; 886 } 887 888 // We must have a single exiting block. 889 if (!TheLoop->getExitingBlock()) { 890 emitAnalysis( 891 LoopAccessReport() << 892 "loop control flow is not understood by analyzer"); 893 return false; 894 } 895 896 // We only handle bottom-tested loops, i.e. loop in which the condition is 897 // checked at the end of each iteration. With that we can assume that all 898 // instructions in the loop are executed the same number of times. 899 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { 900 emitAnalysis( 901 LoopAccessReport() << 902 "loop control flow is not understood by analyzer"); 903 return false; 904 } 905 906 // We need to have a loop header. 907 DEBUG(dbgs() << "LAA: Found a loop: " << 908 TheLoop->getHeader()->getName() << '\n'); 909 910 // ScalarEvolution needs to be able to find the exit count. 911 const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop); 912 if (ExitCount == SE->getCouldNotCompute()) { 913 emitAnalysis(LoopAccessReport() << 914 "could not determine number of loop iterations"); 915 DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n"); 916 return false; 917 } 918 919 return true; 920 } 921 922 void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { 923 924 typedef SmallVector<Value*, 16> ValueVector; 925 typedef SmallPtrSet<Value*, 16> ValueSet; 926 927 // Holds the Load and Store *instructions*. 928 ValueVector Loads; 929 ValueVector Stores; 930 931 // Holds all the different accesses in the loop. 932 unsigned NumReads = 0; 933 unsigned NumReadWrites = 0; 934 935 PtrRtCheck.Pointers.clear(); 936 PtrRtCheck.Need = false; 937 938 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); 939 MemoryDepChecker DepChecker(SE, DL, TheLoop); 940 941 // For each block. 942 for (Loop::block_iterator bb = TheLoop->block_begin(), 943 be = TheLoop->block_end(); bb != be; ++bb) { 944 945 // Scan the BB and collect legal loads and stores. 946 for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; 947 ++it) { 948 949 // If this is a load, save it. If this instruction can read from memory 950 // but is not a load, then we quit. Notice that we don't handle function 951 // calls that read or write. 952 if (it->mayReadFromMemory()) { 953 // Many math library functions read the rounding mode. We will only 954 // vectorize a loop if it contains known function calls that don't set 955 // the flag. Therefore, it is safe to ignore this read from memory. 956 CallInst *Call = dyn_cast<CallInst>(it); 957 if (Call && getIntrinsicIDForCall(Call, TLI)) 958 continue; 959 960 LoadInst *Ld = dyn_cast<LoadInst>(it); 961 if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { 962 emitAnalysis(LoopAccessReport(Ld) 963 << "read with atomic ordering or volatile read"); 964 DEBUG(dbgs() << "LAA: Found a non-simple load.\n"); 965 CanVecMem = false; 966 return; 967 } 968 NumLoads++; 969 Loads.push_back(Ld); 970 DepChecker.addAccess(Ld); 971 continue; 972 } 973 974 // Save 'store' instructions. Abort if other instructions write to memory. 975 if (it->mayWriteToMemory()) { 976 StoreInst *St = dyn_cast<StoreInst>(it); 977 if (!St) { 978 emitAnalysis(LoopAccessReport(it) << 979 "instruction cannot be vectorized"); 980 CanVecMem = false; 981 return; 982 } 983 if (!St->isSimple() && !IsAnnotatedParallel) { 984 emitAnalysis(LoopAccessReport(St) 985 << "write with atomic ordering or volatile write"); 986 DEBUG(dbgs() << "LAA: Found a non-simple store.\n"); 987 CanVecMem = false; 988 return; 989 } 990 NumStores++; 991 Stores.push_back(St); 992 DepChecker.addAccess(St); 993 } 994 } // Next instr. 995 } // Next block. 996 997 // Now we have two lists that hold the loads and the stores. 998 // Next, we find the pointers that they use. 999 1000 // Check if we see any stores. If there are no stores, then we don't 1001 // care if the pointers are *restrict*. 1002 if (!Stores.size()) { 1003 DEBUG(dbgs() << "LAA: Found a read-only loop!\n"); 1004 CanVecMem = true; 1005 return; 1006 } 1007 1008 AccessAnalysis::DepCandidates DependentAccesses; 1009 AccessAnalysis Accesses(DL, AA, DependentAccesses); 1010 1011 // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects 1012 // multiple times on the same object. If the ptr is accessed twice, once 1013 // for read and once for write, it will only appear once (on the write 1014 // list). This is okay, since we are going to check for conflicts between 1015 // writes and between reads and writes, but not between reads and reads. 1016 ValueSet Seen; 1017 1018 ValueVector::iterator I, IE; 1019 for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { 1020 StoreInst *ST = cast<StoreInst>(*I); 1021 Value* Ptr = ST->getPointerOperand(); 1022 1023 if (isUniform(Ptr)) { 1024 emitAnalysis( 1025 LoopAccessReport(ST) 1026 << "write to a loop invariant address could not be vectorized"); 1027 DEBUG(dbgs() << "LAA: We don't allow storing to uniform addresses\n"); 1028 CanVecMem = false; 1029 return; 1030 } 1031 1032 // If we did *not* see this pointer before, insert it to the read-write 1033 // list. At this phase it is only a 'write' list. 1034 if (Seen.insert(Ptr).second) { 1035 ++NumReadWrites; 1036 1037 AliasAnalysis::Location Loc = AA->getLocation(ST); 1038 // The TBAA metadata could have a control dependency on the predication 1039 // condition, so we cannot rely on it when determining whether or not we 1040 // need runtime pointer checks. 1041 if (blockNeedsPredication(ST->getParent(), TheLoop, DT)) 1042 Loc.AATags.TBAA = nullptr; 1043 1044 Accesses.addStore(Loc); 1045 } 1046 } 1047 1048 if (IsAnnotatedParallel) { 1049 DEBUG(dbgs() 1050 << "LAA: A loop annotated parallel, ignore memory dependency " 1051 << "checks.\n"); 1052 CanVecMem = true; 1053 return; 1054 } 1055 1056 for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { 1057 LoadInst *LD = cast<LoadInst>(*I); 1058 Value* Ptr = LD->getPointerOperand(); 1059 // If we did *not* see this pointer before, insert it to the 1060 // read list. If we *did* see it before, then it is already in 1061 // the read-write list. This allows us to vectorize expressions 1062 // such as A[i] += x; Because the address of A[i] is a read-write 1063 // pointer. This only works if the index of A[i] is consecutive. 1064 // If the address of i is unknown (for example A[B[i]]) then we may 1065 // read a few words, modify, and write a few words, and some of the 1066 // words may be written to the same address. 1067 bool IsReadOnlyPtr = false; 1068 if (Seen.insert(Ptr).second || 1069 !isStridedPtr(SE, DL, Ptr, TheLoop, Strides)) { 1070 ++NumReads; 1071 IsReadOnlyPtr = true; 1072 } 1073 1074 AliasAnalysis::Location Loc = AA->getLocation(LD); 1075 // The TBAA metadata could have a control dependency on the predication 1076 // condition, so we cannot rely on it when determining whether or not we 1077 // need runtime pointer checks. 1078 if (blockNeedsPredication(LD->getParent(), TheLoop, DT)) 1079 Loc.AATags.TBAA = nullptr; 1080 1081 Accesses.addLoad(Loc, IsReadOnlyPtr); 1082 } 1083 1084 // If we write (or read-write) to a single destination and there are no 1085 // other reads in this loop then is it safe to vectorize. 1086 if (NumReadWrites == 1 && NumReads == 0) { 1087 DEBUG(dbgs() << "LAA: Found a write-only loop!\n"); 1088 CanVecMem = true; 1089 return; 1090 } 1091 1092 // Build dependence sets and check whether we need a runtime pointer bounds 1093 // check. 1094 Accesses.buildDependenceSets(); 1095 bool NeedRTCheck = Accesses.isRTCheckNeeded(); 1096 1097 // Find pointers with computable bounds. We are going to use this information 1098 // to place a runtime bound check. 1099 unsigned NumComparisons = 0; 1100 bool CanDoRT = false; 1101 if (NeedRTCheck) 1102 CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop, 1103 Strides); 1104 1105 DEBUG(dbgs() << "LAA: We need to do " << NumComparisons << 1106 " pointer comparisons.\n"); 1107 1108 // If we only have one set of dependences to check pointers among we don't 1109 // need a runtime check. 1110 if (NumComparisons == 0 && NeedRTCheck) 1111 NeedRTCheck = false; 1112 1113 // Check that we did not collect too many pointers or found an unsizeable 1114 // pointer. 1115 if (!CanDoRT || 1116 NumComparisons > VectorizerParams::RuntimeMemoryCheckThreshold) { 1117 PtrRtCheck.reset(); 1118 CanDoRT = false; 1119 } 1120 1121 if (CanDoRT) { 1122 DEBUG(dbgs() << "LAA: We can perform a memory runtime check if needed.\n"); 1123 } 1124 1125 if (NeedRTCheck && !CanDoRT) { 1126 emitAnalysis(LoopAccessReport() << "cannot identify array bounds"); 1127 DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " << 1128 "the array bounds.\n"); 1129 PtrRtCheck.reset(); 1130 CanVecMem = false; 1131 return; 1132 } 1133 1134 PtrRtCheck.Need = NeedRTCheck; 1135 1136 CanVecMem = true; 1137 if (Accesses.isDependencyCheckNeeded()) { 1138 DEBUG(dbgs() << "LAA: Checking memory dependencies\n"); 1139 CanVecMem = DepChecker.areDepsSafe( 1140 DependentAccesses, Accesses.getDependenciesToCheck(), Strides); 1141 MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); 1142 1143 if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) { 1144 DEBUG(dbgs() << "LAA: Retrying with memory checks\n"); 1145 NeedRTCheck = true; 1146 1147 // Clear the dependency checks. We assume they are not needed. 1148 Accesses.resetDepChecks(); 1149 1150 PtrRtCheck.reset(); 1151 PtrRtCheck.Need = true; 1152 1153 CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, 1154 TheLoop, Strides, true); 1155 // Check that we did not collect too many pointers or found an unsizeable 1156 // pointer. 1157 if (!CanDoRT || 1158 NumComparisons > VectorizerParams::RuntimeMemoryCheckThreshold) { 1159 if (!CanDoRT && NumComparisons > 0) 1160 emitAnalysis(LoopAccessReport() 1161 << "cannot check memory dependencies at runtime"); 1162 else 1163 emitAnalysis(LoopAccessReport() 1164 << NumComparisons << " exceeds limit of " 1165 << VectorizerParams::RuntimeMemoryCheckThreshold 1166 << " dependent memory operations checked at runtime"); 1167 DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n"); 1168 PtrRtCheck.reset(); 1169 CanVecMem = false; 1170 return; 1171 } 1172 1173 CanVecMem = true; 1174 } 1175 } 1176 1177 if (!CanVecMem) 1178 emitAnalysis(LoopAccessReport() << 1179 "unsafe dependent memory operations in loop"); 1180 1181 DEBUG(dbgs() << "LAA: We" << (NeedRTCheck ? "" : " don't") << 1182 " need a runtime memory check.\n"); 1183 } 1184 1185 bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, 1186 DominatorTree *DT) { 1187 assert(TheLoop->contains(BB) && "Unknown block used"); 1188 1189 // Blocks that do not dominate the latch need predication. 1190 BasicBlock* Latch = TheLoop->getLoopLatch(); 1191 return !DT->dominates(BB, Latch); 1192 } 1193 1194 void LoopAccessInfo::emitAnalysis(LoopAccessReport &Message) { 1195 assert(!Report && "Multiple reports generated"); 1196 Report = Message; 1197 } 1198 1199 bool LoopAccessInfo::isUniform(Value *V) const { 1200 return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); 1201 } 1202 1203 // FIXME: this function is currently a duplicate of the one in 1204 // LoopVectorize.cpp. 1205 static Instruction *getFirstInst(Instruction *FirstInst, Value *V, 1206 Instruction *Loc) { 1207 if (FirstInst) 1208 return FirstInst; 1209 if (Instruction *I = dyn_cast<Instruction>(V)) 1210 return I->getParent() == Loc->getParent() ? I : nullptr; 1211 return nullptr; 1212 } 1213 1214 std::pair<Instruction *, Instruction *> 1215 LoopAccessInfo::addRuntimeCheck(Instruction *Loc) const { 1216 Instruction *tnullptr = nullptr; 1217 if (!PtrRtCheck.Need) 1218 return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr); 1219 1220 unsigned NumPointers = PtrRtCheck.Pointers.size(); 1221 SmallVector<TrackingVH<Value> , 2> Starts; 1222 SmallVector<TrackingVH<Value> , 2> Ends; 1223 1224 LLVMContext &Ctx = Loc->getContext(); 1225 SCEVExpander Exp(*SE, "induction"); 1226 Instruction *FirstInst = nullptr; 1227 1228 for (unsigned i = 0; i < NumPointers; ++i) { 1229 Value *Ptr = PtrRtCheck.Pointers[i]; 1230 const SCEV *Sc = SE->getSCEV(Ptr); 1231 1232 if (SE->isLoopInvariant(Sc, TheLoop)) { 1233 DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:" << 1234 *Ptr <<"\n"); 1235 Starts.push_back(Ptr); 1236 Ends.push_back(Ptr); 1237 } else { 1238 DEBUG(dbgs() << "LAA: Adding RT check for range:" << *Ptr << '\n'); 1239 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 1240 1241 // Use this type for pointer arithmetic. 1242 Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS); 1243 1244 Value *Start = Exp.expandCodeFor(PtrRtCheck.Starts[i], PtrArithTy, Loc); 1245 Value *End = Exp.expandCodeFor(PtrRtCheck.Ends[i], PtrArithTy, Loc); 1246 Starts.push_back(Start); 1247 Ends.push_back(End); 1248 } 1249 } 1250 1251 IRBuilder<> ChkBuilder(Loc); 1252 // Our instructions might fold to a constant. 1253 Value *MemoryRuntimeCheck = nullptr; 1254 for (unsigned i = 0; i < NumPointers; ++i) { 1255 for (unsigned j = i+1; j < NumPointers; ++j) { 1256 if (!PtrRtCheck.needsChecking(i, j)) 1257 continue; 1258 1259 unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace(); 1260 unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace(); 1261 1262 assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) && 1263 (AS1 == Ends[i]->getType()->getPointerAddressSpace()) && 1264 "Trying to bounds check pointers with different address spaces"); 1265 1266 Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); 1267 Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); 1268 1269 Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc"); 1270 Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc"); 1271 Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy1, "bc"); 1272 Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy0, "bc"); 1273 1274 Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); 1275 FirstInst = getFirstInst(FirstInst, Cmp0, Loc); 1276 Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); 1277 FirstInst = getFirstInst(FirstInst, Cmp1, Loc); 1278 Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); 1279 FirstInst = getFirstInst(FirstInst, IsConflict, Loc); 1280 if (MemoryRuntimeCheck) { 1281 IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, 1282 "conflict.rdx"); 1283 FirstInst = getFirstInst(FirstInst, IsConflict, Loc); 1284 } 1285 MemoryRuntimeCheck = IsConflict; 1286 } 1287 } 1288 1289 // We have to do this trickery because the IRBuilder might fold the check to a 1290 // constant expression in which case there is no Instruction anchored in a 1291 // the block. 1292 Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, 1293 ConstantInt::getTrue(Ctx)); 1294 ChkBuilder.Insert(Check, "memcheck.conflict"); 1295 FirstInst = getFirstInst(FirstInst, Check, Loc); 1296 return std::make_pair(FirstInst, Check); 1297 } 1298 1299 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, 1300 const DataLayout *DL, 1301 const TargetLibraryInfo *TLI, AliasAnalysis *AA, 1302 DominatorTree *DT, 1303 const ValueToValueMap &Strides) 1304 : TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT), NumLoads(0), 1305 NumStores(0), MaxSafeDepDistBytes(-1U), CanVecMem(false) { 1306 if (canAnalyzeLoop()) 1307 analyzeLoop(Strides); 1308 } 1309 1310 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { 1311 if (CanVecMem) { 1312 if (PtrRtCheck.empty()) 1313 OS.indent(Depth) << "Memory dependences are safe\n"; 1314 else 1315 OS.indent(Depth) << "Memory dependences are safe with run-time checks\n"; 1316 } 1317 1318 if (Report) 1319 OS.indent(Depth) << "Report: " << Report->str() << "\n"; 1320 1321 // FIXME: Print unsafe dependences 1322 1323 // List the pair of accesses need run-time checks to prove independence. 1324 PtrRtCheck.print(OS, Depth); 1325 OS << "\n"; 1326 } 1327 1328 const LoopAccessInfo & 1329 LoopAccessAnalysis::getInfo(Loop *L, const ValueToValueMap &Strides) { 1330 auto &LAI = LoopAccessInfoMap[L]; 1331 1332 #ifndef NDEBUG 1333 assert((!LAI || LAI->NumSymbolicStrides == Strides.size()) && 1334 "Symbolic strides changed for loop"); 1335 #endif 1336 1337 if (!LAI) { 1338 LAI = llvm::make_unique<LoopAccessInfo>(L, SE, DL, TLI, AA, DT, Strides); 1339 #ifndef NDEBUG 1340 LAI->NumSymbolicStrides = Strides.size(); 1341 #endif 1342 } 1343 return *LAI.get(); 1344 } 1345 1346 void LoopAccessAnalysis::print(raw_ostream &OS, const Module *M) const { 1347 LoopAccessAnalysis &LAA = *const_cast<LoopAccessAnalysis *>(this); 1348 1349 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1350 ValueToValueMap NoSymbolicStrides; 1351 1352 for (Loop *TopLevelLoop : *LI) 1353 for (Loop *L : depth_first(TopLevelLoop)) { 1354 OS.indent(2) << L->getHeader()->getName() << ":\n"; 1355 auto &LAI = LAA.getInfo(L, NoSymbolicStrides); 1356 LAI.print(OS, 4); 1357 } 1358 } 1359 1360 bool LoopAccessAnalysis::runOnFunction(Function &F) { 1361 SE = &getAnalysis<ScalarEvolution>(); 1362 DL = F.getParent()->getDataLayout(); 1363 auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); 1364 TLI = TLIP ? &TLIP->getTLI() : nullptr; 1365 AA = &getAnalysis<AliasAnalysis>(); 1366 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1367 1368 return false; 1369 } 1370 1371 void LoopAccessAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { 1372 AU.addRequired<ScalarEvolution>(); 1373 AU.addRequired<AliasAnalysis>(); 1374 AU.addRequired<DominatorTreeWrapperPass>(); 1375 AU.addRequired<LoopInfoWrapperPass>(); 1376 1377 AU.setPreservesAll(); 1378 } 1379 1380 char LoopAccessAnalysis::ID = 0; 1381 static const char laa_name[] = "Loop Access Analysis"; 1382 #define LAA_NAME "loop-accesses" 1383 1384 INITIALIZE_PASS_BEGIN(LoopAccessAnalysis, LAA_NAME, laa_name, false, true) 1385 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 1386 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 1387 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1388 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1389 INITIALIZE_PASS_END(LoopAccessAnalysis, LAA_NAME, laa_name, false, true) 1390 1391 namespace llvm { 1392 Pass *createLAAPass() { 1393 return new LoopAccessAnalysis(); 1394 } 1395 } 1396