1 //===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 //===----------------------------------------------------------------------===// 11 12 #include "llvm/ADT/MapVector.h" 13 #include "llvm/ADT/PostOrderIterator.h" 14 #include "llvm/ADT/SetVector.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/ADT/Triple.h" 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/OrderedBasicBlock.h" 19 #include "llvm/Analysis/ScalarEvolution.h" 20 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/DataLayout.h" 25 #include "llvm/IR/Dominators.h" 26 #include "llvm/IR/IRBuilder.h" 27 #include "llvm/IR/Instructions.h" 28 #include "llvm/IR/Module.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/IR/Value.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/KnownBits.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Transforms/Utils/Local.h" 36 #include "llvm/Transforms/Vectorize.h" 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "load-store-vectorizer" 41 STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 42 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 43 44 namespace { 45 46 // FIXME: Assuming stack alignment of 4 is always good enough 47 static const unsigned StackAdjustedAlignment = 4; 48 typedef SmallVector<Instruction *, 8> InstrList; 49 typedef MapVector<Value *, InstrList> InstrListMap; 50 51 class Vectorizer { 52 Function &F; 53 AliasAnalysis &AA; 54 DominatorTree &DT; 55 ScalarEvolution &SE; 56 TargetTransformInfo &TTI; 57 const DataLayout &DL; 58 IRBuilder<> Builder; 59 60 public: 61 Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, 62 ScalarEvolution &SE, TargetTransformInfo &TTI) 63 : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), 64 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 65 66 bool run(); 67 68 private: 69 Value *getPointerOperand(Value *I) const; 70 71 GetElementPtrInst *getSourceGEP(Value *Src) const; 72 73 unsigned getPointerAddressSpace(Value *I); 74 75 unsigned getAlignment(LoadInst *LI) const { 76 unsigned Align = LI->getAlignment(); 77 if (Align != 0) 78 return Align; 79 80 return DL.getABITypeAlignment(LI->getType()); 81 } 82 83 unsigned getAlignment(StoreInst *SI) const { 84 unsigned Align = SI->getAlignment(); 85 if (Align != 0) 86 return Align; 87 88 return DL.getABITypeAlignment(SI->getValueOperand()->getType()); 89 } 90 91 bool isConsecutiveAccess(Value *A, Value *B); 92 93 /// After vectorization, reorder the instructions that I depends on 94 /// (the instructions defining its operands), to ensure they dominate I. 95 void reorder(Instruction *I); 96 97 /// Returns the first and the last instructions in Chain. 98 std::pair<BasicBlock::iterator, BasicBlock::iterator> 99 getBoundaryInstrs(ArrayRef<Instruction *> Chain); 100 101 /// Erases the original instructions after vectorizing. 102 void eraseInstructions(ArrayRef<Instruction *> Chain); 103 104 /// "Legalize" the vector type that would be produced by combining \p 105 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the 106 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is 107 /// expected to have more than 4 elements. 108 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 109 splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); 110 111 /// Finds the largest prefix of Chain that's vectorizable, checking for 112 /// intervening instructions which may affect the memory accessed by the 113 /// instructions within Chain. 114 /// 115 /// The elements of \p Chain must be all loads or all stores and must be in 116 /// address order. 117 ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); 118 119 /// Collects load and store instructions to vectorize. 120 std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); 121 122 /// Processes the collected instructions, the \p Map. The values of \p Map 123 /// should be all loads or all stores. 124 bool vectorizeChains(InstrListMap &Map); 125 126 /// Finds the load/stores to consecutive memory addresses and vectorizes them. 127 bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); 128 129 /// Vectorizes the load instructions in Chain. 130 bool 131 vectorizeLoadChain(ArrayRef<Instruction *> Chain, 132 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 133 134 /// Vectorizes the store instructions in Chain. 135 bool 136 vectorizeStoreChain(ArrayRef<Instruction *> Chain, 137 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 138 139 /// Check if this load/store access is misaligned accesses. 140 bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 141 unsigned Alignment); 142 }; 143 144 class LoadStoreVectorizer : public FunctionPass { 145 public: 146 static char ID; 147 148 LoadStoreVectorizer() : FunctionPass(ID) { 149 initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry()); 150 } 151 152 bool runOnFunction(Function &F) override; 153 154 StringRef getPassName() const override { 155 return "GPU Load and Store Vectorizer"; 156 } 157 158 void getAnalysisUsage(AnalysisUsage &AU) const override { 159 AU.addRequired<AAResultsWrapperPass>(); 160 AU.addRequired<ScalarEvolutionWrapperPass>(); 161 AU.addRequired<DominatorTreeWrapperPass>(); 162 AU.addRequired<TargetTransformInfoWrapperPass>(); 163 AU.setPreservesCFG(); 164 } 165 }; 166 } 167 168 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE, 169 "Vectorize load and Store instructions", false, false) 170 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 171 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 172 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 173 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 174 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 175 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE, 176 "Vectorize load and store instructions", false, false) 177 178 char LoadStoreVectorizer::ID = 0; 179 180 Pass *llvm::createLoadStoreVectorizerPass() { 181 return new LoadStoreVectorizer(); 182 } 183 184 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in 185 // vectors of Instructions. 186 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { 187 SmallVector<Value *, 8> VL(IL.begin(), IL.end()); 188 propagateMetadata(I, VL); 189 } 190 191 bool LoadStoreVectorizer::runOnFunction(Function &F) { 192 // Don't vectorize when the attribute NoImplicitFloat is used. 193 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 194 return false; 195 196 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 197 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 198 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 199 TargetTransformInfo &TTI = 200 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 201 202 Vectorizer V(F, AA, DT, SE, TTI); 203 return V.run(); 204 } 205 206 // Vectorizer Implementation 207 bool Vectorizer::run() { 208 bool Changed = false; 209 210 // Scan the blocks in the function in post order. 211 for (BasicBlock *BB : post_order(&F)) { 212 InstrListMap LoadRefs, StoreRefs; 213 std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); 214 Changed |= vectorizeChains(LoadRefs); 215 Changed |= vectorizeChains(StoreRefs); 216 } 217 218 return Changed; 219 } 220 221 Value *Vectorizer::getPointerOperand(Value *I) const { 222 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 223 return LI->getPointerOperand(); 224 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 225 return SI->getPointerOperand(); 226 return nullptr; 227 } 228 229 unsigned Vectorizer::getPointerAddressSpace(Value *I) { 230 if (LoadInst *L = dyn_cast<LoadInst>(I)) 231 return L->getPointerAddressSpace(); 232 if (StoreInst *S = dyn_cast<StoreInst>(I)) 233 return S->getPointerAddressSpace(); 234 return -1; 235 } 236 237 GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const { 238 // First strip pointer bitcasts. Make sure pointee size is the same with 239 // and without casts. 240 // TODO: a stride set by the add instruction below can match the difference 241 // in pointee type size here. Currently it will not be vectorized. 242 Value *SrcPtr = getPointerOperand(Src); 243 Value *SrcBase = SrcPtr->stripPointerCasts(); 244 if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) == 245 DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType())) 246 SrcPtr = SrcBase; 247 return dyn_cast<GetElementPtrInst>(SrcPtr); 248 } 249 250 // FIXME: Merge with llvm::isConsecutiveAccess 251 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { 252 Value *PtrA = getPointerOperand(A); 253 Value *PtrB = getPointerOperand(B); 254 unsigned ASA = getPointerAddressSpace(A); 255 unsigned ASB = getPointerAddressSpace(B); 256 257 // Check that the address spaces match and that the pointers are valid. 258 if (!PtrA || !PtrB || (ASA != ASB)) 259 return false; 260 261 // Make sure that A and B are different pointers of the same size type. 262 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 263 Type *PtrATy = PtrA->getType()->getPointerElementType(); 264 Type *PtrBTy = PtrB->getType()->getPointerElementType(); 265 if (PtrA == PtrB || 266 DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || 267 DL.getTypeStoreSize(PtrATy->getScalarType()) != 268 DL.getTypeStoreSize(PtrBTy->getScalarType())) 269 return false; 270 271 APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); 272 273 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 274 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 275 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 276 277 APInt OffsetDelta = OffsetB - OffsetA; 278 279 // Check if they are based on the same pointer. That makes the offsets 280 // sufficient. 281 if (PtrA == PtrB) 282 return OffsetDelta == Size; 283 284 // Compute the necessary base pointer delta to have the necessary final delta 285 // equal to the size. 286 APInt BaseDelta = Size - OffsetDelta; 287 288 // Compute the distance with SCEV between the base pointers. 289 const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 290 const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 291 const SCEV *C = SE.getConstant(BaseDelta); 292 const SCEV *X = SE.getAddExpr(PtrSCEVA, C); 293 if (X == PtrSCEVB) 294 return true; 295 296 // Sometimes even this doesn't work, because SCEV can't always see through 297 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking 298 // things the hard way. 299 300 // Look through GEPs after checking they're the same except for the last 301 // index. 302 GetElementPtrInst *GEPA = getSourceGEP(A); 303 GetElementPtrInst *GEPB = getSourceGEP(B); 304 if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) 305 return false; 306 unsigned FinalIndex = GEPA->getNumOperands() - 1; 307 for (unsigned i = 0; i < FinalIndex; i++) 308 if (GEPA->getOperand(i) != GEPB->getOperand(i)) 309 return false; 310 311 Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex)); 312 Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex)); 313 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 314 OpA->getType() != OpB->getType()) 315 return false; 316 317 // Only look through a ZExt/SExt. 318 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 319 return false; 320 321 bool Signed = isa<SExtInst>(OpA); 322 323 OpA = dyn_cast<Instruction>(OpA->getOperand(0)); 324 OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 325 if (!OpA || !OpB || OpA->getType() != OpB->getType()) 326 return false; 327 328 // Now we need to prove that adding 1 to OpA won't overflow. 329 bool Safe = false; 330 // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, 331 // we're okay. 332 if (OpB->getOpcode() == Instruction::Add && 333 isa<ConstantInt>(OpB->getOperand(1)) && 334 cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) { 335 if (Signed) 336 Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); 337 else 338 Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); 339 } 340 341 unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); 342 343 // Second attempt: 344 // If any bits are known to be zero other than the sign bit in OpA, we can 345 // add 1 to it while guaranteeing no overflow of any sort. 346 if (!Safe) { 347 KnownBits Known(BitWidth); 348 computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); 349 if (Known.countMaxTrailingOnes() < (BitWidth - 1)) 350 Safe = true; 351 } 352 353 if (!Safe) 354 return false; 355 356 const SCEV *OffsetSCEVA = SE.getSCEV(OpA); 357 const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 358 const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); 359 const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); 360 return X2 == OffsetSCEVB; 361 } 362 363 void Vectorizer::reorder(Instruction *I) { 364 OrderedBasicBlock OBB(I->getParent()); 365 SmallPtrSet<Instruction *, 16> InstructionsToMove; 366 SmallVector<Instruction *, 16> Worklist; 367 368 Worklist.push_back(I); 369 while (!Worklist.empty()) { 370 Instruction *IW = Worklist.pop_back_val(); 371 int NumOperands = IW->getNumOperands(); 372 for (int i = 0; i < NumOperands; i++) { 373 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 374 if (!IM || IM->getOpcode() == Instruction::PHI) 375 continue; 376 377 // If IM is in another BB, no need to move it, because this pass only 378 // vectorizes instructions within one BB. 379 if (IM->getParent() != I->getParent()) 380 continue; 381 382 if (!OBB.dominates(IM, I)) { 383 InstructionsToMove.insert(IM); 384 Worklist.push_back(IM); 385 } 386 } 387 } 388 389 // All instructions to move should follow I. Start from I, not from begin(). 390 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; 391 ++BBI) { 392 if (!InstructionsToMove.count(&*BBI)) 393 continue; 394 Instruction *IM = &*BBI; 395 --BBI; 396 IM->removeFromParent(); 397 IM->insertBefore(I); 398 } 399 } 400 401 std::pair<BasicBlock::iterator, BasicBlock::iterator> 402 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { 403 Instruction *C0 = Chain[0]; 404 BasicBlock::iterator FirstInstr = C0->getIterator(); 405 BasicBlock::iterator LastInstr = C0->getIterator(); 406 407 BasicBlock *BB = C0->getParent(); 408 unsigned NumFound = 0; 409 for (Instruction &I : *BB) { 410 if (!is_contained(Chain, &I)) 411 continue; 412 413 ++NumFound; 414 if (NumFound == 1) { 415 FirstInstr = I.getIterator(); 416 } 417 if (NumFound == Chain.size()) { 418 LastInstr = I.getIterator(); 419 break; 420 } 421 } 422 423 // Range is [first, last). 424 return std::make_pair(FirstInstr, ++LastInstr); 425 } 426 427 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { 428 SmallVector<Instruction *, 16> Instrs; 429 for (Instruction *I : Chain) { 430 Value *PtrOperand = getPointerOperand(I); 431 assert(PtrOperand && "Instruction must have a pointer operand."); 432 Instrs.push_back(I); 433 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) 434 Instrs.push_back(GEP); 435 } 436 437 // Erase instructions. 438 for (Instruction *I : Instrs) 439 if (I->use_empty()) 440 I->eraseFromParent(); 441 } 442 443 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 444 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, 445 unsigned ElementSizeBits) { 446 unsigned ElementSizeBytes = ElementSizeBits / 8; 447 unsigned SizeBytes = ElementSizeBytes * Chain.size(); 448 unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes; 449 if (NumLeft == Chain.size()) { 450 if ((NumLeft & 1) == 0) 451 NumLeft /= 2; // Split even in half 452 else 453 --NumLeft; // Split off last element 454 } else if (NumLeft == 0) 455 NumLeft = 1; 456 return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); 457 } 458 459 ArrayRef<Instruction *> 460 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { 461 // These are in BB order, unlike Chain, which is in address order. 462 SmallVector<Instruction *, 16> MemoryInstrs; 463 SmallVector<Instruction *, 16> ChainInstrs; 464 465 bool IsLoadChain = isa<LoadInst>(Chain[0]); 466 DEBUG({ 467 for (Instruction *I : Chain) { 468 if (IsLoadChain) 469 assert(isa<LoadInst>(I) && 470 "All elements of Chain must be loads, or all must be stores."); 471 else 472 assert(isa<StoreInst>(I) && 473 "All elements of Chain must be loads, or all must be stores."); 474 } 475 }); 476 477 for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { 478 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 479 if (!is_contained(Chain, &I)) 480 MemoryInstrs.push_back(&I); 481 else 482 ChainInstrs.push_back(&I); 483 } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { 484 DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); 485 break; 486 } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { 487 DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I 488 << '\n'); 489 break; 490 } 491 } 492 493 OrderedBasicBlock OBB(Chain[0]->getParent()); 494 495 // Loop until we find an instruction in ChainInstrs that we can't vectorize. 496 unsigned ChainInstrIdx = 0; 497 Instruction *BarrierMemoryInstr = nullptr; 498 499 for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { 500 Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; 501 502 // If a barrier memory instruction was found, chain instructions that follow 503 // will not be added to the valid prefix. 504 if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr)) 505 break; 506 507 // Check (in BB order) if any instruction prevents ChainInstr from being 508 // vectorized. Find and store the first such "conflicting" instruction. 509 for (Instruction *MemInstr : MemoryInstrs) { 510 // If a barrier memory instruction was found, do not check past it. 511 if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr)) 512 break; 513 514 if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr)) 515 continue; 516 517 // We can ignore the alias as long as the load comes before the store, 518 // because that means we won't be moving the load past the store to 519 // vectorize it (the vectorized load is inserted at the location of the 520 // first load in the chain). 521 if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) && 522 OBB.dominates(ChainInstr, MemInstr)) 523 continue; 524 525 // Same case, but in reverse. 526 if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) && 527 OBB.dominates(MemInstr, ChainInstr)) 528 continue; 529 530 if (!AA.isNoAlias(MemoryLocation::get(MemInstr), 531 MemoryLocation::get(ChainInstr))) { 532 DEBUG({ 533 dbgs() << "LSV: Found alias:\n" 534 " Aliasing instruction and pointer:\n" 535 << " " << *MemInstr << '\n' 536 << " " << *getPointerOperand(MemInstr) << '\n' 537 << " Aliased instruction and pointer:\n" 538 << " " << *ChainInstr << '\n' 539 << " " << *getPointerOperand(ChainInstr) << '\n'; 540 }); 541 // Save this aliasing memory instruction as a barrier, but allow other 542 // instructions that precede the barrier to be vectorized with this one. 543 BarrierMemoryInstr = MemInstr; 544 break; 545 } 546 } 547 // Continue the search only for store chains, since vectorizing stores that 548 // precede an aliasing load is valid. Conversely, vectorizing loads is valid 549 // up to an aliasing store, but should not pull loads from further down in 550 // the basic block. 551 if (IsLoadChain && BarrierMemoryInstr) { 552 // The BarrierMemoryInstr is a store that precedes ChainInstr. 553 assert(OBB.dominates(BarrierMemoryInstr, ChainInstr)); 554 break; 555 } 556 } 557 558 // Find the largest prefix of Chain whose elements are all in 559 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of 560 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB 561 // order.) 562 SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( 563 ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); 564 unsigned ChainIdx = 0; 565 for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { 566 if (!VectorizableChainInstrs.count(Chain[ChainIdx])) 567 break; 568 } 569 return Chain.slice(0, ChainIdx); 570 } 571 572 std::pair<InstrListMap, InstrListMap> 573 Vectorizer::collectInstructions(BasicBlock *BB) { 574 InstrListMap LoadRefs; 575 InstrListMap StoreRefs; 576 577 for (Instruction &I : *BB) { 578 if (!I.mayReadOrWriteMemory()) 579 continue; 580 581 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 582 if (!LI->isSimple()) 583 continue; 584 585 // Skip if it's not legal. 586 if (!TTI.isLegalToVectorizeLoad(LI)) 587 continue; 588 589 Type *Ty = LI->getType(); 590 if (!VectorType::isValidElementType(Ty->getScalarType())) 591 continue; 592 593 // Skip weird non-byte sizes. They probably aren't worth the effort of 594 // handling correctly. 595 unsigned TySize = DL.getTypeSizeInBits(Ty); 596 if (TySize < 8) 597 continue; 598 599 Value *Ptr = LI->getPointerOperand(); 600 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 601 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 602 603 // No point in looking at these if they're too big to vectorize. 604 if (TySize > VecRegSize / 2) 605 continue; 606 607 // Make sure all the users of a vector are constant-index extracts. 608 if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) { 609 const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 610 return EEI && isa<ConstantInt>(EEI->getOperand(1)); 611 })) 612 continue; 613 614 // Save the load locations. 615 Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 616 LoadRefs[ObjPtr].push_back(LI); 617 618 } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 619 if (!SI->isSimple()) 620 continue; 621 622 // Skip if it's not legal. 623 if (!TTI.isLegalToVectorizeStore(SI)) 624 continue; 625 626 Type *Ty = SI->getValueOperand()->getType(); 627 if (!VectorType::isValidElementType(Ty->getScalarType())) 628 continue; 629 630 // Skip weird non-byte sizes. They probably aren't worth the effort of 631 // handling correctly. 632 unsigned TySize = DL.getTypeSizeInBits(Ty); 633 if (TySize < 8) 634 continue; 635 636 Value *Ptr = SI->getPointerOperand(); 637 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 638 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 639 if (TySize > VecRegSize / 2) 640 continue; 641 642 if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) { 643 const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 644 return EEI && isa<ConstantInt>(EEI->getOperand(1)); 645 })) 646 continue; 647 648 // Save store location. 649 Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 650 StoreRefs[ObjPtr].push_back(SI); 651 } 652 } 653 654 return {LoadRefs, StoreRefs}; 655 } 656 657 bool Vectorizer::vectorizeChains(InstrListMap &Map) { 658 bool Changed = false; 659 660 for (const std::pair<Value *, InstrList> &Chain : Map) { 661 unsigned Size = Chain.second.size(); 662 if (Size < 2) 663 continue; 664 665 DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); 666 667 // Process the stores in chunks of 64. 668 for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { 669 unsigned Len = std::min<unsigned>(CE - CI, 64); 670 ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); 671 Changed |= vectorizeInstructions(Chunk); 672 } 673 } 674 675 return Changed; 676 } 677 678 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { 679 DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); 680 SmallVector<int, 16> Heads, Tails; 681 int ConsecutiveChain[64]; 682 683 // Do a quadratic search on all of the given stores and find all of the pairs 684 // of stores that follow each other. 685 for (int i = 0, e = Instrs.size(); i < e; ++i) { 686 ConsecutiveChain[i] = -1; 687 for (int j = e - 1; j >= 0; --j) { 688 if (i == j) 689 continue; 690 691 if (isConsecutiveAccess(Instrs[i], Instrs[j])) { 692 if (ConsecutiveChain[i] != -1) { 693 int CurDistance = std::abs(ConsecutiveChain[i] - i); 694 int NewDistance = std::abs(ConsecutiveChain[i] - j); 695 if (j < i || NewDistance > CurDistance) 696 continue; // Should not insert. 697 } 698 699 Tails.push_back(j); 700 Heads.push_back(i); 701 ConsecutiveChain[i] = j; 702 } 703 } 704 } 705 706 bool Changed = false; 707 SmallPtrSet<Instruction *, 16> InstructionsProcessed; 708 709 for (int Head : Heads) { 710 if (InstructionsProcessed.count(Instrs[Head])) 711 continue; 712 bool LongerChainExists = false; 713 for (unsigned TIt = 0; TIt < Tails.size(); TIt++) 714 if (Head == Tails[TIt] && 715 !InstructionsProcessed.count(Instrs[Heads[TIt]])) { 716 LongerChainExists = true; 717 break; 718 } 719 if (LongerChainExists) 720 continue; 721 722 // We found an instr that starts a chain. Now follow the chain and try to 723 // vectorize it. 724 SmallVector<Instruction *, 16> Operands; 725 int I = Head; 726 while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { 727 if (InstructionsProcessed.count(Instrs[I])) 728 break; 729 730 Operands.push_back(Instrs[I]); 731 I = ConsecutiveChain[I]; 732 } 733 734 bool Vectorized = false; 735 if (isa<LoadInst>(*Operands.begin())) 736 Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); 737 else 738 Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); 739 740 Changed |= Vectorized; 741 } 742 743 return Changed; 744 } 745 746 bool Vectorizer::vectorizeStoreChain( 747 ArrayRef<Instruction *> Chain, 748 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 749 StoreInst *S0 = cast<StoreInst>(Chain[0]); 750 751 // If the vector has an int element, default to int for the whole load. 752 Type *StoreTy; 753 for (Instruction *I : Chain) { 754 StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); 755 if (StoreTy->isIntOrIntVectorTy()) 756 break; 757 758 if (StoreTy->isPtrOrPtrVectorTy()) { 759 StoreTy = Type::getIntNTy(F.getParent()->getContext(), 760 DL.getTypeSizeInBits(StoreTy)); 761 break; 762 } 763 } 764 765 unsigned Sz = DL.getTypeSizeInBits(StoreTy); 766 unsigned AS = S0->getPointerAddressSpace(); 767 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 768 unsigned VF = VecRegSize / Sz; 769 unsigned ChainSize = Chain.size(); 770 unsigned Alignment = getAlignment(S0); 771 772 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 773 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 774 return false; 775 } 776 777 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 778 if (NewChain.empty()) { 779 // No vectorization possible. 780 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 781 return false; 782 } 783 if (NewChain.size() == 1) { 784 // Failed after the first instruction. Discard it and try the smaller chain. 785 InstructionsProcessed->insert(NewChain.front()); 786 return false; 787 } 788 789 // Update Chain to the valid vectorizable subchain. 790 Chain = NewChain; 791 ChainSize = Chain.size(); 792 793 // Check if it's legal to vectorize this chain. If not, split the chain and 794 // try again. 795 unsigned EltSzInBytes = Sz / 8; 796 unsigned SzInBytes = EltSzInBytes * ChainSize; 797 if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) { 798 auto Chains = splitOddVectorElts(Chain, Sz); 799 return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 800 vectorizeStoreChain(Chains.second, InstructionsProcessed); 801 } 802 803 VectorType *VecTy; 804 VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); 805 if (VecStoreTy) 806 VecTy = VectorType::get(StoreTy->getScalarType(), 807 Chain.size() * VecStoreTy->getNumElements()); 808 else 809 VecTy = VectorType::get(StoreTy, Chain.size()); 810 811 // If it's more than the max vector size or the target has a better 812 // vector factor, break it into two pieces. 813 unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy); 814 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 815 DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 816 " Creating two separate arrays.\n"); 817 return vectorizeStoreChain(Chain.slice(0, TargetVF), 818 InstructionsProcessed) | 819 vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed); 820 } 821 822 DEBUG({ 823 dbgs() << "LSV: Stores to vectorize:\n"; 824 for (Instruction *I : Chain) 825 dbgs() << " " << *I << "\n"; 826 }); 827 828 // We won't try again to vectorize the elements of the chain, regardless of 829 // whether we succeed below. 830 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 831 832 // If the store is going to be misaligned, don't vectorize it. 833 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 834 if (S0->getPointerAddressSpace() != 0) 835 return false; 836 837 unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(), 838 StackAdjustedAlignment, 839 DL, S0, nullptr, &DT); 840 if (NewAlign < StackAdjustedAlignment) 841 return false; 842 } 843 844 BasicBlock::iterator First, Last; 845 std::tie(First, Last) = getBoundaryInstrs(Chain); 846 Builder.SetInsertPoint(&*Last); 847 848 Value *Vec = UndefValue::get(VecTy); 849 850 if (VecStoreTy) { 851 unsigned VecWidth = VecStoreTy->getNumElements(); 852 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 853 StoreInst *Store = cast<StoreInst>(Chain[I]); 854 for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { 855 unsigned NewIdx = J + I * VecWidth; 856 Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), 857 Builder.getInt32(J)); 858 if (Extract->getType() != StoreTy->getScalarType()) 859 Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); 860 861 Value *Insert = 862 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); 863 Vec = Insert; 864 } 865 } 866 } else { 867 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 868 StoreInst *Store = cast<StoreInst>(Chain[I]); 869 Value *Extract = Store->getValueOperand(); 870 if (Extract->getType() != StoreTy->getScalarType()) 871 Extract = 872 Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); 873 874 Value *Insert = 875 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); 876 Vec = Insert; 877 } 878 } 879 880 // This cast is safe because Builder.CreateStore() always creates a bona fide 881 // StoreInst. 882 StoreInst *SI = cast<StoreInst>( 883 Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(), 884 VecTy->getPointerTo(AS)))); 885 propagateMetadata(SI, Chain); 886 SI->setAlignment(Alignment); 887 888 eraseInstructions(Chain); 889 ++NumVectorInstructions; 890 NumScalarsVectorized += Chain.size(); 891 return true; 892 } 893 894 bool Vectorizer::vectorizeLoadChain( 895 ArrayRef<Instruction *> Chain, 896 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 897 LoadInst *L0 = cast<LoadInst>(Chain[0]); 898 899 // If the vector has an int element, default to int for the whole load. 900 Type *LoadTy; 901 for (const auto &V : Chain) { 902 LoadTy = cast<LoadInst>(V)->getType(); 903 if (LoadTy->isIntOrIntVectorTy()) 904 break; 905 906 if (LoadTy->isPtrOrPtrVectorTy()) { 907 LoadTy = Type::getIntNTy(F.getParent()->getContext(), 908 DL.getTypeSizeInBits(LoadTy)); 909 break; 910 } 911 } 912 913 unsigned Sz = DL.getTypeSizeInBits(LoadTy); 914 unsigned AS = L0->getPointerAddressSpace(); 915 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 916 unsigned VF = VecRegSize / Sz; 917 unsigned ChainSize = Chain.size(); 918 unsigned Alignment = getAlignment(L0); 919 920 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 921 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 922 return false; 923 } 924 925 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 926 if (NewChain.empty()) { 927 // No vectorization possible. 928 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 929 return false; 930 } 931 if (NewChain.size() == 1) { 932 // Failed after the first instruction. Discard it and try the smaller chain. 933 InstructionsProcessed->insert(NewChain.front()); 934 return false; 935 } 936 937 // Update Chain to the valid vectorizable subchain. 938 Chain = NewChain; 939 ChainSize = Chain.size(); 940 941 // Check if it's legal to vectorize this chain. If not, split the chain and 942 // try again. 943 unsigned EltSzInBytes = Sz / 8; 944 unsigned SzInBytes = EltSzInBytes * ChainSize; 945 if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) { 946 auto Chains = splitOddVectorElts(Chain, Sz); 947 return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 948 vectorizeLoadChain(Chains.second, InstructionsProcessed); 949 } 950 951 VectorType *VecTy; 952 VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); 953 if (VecLoadTy) 954 VecTy = VectorType::get(LoadTy->getScalarType(), 955 Chain.size() * VecLoadTy->getNumElements()); 956 else 957 VecTy = VectorType::get(LoadTy, Chain.size()); 958 959 // If it's more than the max vector size or the target has a better 960 // vector factor, break it into two pieces. 961 unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy); 962 if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) { 963 DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor." 964 " Creating two separate arrays.\n"); 965 return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) | 966 vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed); 967 } 968 969 // We won't try again to vectorize the elements of the chain, regardless of 970 // whether we succeed below. 971 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 972 973 // If the load is going to be misaligned, don't vectorize it. 974 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 975 if (L0->getPointerAddressSpace() != 0) 976 return false; 977 978 unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(), 979 StackAdjustedAlignment, 980 DL, L0, nullptr, &DT); 981 if (NewAlign < StackAdjustedAlignment) 982 return false; 983 984 Alignment = NewAlign; 985 } 986 987 DEBUG({ 988 dbgs() << "LSV: Loads to vectorize:\n"; 989 for (Instruction *I : Chain) 990 I->dump(); 991 }); 992 993 // getVectorizablePrefix already computed getBoundaryInstrs. The value of 994 // Last may have changed since then, but the value of First won't have. If it 995 // matters, we could compute getBoundaryInstrs only once and reuse it here. 996 BasicBlock::iterator First, Last; 997 std::tie(First, Last) = getBoundaryInstrs(Chain); 998 Builder.SetInsertPoint(&*First); 999 1000 Value *Bitcast = 1001 Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); 1002 // This cast is safe because Builder.CreateLoad always creates a bona fide 1003 // LoadInst. 1004 LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast)); 1005 propagateMetadata(LI, Chain); 1006 LI->setAlignment(Alignment); 1007 1008 if (VecLoadTy) { 1009 SmallVector<Instruction *, 16> InstrsToErase; 1010 1011 unsigned VecWidth = VecLoadTy->getNumElements(); 1012 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1013 for (auto Use : Chain[I]->users()) { 1014 // All users of vector loads are ExtractElement instructions with 1015 // constant indices, otherwise we would have bailed before now. 1016 Instruction *UI = cast<Instruction>(Use); 1017 unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); 1018 unsigned NewIdx = Idx + I * VecWidth; 1019 Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), 1020 UI->getName()); 1021 if (V->getType() != UI->getType()) 1022 V = Builder.CreateBitCast(V, UI->getType()); 1023 1024 // Replace the old instruction. 1025 UI->replaceAllUsesWith(V); 1026 InstrsToErase.push_back(UI); 1027 } 1028 } 1029 1030 // Bitcast might not be an Instruction, if the value being loaded is a 1031 // constant. In that case, no need to reorder anything. 1032 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1033 reorder(BitcastInst); 1034 1035 for (auto I : InstrsToErase) 1036 I->eraseFromParent(); 1037 } else { 1038 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1039 Value *CV = Chain[I]; 1040 Value *V = 1041 Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); 1042 if (V->getType() != CV->getType()) { 1043 V = Builder.CreateBitOrPointerCast(V, CV->getType()); 1044 } 1045 1046 // Replace the old instruction. 1047 CV->replaceAllUsesWith(V); 1048 } 1049 1050 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1051 reorder(BitcastInst); 1052 } 1053 1054 eraseInstructions(Chain); 1055 1056 ++NumVectorInstructions; 1057 NumScalarsVectorized += Chain.size(); 1058 return true; 1059 } 1060 1061 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 1062 unsigned Alignment) { 1063 if (Alignment % SzInBytes == 0) 1064 return false; 1065 1066 bool Fast = false; 1067 bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), 1068 SzInBytes * 8, AddressSpace, 1069 Alignment, &Fast); 1070 DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows 1071 << " and fast? " << Fast << "\n";); 1072 return !Allows || !Fast; 1073 } 1074