1 //===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 //===----------------------------------------------------------------------===// 11 12 #include "llvm/ADT/MapVector.h" 13 #include "llvm/ADT/PostOrderIterator.h" 14 #include "llvm/ADT/SetVector.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/ADT/Triple.h" 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/OrderedBasicBlock.h" 19 #include "llvm/Analysis/ScalarEvolution.h" 20 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 21 #include "llvm/Analysis/TargetTransformInfo.h" 22 #include "llvm/Analysis/ValueTracking.h" 23 #include "llvm/Analysis/VectorUtils.h" 24 #include "llvm/IR/DataLayout.h" 25 #include "llvm/IR/Dominators.h" 26 #include "llvm/IR/IRBuilder.h" 27 #include "llvm/IR/Instructions.h" 28 #include "llvm/IR/Module.h" 29 #include "llvm/IR/Type.h" 30 #include "llvm/IR/Value.h" 31 #include "llvm/Support/CommandLine.h" 32 #include "llvm/Support/Debug.h" 33 #include "llvm/Support/raw_ostream.h" 34 #include "llvm/Transforms/Vectorize.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "load-store-vectorizer" 39 STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 40 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 41 42 namespace { 43 44 // FIXME: Assuming stack alignment of 4 is always good enough 45 static const unsigned StackAdjustedAlignment = 4; 46 typedef SmallVector<Instruction *, 8> InstrList; 47 typedef MapVector<Value *, InstrList> InstrListMap; 48 49 class Vectorizer { 50 Function &F; 51 AliasAnalysis &AA; 52 DominatorTree &DT; 53 ScalarEvolution &SE; 54 TargetTransformInfo &TTI; 55 const DataLayout &DL; 56 IRBuilder<> Builder; 57 58 public: 59 Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, 60 ScalarEvolution &SE, TargetTransformInfo &TTI) 61 : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), 62 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 63 64 bool run(); 65 66 private: 67 Value *getPointerOperand(Value *I); 68 69 unsigned getPointerAddressSpace(Value *I); 70 71 unsigned getAlignment(LoadInst *LI) const { 72 unsigned Align = LI->getAlignment(); 73 if (Align != 0) 74 return Align; 75 76 return DL.getABITypeAlignment(LI->getType()); 77 } 78 79 unsigned getAlignment(StoreInst *SI) const { 80 unsigned Align = SI->getAlignment(); 81 if (Align != 0) 82 return Align; 83 84 return DL.getABITypeAlignment(SI->getValueOperand()->getType()); 85 } 86 87 bool isConsecutiveAccess(Value *A, Value *B); 88 89 /// After vectorization, reorder the instructions that I depends on 90 /// (the instructions defining its operands), to ensure they dominate I. 91 void reorder(Instruction *I); 92 93 /// Returns the first and the last instructions in Chain. 94 std::pair<BasicBlock::iterator, BasicBlock::iterator> 95 getBoundaryInstrs(ArrayRef<Instruction *> Chain); 96 97 /// Erases the original instructions after vectorizing. 98 void eraseInstructions(ArrayRef<Instruction *> Chain); 99 100 /// "Legalize" the vector type that would be produced by combining \p 101 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the 102 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is 103 /// expected to have more than 4 elements. 104 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 105 splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); 106 107 /// Finds the largest prefix of Chain that's vectorizable, checking for 108 /// intervening instructions which may affect the memory accessed by the 109 /// instructions within Chain. 110 /// 111 /// The elements of \p Chain must be all loads or all stores and must be in 112 /// address order. 113 ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); 114 115 /// Collects load and store instructions to vectorize. 116 std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); 117 118 /// Processes the collected instructions, the \p Map. The values of \p Map 119 /// should be all loads or all stores. 120 bool vectorizeChains(InstrListMap &Map); 121 122 /// Finds the load/stores to consecutive memory addresses and vectorizes them. 123 bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); 124 125 /// Vectorizes the load instructions in Chain. 126 bool 127 vectorizeLoadChain(ArrayRef<Instruction *> Chain, 128 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 129 130 /// Vectorizes the store instructions in Chain. 131 bool 132 vectorizeStoreChain(ArrayRef<Instruction *> Chain, 133 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 134 135 /// Check if this load/store access is misaligned accesses 136 bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 137 unsigned Alignment); 138 }; 139 140 class LoadStoreVectorizer : public FunctionPass { 141 public: 142 static char ID; 143 144 LoadStoreVectorizer() : FunctionPass(ID) { 145 initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry()); 146 } 147 148 bool runOnFunction(Function &F) override; 149 150 const char *getPassName() const override { 151 return "GPU Load and Store Vectorizer"; 152 } 153 154 void getAnalysisUsage(AnalysisUsage &AU) const override { 155 AU.addRequired<AAResultsWrapperPass>(); 156 AU.addRequired<ScalarEvolutionWrapperPass>(); 157 AU.addRequired<DominatorTreeWrapperPass>(); 158 AU.addRequired<TargetTransformInfoWrapperPass>(); 159 AU.setPreservesCFG(); 160 } 161 }; 162 } 163 164 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE, 165 "Vectorize load and Store instructions", false, false) 166 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 167 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 168 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 169 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 170 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 171 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE, 172 "Vectorize load and store instructions", false, false) 173 174 char LoadStoreVectorizer::ID = 0; 175 176 Pass *llvm::createLoadStoreVectorizerPass() { 177 return new LoadStoreVectorizer(); 178 } 179 180 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in 181 // vectors of Instructions. 182 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { 183 SmallVector<Value *, 8> VL(IL.begin(), IL.end()); 184 propagateMetadata(I, VL); 185 } 186 187 bool LoadStoreVectorizer::runOnFunction(Function &F) { 188 // Don't vectorize when the attribute NoImplicitFloat is used. 189 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 190 return false; 191 192 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 193 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 194 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 195 TargetTransformInfo &TTI = 196 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 197 198 Vectorizer V(F, AA, DT, SE, TTI); 199 return V.run(); 200 } 201 202 // Vectorizer Implementation 203 bool Vectorizer::run() { 204 bool Changed = false; 205 206 // Scan the blocks in the function in post order. 207 for (BasicBlock *BB : post_order(&F)) { 208 InstrListMap LoadRefs, StoreRefs; 209 std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); 210 Changed |= vectorizeChains(LoadRefs); 211 Changed |= vectorizeChains(StoreRefs); 212 } 213 214 return Changed; 215 } 216 217 Value *Vectorizer::getPointerOperand(Value *I) { 218 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 219 return LI->getPointerOperand(); 220 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 221 return SI->getPointerOperand(); 222 return nullptr; 223 } 224 225 unsigned Vectorizer::getPointerAddressSpace(Value *I) { 226 if (LoadInst *L = dyn_cast<LoadInst>(I)) 227 return L->getPointerAddressSpace(); 228 if (StoreInst *S = dyn_cast<StoreInst>(I)) 229 return S->getPointerAddressSpace(); 230 return -1; 231 } 232 233 // FIXME: Merge with llvm::isConsecutiveAccess 234 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { 235 Value *PtrA = getPointerOperand(A); 236 Value *PtrB = getPointerOperand(B); 237 unsigned ASA = getPointerAddressSpace(A); 238 unsigned ASB = getPointerAddressSpace(B); 239 240 // Check that the address spaces match and that the pointers are valid. 241 if (!PtrA || !PtrB || (ASA != ASB)) 242 return false; 243 244 // Make sure that A and B are different pointers of the same size type. 245 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 246 Type *PtrATy = PtrA->getType()->getPointerElementType(); 247 Type *PtrBTy = PtrB->getType()->getPointerElementType(); 248 if (PtrA == PtrB || 249 DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || 250 DL.getTypeStoreSize(PtrATy->getScalarType()) != 251 DL.getTypeStoreSize(PtrBTy->getScalarType())) 252 return false; 253 254 APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); 255 256 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 257 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 258 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 259 260 APInt OffsetDelta = OffsetB - OffsetA; 261 262 // Check if they are based on the same pointer. That makes the offsets 263 // sufficient. 264 if (PtrA == PtrB) 265 return OffsetDelta == Size; 266 267 // Compute the necessary base pointer delta to have the necessary final delta 268 // equal to the size. 269 APInt BaseDelta = Size - OffsetDelta; 270 271 // Compute the distance with SCEV between the base pointers. 272 const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 273 const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 274 const SCEV *C = SE.getConstant(BaseDelta); 275 const SCEV *X = SE.getAddExpr(PtrSCEVA, C); 276 if (X == PtrSCEVB) 277 return true; 278 279 // Sometimes even this doesn't work, because SCEV can't always see through 280 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking 281 // things the hard way. 282 283 // Look through GEPs after checking they're the same except for the last 284 // index. 285 GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(getPointerOperand(A)); 286 GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(getPointerOperand(B)); 287 if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) 288 return false; 289 unsigned FinalIndex = GEPA->getNumOperands() - 1; 290 for (unsigned i = 0; i < FinalIndex; i++) 291 if (GEPA->getOperand(i) != GEPB->getOperand(i)) 292 return false; 293 294 Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex)); 295 Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex)); 296 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 297 OpA->getType() != OpB->getType()) 298 return false; 299 300 // Only look through a ZExt/SExt. 301 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 302 return false; 303 304 bool Signed = isa<SExtInst>(OpA); 305 306 OpA = dyn_cast<Instruction>(OpA->getOperand(0)); 307 OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 308 if (!OpA || !OpB || OpA->getType() != OpB->getType()) 309 return false; 310 311 // Now we need to prove that adding 1 to OpA won't overflow. 312 bool Safe = false; 313 // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, 314 // we're okay. 315 if (OpB->getOpcode() == Instruction::Add && 316 isa<ConstantInt>(OpB->getOperand(1)) && 317 cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) { 318 if (Signed) 319 Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); 320 else 321 Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); 322 } 323 324 unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); 325 326 // Second attempt: 327 // If any bits are known to be zero other than the sign bit in OpA, we can 328 // add 1 to it while guaranteeing no overflow of any sort. 329 if (!Safe) { 330 APInt KnownZero(BitWidth, 0); 331 APInt KnownOne(BitWidth, 0); 332 computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT); 333 KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1); 334 if (KnownZero != 0) 335 Safe = true; 336 } 337 338 if (!Safe) 339 return false; 340 341 const SCEV *OffsetSCEVA = SE.getSCEV(OpA); 342 const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 343 const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); 344 const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); 345 return X2 == OffsetSCEVB; 346 } 347 348 void Vectorizer::reorder(Instruction *I) { 349 OrderedBasicBlock OBB(I->getParent()); 350 SmallPtrSet<Instruction *, 16> InstructionsToMove; 351 SmallVector<Instruction *, 16> Worklist; 352 353 Worklist.push_back(I); 354 while (!Worklist.empty()) { 355 Instruction *IW = Worklist.pop_back_val(); 356 int NumOperands = IW->getNumOperands(); 357 for (int i = 0; i < NumOperands; i++) { 358 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 359 if (!IM || IM->getOpcode() == Instruction::PHI) 360 continue; 361 362 // If IM is in another BB, no need to move it, because this pass only 363 // vectorizes instructions within one BB. 364 if (IM->getParent() != I->getParent()) 365 continue; 366 367 if (!OBB.dominates(IM, I)) { 368 InstructionsToMove.insert(IM); 369 Worklist.push_back(IM); 370 } 371 } 372 } 373 374 // All instructions to move should follow I. Start from I, not from begin(). 375 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; 376 ++BBI) { 377 if (!InstructionsToMove.count(&*BBI)) 378 continue; 379 Instruction *IM = &*BBI; 380 --BBI; 381 IM->removeFromParent(); 382 IM->insertBefore(I); 383 } 384 } 385 386 std::pair<BasicBlock::iterator, BasicBlock::iterator> 387 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { 388 Instruction *C0 = Chain[0]; 389 BasicBlock::iterator FirstInstr = C0->getIterator(); 390 BasicBlock::iterator LastInstr = C0->getIterator(); 391 392 BasicBlock *BB = C0->getParent(); 393 unsigned NumFound = 0; 394 for (Instruction &I : *BB) { 395 if (!is_contained(Chain, &I)) 396 continue; 397 398 ++NumFound; 399 if (NumFound == 1) { 400 FirstInstr = I.getIterator(); 401 } 402 if (NumFound == Chain.size()) { 403 LastInstr = I.getIterator(); 404 break; 405 } 406 } 407 408 // Range is [first, last). 409 return std::make_pair(FirstInstr, ++LastInstr); 410 } 411 412 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { 413 SmallVector<Instruction *, 16> Instrs; 414 for (Instruction *I : Chain) { 415 Value *PtrOperand = getPointerOperand(I); 416 assert(PtrOperand && "Instruction must have a pointer operand."); 417 Instrs.push_back(I); 418 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) 419 Instrs.push_back(GEP); 420 } 421 422 // Erase instructions. 423 for (Instruction *I : Instrs) 424 if (I->use_empty()) 425 I->eraseFromParent(); 426 } 427 428 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 429 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, 430 unsigned ElementSizeBits) { 431 unsigned ElemSizeInBytes = ElementSizeBits / 8; 432 unsigned SizeInBytes = ElemSizeInBytes * Chain.size(); 433 unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes; 434 unsigned NumLeft = Chain.size() - NumRight; 435 return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); 436 } 437 438 ArrayRef<Instruction *> 439 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { 440 // These are in BB order, unlike Chain, which is in address order. 441 SmallVector<Instruction *, 16> MemoryInstrs; 442 SmallVector<Instruction *, 16> ChainInstrs; 443 444 bool IsLoadChain = isa<LoadInst>(Chain[0]); 445 DEBUG({ 446 for (Instruction *I : Chain) { 447 if (IsLoadChain) 448 assert(isa<LoadInst>(I) && 449 "All elements of Chain must be loads, or all must be stores."); 450 else 451 assert(isa<StoreInst>(I) && 452 "All elements of Chain must be loads, or all must be stores."); 453 } 454 }); 455 456 for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { 457 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 458 if (!is_contained(Chain, &I)) 459 MemoryInstrs.push_back(&I); 460 else 461 ChainInstrs.push_back(&I); 462 } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { 463 DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); 464 break; 465 } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { 466 DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I 467 << '\n'); 468 break; 469 } 470 } 471 472 OrderedBasicBlock OBB(Chain[0]->getParent()); 473 474 // Loop until we find an instruction in ChainInstrs that we can't vectorize. 475 unsigned ChainInstrIdx = 0; 476 for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) { 477 Instruction *ChainInstr = ChainInstrs[ChainInstrIdx]; 478 bool AliasFound = false; 479 for (Instruction *MemInstr : MemoryInstrs) { 480 if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr)) 481 continue; 482 483 // We can ignore the alias as long as the load comes before the store, 484 // because that means we won't be moving the load past the store to 485 // vectorize it (the vectorized load is inserted at the location of the 486 // first load in the chain). 487 if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) && 488 OBB.dominates(ChainInstr, MemInstr)) 489 continue; 490 491 // Same case, but in reverse. 492 if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) && 493 OBB.dominates(MemInstr, ChainInstr)) 494 continue; 495 496 if (!AA.isNoAlias(MemoryLocation::get(MemInstr), 497 MemoryLocation::get(ChainInstr))) { 498 DEBUG({ 499 dbgs() << "LSV: Found alias:\n" 500 " Aliasing instruction and pointer:\n" 501 << " " << *MemInstr << '\n' 502 << " " << *getPointerOperand(MemInstr) << '\n' 503 << " Aliased instruction and pointer:\n" 504 << " " << *ChainInstr << '\n' 505 << " " << *getPointerOperand(ChainInstr) << '\n'; 506 }); 507 AliasFound = true; 508 break; 509 } 510 } 511 if (AliasFound) 512 break; 513 } 514 515 // Find the largest prefix of Chain whose elements are all in 516 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of 517 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB 518 // order.) 519 SmallPtrSet<Instruction *, 8> VectorizableChainInstrs( 520 ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx); 521 unsigned ChainIdx = 0; 522 for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { 523 if (!VectorizableChainInstrs.count(Chain[ChainIdx])) 524 break; 525 } 526 return Chain.slice(0, ChainIdx); 527 } 528 529 std::pair<InstrListMap, InstrListMap> 530 Vectorizer::collectInstructions(BasicBlock *BB) { 531 InstrListMap LoadRefs; 532 InstrListMap StoreRefs; 533 534 for (Instruction &I : *BB) { 535 if (!I.mayReadOrWriteMemory()) 536 continue; 537 538 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 539 if (!LI->isSimple()) 540 continue; 541 542 Type *Ty = LI->getType(); 543 if (!VectorType::isValidElementType(Ty->getScalarType())) 544 continue; 545 546 // Skip weird non-byte sizes. They probably aren't worth the effort of 547 // handling correctly. 548 unsigned TySize = DL.getTypeSizeInBits(Ty); 549 if (TySize < 8) 550 continue; 551 552 Value *Ptr = LI->getPointerOperand(); 553 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 554 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 555 556 // No point in looking at these if they're too big to vectorize. 557 if (TySize > VecRegSize / 2) 558 continue; 559 560 // Make sure all the users of a vector are constant-index extracts. 561 if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) { 562 const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 563 return EEI && isa<ConstantInt>(EEI->getOperand(1)); 564 })) 565 continue; 566 567 // TODO: Target hook to filter types. 568 569 // Save the load locations. 570 Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 571 LoadRefs[ObjPtr].push_back(LI); 572 573 } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 574 if (!SI->isSimple()) 575 continue; 576 577 Type *Ty = SI->getValueOperand()->getType(); 578 if (!VectorType::isValidElementType(Ty->getScalarType())) 579 continue; 580 581 // Skip weird non-byte sizes. They probably aren't worth the effort of 582 // handling correctly. 583 unsigned TySize = DL.getTypeSizeInBits(Ty); 584 if (TySize < 8) 585 continue; 586 587 Value *Ptr = SI->getPointerOperand(); 588 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 589 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 590 if (TySize > VecRegSize / 2) 591 continue; 592 593 if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) { 594 const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 595 return EEI && isa<ConstantInt>(EEI->getOperand(1)); 596 })) 597 continue; 598 599 // Save store location. 600 Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 601 StoreRefs[ObjPtr].push_back(SI); 602 } 603 } 604 605 return {LoadRefs, StoreRefs}; 606 } 607 608 bool Vectorizer::vectorizeChains(InstrListMap &Map) { 609 bool Changed = false; 610 611 for (const std::pair<Value *, InstrList> &Chain : Map) { 612 unsigned Size = Chain.second.size(); 613 if (Size < 2) 614 continue; 615 616 DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); 617 618 // Process the stores in chunks of 64. 619 for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { 620 unsigned Len = std::min<unsigned>(CE - CI, 64); 621 ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); 622 Changed |= vectorizeInstructions(Chunk); 623 } 624 } 625 626 return Changed; 627 } 628 629 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { 630 DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); 631 SmallVector<int, 16> Heads, Tails; 632 int ConsecutiveChain[64]; 633 634 // Do a quadratic search on all of the given stores and find all of the pairs 635 // of stores that follow each other. 636 for (int i = 0, e = Instrs.size(); i < e; ++i) { 637 ConsecutiveChain[i] = -1; 638 for (int j = e - 1; j >= 0; --j) { 639 if (i == j) 640 continue; 641 642 if (isConsecutiveAccess(Instrs[i], Instrs[j])) { 643 if (ConsecutiveChain[i] != -1) { 644 int CurDistance = std::abs(ConsecutiveChain[i] - i); 645 int NewDistance = std::abs(ConsecutiveChain[i] - j); 646 if (j < i || NewDistance > CurDistance) 647 continue; // Should not insert. 648 } 649 650 Tails.push_back(j); 651 Heads.push_back(i); 652 ConsecutiveChain[i] = j; 653 } 654 } 655 } 656 657 bool Changed = false; 658 SmallPtrSet<Instruction *, 16> InstructionsProcessed; 659 660 for (int Head : Heads) { 661 if (InstructionsProcessed.count(Instrs[Head])) 662 continue; 663 bool LongerChainExists = false; 664 for (unsigned TIt = 0; TIt < Tails.size(); TIt++) 665 if (Head == Tails[TIt] && 666 !InstructionsProcessed.count(Instrs[Heads[TIt]])) { 667 LongerChainExists = true; 668 break; 669 } 670 if (LongerChainExists) 671 continue; 672 673 // We found an instr that starts a chain. Now follow the chain and try to 674 // vectorize it. 675 SmallVector<Instruction *, 16> Operands; 676 int I = Head; 677 while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { 678 if (InstructionsProcessed.count(Instrs[I])) 679 break; 680 681 Operands.push_back(Instrs[I]); 682 I = ConsecutiveChain[I]; 683 } 684 685 bool Vectorized = false; 686 if (isa<LoadInst>(*Operands.begin())) 687 Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); 688 else 689 Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); 690 691 Changed |= Vectorized; 692 } 693 694 return Changed; 695 } 696 697 bool Vectorizer::vectorizeStoreChain( 698 ArrayRef<Instruction *> Chain, 699 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 700 StoreInst *S0 = cast<StoreInst>(Chain[0]); 701 702 // If the vector has an int element, default to int for the whole load. 703 Type *StoreTy; 704 for (Instruction *I : Chain) { 705 StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); 706 if (StoreTy->isIntOrIntVectorTy()) 707 break; 708 709 if (StoreTy->isPtrOrPtrVectorTy()) { 710 StoreTy = Type::getIntNTy(F.getParent()->getContext(), 711 DL.getTypeSizeInBits(StoreTy)); 712 break; 713 } 714 } 715 716 unsigned Sz = DL.getTypeSizeInBits(StoreTy); 717 unsigned AS = S0->getPointerAddressSpace(); 718 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 719 unsigned VF = VecRegSize / Sz; 720 unsigned ChainSize = Chain.size(); 721 722 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 723 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 724 return false; 725 } 726 727 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 728 if (NewChain.empty()) { 729 // No vectorization possible. 730 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 731 return false; 732 } 733 if (NewChain.size() == 1) { 734 // Failed after the first instruction. Discard it and try the smaller chain. 735 InstructionsProcessed->insert(NewChain.front()); 736 return false; 737 } 738 739 // Update Chain to the valid vectorizable subchain. 740 Chain = NewChain; 741 ChainSize = Chain.size(); 742 743 // Store size should be 1B, 2B or multiple of 4B. 744 // TODO: Target hook for size constraint? 745 unsigned SzInBytes = (Sz / 8) * ChainSize; 746 if (SzInBytes > 2 && SzInBytes % 4 != 0) { 747 DEBUG(dbgs() << "LSV: Size should be 1B, 2B " 748 "or multiple of 4B. Splitting.\n"); 749 if (SzInBytes == 3) 750 return vectorizeStoreChain(Chain.slice(0, ChainSize - 1), 751 InstructionsProcessed); 752 753 auto Chains = splitOddVectorElts(Chain, Sz); 754 return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 755 vectorizeStoreChain(Chains.second, InstructionsProcessed); 756 } 757 758 VectorType *VecTy; 759 VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); 760 if (VecStoreTy) 761 VecTy = VectorType::get(StoreTy->getScalarType(), 762 Chain.size() * VecStoreTy->getNumElements()); 763 else 764 VecTy = VectorType::get(StoreTy, Chain.size()); 765 766 // If it's more than the max vector size, break it into two pieces. 767 // TODO: Target hook to control types to split to. 768 if (ChainSize > VF) { 769 DEBUG(dbgs() << "LSV: Vector factor is too big." 770 " Creating two separate arrays.\n"); 771 return vectorizeStoreChain(Chain.slice(0, VF), InstructionsProcessed) | 772 vectorizeStoreChain(Chain.slice(VF), InstructionsProcessed); 773 } 774 775 DEBUG({ 776 dbgs() << "LSV: Stores to vectorize:\n"; 777 for (Instruction *I : Chain) 778 dbgs() << " " << *I << "\n"; 779 }); 780 781 // We won't try again to vectorize the elements of the chain, regardless of 782 // whether we succeed below. 783 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 784 785 // Check alignment restrictions. 786 unsigned Alignment = getAlignment(S0); 787 788 // If the store is going to be misaligned, don't vectorize it. 789 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 790 if (S0->getPointerAddressSpace() != 0) 791 return false; 792 793 // If we're storing to an object on the stack, we control its alignment, 794 // so we can cheat and change it! 795 Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL); 796 if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) { 797 AI->setAlignment(StackAdjustedAlignment); 798 Alignment = StackAdjustedAlignment; 799 } else { 800 return false; 801 } 802 } 803 804 BasicBlock::iterator First, Last; 805 std::tie(First, Last) = getBoundaryInstrs(Chain); 806 Builder.SetInsertPoint(&*Last); 807 808 Value *Vec = UndefValue::get(VecTy); 809 810 if (VecStoreTy) { 811 unsigned VecWidth = VecStoreTy->getNumElements(); 812 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 813 StoreInst *Store = cast<StoreInst>(Chain[I]); 814 for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { 815 unsigned NewIdx = J + I * VecWidth; 816 Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), 817 Builder.getInt32(J)); 818 if (Extract->getType() != StoreTy->getScalarType()) 819 Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); 820 821 Value *Insert = 822 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); 823 Vec = Insert; 824 } 825 } 826 } else { 827 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 828 StoreInst *Store = cast<StoreInst>(Chain[I]); 829 Value *Extract = Store->getValueOperand(); 830 if (Extract->getType() != StoreTy->getScalarType()) 831 Extract = 832 Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); 833 834 Value *Insert = 835 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); 836 Vec = Insert; 837 } 838 } 839 840 // This cast is safe because Builder.CreateStore() always creates a bona fide 841 // StoreInst. 842 StoreInst *SI = cast<StoreInst>( 843 Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(), 844 VecTy->getPointerTo(AS)))); 845 propagateMetadata(SI, Chain); 846 SI->setAlignment(Alignment); 847 848 eraseInstructions(Chain); 849 ++NumVectorInstructions; 850 NumScalarsVectorized += Chain.size(); 851 return true; 852 } 853 854 bool Vectorizer::vectorizeLoadChain( 855 ArrayRef<Instruction *> Chain, 856 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 857 LoadInst *L0 = cast<LoadInst>(Chain[0]); 858 859 // If the vector has an int element, default to int for the whole load. 860 Type *LoadTy; 861 for (const auto &V : Chain) { 862 LoadTy = cast<LoadInst>(V)->getType(); 863 if (LoadTy->isIntOrIntVectorTy()) 864 break; 865 866 if (LoadTy->isPtrOrPtrVectorTy()) { 867 LoadTy = Type::getIntNTy(F.getParent()->getContext(), 868 DL.getTypeSizeInBits(LoadTy)); 869 break; 870 } 871 } 872 873 unsigned Sz = DL.getTypeSizeInBits(LoadTy); 874 unsigned AS = L0->getPointerAddressSpace(); 875 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 876 unsigned VF = VecRegSize / Sz; 877 unsigned ChainSize = Chain.size(); 878 879 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 880 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 881 return false; 882 } 883 884 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 885 if (NewChain.empty()) { 886 // No vectorization possible. 887 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 888 return false; 889 } 890 if (NewChain.size() == 1) { 891 // Failed after the first instruction. Discard it and try the smaller chain. 892 InstructionsProcessed->insert(NewChain.front()); 893 return false; 894 } 895 896 // Update Chain to the valid vectorizable subchain. 897 Chain = NewChain; 898 ChainSize = Chain.size(); 899 900 // Load size should be 1B, 2B or multiple of 4B. 901 // TODO: Should size constraint be a target hook? 902 unsigned SzInBytes = (Sz / 8) * ChainSize; 903 if (SzInBytes > 2 && SzInBytes % 4 != 0) { 904 DEBUG(dbgs() << "LSV: Size should be 1B, 2B " 905 "or multiple of 4B. Splitting.\n"); 906 if (SzInBytes == 3) 907 return vectorizeLoadChain(Chain.slice(0, ChainSize - 1), 908 InstructionsProcessed); 909 auto Chains = splitOddVectorElts(Chain, Sz); 910 return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 911 vectorizeLoadChain(Chains.second, InstructionsProcessed); 912 } 913 914 VectorType *VecTy; 915 VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); 916 if (VecLoadTy) 917 VecTy = VectorType::get(LoadTy->getScalarType(), 918 Chain.size() * VecLoadTy->getNumElements()); 919 else 920 VecTy = VectorType::get(LoadTy, Chain.size()); 921 922 // If it's more than the max vector size, break it into two pieces. 923 // TODO: Target hook to control types to split to. 924 if (ChainSize > VF) { 925 DEBUG(dbgs() << "LSV: Vector factor is too big. " 926 "Creating two separate arrays.\n"); 927 return vectorizeLoadChain(Chain.slice(0, VF), InstructionsProcessed) | 928 vectorizeLoadChain(Chain.slice(VF), InstructionsProcessed); 929 } 930 931 // We won't try again to vectorize the elements of the chain, regardless of 932 // whether we succeed below. 933 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 934 935 // Check alignment restrictions. 936 unsigned Alignment = getAlignment(L0); 937 938 // If the load is going to be misaligned, don't vectorize it. 939 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 940 if (L0->getPointerAddressSpace() != 0) 941 return false; 942 943 // If we're loading from an object on the stack, we control its alignment, 944 // so we can cheat and change it! 945 Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL); 946 if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) { 947 AI->setAlignment(StackAdjustedAlignment); 948 Alignment = StackAdjustedAlignment; 949 } else { 950 return false; 951 } 952 } 953 954 DEBUG({ 955 dbgs() << "LSV: Loads to vectorize:\n"; 956 for (Instruction *I : Chain) 957 I->dump(); 958 }); 959 960 // getVectorizablePrefix already computed getBoundaryInstrs. The value of 961 // Last may have changed since then, but the value of First won't have. If it 962 // matters, we could compute getBoundaryInstrs only once and reuse it here. 963 BasicBlock::iterator First, Last; 964 std::tie(First, Last) = getBoundaryInstrs(Chain); 965 Builder.SetInsertPoint(&*First); 966 967 Value *Bitcast = 968 Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); 969 // This cast is safe because Builder.CreateLoad always creates a bona fide 970 // LoadInst. 971 LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast)); 972 propagateMetadata(LI, Chain); 973 LI->setAlignment(Alignment); 974 975 if (VecLoadTy) { 976 SmallVector<Instruction *, 16> InstrsToErase; 977 978 unsigned VecWidth = VecLoadTy->getNumElements(); 979 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 980 for (auto Use : Chain[I]->users()) { 981 // All users of vector loads are ExtractElement instructions with 982 // constant indices, otherwise we would have bailed before now. 983 Instruction *UI = cast<Instruction>(Use); 984 unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); 985 unsigned NewIdx = Idx + I * VecWidth; 986 Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx), 987 UI->getName()); 988 if (V->getType() != UI->getType()) 989 V = Builder.CreateBitCast(V, UI->getType()); 990 991 // Replace the old instruction. 992 UI->replaceAllUsesWith(V); 993 InstrsToErase.push_back(UI); 994 } 995 } 996 997 // Bitcast might not be an Instruction, if the value being loaded is a 998 // constant. In that case, no need to reorder anything. 999 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1000 reorder(BitcastInst); 1001 1002 for (auto I : InstrsToErase) 1003 I->eraseFromParent(); 1004 } else { 1005 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1006 Value *CV = Chain[I]; 1007 Value *V = 1008 Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName()); 1009 if (V->getType() != CV->getType()) { 1010 V = Builder.CreateBitOrPointerCast(V, CV->getType()); 1011 } 1012 1013 // Replace the old instruction. 1014 CV->replaceAllUsesWith(V); 1015 } 1016 1017 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1018 reorder(BitcastInst); 1019 } 1020 1021 eraseInstructions(Chain); 1022 1023 ++NumVectorInstructions; 1024 NumScalarsVectorized += Chain.size(); 1025 return true; 1026 } 1027 1028 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 1029 unsigned Alignment) { 1030 if (Alignment % SzInBytes == 0) 1031 return false; 1032 bool Fast = false; 1033 bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(), 1034 SzInBytes * 8, AddressSpace, 1035 Alignment, &Fast); 1036 DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows 1037 << " and fast? " << Fast << "\n";); 1038 return !Allows || !Fast; 1039 } 1040