1 //===----- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 //===----------------------------------------------------------------------===// 11 12 #include "llvm/ADT/MapVector.h" 13 #include "llvm/ADT/PostOrderIterator.h" 14 #include "llvm/ADT/SetVector.h" 15 #include "llvm/ADT/Statistic.h" 16 #include "llvm/ADT/Triple.h" 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/ScalarEvolution.h" 19 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/Analysis/VectorUtils.h" 23 #include "llvm/IR/DataLayout.h" 24 #include "llvm/IR/Dominators.h" 25 #include "llvm/IR/IRBuilder.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/Module.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/IR/Value.h" 30 #include "llvm/Support/CommandLine.h" 31 #include "llvm/Support/Debug.h" 32 #include "llvm/Support/raw_ostream.h" 33 #include "llvm/Transforms/Vectorize.h" 34 35 using namespace llvm; 36 37 #define DEBUG_TYPE "load-store-vectorizer" 38 STATISTIC(NumVectorInstructions, "Number of vector accesses generated"); 39 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized"); 40 41 namespace { 42 43 // TODO: Remove this 44 static const unsigned TargetBaseAlign = 4; 45 46 typedef SmallVector<Instruction *, 8> InstrList; 47 typedef MapVector<Value *, InstrList> InstrListMap; 48 49 class Vectorizer { 50 Function &F; 51 AliasAnalysis &AA; 52 DominatorTree &DT; 53 ScalarEvolution &SE; 54 TargetTransformInfo &TTI; 55 const DataLayout &DL; 56 IRBuilder<> Builder; 57 58 public: 59 Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, 60 ScalarEvolution &SE, TargetTransformInfo &TTI) 61 : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), 62 DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} 63 64 bool run(); 65 66 private: 67 Value *getPointerOperand(Value *I); 68 69 unsigned getPointerAddressSpace(Value *I); 70 71 unsigned getAlignment(LoadInst *LI) const { 72 unsigned Align = LI->getAlignment(); 73 if (Align != 0) 74 return Align; 75 76 return DL.getABITypeAlignment(LI->getType()); 77 } 78 79 unsigned getAlignment(StoreInst *SI) const { 80 unsigned Align = SI->getAlignment(); 81 if (Align != 0) 82 return Align; 83 84 return DL.getABITypeAlignment(SI->getValueOperand()->getType()); 85 } 86 87 bool isConsecutiveAccess(Value *A, Value *B); 88 89 /// After vectorization, reorder the instructions that I depends on 90 /// (the instructions defining its operands), to ensure they dominate I. 91 void reorder(Instruction *I); 92 93 /// Returns the first and the last instructions in Chain. 94 std::pair<BasicBlock::iterator, BasicBlock::iterator> 95 getBoundaryInstrs(ArrayRef<Instruction *> Chain); 96 97 /// Erases the original instructions after vectorizing. 98 void eraseInstructions(ArrayRef<Instruction *> Chain); 99 100 /// "Legalize" the vector type that would be produced by combining \p 101 /// ElementSizeBits elements in \p Chain. Break into two pieces such that the 102 /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is 103 /// expected to have more than 4 elements. 104 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 105 splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits); 106 107 /// Finds the largest prefix of Chain that's vectorizable, checking for 108 /// intervening instructions which may affect the memory accessed by the 109 /// instructions within Chain. 110 /// 111 /// The elements of \p Chain must be all loads or all stores and must be in 112 /// address order. 113 ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain); 114 115 /// Collects load and store instructions to vectorize. 116 std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB); 117 118 /// Processes the collected instructions, the \p Map. The values of \p Map 119 /// should be all loads or all stores. 120 bool vectorizeChains(InstrListMap &Map); 121 122 /// Finds the load/stores to consecutive memory addresses and vectorizes them. 123 bool vectorizeInstructions(ArrayRef<Instruction *> Instrs); 124 125 /// Vectorizes the load instructions in Chain. 126 bool 127 vectorizeLoadChain(ArrayRef<Instruction *> Chain, 128 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 129 130 /// Vectorizes the store instructions in Chain. 131 bool 132 vectorizeStoreChain(ArrayRef<Instruction *> Chain, 133 SmallPtrSet<Instruction *, 16> *InstructionsProcessed); 134 135 /// Check if this load/store access is misaligned accesses 136 bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 137 unsigned Alignment); 138 }; 139 140 class LoadStoreVectorizer : public FunctionPass { 141 public: 142 static char ID; 143 144 LoadStoreVectorizer() : FunctionPass(ID) { 145 initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry()); 146 } 147 148 bool runOnFunction(Function &F) override; 149 150 const char *getPassName() const override { 151 return "GPU Load and Store Vectorizer"; 152 } 153 154 void getAnalysisUsage(AnalysisUsage &AU) const override { 155 AU.addRequired<AAResultsWrapperPass>(); 156 AU.addRequired<ScalarEvolutionWrapperPass>(); 157 AU.addRequired<DominatorTreeWrapperPass>(); 158 AU.addRequired<TargetTransformInfoWrapperPass>(); 159 AU.setPreservesCFG(); 160 } 161 }; 162 } 163 164 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE, 165 "Vectorize load and Store instructions", false, false) 166 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) 167 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 168 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 169 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) 170 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 171 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE, 172 "Vectorize load and store instructions", false, false) 173 174 char LoadStoreVectorizer::ID = 0; 175 176 Pass *llvm::createLoadStoreVectorizerPass() { 177 return new LoadStoreVectorizer(); 178 } 179 180 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in 181 // vectors of Instructions. 182 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) { 183 SmallVector<Value *, 8> VL(IL.begin(), IL.end()); 184 propagateMetadata(I, VL); 185 } 186 187 bool LoadStoreVectorizer::runOnFunction(Function &F) { 188 // Don't vectorize when the attribute NoImplicitFloat is used. 189 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat)) 190 return false; 191 192 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); 193 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 194 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 195 TargetTransformInfo &TTI = 196 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 197 198 Vectorizer V(F, AA, DT, SE, TTI); 199 return V.run(); 200 } 201 202 // Vectorizer Implementation 203 bool Vectorizer::run() { 204 bool Changed = false; 205 206 // Scan the blocks in the function in post order. 207 for (BasicBlock *BB : post_order(&F)) { 208 InstrListMap LoadRefs, StoreRefs; 209 std::tie(LoadRefs, StoreRefs) = collectInstructions(BB); 210 Changed |= vectorizeChains(LoadRefs); 211 Changed |= vectorizeChains(StoreRefs); 212 } 213 214 return Changed; 215 } 216 217 Value *Vectorizer::getPointerOperand(Value *I) { 218 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 219 return LI->getPointerOperand(); 220 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 221 return SI->getPointerOperand(); 222 return nullptr; 223 } 224 225 unsigned Vectorizer::getPointerAddressSpace(Value *I) { 226 if (LoadInst *L = dyn_cast<LoadInst>(I)) 227 return L->getPointerAddressSpace(); 228 if (StoreInst *S = dyn_cast<StoreInst>(I)) 229 return S->getPointerAddressSpace(); 230 return -1; 231 } 232 233 // FIXME: Merge with llvm::isConsecutiveAccess 234 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { 235 Value *PtrA = getPointerOperand(A); 236 Value *PtrB = getPointerOperand(B); 237 unsigned ASA = getPointerAddressSpace(A); 238 unsigned ASB = getPointerAddressSpace(B); 239 240 // Check that the address spaces match and that the pointers are valid. 241 if (!PtrA || !PtrB || (ASA != ASB)) 242 return false; 243 244 // Make sure that A and B are different pointers of the same size type. 245 unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); 246 Type *PtrATy = PtrA->getType()->getPointerElementType(); 247 Type *PtrBTy = PtrB->getType()->getPointerElementType(); 248 if (PtrA == PtrB || 249 DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || 250 DL.getTypeStoreSize(PtrATy->getScalarType()) != 251 DL.getTypeStoreSize(PtrBTy->getScalarType())) 252 return false; 253 254 APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); 255 256 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 257 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); 258 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); 259 260 APInt OffsetDelta = OffsetB - OffsetA; 261 262 // Check if they are based on the same pointer. That makes the offsets 263 // sufficient. 264 if (PtrA == PtrB) 265 return OffsetDelta == Size; 266 267 // Compute the necessary base pointer delta to have the necessary final delta 268 // equal to the size. 269 APInt BaseDelta = Size - OffsetDelta; 270 271 // Compute the distance with SCEV between the base pointers. 272 const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 273 const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 274 const SCEV *C = SE.getConstant(BaseDelta); 275 const SCEV *X = SE.getAddExpr(PtrSCEVA, C); 276 if (X == PtrSCEVB) 277 return true; 278 279 // Sometimes even this doesn't work, because SCEV can't always see through 280 // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking 281 // things the hard way. 282 283 // Look through GEPs after checking they're the same except for the last 284 // index. 285 GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(getPointerOperand(A)); 286 GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(getPointerOperand(B)); 287 if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) 288 return false; 289 unsigned FinalIndex = GEPA->getNumOperands() - 1; 290 for (unsigned i = 0; i < FinalIndex; i++) 291 if (GEPA->getOperand(i) != GEPB->getOperand(i)) 292 return false; 293 294 Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex)); 295 Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex)); 296 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || 297 OpA->getType() != OpB->getType()) 298 return false; 299 300 // Only look through a ZExt/SExt. 301 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) 302 return false; 303 304 bool Signed = isa<SExtInst>(OpA); 305 306 OpA = dyn_cast<Instruction>(OpA->getOperand(0)); 307 OpB = dyn_cast<Instruction>(OpB->getOperand(0)); 308 if (!OpA || !OpB || OpA->getType() != OpB->getType()) 309 return false; 310 311 // Now we need to prove that adding 1 to OpA won't overflow. 312 bool Safe = false; 313 // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, 314 // we're okay. 315 if (OpB->getOpcode() == Instruction::Add && 316 isa<ConstantInt>(OpB->getOperand(1)) && 317 cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) { 318 if (Signed) 319 Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); 320 else 321 Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); 322 } 323 324 unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); 325 326 // Second attempt: 327 // If any bits are known to be zero other than the sign bit in OpA, we can 328 // add 1 to it while guaranteeing no overflow of any sort. 329 if (!Safe) { 330 APInt KnownZero(BitWidth, 0); 331 APInt KnownOne(BitWidth, 0); 332 computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT); 333 KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1); 334 if (KnownZero != 0) 335 Safe = true; 336 } 337 338 if (!Safe) 339 return false; 340 341 const SCEV *OffsetSCEVA = SE.getSCEV(OpA); 342 const SCEV *OffsetSCEVB = SE.getSCEV(OpB); 343 const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); 344 const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); 345 return X2 == OffsetSCEVB; 346 } 347 348 void Vectorizer::reorder(Instruction *I) { 349 SmallPtrSet<Instruction *, 16> InstructionsToMove; 350 SmallVector<Instruction *, 16> Worklist; 351 352 Worklist.push_back(I); 353 while (!Worklist.empty()) { 354 Instruction *IW = Worklist.pop_back_val(); 355 int NumOperands = IW->getNumOperands(); 356 for (int i = 0; i < NumOperands; i++) { 357 Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i)); 358 if (!IM || IM->getOpcode() == Instruction::PHI) 359 continue; 360 361 if (!DT.dominates(IM, I)) { 362 InstructionsToMove.insert(IM); 363 Worklist.push_back(IM); 364 assert(IM->getParent() == IW->getParent() && 365 "Instructions to move should be in the same basic block"); 366 } 367 } 368 } 369 370 // All instructions to move should follow I. Start from I, not from begin(). 371 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; 372 ++BBI) { 373 if (!InstructionsToMove.count(&*BBI)) 374 continue; 375 Instruction *IM = &*BBI; 376 --BBI; 377 IM->removeFromParent(); 378 IM->insertBefore(I); 379 } 380 } 381 382 std::pair<BasicBlock::iterator, BasicBlock::iterator> 383 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) { 384 Instruction *C0 = Chain[0]; 385 BasicBlock::iterator FirstInstr = C0->getIterator(); 386 BasicBlock::iterator LastInstr = C0->getIterator(); 387 388 BasicBlock *BB = C0->getParent(); 389 unsigned NumFound = 0; 390 for (Instruction &I : *BB) { 391 if (!is_contained(Chain, &I)) 392 continue; 393 394 ++NumFound; 395 if (NumFound == 1) { 396 FirstInstr = I.getIterator(); 397 } 398 if (NumFound == Chain.size()) { 399 LastInstr = I.getIterator(); 400 break; 401 } 402 } 403 404 // Range is [first, last). 405 return std::make_pair(FirstInstr, ++LastInstr); 406 } 407 408 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) { 409 SmallVector<Instruction *, 16> Instrs; 410 for (Instruction *I : Chain) { 411 Value *PtrOperand = getPointerOperand(I); 412 assert(PtrOperand && "Instruction must have a pointer operand."); 413 Instrs.push_back(I); 414 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand)) 415 Instrs.push_back(GEP); 416 } 417 418 // Erase instructions. 419 for (Instruction *I : Instrs) 420 if (I->use_empty()) 421 I->eraseFromParent(); 422 } 423 424 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>> 425 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain, 426 unsigned ElementSizeBits) { 427 unsigned ElemSizeInBytes = ElementSizeBits / 8; 428 unsigned SizeInBytes = ElemSizeInBytes * Chain.size(); 429 unsigned NumRight = (SizeInBytes % 4) / ElemSizeInBytes; 430 unsigned NumLeft = Chain.size() - NumRight; 431 return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); 432 } 433 434 ArrayRef<Instruction *> 435 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { 436 // These are in BB order, unlike Chain, which is in address order. 437 SmallVector<std::pair<Instruction *, unsigned>, 16> MemoryInstrs; 438 SmallVector<std::pair<Instruction *, unsigned>, 16> ChainInstrs; 439 440 bool IsLoadChain = isa<LoadInst>(Chain[0]); 441 DEBUG({ 442 for (Instruction *I : Chain) { 443 if (IsLoadChain) 444 assert(isa<LoadInst>(I) && 445 "All elements of Chain must be loads, or all must be stores."); 446 else 447 assert(isa<StoreInst>(I) && 448 "All elements of Chain must be loads, or all must be stores."); 449 } 450 }); 451 452 unsigned InstrIdx = 0; 453 for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { 454 ++InstrIdx; 455 if (isa<LoadInst>(I) || isa<StoreInst>(I)) { 456 if (!is_contained(Chain, &I)) 457 MemoryInstrs.push_back({&I, InstrIdx}); 458 else 459 ChainInstrs.push_back({&I, InstrIdx}); 460 } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) { 461 DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n'); 462 break; 463 } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) { 464 DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I 465 << '\n'); 466 break; 467 } 468 } 469 470 // Loop until we find an instruction in ChainInstrs that we can't vectorize. 471 unsigned ChainInstrIdx, ChainInstrsLen; 472 for (ChainInstrIdx = 0, ChainInstrsLen = ChainInstrs.size(); 473 ChainInstrIdx < ChainInstrsLen; ++ChainInstrIdx) { 474 Instruction *ChainInstr = ChainInstrs[ChainInstrIdx].first; 475 unsigned ChainInstrLoc = ChainInstrs[ChainInstrIdx].second; 476 bool AliasFound = false; 477 for (auto EntryMem : MemoryInstrs) { 478 Instruction *MemInstr = EntryMem.first; 479 unsigned MemInstrLoc = EntryMem.second; 480 if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr)) 481 continue; 482 483 // We can ignore the alias as long as the load comes before the store, 484 // because that means we won't be moving the load past the store to 485 // vectorize it (the vectorized load is inserted at the location of the 486 // first load in the chain). 487 if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) && 488 ChainInstrLoc < MemInstrLoc) 489 continue; 490 491 // Same case, but in reverse. 492 if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) && 493 ChainInstrLoc > MemInstrLoc) 494 continue; 495 496 if (!AA.isNoAlias(MemoryLocation::get(MemInstr), 497 MemoryLocation::get(ChainInstr))) { 498 DEBUG({ 499 dbgs() << "LSV: Found alias:\n" 500 " Aliasing instruction and pointer:\n" 501 << " " << *MemInstr << '\n' 502 << " " << *getPointerOperand(MemInstr) << '\n' 503 << " Aliased instruction and pointer:\n" 504 << " " << *ChainInstr << '\n' 505 << " " << *getPointerOperand(ChainInstr) << '\n'; 506 }); 507 AliasFound = true; 508 break; 509 } 510 } 511 if (AliasFound) 512 break; 513 } 514 515 // Find the largest prefix of Chain whose elements are all in 516 // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of 517 // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB 518 // order.) 519 auto VectorizableChainInstrs = 520 makeArrayRef(ChainInstrs.data(), ChainInstrIdx); 521 unsigned ChainIdx, ChainLen; 522 for (ChainIdx = 0, ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { 523 Instruction *I = Chain[ChainIdx]; 524 if (!any_of(VectorizableChainInstrs, 525 [I](std::pair<Instruction *, unsigned> CI) { 526 return I == CI.first; 527 })) 528 break; 529 } 530 return Chain.slice(0, ChainIdx); 531 } 532 533 std::pair<InstrListMap, InstrListMap> 534 Vectorizer::collectInstructions(BasicBlock *BB) { 535 InstrListMap LoadRefs; 536 InstrListMap StoreRefs; 537 538 for (Instruction &I : *BB) { 539 if (!I.mayReadOrWriteMemory()) 540 continue; 541 542 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 543 if (!LI->isSimple()) 544 continue; 545 546 Type *Ty = LI->getType(); 547 if (!VectorType::isValidElementType(Ty->getScalarType())) 548 continue; 549 550 // Skip weird non-byte sizes. They probably aren't worth the effort of 551 // handling correctly. 552 unsigned TySize = DL.getTypeSizeInBits(Ty); 553 if (TySize < 8) 554 continue; 555 556 Value *Ptr = LI->getPointerOperand(); 557 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 558 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 559 560 // No point in looking at these if they're too big to vectorize. 561 if (TySize > VecRegSize / 2) 562 continue; 563 564 // Make sure all the users of a vector are constant-index extracts. 565 if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) { 566 const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 567 return EEI && isa<ConstantInt>(EEI->getOperand(1)); 568 })) 569 continue; 570 571 // TODO: Target hook to filter types. 572 573 // Save the load locations. 574 Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 575 LoadRefs[ObjPtr].push_back(LI); 576 577 } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 578 if (!SI->isSimple()) 579 continue; 580 581 Type *Ty = SI->getValueOperand()->getType(); 582 if (!VectorType::isValidElementType(Ty->getScalarType())) 583 continue; 584 585 // Skip weird non-byte sizes. They probably aren't worth the effort of 586 // handling correctly. 587 unsigned TySize = DL.getTypeSizeInBits(Ty); 588 if (TySize < 8) 589 continue; 590 591 Value *Ptr = SI->getPointerOperand(); 592 unsigned AS = Ptr->getType()->getPointerAddressSpace(); 593 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 594 if (TySize > VecRegSize / 2) 595 continue; 596 597 if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) { 598 const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U); 599 return EEI && isa<ConstantInt>(EEI->getOperand(1)); 600 })) 601 continue; 602 603 // Save store location. 604 Value *ObjPtr = GetUnderlyingObject(Ptr, DL); 605 StoreRefs[ObjPtr].push_back(SI); 606 } 607 } 608 609 return {LoadRefs, StoreRefs}; 610 } 611 612 bool Vectorizer::vectorizeChains(InstrListMap &Map) { 613 bool Changed = false; 614 615 for (const std::pair<Value *, InstrList> &Chain : Map) { 616 unsigned Size = Chain.second.size(); 617 if (Size < 2) 618 continue; 619 620 DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n"); 621 622 // Process the stores in chunks of 64. 623 for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) { 624 unsigned Len = std::min<unsigned>(CE - CI, 64); 625 ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len); 626 Changed |= vectorizeInstructions(Chunk); 627 } 628 } 629 630 return Changed; 631 } 632 633 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) { 634 DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); 635 SmallSetVector<int, 16> Heads, Tails; 636 int ConsecutiveChain[64]; 637 638 // Do a quadratic search on all of the given stores and find all of the pairs 639 // of stores that follow each other. 640 for (int i = 0, e = Instrs.size(); i < e; ++i) { 641 ConsecutiveChain[i] = -1; 642 for (int j = e - 1; j >= 0; --j) { 643 if (i == j) 644 continue; 645 646 if (isConsecutiveAccess(Instrs[i], Instrs[j])) { 647 if (ConsecutiveChain[i] != -1) { 648 int CurDistance = std::abs(ConsecutiveChain[i] - i); 649 int NewDistance = std::abs(ConsecutiveChain[i] - j); 650 if (j < i || NewDistance > CurDistance) 651 continue; // Should not insert. 652 } 653 654 Tails.insert(j); 655 Heads.insert(i); 656 ConsecutiveChain[i] = j; 657 } 658 } 659 } 660 661 bool Changed = false; 662 SmallPtrSet<Instruction *, 16> InstructionsProcessed; 663 664 for (int Head : Heads) { 665 if (InstructionsProcessed.count(Instrs[Head])) 666 continue; 667 bool longerChainExists = false; 668 for (unsigned TIt = 0; TIt < Tails.size(); TIt++) 669 if (Head == Tails[TIt] && 670 !InstructionsProcessed.count(Instrs[Heads[TIt]])) { 671 longerChainExists = true; 672 break; 673 } 674 if (longerChainExists) 675 continue; 676 677 // We found an instr that starts a chain. Now follow the chain and try to 678 // vectorize it. 679 SmallVector<Instruction *, 16> Operands; 680 int I = Head; 681 while (I != -1 && (Tails.count(I) || Heads.count(I))) { 682 if (InstructionsProcessed.count(Instrs[I])) 683 break; 684 685 Operands.push_back(Instrs[I]); 686 I = ConsecutiveChain[I]; 687 } 688 689 bool Vectorized = false; 690 if (isa<LoadInst>(*Operands.begin())) 691 Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed); 692 else 693 Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed); 694 695 Changed |= Vectorized; 696 } 697 698 return Changed; 699 } 700 701 bool Vectorizer::vectorizeStoreChain( 702 ArrayRef<Instruction *> Chain, 703 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 704 StoreInst *S0 = cast<StoreInst>(Chain[0]); 705 706 // If the vector has an int element, default to int for the whole load. 707 Type *StoreTy; 708 for (Instruction *I : Chain) { 709 StoreTy = cast<StoreInst>(I)->getValueOperand()->getType(); 710 if (StoreTy->isIntOrIntVectorTy()) 711 break; 712 713 if (StoreTy->isPtrOrPtrVectorTy()) { 714 StoreTy = Type::getIntNTy(F.getParent()->getContext(), 715 DL.getTypeSizeInBits(StoreTy)); 716 break; 717 } 718 } 719 720 unsigned Sz = DL.getTypeSizeInBits(StoreTy); 721 unsigned AS = S0->getPointerAddressSpace(); 722 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 723 unsigned VF = VecRegSize / Sz; 724 unsigned ChainSize = Chain.size(); 725 726 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 727 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 728 return false; 729 } 730 731 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 732 if (NewChain.empty()) { 733 // No vectorization possible. 734 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 735 return false; 736 } 737 if (NewChain.size() == 1) { 738 // Failed after the first instruction. Discard it and try the smaller chain. 739 InstructionsProcessed->insert(NewChain.front()); 740 return false; 741 } 742 743 // Update Chain to the valid vectorizable subchain. 744 Chain = NewChain; 745 ChainSize = Chain.size(); 746 747 // Store size should be 1B, 2B or multiple of 4B. 748 // TODO: Target hook for size constraint? 749 unsigned SzInBytes = (Sz / 8) * ChainSize; 750 if (SzInBytes > 2 && SzInBytes % 4 != 0) { 751 DEBUG(dbgs() << "LSV: Size should be 1B, 2B " 752 "or multiple of 4B. Splitting.\n"); 753 if (SzInBytes == 3) 754 return vectorizeStoreChain(Chain.slice(0, ChainSize - 1), 755 InstructionsProcessed); 756 757 auto Chains = splitOddVectorElts(Chain, Sz); 758 return vectorizeStoreChain(Chains.first, InstructionsProcessed) | 759 vectorizeStoreChain(Chains.second, InstructionsProcessed); 760 } 761 762 VectorType *VecTy; 763 VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy); 764 if (VecStoreTy) 765 VecTy = VectorType::get(StoreTy->getScalarType(), 766 Chain.size() * VecStoreTy->getNumElements()); 767 else 768 VecTy = VectorType::get(StoreTy, Chain.size()); 769 770 // If it's more than the max vector size, break it into two pieces. 771 // TODO: Target hook to control types to split to. 772 if (ChainSize > VF) { 773 DEBUG(dbgs() << "LSV: Vector factor is too big." 774 " Creating two separate arrays.\n"); 775 return vectorizeStoreChain(Chain.slice(0, VF), InstructionsProcessed) | 776 vectorizeStoreChain(Chain.slice(VF), InstructionsProcessed); 777 } 778 779 DEBUG({ 780 dbgs() << "LSV: Stores to vectorize:\n"; 781 for (Instruction *I : Chain) 782 dbgs() << " " << *I << "\n"; 783 }); 784 785 // We won't try again to vectorize the elements of the chain, regardless of 786 // whether we succeed below. 787 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 788 789 // Check alignment restrictions. 790 unsigned Alignment = getAlignment(S0); 791 792 // If the store is going to be misaligned, don't vectorize it. 793 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 794 if (S0->getPointerAddressSpace() != 0) 795 return false; 796 797 // If we're storing to an object on the stack, we control its alignment, 798 // so we can cheat and change it! 799 Value *V = GetUnderlyingObject(S0->getPointerOperand(), DL); 800 if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) { 801 AI->setAlignment(TargetBaseAlign); 802 Alignment = TargetBaseAlign; 803 } else { 804 return false; 805 } 806 } 807 808 BasicBlock::iterator First, Last; 809 std::tie(First, Last) = getBoundaryInstrs(Chain); 810 Builder.SetInsertPoint(&*Last); 811 812 Value *Vec = UndefValue::get(VecTy); 813 814 if (VecStoreTy) { 815 unsigned VecWidth = VecStoreTy->getNumElements(); 816 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 817 StoreInst *Store = cast<StoreInst>(Chain[I]); 818 for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) { 819 unsigned NewIdx = J + I * VecWidth; 820 Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(), 821 Builder.getInt32(J)); 822 if (Extract->getType() != StoreTy->getScalarType()) 823 Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); 824 825 Value *Insert = 826 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); 827 Vec = Insert; 828 } 829 } 830 } else { 831 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 832 StoreInst *Store = cast<StoreInst>(Chain[I]); 833 Value *Extract = Store->getValueOperand(); 834 if (Extract->getType() != StoreTy->getScalarType()) 835 Extract = 836 Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); 837 838 Value *Insert = 839 Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); 840 Vec = Insert; 841 } 842 } 843 844 // This cast is safe because Builder.CreateStore() always creates a bona fide 845 // StoreInst. 846 StoreInst *SI = cast<StoreInst>( 847 Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(), 848 VecTy->getPointerTo(AS)))); 849 propagateMetadata(SI, Chain); 850 SI->setAlignment(Alignment); 851 852 eraseInstructions(Chain); 853 ++NumVectorInstructions; 854 NumScalarsVectorized += Chain.size(); 855 return true; 856 } 857 858 bool Vectorizer::vectorizeLoadChain( 859 ArrayRef<Instruction *> Chain, 860 SmallPtrSet<Instruction *, 16> *InstructionsProcessed) { 861 LoadInst *L0 = cast<LoadInst>(Chain[0]); 862 863 // If the vector has an int element, default to int for the whole load. 864 Type *LoadTy; 865 for (const auto &V : Chain) { 866 LoadTy = cast<LoadInst>(V)->getType(); 867 if (LoadTy->isIntOrIntVectorTy()) 868 break; 869 870 if (LoadTy->isPtrOrPtrVectorTy()) { 871 LoadTy = Type::getIntNTy(F.getParent()->getContext(), 872 DL.getTypeSizeInBits(LoadTy)); 873 break; 874 } 875 } 876 877 unsigned Sz = DL.getTypeSizeInBits(LoadTy); 878 unsigned AS = L0->getPointerAddressSpace(); 879 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); 880 unsigned VF = VecRegSize / Sz; 881 unsigned ChainSize = Chain.size(); 882 883 if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { 884 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 885 return false; 886 } 887 888 ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain); 889 if (NewChain.empty()) { 890 // No vectorization possible. 891 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 892 return false; 893 } 894 if (NewChain.size() == 1) { 895 // Failed after the first instruction. Discard it and try the smaller chain. 896 InstructionsProcessed->insert(NewChain.front()); 897 return false; 898 } 899 900 // Update Chain to the valid vectorizable subchain. 901 Chain = NewChain; 902 ChainSize = Chain.size(); 903 904 // Load size should be 1B, 2B or multiple of 4B. 905 // TODO: Should size constraint be a target hook? 906 unsigned SzInBytes = (Sz / 8) * ChainSize; 907 if (SzInBytes > 2 && SzInBytes % 4 != 0) { 908 DEBUG(dbgs() << "LSV: Size should be 1B, 2B " 909 "or multiple of 4B. Splitting.\n"); 910 if (SzInBytes == 3) 911 return vectorizeLoadChain(Chain.slice(0, ChainSize - 1), 912 InstructionsProcessed); 913 auto Chains = splitOddVectorElts(Chain, Sz); 914 return vectorizeLoadChain(Chains.first, InstructionsProcessed) | 915 vectorizeLoadChain(Chains.second, InstructionsProcessed); 916 } 917 918 VectorType *VecTy; 919 VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy); 920 if (VecLoadTy) 921 VecTy = VectorType::get(LoadTy->getScalarType(), 922 Chain.size() * VecLoadTy->getNumElements()); 923 else 924 VecTy = VectorType::get(LoadTy, Chain.size()); 925 926 // If it's more than the max vector size, break it into two pieces. 927 // TODO: Target hook to control types to split to. 928 if (ChainSize > VF) { 929 DEBUG(dbgs() << "LSV: Vector factor is too big. " 930 "Creating two separate arrays.\n"); 931 return vectorizeLoadChain(Chain.slice(0, VF), InstructionsProcessed) | 932 vectorizeLoadChain(Chain.slice(VF), InstructionsProcessed); 933 } 934 935 // We won't try again to vectorize the elements of the chain, regardless of 936 // whether we succeed below. 937 InstructionsProcessed->insert(Chain.begin(), Chain.end()); 938 939 // Check alignment restrictions. 940 unsigned Alignment = getAlignment(L0); 941 942 // If the load is going to be misaligned, don't vectorize it. 943 if (accessIsMisaligned(SzInBytes, AS, Alignment)) { 944 if (L0->getPointerAddressSpace() != 0) 945 return false; 946 947 // If we're loading from an object on the stack, we control its alignment, 948 // so we can cheat and change it! 949 Value *V = GetUnderlyingObject(L0->getPointerOperand(), DL); 950 if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V)) { 951 AI->setAlignment(TargetBaseAlign); 952 Alignment = TargetBaseAlign; 953 } else { 954 return false; 955 } 956 } 957 958 DEBUG({ 959 dbgs() << "LSV: Loads to vectorize:\n"; 960 for (Instruction *I : Chain) 961 I->dump(); 962 }); 963 964 // getVectorizablePrefix already computed getBoundaryInstrs. The value of 965 // Last may have changed since then, but the value of First won't have. If it 966 // matters, we could compute getBoundaryInstrs only once and reuse it here. 967 BasicBlock::iterator First, Last; 968 std::tie(First, Last) = getBoundaryInstrs(Chain); 969 Builder.SetInsertPoint(&*First); 970 971 Value *Bitcast = 972 Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); 973 // This cast is safe because Builder.CreateLoad always creates a bona fide 974 // LoadInst. 975 LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast)); 976 propagateMetadata(LI, Chain); 977 LI->setAlignment(Alignment); 978 979 if (VecLoadTy) { 980 SmallVector<Instruction *, 16> InstrsToErase; 981 982 unsigned VecWidth = VecLoadTy->getNumElements(); 983 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 984 for (auto Use : Chain[I]->users()) { 985 // All users of vector loads are ExtractElement instructions with 986 // constant indices, otherwise we would have bailed before now. 987 Instruction *UI = cast<Instruction>(Use); 988 unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue(); 989 unsigned NewIdx = Idx + I * VecWidth; 990 Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx)); 991 if (V->getType() != UI->getType()) 992 V = Builder.CreateBitCast(V, UI->getType()); 993 994 // Replace the old instruction. 995 UI->replaceAllUsesWith(V); 996 InstrsToErase.push_back(UI); 997 } 998 } 999 1000 // Bitcast might not be an Instruction, if the value being loaded is a 1001 // constant. In that case, no need to reorder anything. 1002 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1003 reorder(BitcastInst); 1004 1005 for (auto I : InstrsToErase) 1006 I->eraseFromParent(); 1007 } else { 1008 for (unsigned I = 0, E = Chain.size(); I != E; ++I) { 1009 Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I)); 1010 Value *CV = Chain[I]; 1011 if (V->getType() != CV->getType()) { 1012 V = Builder.CreateBitOrPointerCast(V, CV->getType()); 1013 } 1014 1015 // Replace the old instruction. 1016 CV->replaceAllUsesWith(V); 1017 } 1018 1019 if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast)) 1020 reorder(BitcastInst); 1021 } 1022 1023 eraseInstructions(Chain); 1024 1025 ++NumVectorInstructions; 1026 NumScalarsVectorized += Chain.size(); 1027 return true; 1028 } 1029 1030 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace, 1031 unsigned Alignment) { 1032 bool Fast = false; 1033 bool Allows = TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace, 1034 Alignment, &Fast); 1035 // TODO: Remove TargetBaseAlign 1036 return !(Allows && Fast) && (Alignment % SzInBytes) != 0 && 1037 (Alignment % TargetBaseAlign) != 0; 1038 } 1039