1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass merges loads/stores to/from sequential memory addresses into vector
11 // loads/stores.  Although there's nothing GPU-specific in here, this pass is
12 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 //
14 // (For simplicity below we talk about loads only, but everything also applies
15 // to stores.)
16 //
17 // This pass is intended to be run late in the pipeline, after other
18 // vectorization opportunities have been exploited.  So the assumption here is
19 // that immediately following our new vector load we'll need to extract out the
20 // individual elements of the load, so we can operate on them individually.
21 //
22 // On CPUs this transformation is usually not beneficial, because extracting the
23 // elements of a vector register is expensive on most architectures.  It's
24 // usually better just to load each element individually into its own scalar
25 // register.
26 //
27 // However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
28 // "vector load" loads directly into a series of scalar registers.  In effect,
29 // extracting the elements of the vector is free.  It's therefore always
30 // beneficial to vectorize a sequence of loads on these architectures.
31 //
32 // Vectorizing (perhaps a better name might be "coalescing") loads can have
33 // large performance impacts on GPU kernels, and opportunities for vectorizing
34 // are common in GPU code.  This pass tries very hard to find such
35 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 //
37 // Some CPU architectures, such as ARM, have instructions that load into
38 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
39 // could use this pass (with some modifications), but currently it implements
40 // its own pass to do something similar to what we do here.
41 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/PostOrderIterator.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
50 #include "llvm/ADT/iterator_range.h"
51 #include "llvm/Analysis/AliasAnalysis.h"
52 #include "llvm/Analysis/MemoryLocation.h"
53 #include "llvm/Analysis/OrderedBasicBlock.h"
54 #include "llvm/Analysis/ScalarEvolution.h"
55 #include "llvm/Analysis/TargetTransformInfo.h"
56 #include "llvm/Transforms/Utils/Local.h"
57 #include "llvm/Analysis/ValueTracking.h"
58 #include "llvm/Analysis/VectorUtils.h"
59 #include "llvm/IR/Attributes.h"
60 #include "llvm/IR/BasicBlock.h"
61 #include "llvm/IR/Constants.h"
62 #include "llvm/IR/DataLayout.h"
63 #include "llvm/IR/DerivedTypes.h"
64 #include "llvm/IR/Dominators.h"
65 #include "llvm/IR/Function.h"
66 #include "llvm/IR/IRBuilder.h"
67 #include "llvm/IR/InstrTypes.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/IntrinsicInst.h"
71 #include "llvm/IR/Module.h"
72 #include "llvm/IR/Type.h"
73 #include "llvm/IR/User.h"
74 #include "llvm/IR/Value.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
79 #include "llvm/Support/MathExtras.h"
80 #include "llvm/Support/raw_ostream.h"
81 #include "llvm/Transforms/Vectorize.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 /// ChainID is an arbitrary token that is allowed to be different only for the
101 /// accesses that are guaranteed to be considered non-consecutive by
102 /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
103 /// together and reducing the number of instructions the main search operates on
104 /// at a time, i.e. this is to reduce compile time and nothing else as the main
105 /// search has O(n^2) time complexity. The underlying type of ChainID should not
106 /// be relied upon.
107 using ChainID = const Value *;
108 using InstrList = SmallVector<Instruction *, 8>;
109 using InstrListMap = MapVector<ChainID, InstrList>;
110 
111 class Vectorizer {
112   Function &F;
113   AliasAnalysis &AA;
114   DominatorTree &DT;
115   ScalarEvolution &SE;
116   TargetTransformInfo &TTI;
117   const DataLayout &DL;
118   IRBuilder<> Builder;
119 
120 public:
121   Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
122              ScalarEvolution &SE, TargetTransformInfo &TTI)
123       : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
124         DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
125 
126   bool run();
127 
128 private:
129   unsigned getPointerAddressSpace(Value *I);
130 
131   unsigned getAlignment(LoadInst *LI) const {
132     unsigned Align = LI->getAlignment();
133     if (Align != 0)
134       return Align;
135 
136     return DL.getABITypeAlignment(LI->getType());
137   }
138 
139   unsigned getAlignment(StoreInst *SI) const {
140     unsigned Align = SI->getAlignment();
141     if (Align != 0)
142       return Align;
143 
144     return DL.getABITypeAlignment(SI->getValueOperand()->getType());
145   }
146 
147   static const unsigned MaxDepth = 3;
148 
149   bool isConsecutiveAccess(Value *A, Value *B);
150   bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
151                               unsigned Depth = 0) const;
152   bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
153                                    unsigned Depth) const;
154   bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
155                           unsigned Depth) const;
156 
157   /// After vectorization, reorder the instructions that I depends on
158   /// (the instructions defining its operands), to ensure they dominate I.
159   void reorder(Instruction *I);
160 
161   /// Returns the first and the last instructions in Chain.
162   std::pair<BasicBlock::iterator, BasicBlock::iterator>
163   getBoundaryInstrs(ArrayRef<Instruction *> Chain);
164 
165   /// Erases the original instructions after vectorizing.
166   void eraseInstructions(ArrayRef<Instruction *> Chain);
167 
168   /// "Legalize" the vector type that would be produced by combining \p
169   /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
170   /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
171   /// expected to have more than 4 elements.
172   std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
173   splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
174 
175   /// Finds the largest prefix of Chain that's vectorizable, checking for
176   /// intervening instructions which may affect the memory accessed by the
177   /// instructions within Chain.
178   ///
179   /// The elements of \p Chain must be all loads or all stores and must be in
180   /// address order.
181   ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
182 
183   /// Collects load and store instructions to vectorize.
184   std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
185 
186   /// Processes the collected instructions, the \p Map. The values of \p Map
187   /// should be all loads or all stores.
188   bool vectorizeChains(InstrListMap &Map);
189 
190   /// Finds the load/stores to consecutive memory addresses and vectorizes them.
191   bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
192 
193   /// Vectorizes the load instructions in Chain.
194   bool
195   vectorizeLoadChain(ArrayRef<Instruction *> Chain,
196                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
197 
198   /// Vectorizes the store instructions in Chain.
199   bool
200   vectorizeStoreChain(ArrayRef<Instruction *> Chain,
201                       SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
202 
203   /// Check if this load/store access is misaligned accesses.
204   bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
205                           unsigned Alignment);
206 };
207 
208 class LoadStoreVectorizer : public FunctionPass {
209 public:
210   static char ID;
211 
212   LoadStoreVectorizer() : FunctionPass(ID) {
213     initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
214   }
215 
216   bool runOnFunction(Function &F) override;
217 
218   StringRef getPassName() const override {
219     return "GPU Load and Store Vectorizer";
220   }
221 
222   void getAnalysisUsage(AnalysisUsage &AU) const override {
223     AU.addRequired<AAResultsWrapperPass>();
224     AU.addRequired<ScalarEvolutionWrapperPass>();
225     AU.addRequired<DominatorTreeWrapperPass>();
226     AU.addRequired<TargetTransformInfoWrapperPass>();
227     AU.setPreservesCFG();
228   }
229 };
230 
231 } // end anonymous namespace
232 
233 char LoadStoreVectorizer::ID = 0;
234 
235 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
236                       "Vectorize load and Store instructions", false, false)
237 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
238 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
239 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
240 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
241 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
242 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
243                     "Vectorize load and store instructions", false, false)
244 
245 Pass *llvm::createLoadStoreVectorizerPass() {
246   return new LoadStoreVectorizer();
247 }
248 
249 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
250 // vectors of Instructions.
251 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
252   SmallVector<Value *, 8> VL(IL.begin(), IL.end());
253   propagateMetadata(I, VL);
254 }
255 
256 bool LoadStoreVectorizer::runOnFunction(Function &F) {
257   // Don't vectorize when the attribute NoImplicitFloat is used.
258   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
259     return false;
260 
261   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
262   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
263   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
264   TargetTransformInfo &TTI =
265       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
266 
267   Vectorizer V(F, AA, DT, SE, TTI);
268   return V.run();
269 }
270 
271 // Vectorizer Implementation
272 bool Vectorizer::run() {
273   bool Changed = false;
274 
275   // Scan the blocks in the function in post order.
276   for (BasicBlock *BB : post_order(&F)) {
277     InstrListMap LoadRefs, StoreRefs;
278     std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
279     Changed |= vectorizeChains(LoadRefs);
280     Changed |= vectorizeChains(StoreRefs);
281   }
282 
283   return Changed;
284 }
285 
286 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
287   if (LoadInst *L = dyn_cast<LoadInst>(I))
288     return L->getPointerAddressSpace();
289   if (StoreInst *S = dyn_cast<StoreInst>(I))
290     return S->getPointerAddressSpace();
291   return -1;
292 }
293 
294 // FIXME: Merge with llvm::isConsecutiveAccess
295 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
296   Value *PtrA = getLoadStorePointerOperand(A);
297   Value *PtrB = getLoadStorePointerOperand(B);
298   unsigned ASA = getPointerAddressSpace(A);
299   unsigned ASB = getPointerAddressSpace(B);
300 
301   // Check that the address spaces match and that the pointers are valid.
302   if (!PtrA || !PtrB || (ASA != ASB))
303     return false;
304 
305   // Make sure that A and B are different pointers of the same size type.
306   Type *PtrATy = PtrA->getType()->getPointerElementType();
307   Type *PtrBTy = PtrB->getType()->getPointerElementType();
308   if (PtrA == PtrB ||
309       PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
310       DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
311       DL.getTypeStoreSize(PtrATy->getScalarType()) !=
312           DL.getTypeStoreSize(PtrBTy->getScalarType()))
313     return false;
314 
315   unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
316   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
317 
318   return areConsecutivePointers(PtrA, PtrB, Size);
319 }
320 
321 bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
322                                         const APInt &PtrDelta,
323                                         unsigned Depth) const {
324   unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
325   APInt OffsetA(PtrBitWidth, 0);
326   APInt OffsetB(PtrBitWidth, 0);
327   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
328   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
329 
330   APInt OffsetDelta = OffsetB - OffsetA;
331 
332   // Check if they are based on the same pointer. That makes the offsets
333   // sufficient.
334   if (PtrA == PtrB)
335     return OffsetDelta == PtrDelta;
336 
337   // Compute the necessary base pointer delta to have the necessary final delta
338   // equal to the pointer delta requested.
339   APInt BaseDelta = PtrDelta - OffsetDelta;
340 
341   // Compute the distance with SCEV between the base pointers.
342   const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
343   const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
344   const SCEV *C = SE.getConstant(BaseDelta);
345   const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
346   if (X == PtrSCEVB)
347     return true;
348 
349   // The above check will not catch the cases where one of the pointers is
350   // factorized but the other one is not, such as (C + (S * (A + B))) vs
351   // (AS + BS). Get the minus scev. That will allow re-combining the expresions
352   // and getting the simplified difference.
353   const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
354   if (C == Dist)
355     return true;
356 
357   // Sometimes even this doesn't work, because SCEV can't always see through
358   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
359   // things the hard way.
360   return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
361 }
362 
363 bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
364                                              APInt PtrDelta,
365                                              unsigned Depth) const {
366   auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
367   auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
368   if (!GEPA || !GEPB)
369     return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
370 
371   // Look through GEPs after checking they're the same except for the last
372   // index.
373   if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
374       GEPA->getPointerOperand() != GEPB->getPointerOperand())
375     return false;
376   gep_type_iterator GTIA = gep_type_begin(GEPA);
377   gep_type_iterator GTIB = gep_type_begin(GEPB);
378   for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
379     if (GTIA.getOperand() != GTIB.getOperand())
380       return false;
381     ++GTIA;
382     ++GTIB;
383   }
384 
385   Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
386   Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
387   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
388       OpA->getType() != OpB->getType())
389     return false;
390 
391   if (PtrDelta.isNegative()) {
392     if (PtrDelta.isMinSignedValue())
393       return false;
394     PtrDelta.negate();
395     std::swap(OpA, OpB);
396   }
397   uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
398   if (PtrDelta.urem(Stride) != 0)
399     return false;
400   unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
401   APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
402 
403   // Only look through a ZExt/SExt.
404   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
405     return false;
406 
407   bool Signed = isa<SExtInst>(OpA);
408 
409   // At this point A could be a function parameter, i.e. not an instruction
410   Value *ValA = OpA->getOperand(0);
411   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
412   if (!OpB || ValA->getType() != OpB->getType())
413     return false;
414 
415   // Now we need to prove that adding IdxDiff to ValA won't overflow.
416   bool Safe = false;
417   // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
418   // ValA, we're okay.
419   if (OpB->getOpcode() == Instruction::Add &&
420       isa<ConstantInt>(OpB->getOperand(1)) &&
421       IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
422     if (Signed)
423       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
424     else
425       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
426   }
427 
428   unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
429 
430   // Second attempt:
431   // If all set bits of IdxDiff or any higher order bit other than the sign bit
432   // are known to be zero in ValA, we can add Diff to it while guaranteeing no
433   // overflow of any sort.
434   if (!Safe) {
435     OpA = dyn_cast<Instruction>(ValA);
436     if (!OpA)
437       return false;
438     KnownBits Known(BitWidth);
439     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
440     APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
441     if (Signed)
442       BitsAllowedToBeSet.clearBit(BitWidth - 1);
443     if (BitsAllowedToBeSet.ult(IdxDiff))
444       return false;
445   }
446 
447   const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
448   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
449   const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
450   const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
451   return X == OffsetSCEVB;
452 }
453 
454 bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
455                                     const APInt &PtrDelta,
456                                     unsigned Depth) const {
457   if (Depth++ == MaxDepth)
458     return false;
459 
460   if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
461     if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
462       return SelectA->getCondition() == SelectB->getCondition() &&
463              areConsecutivePointers(SelectA->getTrueValue(),
464                                     SelectB->getTrueValue(), PtrDelta, Depth) &&
465              areConsecutivePointers(SelectA->getFalseValue(),
466                                     SelectB->getFalseValue(), PtrDelta, Depth);
467     }
468   }
469   return false;
470 }
471 
472 void Vectorizer::reorder(Instruction *I) {
473   OrderedBasicBlock OBB(I->getParent());
474   SmallPtrSet<Instruction *, 16> InstructionsToMove;
475   SmallVector<Instruction *, 16> Worklist;
476 
477   Worklist.push_back(I);
478   while (!Worklist.empty()) {
479     Instruction *IW = Worklist.pop_back_val();
480     int NumOperands = IW->getNumOperands();
481     for (int i = 0; i < NumOperands; i++) {
482       Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
483       if (!IM || IM->getOpcode() == Instruction::PHI)
484         continue;
485 
486       // If IM is in another BB, no need to move it, because this pass only
487       // vectorizes instructions within one BB.
488       if (IM->getParent() != I->getParent())
489         continue;
490 
491       if (!OBB.dominates(IM, I)) {
492         InstructionsToMove.insert(IM);
493         Worklist.push_back(IM);
494       }
495     }
496   }
497 
498   // All instructions to move should follow I. Start from I, not from begin().
499   for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
500        ++BBI) {
501     if (!InstructionsToMove.count(&*BBI))
502       continue;
503     Instruction *IM = &*BBI;
504     --BBI;
505     IM->removeFromParent();
506     IM->insertBefore(I);
507   }
508 }
509 
510 std::pair<BasicBlock::iterator, BasicBlock::iterator>
511 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
512   Instruction *C0 = Chain[0];
513   BasicBlock::iterator FirstInstr = C0->getIterator();
514   BasicBlock::iterator LastInstr = C0->getIterator();
515 
516   BasicBlock *BB = C0->getParent();
517   unsigned NumFound = 0;
518   for (Instruction &I : *BB) {
519     if (!is_contained(Chain, &I))
520       continue;
521 
522     ++NumFound;
523     if (NumFound == 1) {
524       FirstInstr = I.getIterator();
525     }
526     if (NumFound == Chain.size()) {
527       LastInstr = I.getIterator();
528       break;
529     }
530   }
531 
532   // Range is [first, last).
533   return std::make_pair(FirstInstr, ++LastInstr);
534 }
535 
536 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
537   SmallVector<Instruction *, 16> Instrs;
538   for (Instruction *I : Chain) {
539     Value *PtrOperand = getLoadStorePointerOperand(I);
540     assert(PtrOperand && "Instruction must have a pointer operand.");
541     Instrs.push_back(I);
542     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
543       Instrs.push_back(GEP);
544   }
545 
546   // Erase instructions.
547   for (Instruction *I : Instrs)
548     if (I->use_empty())
549       I->eraseFromParent();
550 }
551 
552 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
553 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
554                                unsigned ElementSizeBits) {
555   unsigned ElementSizeBytes = ElementSizeBits / 8;
556   unsigned SizeBytes = ElementSizeBytes * Chain.size();
557   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
558   if (NumLeft == Chain.size()) {
559     if ((NumLeft & 1) == 0)
560       NumLeft /= 2; // Split even in half
561     else
562       --NumLeft;    // Split off last element
563   } else if (NumLeft == 0)
564     NumLeft = 1;
565   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
566 }
567 
568 ArrayRef<Instruction *>
569 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
570   // These are in BB order, unlike Chain, which is in address order.
571   SmallVector<Instruction *, 16> MemoryInstrs;
572   SmallVector<Instruction *, 16> ChainInstrs;
573 
574   bool IsLoadChain = isa<LoadInst>(Chain[0]);
575   LLVM_DEBUG({
576     for (Instruction *I : Chain) {
577       if (IsLoadChain)
578         assert(isa<LoadInst>(I) &&
579                "All elements of Chain must be loads, or all must be stores.");
580       else
581         assert(isa<StoreInst>(I) &&
582                "All elements of Chain must be loads, or all must be stores.");
583     }
584   });
585 
586   for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
587     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
588       if (!is_contained(Chain, &I))
589         MemoryInstrs.push_back(&I);
590       else
591         ChainInstrs.push_back(&I);
592     } else if (isa<IntrinsicInst>(&I) &&
593                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
594                    Intrinsic::sideeffect) {
595       // Ignore llvm.sideeffect calls.
596     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
597       LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
598                         << '\n');
599       break;
600     } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
601       LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
602                         << '\n');
603       break;
604     }
605   }
606 
607   OrderedBasicBlock OBB(Chain[0]->getParent());
608 
609   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
610   unsigned ChainInstrIdx = 0;
611   Instruction *BarrierMemoryInstr = nullptr;
612 
613   for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
614     Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
615 
616     // If a barrier memory instruction was found, chain instructions that follow
617     // will not be added to the valid prefix.
618     if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
619       break;
620 
621     // Check (in BB order) if any instruction prevents ChainInstr from being
622     // vectorized. Find and store the first such "conflicting" instruction.
623     for (Instruction *MemInstr : MemoryInstrs) {
624       // If a barrier memory instruction was found, do not check past it.
625       if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
626         break;
627 
628       auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
629       auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
630       if (MemLoad && ChainLoad)
631         continue;
632 
633       // We can ignore the alias if the we have a load store pair and the load
634       // is known to be invariant. The load cannot be clobbered by the store.
635       auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
636         return LI->getMetadata(LLVMContext::MD_invariant_load);
637       };
638 
639       // We can ignore the alias as long as the load comes before the store,
640       // because that means we won't be moving the load past the store to
641       // vectorize it (the vectorized load is inserted at the location of the
642       // first load in the chain).
643       if (isa<StoreInst>(MemInstr) && ChainLoad &&
644           (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
645         continue;
646 
647       // Same case, but in reverse.
648       if (MemLoad && isa<StoreInst>(ChainInstr) &&
649           (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
650         continue;
651 
652       if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
653                         MemoryLocation::get(ChainInstr))) {
654         LLVM_DEBUG({
655           dbgs() << "LSV: Found alias:\n"
656                     "  Aliasing instruction and pointer:\n"
657                  << "  " << *MemInstr << '\n'
658                  << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
659                  << "  Aliased instruction and pointer:\n"
660                  << "  " << *ChainInstr << '\n'
661                  << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
662         });
663         // Save this aliasing memory instruction as a barrier, but allow other
664         // instructions that precede the barrier to be vectorized with this one.
665         BarrierMemoryInstr = MemInstr;
666         break;
667       }
668     }
669     // Continue the search only for store chains, since vectorizing stores that
670     // precede an aliasing load is valid. Conversely, vectorizing loads is valid
671     // up to an aliasing store, but should not pull loads from further down in
672     // the basic block.
673     if (IsLoadChain && BarrierMemoryInstr) {
674       // The BarrierMemoryInstr is a store that precedes ChainInstr.
675       assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
676       break;
677     }
678   }
679 
680   // Find the largest prefix of Chain whose elements are all in
681   // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
682   // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
683   // order.)
684   SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
685       ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
686   unsigned ChainIdx = 0;
687   for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
688     if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
689       break;
690   }
691   return Chain.slice(0, ChainIdx);
692 }
693 
694 static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
695   const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
696   if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
697     // The select's themselves are distinct instructions even if they share the
698     // same condition and evaluate to consecutive pointers for true and false
699     // values of the condition. Therefore using the select's themselves for
700     // grouping instructions would put consecutive accesses into different lists
701     // and they won't be even checked for being consecutive, and won't be
702     // vectorized.
703     return Sel->getCondition();
704   }
705   return ObjPtr;
706 }
707 
708 std::pair<InstrListMap, InstrListMap>
709 Vectorizer::collectInstructions(BasicBlock *BB) {
710   InstrListMap LoadRefs;
711   InstrListMap StoreRefs;
712 
713   for (Instruction &I : *BB) {
714     if (!I.mayReadOrWriteMemory())
715       continue;
716 
717     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
718       if (!LI->isSimple())
719         continue;
720 
721       // Skip if it's not legal.
722       if (!TTI.isLegalToVectorizeLoad(LI))
723         continue;
724 
725       Type *Ty = LI->getType();
726       if (!VectorType::isValidElementType(Ty->getScalarType()))
727         continue;
728 
729       // Skip weird non-byte sizes. They probably aren't worth the effort of
730       // handling correctly.
731       unsigned TySize = DL.getTypeSizeInBits(Ty);
732       if ((TySize % 8) != 0)
733         continue;
734 
735       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
736       // functions are currently using an integer type for the vectorized
737       // load/store, and does not support casting between the integer type and a
738       // vector of pointers (e.g. i64 to <2 x i16*>)
739       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
740         continue;
741 
742       Value *Ptr = LI->getPointerOperand();
743       unsigned AS = Ptr->getType()->getPointerAddressSpace();
744       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
745 
746       unsigned VF = VecRegSize / TySize;
747       VectorType *VecTy = dyn_cast<VectorType>(Ty);
748 
749       // No point in looking at these if they're too big to vectorize.
750       if (TySize > VecRegSize / 2 ||
751           (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
752         continue;
753 
754       // Make sure all the users of a vector are constant-index extracts.
755       if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
756             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
757             return EEI && isa<ConstantInt>(EEI->getOperand(1));
758           }))
759         continue;
760 
761       // Save the load locations.
762       const ChainID ID = getChainID(Ptr, DL);
763       LoadRefs[ID].push_back(LI);
764     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
765       if (!SI->isSimple())
766         continue;
767 
768       // Skip if it's not legal.
769       if (!TTI.isLegalToVectorizeStore(SI))
770         continue;
771 
772       Type *Ty = SI->getValueOperand()->getType();
773       if (!VectorType::isValidElementType(Ty->getScalarType()))
774         continue;
775 
776       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
777       // functions are currently using an integer type for the vectorized
778       // load/store, and does not support casting between the integer type and a
779       // vector of pointers (e.g. i64 to <2 x i16*>)
780       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
781         continue;
782 
783       // Skip weird non-byte sizes. They probably aren't worth the effort of
784       // handling correctly.
785       unsigned TySize = DL.getTypeSizeInBits(Ty);
786       if ((TySize % 8) != 0)
787         continue;
788 
789       Value *Ptr = SI->getPointerOperand();
790       unsigned AS = Ptr->getType()->getPointerAddressSpace();
791       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
792 
793       unsigned VF = VecRegSize / TySize;
794       VectorType *VecTy = dyn_cast<VectorType>(Ty);
795 
796       // No point in looking at these if they're too big to vectorize.
797       if (TySize > VecRegSize / 2 ||
798           (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
799         continue;
800 
801       if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
802             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
803             return EEI && isa<ConstantInt>(EEI->getOperand(1));
804           }))
805         continue;
806 
807       // Save store location.
808       const ChainID ID = getChainID(Ptr, DL);
809       StoreRefs[ID].push_back(SI);
810     }
811   }
812 
813   return {LoadRefs, StoreRefs};
814 }
815 
816 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
817   bool Changed = false;
818 
819   for (const std::pair<ChainID, InstrList> &Chain : Map) {
820     unsigned Size = Chain.second.size();
821     if (Size < 2)
822       continue;
823 
824     LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
825 
826     // Process the stores in chunks of 64.
827     for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
828       unsigned Len = std::min<unsigned>(CE - CI, 64);
829       ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
830       Changed |= vectorizeInstructions(Chunk);
831     }
832   }
833 
834   return Changed;
835 }
836 
837 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
838   LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
839                     << " instructions.\n");
840   SmallVector<int, 16> Heads, Tails;
841   int ConsecutiveChain[64];
842 
843   // Do a quadratic search on all of the given loads/stores and find all of the
844   // pairs of loads/stores that follow each other.
845   for (int i = 0, e = Instrs.size(); i < e; ++i) {
846     ConsecutiveChain[i] = -1;
847     for (int j = e - 1; j >= 0; --j) {
848       if (i == j)
849         continue;
850 
851       if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
852         if (ConsecutiveChain[i] != -1) {
853           int CurDistance = std::abs(ConsecutiveChain[i] - i);
854           int NewDistance = std::abs(ConsecutiveChain[i] - j);
855           if (j < i || NewDistance > CurDistance)
856             continue; // Should not insert.
857         }
858 
859         Tails.push_back(j);
860         Heads.push_back(i);
861         ConsecutiveChain[i] = j;
862       }
863     }
864   }
865 
866   bool Changed = false;
867   SmallPtrSet<Instruction *, 16> InstructionsProcessed;
868 
869   for (int Head : Heads) {
870     if (InstructionsProcessed.count(Instrs[Head]))
871       continue;
872     bool LongerChainExists = false;
873     for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
874       if (Head == Tails[TIt] &&
875           !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
876         LongerChainExists = true;
877         break;
878       }
879     if (LongerChainExists)
880       continue;
881 
882     // We found an instr that starts a chain. Now follow the chain and try to
883     // vectorize it.
884     SmallVector<Instruction *, 16> Operands;
885     int I = Head;
886     while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
887       if (InstructionsProcessed.count(Instrs[I]))
888         break;
889 
890       Operands.push_back(Instrs[I]);
891       I = ConsecutiveChain[I];
892     }
893 
894     bool Vectorized = false;
895     if (isa<LoadInst>(*Operands.begin()))
896       Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
897     else
898       Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
899 
900     Changed |= Vectorized;
901   }
902 
903   return Changed;
904 }
905 
906 bool Vectorizer::vectorizeStoreChain(
907     ArrayRef<Instruction *> Chain,
908     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
909   StoreInst *S0 = cast<StoreInst>(Chain[0]);
910 
911   // If the vector has an int element, default to int for the whole store.
912   Type *StoreTy;
913   for (Instruction *I : Chain) {
914     StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
915     if (StoreTy->isIntOrIntVectorTy())
916       break;
917 
918     if (StoreTy->isPtrOrPtrVectorTy()) {
919       StoreTy = Type::getIntNTy(F.getParent()->getContext(),
920                                 DL.getTypeSizeInBits(StoreTy));
921       break;
922     }
923   }
924 
925   unsigned Sz = DL.getTypeSizeInBits(StoreTy);
926   unsigned AS = S0->getPointerAddressSpace();
927   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
928   unsigned VF = VecRegSize / Sz;
929   unsigned ChainSize = Chain.size();
930   unsigned Alignment = getAlignment(S0);
931 
932   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
933     InstructionsProcessed->insert(Chain.begin(), Chain.end());
934     return false;
935   }
936 
937   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
938   if (NewChain.empty()) {
939     // No vectorization possible.
940     InstructionsProcessed->insert(Chain.begin(), Chain.end());
941     return false;
942   }
943   if (NewChain.size() == 1) {
944     // Failed after the first instruction. Discard it and try the smaller chain.
945     InstructionsProcessed->insert(NewChain.front());
946     return false;
947   }
948 
949   // Update Chain to the valid vectorizable subchain.
950   Chain = NewChain;
951   ChainSize = Chain.size();
952 
953   // Check if it's legal to vectorize this chain. If not, split the chain and
954   // try again.
955   unsigned EltSzInBytes = Sz / 8;
956   unsigned SzInBytes = EltSzInBytes * ChainSize;
957 
958   VectorType *VecTy;
959   VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
960   if (VecStoreTy)
961     VecTy = VectorType::get(StoreTy->getScalarType(),
962                             Chain.size() * VecStoreTy->getNumElements());
963   else
964     VecTy = VectorType::get(StoreTy, Chain.size());
965 
966   // If it's more than the max vector size or the target has a better
967   // vector factor, break it into two pieces.
968   unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
969   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
970     LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
971                          " Creating two separate arrays.\n");
972     return vectorizeStoreChain(Chain.slice(0, TargetVF),
973                                InstructionsProcessed) |
974            vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
975   }
976 
977   LLVM_DEBUG({
978     dbgs() << "LSV: Stores to vectorize:\n";
979     for (Instruction *I : Chain)
980       dbgs() << "  " << *I << "\n";
981   });
982 
983   // We won't try again to vectorize the elements of the chain, regardless of
984   // whether we succeed below.
985   InstructionsProcessed->insert(Chain.begin(), Chain.end());
986 
987   // If the store is going to be misaligned, don't vectorize it.
988   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
989     if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
990       auto Chains = splitOddVectorElts(Chain, Sz);
991       return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
992              vectorizeStoreChain(Chains.second, InstructionsProcessed);
993     }
994 
995     unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
996                                                    StackAdjustedAlignment,
997                                                    DL, S0, nullptr, &DT);
998     if (NewAlign != 0)
999       Alignment = NewAlign;
1000   }
1001 
1002   if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1003     auto Chains = splitOddVectorElts(Chain, Sz);
1004     return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1005            vectorizeStoreChain(Chains.second, InstructionsProcessed);
1006   }
1007 
1008   BasicBlock::iterator First, Last;
1009   std::tie(First, Last) = getBoundaryInstrs(Chain);
1010   Builder.SetInsertPoint(&*Last);
1011 
1012   Value *Vec = UndefValue::get(VecTy);
1013 
1014   if (VecStoreTy) {
1015     unsigned VecWidth = VecStoreTy->getNumElements();
1016     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1017       StoreInst *Store = cast<StoreInst>(Chain[I]);
1018       for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
1019         unsigned NewIdx = J + I * VecWidth;
1020         Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
1021                                                       Builder.getInt32(J));
1022         if (Extract->getType() != StoreTy->getScalarType())
1023           Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
1024 
1025         Value *Insert =
1026             Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
1027         Vec = Insert;
1028       }
1029     }
1030   } else {
1031     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1032       StoreInst *Store = cast<StoreInst>(Chain[I]);
1033       Value *Extract = Store->getValueOperand();
1034       if (Extract->getType() != StoreTy->getScalarType())
1035         Extract =
1036             Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
1037 
1038       Value *Insert =
1039           Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
1040       Vec = Insert;
1041     }
1042   }
1043 
1044   StoreInst *SI = Builder.CreateAlignedStore(
1045     Vec,
1046     Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1047     Alignment);
1048   propagateMetadata(SI, Chain);
1049 
1050   eraseInstructions(Chain);
1051   ++NumVectorInstructions;
1052   NumScalarsVectorized += Chain.size();
1053   return true;
1054 }
1055 
1056 bool Vectorizer::vectorizeLoadChain(
1057     ArrayRef<Instruction *> Chain,
1058     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
1059   LoadInst *L0 = cast<LoadInst>(Chain[0]);
1060 
1061   // If the vector has an int element, default to int for the whole load.
1062   Type *LoadTy;
1063   for (const auto &V : Chain) {
1064     LoadTy = cast<LoadInst>(V)->getType();
1065     if (LoadTy->isIntOrIntVectorTy())
1066       break;
1067 
1068     if (LoadTy->isPtrOrPtrVectorTy()) {
1069       LoadTy = Type::getIntNTy(F.getParent()->getContext(),
1070                                DL.getTypeSizeInBits(LoadTy));
1071       break;
1072     }
1073   }
1074 
1075   unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1076   unsigned AS = L0->getPointerAddressSpace();
1077   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1078   unsigned VF = VecRegSize / Sz;
1079   unsigned ChainSize = Chain.size();
1080   unsigned Alignment = getAlignment(L0);
1081 
1082   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1083     InstructionsProcessed->insert(Chain.begin(), Chain.end());
1084     return false;
1085   }
1086 
1087   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1088   if (NewChain.empty()) {
1089     // No vectorization possible.
1090     InstructionsProcessed->insert(Chain.begin(), Chain.end());
1091     return false;
1092   }
1093   if (NewChain.size() == 1) {
1094     // Failed after the first instruction. Discard it and try the smaller chain.
1095     InstructionsProcessed->insert(NewChain.front());
1096     return false;
1097   }
1098 
1099   // Update Chain to the valid vectorizable subchain.
1100   Chain = NewChain;
1101   ChainSize = Chain.size();
1102 
1103   // Check if it's legal to vectorize this chain. If not, split the chain and
1104   // try again.
1105   unsigned EltSzInBytes = Sz / 8;
1106   unsigned SzInBytes = EltSzInBytes * ChainSize;
1107   VectorType *VecTy;
1108   VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1109   if (VecLoadTy)
1110     VecTy = VectorType::get(LoadTy->getScalarType(),
1111                             Chain.size() * VecLoadTy->getNumElements());
1112   else
1113     VecTy = VectorType::get(LoadTy, Chain.size());
1114 
1115   // If it's more than the max vector size or the target has a better
1116   // vector factor, break it into two pieces.
1117   unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1118   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1119     LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1120                          " Creating two separate arrays.\n");
1121     return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1122            vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1123   }
1124 
1125   // We won't try again to vectorize the elements of the chain, regardless of
1126   // whether we succeed below.
1127   InstructionsProcessed->insert(Chain.begin(), Chain.end());
1128 
1129   // If the load is going to be misaligned, don't vectorize it.
1130   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1131     if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1132       auto Chains = splitOddVectorElts(Chain, Sz);
1133       return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1134              vectorizeLoadChain(Chains.second, InstructionsProcessed);
1135     }
1136 
1137     unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1138                                                    StackAdjustedAlignment,
1139                                                    DL, L0, nullptr, &DT);
1140     if (NewAlign != 0)
1141       Alignment = NewAlign;
1142 
1143     Alignment = NewAlign;
1144   }
1145 
1146   if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1147     auto Chains = splitOddVectorElts(Chain, Sz);
1148     return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1149            vectorizeLoadChain(Chains.second, InstructionsProcessed);
1150   }
1151 
1152   LLVM_DEBUG({
1153     dbgs() << "LSV: Loads to vectorize:\n";
1154     for (Instruction *I : Chain)
1155       I->dump();
1156   });
1157 
1158   // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
1159   // Last may have changed since then, but the value of First won't have.  If it
1160   // matters, we could compute getBoundaryInstrs only once and reuse it here.
1161   BasicBlock::iterator First, Last;
1162   std::tie(First, Last) = getBoundaryInstrs(Chain);
1163   Builder.SetInsertPoint(&*First);
1164 
1165   Value *Bitcast =
1166       Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1167   LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);
1168   propagateMetadata(LI, Chain);
1169 
1170   if (VecLoadTy) {
1171     SmallVector<Instruction *, 16> InstrsToErase;
1172 
1173     unsigned VecWidth = VecLoadTy->getNumElements();
1174     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1175       for (auto Use : Chain[I]->users()) {
1176         // All users of vector loads are ExtractElement instructions with
1177         // constant indices, otherwise we would have bailed before now.
1178         Instruction *UI = cast<Instruction>(Use);
1179         unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1180         unsigned NewIdx = Idx + I * VecWidth;
1181         Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1182                                                 UI->getName());
1183         if (V->getType() != UI->getType())
1184           V = Builder.CreateBitCast(V, UI->getType());
1185 
1186         // Replace the old instruction.
1187         UI->replaceAllUsesWith(V);
1188         InstrsToErase.push_back(UI);
1189       }
1190     }
1191 
1192     // Bitcast might not be an Instruction, if the value being loaded is a
1193     // constant.  In that case, no need to reorder anything.
1194     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1195       reorder(BitcastInst);
1196 
1197     for (auto I : InstrsToErase)
1198       I->eraseFromParent();
1199   } else {
1200     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1201       Value *CV = Chain[I];
1202       Value *V =
1203           Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1204       if (V->getType() != CV->getType()) {
1205         V = Builder.CreateBitOrPointerCast(V, CV->getType());
1206       }
1207 
1208       // Replace the old instruction.
1209       CV->replaceAllUsesWith(V);
1210     }
1211 
1212     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1213       reorder(BitcastInst);
1214   }
1215 
1216   eraseInstructions(Chain);
1217 
1218   ++NumVectorInstructions;
1219   NumScalarsVectorized += Chain.size();
1220   return true;
1221 }
1222 
1223 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1224                                     unsigned Alignment) {
1225   if (Alignment % SzInBytes == 0)
1226     return false;
1227 
1228   bool Fast = false;
1229   bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1230                                                    SzInBytes * 8, AddressSpace,
1231                                                    Alignment, &Fast);
1232   LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1233                     << " and fast? " << Fast << "\n";);
1234   return !Allows || !Fast;
1235 }
1236