1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass merges loads/stores to/from sequential memory addresses into vector
11 // loads/stores.  Although there's nothing GPU-specific in here, this pass is
12 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 //
14 // (For simplicity below we talk about loads only, but everything also applies
15 // to stores.)
16 //
17 // This pass is intended to be run late in the pipeline, after other
18 // vectorization opportunities have been exploited.  So the assumption here is
19 // that immediately following our new vector load we'll need to extract out the
20 // individual elements of the load, so we can operate on them individually.
21 //
22 // On CPUs this transformation is usually not beneficial, because extracting the
23 // elements of a vector register is expensive on most architectures.  It's
24 // usually better just to load each element individually into its own scalar
25 // register.
26 //
27 // However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
28 // "vector load" loads directly into a series of scalar registers.  In effect,
29 // extracting the elements of the vector is free.  It's therefore always
30 // beneficial to vectorize a sequence of loads on these architectures.
31 //
32 // Vectorizing (perhaps a better name might be "coalescing") loads can have
33 // large performance impacts on GPU kernels, and opportunities for vectorizing
34 // are common in GPU code.  This pass tries very hard to find such
35 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 //
37 // Some CPU architectures, such as ARM, have instructions that load into
38 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
39 // could use this pass (with some modifications), but currently it implements
40 // its own pass to do something similar to what we do here.
41 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/PostOrderIterator.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
50 #include "llvm/ADT/iterator_range.h"
51 #include "llvm/Analysis/AliasAnalysis.h"
52 #include "llvm/Analysis/MemoryLocation.h"
53 #include "llvm/Analysis/OrderedBasicBlock.h"
54 #include "llvm/Analysis/ScalarEvolution.h"
55 #include "llvm/Analysis/TargetTransformInfo.h"
56 #include "llvm/Analysis/Utils/Local.h"
57 #include "llvm/Analysis/ValueTracking.h"
58 #include "llvm/Analysis/VectorUtils.h"
59 #include "llvm/IR/Attributes.h"
60 #include "llvm/IR/BasicBlock.h"
61 #include "llvm/IR/Constants.h"
62 #include "llvm/IR/DataLayout.h"
63 #include "llvm/IR/DerivedTypes.h"
64 #include "llvm/IR/Dominators.h"
65 #include "llvm/IR/Function.h"
66 #include "llvm/IR/IRBuilder.h"
67 #include "llvm/IR/InstrTypes.h"
68 #include "llvm/IR/Instruction.h"
69 #include "llvm/IR/Instructions.h"
70 #include "llvm/IR/IntrinsicInst.h"
71 #include "llvm/IR/Module.h"
72 #include "llvm/IR/Type.h"
73 #include "llvm/IR/User.h"
74 #include "llvm/IR/Value.h"
75 #include "llvm/Pass.h"
76 #include "llvm/Support/Casting.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/KnownBits.h"
79 #include "llvm/Support/MathExtras.h"
80 #include "llvm/Support/raw_ostream.h"
81 #include "llvm/Transforms/Vectorize.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 using InstrList = SmallVector<Instruction *, 8>;
101 using InstrListMap = MapVector<Value *, InstrList>;
102 
103 class Vectorizer {
104   Function &F;
105   AliasAnalysis &AA;
106   DominatorTree &DT;
107   ScalarEvolution &SE;
108   TargetTransformInfo &TTI;
109   const DataLayout &DL;
110   IRBuilder<> Builder;
111 
112 public:
113   Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
114              ScalarEvolution &SE, TargetTransformInfo &TTI)
115       : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
116         DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
117 
118   bool run();
119 
120 private:
121   GetElementPtrInst *getSourceGEP(Value *Src) const;
122 
123   unsigned getPointerAddressSpace(Value *I);
124 
125   unsigned getAlignment(LoadInst *LI) const {
126     unsigned Align = LI->getAlignment();
127     if (Align != 0)
128       return Align;
129 
130     return DL.getABITypeAlignment(LI->getType());
131   }
132 
133   unsigned getAlignment(StoreInst *SI) const {
134     unsigned Align = SI->getAlignment();
135     if (Align != 0)
136       return Align;
137 
138     return DL.getABITypeAlignment(SI->getValueOperand()->getType());
139   }
140 
141   bool isConsecutiveAccess(Value *A, Value *B);
142 
143   /// After vectorization, reorder the instructions that I depends on
144   /// (the instructions defining its operands), to ensure they dominate I.
145   void reorder(Instruction *I);
146 
147   /// Returns the first and the last instructions in Chain.
148   std::pair<BasicBlock::iterator, BasicBlock::iterator>
149   getBoundaryInstrs(ArrayRef<Instruction *> Chain);
150 
151   /// Erases the original instructions after vectorizing.
152   void eraseInstructions(ArrayRef<Instruction *> Chain);
153 
154   /// "Legalize" the vector type that would be produced by combining \p
155   /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
156   /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
157   /// expected to have more than 4 elements.
158   std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
159   splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
160 
161   /// Finds the largest prefix of Chain that's vectorizable, checking for
162   /// intervening instructions which may affect the memory accessed by the
163   /// instructions within Chain.
164   ///
165   /// The elements of \p Chain must be all loads or all stores and must be in
166   /// address order.
167   ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
168 
169   /// Collects load and store instructions to vectorize.
170   std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
171 
172   /// Processes the collected instructions, the \p Map. The values of \p Map
173   /// should be all loads or all stores.
174   bool vectorizeChains(InstrListMap &Map);
175 
176   /// Finds the load/stores to consecutive memory addresses and vectorizes them.
177   bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
178 
179   /// Vectorizes the load instructions in Chain.
180   bool
181   vectorizeLoadChain(ArrayRef<Instruction *> Chain,
182                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
183 
184   /// Vectorizes the store instructions in Chain.
185   bool
186   vectorizeStoreChain(ArrayRef<Instruction *> Chain,
187                       SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
188 
189   /// Check if this load/store access is misaligned accesses.
190   bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
191                           unsigned Alignment);
192 };
193 
194 class LoadStoreVectorizer : public FunctionPass {
195 public:
196   static char ID;
197 
198   LoadStoreVectorizer() : FunctionPass(ID) {
199     initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
200   }
201 
202   bool runOnFunction(Function &F) override;
203 
204   StringRef getPassName() const override {
205     return "GPU Load and Store Vectorizer";
206   }
207 
208   void getAnalysisUsage(AnalysisUsage &AU) const override {
209     AU.addRequired<AAResultsWrapperPass>();
210     AU.addRequired<ScalarEvolutionWrapperPass>();
211     AU.addRequired<DominatorTreeWrapperPass>();
212     AU.addRequired<TargetTransformInfoWrapperPass>();
213     AU.setPreservesCFG();
214   }
215 };
216 
217 } // end anonymous namespace
218 
219 char LoadStoreVectorizer::ID = 0;
220 
221 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
222                       "Vectorize load and Store instructions", false, false)
223 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
224 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
225 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
226 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
227 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
228 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
229                     "Vectorize load and store instructions", false, false)
230 
231 Pass *llvm::createLoadStoreVectorizerPass() {
232   return new LoadStoreVectorizer();
233 }
234 
235 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
236 // vectors of Instructions.
237 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
238   SmallVector<Value *, 8> VL(IL.begin(), IL.end());
239   propagateMetadata(I, VL);
240 }
241 
242 bool LoadStoreVectorizer::runOnFunction(Function &F) {
243   // Don't vectorize when the attribute NoImplicitFloat is used.
244   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
245     return false;
246 
247   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
248   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
249   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
250   TargetTransformInfo &TTI =
251       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
252 
253   Vectorizer V(F, AA, DT, SE, TTI);
254   return V.run();
255 }
256 
257 // Vectorizer Implementation
258 bool Vectorizer::run() {
259   bool Changed = false;
260 
261   // Scan the blocks in the function in post order.
262   for (BasicBlock *BB : post_order(&F)) {
263     InstrListMap LoadRefs, StoreRefs;
264     std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
265     Changed |= vectorizeChains(LoadRefs);
266     Changed |= vectorizeChains(StoreRefs);
267   }
268 
269   return Changed;
270 }
271 
272 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
273   if (LoadInst *L = dyn_cast<LoadInst>(I))
274     return L->getPointerAddressSpace();
275   if (StoreInst *S = dyn_cast<StoreInst>(I))
276     return S->getPointerAddressSpace();
277   return -1;
278 }
279 
280 GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
281   // First strip pointer bitcasts. Make sure pointee size is the same with
282   // and without casts.
283   // TODO: a stride set by the add instruction below can match the difference
284   // in pointee type size here. Currently it will not be vectorized.
285   Value *SrcPtr = getLoadStorePointerOperand(Src);
286   Value *SrcBase = SrcPtr->stripPointerCasts();
287   if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
288       DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
289     SrcPtr = SrcBase;
290   return dyn_cast<GetElementPtrInst>(SrcPtr);
291 }
292 
293 // FIXME: Merge with llvm::isConsecutiveAccess
294 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
295   Value *PtrA = getLoadStorePointerOperand(A);
296   Value *PtrB = getLoadStorePointerOperand(B);
297   unsigned ASA = getPointerAddressSpace(A);
298   unsigned ASB = getPointerAddressSpace(B);
299 
300   // Check that the address spaces match and that the pointers are valid.
301   if (!PtrA || !PtrB || (ASA != ASB))
302     return false;
303 
304   // Make sure that A and B are different pointers of the same size type.
305   unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
306   Type *PtrATy = PtrA->getType()->getPointerElementType();
307   Type *PtrBTy = PtrB->getType()->getPointerElementType();
308   if (PtrA == PtrB ||
309       PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
310       DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
311       DL.getTypeStoreSize(PtrATy->getScalarType()) !=
312           DL.getTypeStoreSize(PtrBTy->getScalarType()))
313     return false;
314 
315   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
316 
317   unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
318   APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
319   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
320   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
321 
322   APInt OffsetDelta = OffsetB - OffsetA;
323 
324   // Check if they are based on the same pointer. That makes the offsets
325   // sufficient.
326   if (PtrA == PtrB)
327     return OffsetDelta == Size;
328 
329   // Compute the necessary base pointer delta to have the necessary final delta
330   // equal to the size.
331   APInt BaseDelta = Size - OffsetDelta;
332 
333   // Compute the distance with SCEV between the base pointers.
334   const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
335   const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
336   const SCEV *C = SE.getConstant(BaseDelta);
337   const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
338   if (X == PtrSCEVB)
339     return true;
340 
341   // Sometimes even this doesn't work, because SCEV can't always see through
342   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
343   // things the hard way.
344 
345   // Look through GEPs after checking they're the same except for the last
346   // index.
347   GetElementPtrInst *GEPA = getSourceGEP(A);
348   GetElementPtrInst *GEPB = getSourceGEP(B);
349   if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
350     return false;
351   unsigned FinalIndex = GEPA->getNumOperands() - 1;
352   for (unsigned i = 0; i < FinalIndex; i++)
353     if (GEPA->getOperand(i) != GEPB->getOperand(i))
354       return false;
355 
356   Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
357   Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
358   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
359       OpA->getType() != OpB->getType())
360     return false;
361 
362   // Only look through a ZExt/SExt.
363   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
364     return false;
365 
366   bool Signed = isa<SExtInst>(OpA);
367 
368   OpA = dyn_cast<Instruction>(OpA->getOperand(0));
369   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
370   if (!OpA || !OpB || OpA->getType() != OpB->getType())
371     return false;
372 
373   // Now we need to prove that adding 1 to OpA won't overflow.
374   bool Safe = false;
375   // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
376   // we're okay.
377   if (OpB->getOpcode() == Instruction::Add &&
378       isa<ConstantInt>(OpB->getOperand(1)) &&
379       cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
380     if (Signed)
381       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
382     else
383       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
384   }
385 
386   unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
387 
388   // Second attempt:
389   // If any bits are known to be zero other than the sign bit in OpA, we can
390   // add 1 to it while guaranteeing no overflow of any sort.
391   if (!Safe) {
392     KnownBits Known(BitWidth);
393     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
394     if (Known.countMaxTrailingOnes() < (BitWidth - 1))
395       Safe = true;
396   }
397 
398   if (!Safe)
399     return false;
400 
401   const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
402   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
403   const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
404   const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
405   return X2 == OffsetSCEVB;
406 }
407 
408 void Vectorizer::reorder(Instruction *I) {
409   OrderedBasicBlock OBB(I->getParent());
410   SmallPtrSet<Instruction *, 16> InstructionsToMove;
411   SmallVector<Instruction *, 16> Worklist;
412 
413   Worklist.push_back(I);
414   while (!Worklist.empty()) {
415     Instruction *IW = Worklist.pop_back_val();
416     int NumOperands = IW->getNumOperands();
417     for (int i = 0; i < NumOperands; i++) {
418       Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
419       if (!IM || IM->getOpcode() == Instruction::PHI)
420         continue;
421 
422       // If IM is in another BB, no need to move it, because this pass only
423       // vectorizes instructions within one BB.
424       if (IM->getParent() != I->getParent())
425         continue;
426 
427       if (!OBB.dominates(IM, I)) {
428         InstructionsToMove.insert(IM);
429         Worklist.push_back(IM);
430       }
431     }
432   }
433 
434   // All instructions to move should follow I. Start from I, not from begin().
435   for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
436        ++BBI) {
437     if (!InstructionsToMove.count(&*BBI))
438       continue;
439     Instruction *IM = &*BBI;
440     --BBI;
441     IM->removeFromParent();
442     IM->insertBefore(I);
443   }
444 }
445 
446 std::pair<BasicBlock::iterator, BasicBlock::iterator>
447 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
448   Instruction *C0 = Chain[0];
449   BasicBlock::iterator FirstInstr = C0->getIterator();
450   BasicBlock::iterator LastInstr = C0->getIterator();
451 
452   BasicBlock *BB = C0->getParent();
453   unsigned NumFound = 0;
454   for (Instruction &I : *BB) {
455     if (!is_contained(Chain, &I))
456       continue;
457 
458     ++NumFound;
459     if (NumFound == 1) {
460       FirstInstr = I.getIterator();
461     }
462     if (NumFound == Chain.size()) {
463       LastInstr = I.getIterator();
464       break;
465     }
466   }
467 
468   // Range is [first, last).
469   return std::make_pair(FirstInstr, ++LastInstr);
470 }
471 
472 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
473   SmallVector<Instruction *, 16> Instrs;
474   for (Instruction *I : Chain) {
475     Value *PtrOperand = getLoadStorePointerOperand(I);
476     assert(PtrOperand && "Instruction must have a pointer operand.");
477     Instrs.push_back(I);
478     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
479       Instrs.push_back(GEP);
480   }
481 
482   // Erase instructions.
483   for (Instruction *I : Instrs)
484     if (I->use_empty())
485       I->eraseFromParent();
486 }
487 
488 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
489 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
490                                unsigned ElementSizeBits) {
491   unsigned ElementSizeBytes = ElementSizeBits / 8;
492   unsigned SizeBytes = ElementSizeBytes * Chain.size();
493   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
494   if (NumLeft == Chain.size()) {
495     if ((NumLeft & 1) == 0)
496       NumLeft /= 2; // Split even in half
497     else
498       --NumLeft;    // Split off last element
499   } else if (NumLeft == 0)
500     NumLeft = 1;
501   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
502 }
503 
504 ArrayRef<Instruction *>
505 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
506   // These are in BB order, unlike Chain, which is in address order.
507   SmallVector<Instruction *, 16> MemoryInstrs;
508   SmallVector<Instruction *, 16> ChainInstrs;
509 
510   bool IsLoadChain = isa<LoadInst>(Chain[0]);
511   DEBUG({
512     for (Instruction *I : Chain) {
513       if (IsLoadChain)
514         assert(isa<LoadInst>(I) &&
515                "All elements of Chain must be loads, or all must be stores.");
516       else
517         assert(isa<StoreInst>(I) &&
518                "All elements of Chain must be loads, or all must be stores.");
519     }
520   });
521 
522   for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
523     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
524       if (!is_contained(Chain, &I))
525         MemoryInstrs.push_back(&I);
526       else
527         ChainInstrs.push_back(&I);
528     } else if (isa<IntrinsicInst>(&I) &&
529                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
530                    Intrinsic::sideeffect) {
531       // Ignore llvm.sideeffect calls.
532     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
533       DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
534       break;
535     } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
536       DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
537                    << '\n');
538       break;
539     }
540   }
541 
542   OrderedBasicBlock OBB(Chain[0]->getParent());
543 
544   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
545   unsigned ChainInstrIdx = 0;
546   Instruction *BarrierMemoryInstr = nullptr;
547 
548   for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
549     Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
550 
551     // If a barrier memory instruction was found, chain instructions that follow
552     // will not be added to the valid prefix.
553     if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
554       break;
555 
556     // Check (in BB order) if any instruction prevents ChainInstr from being
557     // vectorized. Find and store the first such "conflicting" instruction.
558     for (Instruction *MemInstr : MemoryInstrs) {
559       // If a barrier memory instruction was found, do not check past it.
560       if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
561         break;
562 
563       if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
564         continue;
565 
566       // We can ignore the alias as long as the load comes before the store,
567       // because that means we won't be moving the load past the store to
568       // vectorize it (the vectorized load is inserted at the location of the
569       // first load in the chain).
570       if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
571           OBB.dominates(ChainInstr, MemInstr))
572         continue;
573 
574       // Same case, but in reverse.
575       if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
576           OBB.dominates(MemInstr, ChainInstr))
577         continue;
578 
579       if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
580                         MemoryLocation::get(ChainInstr))) {
581         DEBUG({
582           dbgs() << "LSV: Found alias:\n"
583                     "  Aliasing instruction and pointer:\n"
584                  << "  " << *MemInstr << '\n'
585                  << "  " << *getLoadStorePointerOperand(MemInstr) << '\n'
586                  << "  Aliased instruction and pointer:\n"
587                  << "  " << *ChainInstr << '\n'
588                  << "  " << *getLoadStorePointerOperand(ChainInstr) << '\n';
589         });
590         // Save this aliasing memory instruction as a barrier, but allow other
591         // instructions that precede the barrier to be vectorized with this one.
592         BarrierMemoryInstr = MemInstr;
593         break;
594       }
595     }
596     // Continue the search only for store chains, since vectorizing stores that
597     // precede an aliasing load is valid. Conversely, vectorizing loads is valid
598     // up to an aliasing store, but should not pull loads from further down in
599     // the basic block.
600     if (IsLoadChain && BarrierMemoryInstr) {
601       // The BarrierMemoryInstr is a store that precedes ChainInstr.
602       assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
603       break;
604     }
605   }
606 
607   // Find the largest prefix of Chain whose elements are all in
608   // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
609   // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
610   // order.)
611   SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
612       ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
613   unsigned ChainIdx = 0;
614   for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
615     if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
616       break;
617   }
618   return Chain.slice(0, ChainIdx);
619 }
620 
621 std::pair<InstrListMap, InstrListMap>
622 Vectorizer::collectInstructions(BasicBlock *BB) {
623   InstrListMap LoadRefs;
624   InstrListMap StoreRefs;
625 
626   for (Instruction &I : *BB) {
627     if (!I.mayReadOrWriteMemory())
628       continue;
629 
630     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
631       if (!LI->isSimple())
632         continue;
633 
634       // Skip if it's not legal.
635       if (!TTI.isLegalToVectorizeLoad(LI))
636         continue;
637 
638       Type *Ty = LI->getType();
639       if (!VectorType::isValidElementType(Ty->getScalarType()))
640         continue;
641 
642       // Skip weird non-byte sizes. They probably aren't worth the effort of
643       // handling correctly.
644       unsigned TySize = DL.getTypeSizeInBits(Ty);
645       if ((TySize % 8) != 0)
646         continue;
647 
648       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
649       // functions are currently using an integer type for the vectorized
650       // load/store, and does not support casting between the integer type and a
651       // vector of pointers (e.g. i64 to <2 x i16*>)
652       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
653         continue;
654 
655       Value *Ptr = LI->getPointerOperand();
656       unsigned AS = Ptr->getType()->getPointerAddressSpace();
657       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
658 
659       unsigned VF = VecRegSize / TySize;
660       VectorType *VecTy = dyn_cast<VectorType>(Ty);
661 
662       // No point in looking at these if they're too big to vectorize.
663       if (TySize > VecRegSize / 2 ||
664           (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
665         continue;
666 
667       // Make sure all the users of a vector are constant-index extracts.
668       if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
669             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
670             return EEI && isa<ConstantInt>(EEI->getOperand(1));
671           }))
672         continue;
673 
674       // Save the load locations.
675       Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
676       LoadRefs[ObjPtr].push_back(LI);
677     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
678       if (!SI->isSimple())
679         continue;
680 
681       // Skip if it's not legal.
682       if (!TTI.isLegalToVectorizeStore(SI))
683         continue;
684 
685       Type *Ty = SI->getValueOperand()->getType();
686       if (!VectorType::isValidElementType(Ty->getScalarType()))
687         continue;
688 
689       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
690       // functions are currently using an integer type for the vectorized
691       // load/store, and does not support casting between the integer type and a
692       // vector of pointers (e.g. i64 to <2 x i16*>)
693       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
694         continue;
695 
696       // Skip weird non-byte sizes. They probably aren't worth the effort of
697       // handling correctly.
698       unsigned TySize = DL.getTypeSizeInBits(Ty);
699       if ((TySize % 8) != 0)
700         continue;
701 
702       Value *Ptr = SI->getPointerOperand();
703       unsigned AS = Ptr->getType()->getPointerAddressSpace();
704       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
705 
706       unsigned VF = VecRegSize / TySize;
707       VectorType *VecTy = dyn_cast<VectorType>(Ty);
708 
709       // No point in looking at these if they're too big to vectorize.
710       if (TySize > VecRegSize / 2 ||
711           (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
712         continue;
713 
714       if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
715             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
716             return EEI && isa<ConstantInt>(EEI->getOperand(1));
717           }))
718         continue;
719 
720       // Save store location.
721       Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
722       StoreRefs[ObjPtr].push_back(SI);
723     }
724   }
725 
726   return {LoadRefs, StoreRefs};
727 }
728 
729 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
730   bool Changed = false;
731 
732   for (const std::pair<Value *, InstrList> &Chain : Map) {
733     unsigned Size = Chain.second.size();
734     if (Size < 2)
735       continue;
736 
737     DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
738 
739     // Process the stores in chunks of 64.
740     for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
741       unsigned Len = std::min<unsigned>(CE - CI, 64);
742       ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
743       Changed |= vectorizeInstructions(Chunk);
744     }
745   }
746 
747   return Changed;
748 }
749 
750 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
751   DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
752   SmallVector<int, 16> Heads, Tails;
753   int ConsecutiveChain[64];
754 
755   // Do a quadratic search on all of the given loads/stores and find all of the
756   // pairs of loads/stores that follow each other.
757   for (int i = 0, e = Instrs.size(); i < e; ++i) {
758     ConsecutiveChain[i] = -1;
759     for (int j = e - 1; j >= 0; --j) {
760       if (i == j)
761         continue;
762 
763       if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
764         if (ConsecutiveChain[i] != -1) {
765           int CurDistance = std::abs(ConsecutiveChain[i] - i);
766           int NewDistance = std::abs(ConsecutiveChain[i] - j);
767           if (j < i || NewDistance > CurDistance)
768             continue; // Should not insert.
769         }
770 
771         Tails.push_back(j);
772         Heads.push_back(i);
773         ConsecutiveChain[i] = j;
774       }
775     }
776   }
777 
778   bool Changed = false;
779   SmallPtrSet<Instruction *, 16> InstructionsProcessed;
780 
781   for (int Head : Heads) {
782     if (InstructionsProcessed.count(Instrs[Head]))
783       continue;
784     bool LongerChainExists = false;
785     for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
786       if (Head == Tails[TIt] &&
787           !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
788         LongerChainExists = true;
789         break;
790       }
791     if (LongerChainExists)
792       continue;
793 
794     // We found an instr that starts a chain. Now follow the chain and try to
795     // vectorize it.
796     SmallVector<Instruction *, 16> Operands;
797     int I = Head;
798     while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
799       if (InstructionsProcessed.count(Instrs[I]))
800         break;
801 
802       Operands.push_back(Instrs[I]);
803       I = ConsecutiveChain[I];
804     }
805 
806     bool Vectorized = false;
807     if (isa<LoadInst>(*Operands.begin()))
808       Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
809     else
810       Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
811 
812     Changed |= Vectorized;
813   }
814 
815   return Changed;
816 }
817 
818 bool Vectorizer::vectorizeStoreChain(
819     ArrayRef<Instruction *> Chain,
820     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
821   StoreInst *S0 = cast<StoreInst>(Chain[0]);
822 
823   // If the vector has an int element, default to int for the whole store.
824   Type *StoreTy;
825   for (Instruction *I : Chain) {
826     StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
827     if (StoreTy->isIntOrIntVectorTy())
828       break;
829 
830     if (StoreTy->isPtrOrPtrVectorTy()) {
831       StoreTy = Type::getIntNTy(F.getParent()->getContext(),
832                                 DL.getTypeSizeInBits(StoreTy));
833       break;
834     }
835   }
836 
837   unsigned Sz = DL.getTypeSizeInBits(StoreTy);
838   unsigned AS = S0->getPointerAddressSpace();
839   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
840   unsigned VF = VecRegSize / Sz;
841   unsigned ChainSize = Chain.size();
842   unsigned Alignment = getAlignment(S0);
843 
844   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
845     InstructionsProcessed->insert(Chain.begin(), Chain.end());
846     return false;
847   }
848 
849   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
850   if (NewChain.empty()) {
851     // No vectorization possible.
852     InstructionsProcessed->insert(Chain.begin(), Chain.end());
853     return false;
854   }
855   if (NewChain.size() == 1) {
856     // Failed after the first instruction. Discard it and try the smaller chain.
857     InstructionsProcessed->insert(NewChain.front());
858     return false;
859   }
860 
861   // Update Chain to the valid vectorizable subchain.
862   Chain = NewChain;
863   ChainSize = Chain.size();
864 
865   // Check if it's legal to vectorize this chain. If not, split the chain and
866   // try again.
867   unsigned EltSzInBytes = Sz / 8;
868   unsigned SzInBytes = EltSzInBytes * ChainSize;
869   if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
870     auto Chains = splitOddVectorElts(Chain, Sz);
871     return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
872            vectorizeStoreChain(Chains.second, InstructionsProcessed);
873   }
874 
875   VectorType *VecTy;
876   VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
877   if (VecStoreTy)
878     VecTy = VectorType::get(StoreTy->getScalarType(),
879                             Chain.size() * VecStoreTy->getNumElements());
880   else
881     VecTy = VectorType::get(StoreTy, Chain.size());
882 
883   // If it's more than the max vector size or the target has a better
884   // vector factor, break it into two pieces.
885   unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
886   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
887     DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
888                     " Creating two separate arrays.\n");
889     return vectorizeStoreChain(Chain.slice(0, TargetVF),
890                                InstructionsProcessed) |
891            vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
892   }
893 
894   DEBUG({
895     dbgs() << "LSV: Stores to vectorize:\n";
896     for (Instruction *I : Chain)
897       dbgs() << "  " << *I << "\n";
898   });
899 
900   // We won't try again to vectorize the elements of the chain, regardless of
901   // whether we succeed below.
902   InstructionsProcessed->insert(Chain.begin(), Chain.end());
903 
904   // If the store is going to be misaligned, don't vectorize it.
905   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
906     if (S0->getPointerAddressSpace() != 0)
907       return false;
908 
909     unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
910                                                    StackAdjustedAlignment,
911                                                    DL, S0, nullptr, &DT);
912     if (NewAlign < StackAdjustedAlignment)
913       return false;
914   }
915 
916   BasicBlock::iterator First, Last;
917   std::tie(First, Last) = getBoundaryInstrs(Chain);
918   Builder.SetInsertPoint(&*Last);
919 
920   Value *Vec = UndefValue::get(VecTy);
921 
922   if (VecStoreTy) {
923     unsigned VecWidth = VecStoreTy->getNumElements();
924     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
925       StoreInst *Store = cast<StoreInst>(Chain[I]);
926       for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
927         unsigned NewIdx = J + I * VecWidth;
928         Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
929                                                       Builder.getInt32(J));
930         if (Extract->getType() != StoreTy->getScalarType())
931           Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
932 
933         Value *Insert =
934             Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
935         Vec = Insert;
936       }
937     }
938   } else {
939     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
940       StoreInst *Store = cast<StoreInst>(Chain[I]);
941       Value *Extract = Store->getValueOperand();
942       if (Extract->getType() != StoreTy->getScalarType())
943         Extract =
944             Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
945 
946       Value *Insert =
947           Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
948       Vec = Insert;
949     }
950   }
951 
952   // This cast is safe because Builder.CreateStore() always creates a bona fide
953   // StoreInst.
954   StoreInst *SI = cast<StoreInst>(
955       Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
956                                                      VecTy->getPointerTo(AS))));
957   propagateMetadata(SI, Chain);
958   SI->setAlignment(Alignment);
959 
960   eraseInstructions(Chain);
961   ++NumVectorInstructions;
962   NumScalarsVectorized += Chain.size();
963   return true;
964 }
965 
966 bool Vectorizer::vectorizeLoadChain(
967     ArrayRef<Instruction *> Chain,
968     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
969   LoadInst *L0 = cast<LoadInst>(Chain[0]);
970 
971   // If the vector has an int element, default to int for the whole load.
972   Type *LoadTy;
973   for (const auto &V : Chain) {
974     LoadTy = cast<LoadInst>(V)->getType();
975     if (LoadTy->isIntOrIntVectorTy())
976       break;
977 
978     if (LoadTy->isPtrOrPtrVectorTy()) {
979       LoadTy = Type::getIntNTy(F.getParent()->getContext(),
980                                DL.getTypeSizeInBits(LoadTy));
981       break;
982     }
983   }
984 
985   unsigned Sz = DL.getTypeSizeInBits(LoadTy);
986   unsigned AS = L0->getPointerAddressSpace();
987   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
988   unsigned VF = VecRegSize / Sz;
989   unsigned ChainSize = Chain.size();
990   unsigned Alignment = getAlignment(L0);
991 
992   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
993     InstructionsProcessed->insert(Chain.begin(), Chain.end());
994     return false;
995   }
996 
997   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
998   if (NewChain.empty()) {
999     // No vectorization possible.
1000     InstructionsProcessed->insert(Chain.begin(), Chain.end());
1001     return false;
1002   }
1003   if (NewChain.size() == 1) {
1004     // Failed after the first instruction. Discard it and try the smaller chain.
1005     InstructionsProcessed->insert(NewChain.front());
1006     return false;
1007   }
1008 
1009   // Update Chain to the valid vectorizable subchain.
1010   Chain = NewChain;
1011   ChainSize = Chain.size();
1012 
1013   // Check if it's legal to vectorize this chain. If not, split the chain and
1014   // try again.
1015   unsigned EltSzInBytes = Sz / 8;
1016   unsigned SzInBytes = EltSzInBytes * ChainSize;
1017   if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1018     auto Chains = splitOddVectorElts(Chain, Sz);
1019     return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1020            vectorizeLoadChain(Chains.second, InstructionsProcessed);
1021   }
1022 
1023   VectorType *VecTy;
1024   VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1025   if (VecLoadTy)
1026     VecTy = VectorType::get(LoadTy->getScalarType(),
1027                             Chain.size() * VecLoadTy->getNumElements());
1028   else
1029     VecTy = VectorType::get(LoadTy, Chain.size());
1030 
1031   // If it's more than the max vector size or the target has a better
1032   // vector factor, break it into two pieces.
1033   unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1034   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1035     DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1036                     " Creating two separate arrays.\n");
1037     return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1038            vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1039   }
1040 
1041   // We won't try again to vectorize the elements of the chain, regardless of
1042   // whether we succeed below.
1043   InstructionsProcessed->insert(Chain.begin(), Chain.end());
1044 
1045   // If the load is going to be misaligned, don't vectorize it.
1046   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1047     if (L0->getPointerAddressSpace() != 0)
1048       return false;
1049 
1050     unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1051                                                    StackAdjustedAlignment,
1052                                                    DL, L0, nullptr, &DT);
1053     if (NewAlign < StackAdjustedAlignment)
1054       return false;
1055 
1056     Alignment = NewAlign;
1057   }
1058 
1059   DEBUG({
1060     dbgs() << "LSV: Loads to vectorize:\n";
1061     for (Instruction *I : Chain)
1062       I->dump();
1063   });
1064 
1065   // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
1066   // Last may have changed since then, but the value of First won't have.  If it
1067   // matters, we could compute getBoundaryInstrs only once and reuse it here.
1068   BasicBlock::iterator First, Last;
1069   std::tie(First, Last) = getBoundaryInstrs(Chain);
1070   Builder.SetInsertPoint(&*First);
1071 
1072   Value *Bitcast =
1073       Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1074   // This cast is safe because Builder.CreateLoad always creates a bona fide
1075   // LoadInst.
1076   LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
1077   propagateMetadata(LI, Chain);
1078   LI->setAlignment(Alignment);
1079 
1080   if (VecLoadTy) {
1081     SmallVector<Instruction *, 16> InstrsToErase;
1082 
1083     unsigned VecWidth = VecLoadTy->getNumElements();
1084     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1085       for (auto Use : Chain[I]->users()) {
1086         // All users of vector loads are ExtractElement instructions with
1087         // constant indices, otherwise we would have bailed before now.
1088         Instruction *UI = cast<Instruction>(Use);
1089         unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1090         unsigned NewIdx = Idx + I * VecWidth;
1091         Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1092                                                 UI->getName());
1093         if (V->getType() != UI->getType())
1094           V = Builder.CreateBitCast(V, UI->getType());
1095 
1096         // Replace the old instruction.
1097         UI->replaceAllUsesWith(V);
1098         InstrsToErase.push_back(UI);
1099       }
1100     }
1101 
1102     // Bitcast might not be an Instruction, if the value being loaded is a
1103     // constant.  In that case, no need to reorder anything.
1104     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1105       reorder(BitcastInst);
1106 
1107     for (auto I : InstrsToErase)
1108       I->eraseFromParent();
1109   } else {
1110     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1111       Value *CV = Chain[I];
1112       Value *V =
1113           Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1114       if (V->getType() != CV->getType()) {
1115         V = Builder.CreateBitOrPointerCast(V, CV->getType());
1116       }
1117 
1118       // Replace the old instruction.
1119       CV->replaceAllUsesWith(V);
1120     }
1121 
1122     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1123       reorder(BitcastInst);
1124   }
1125 
1126   eraseInstructions(Chain);
1127 
1128   ++NumVectorInstructions;
1129   NumScalarsVectorized += Chain.size();
1130   return true;
1131 }
1132 
1133 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1134                                     unsigned Alignment) {
1135   if (Alignment % SzInBytes == 0)
1136     return false;
1137 
1138   bool Fast = false;
1139   bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1140                                                    SzInBytes * 8, AddressSpace,
1141                                                    Alignment, &Fast);
1142   DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1143                << " and fast? " << Fast << "\n";);
1144   return !Allows || !Fast;
1145 }
1146