1 //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass merges loads/stores to/from sequential memory addresses into vector
11 // loads/stores.  Although there's nothing GPU-specific in here, this pass is
12 // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
13 //
14 // (For simplicity below we talk about loads only, but everything also applies
15 // to stores.)
16 //
17 // This pass is intended to be run late in the pipeline, after other
18 // vectorization opportunities have been exploited.  So the assumption here is
19 // that immediately following our new vector load we'll need to extract out the
20 // individual elements of the load, so we can operate on them individually.
21 //
22 // On CPUs this transformation is usually not beneficial, because extracting the
23 // elements of a vector register is expensive on most architectures.  It's
24 // usually better just to load each element individually into its own scalar
25 // register.
26 //
27 // However, nVidia and AMD GPUs don't have proper vector registers.  Instead, a
28 // "vector load" loads directly into a series of scalar registers.  In effect,
29 // extracting the elements of the vector is free.  It's therefore always
30 // beneficial to vectorize a sequence of loads on these architectures.
31 //
32 // Vectorizing (perhaps a better name might be "coalescing") loads can have
33 // large performance impacts on GPU kernels, and opportunities for vectorizing
34 // are common in GPU code.  This pass tries very hard to find such
35 // opportunities; its runtime is quadratic in the number of loads in a BB.
36 //
37 // Some CPU architectures, such as ARM, have instructions that load into
38 // multiple scalar registers, similar to a GPU vectorized load.  In theory ARM
39 // could use this pass (with some modifications), but currently it implements
40 // its own pass to do something similar to what we do here.
41 
42 #include "llvm/ADT/APInt.h"
43 #include "llvm/ADT/ArrayRef.h"
44 #include "llvm/ADT/MapVector.h"
45 #include "llvm/ADT/PostOrderIterator.h"
46 #include "llvm/ADT/STLExtras.h"
47 #include "llvm/ADT/SmallPtrSet.h"
48 #include "llvm/ADT/SmallVector.h"
49 #include "llvm/ADT/Statistic.h"
50 #include "llvm/ADT/iterator_range.h"
51 #include "llvm/Analysis/AliasAnalysis.h"
52 #include "llvm/Analysis/MemoryLocation.h"
53 #include "llvm/Analysis/OrderedBasicBlock.h"
54 #include "llvm/Analysis/ScalarEvolution.h"
55 #include "llvm/Analysis/TargetTransformInfo.h"
56 #include "llvm/Analysis/ValueTracking.h"
57 #include "llvm/Analysis/VectorUtils.h"
58 #include "llvm/IR/Attributes.h"
59 #include "llvm/IR/BasicBlock.h"
60 #include "llvm/IR/Constants.h"
61 #include "llvm/IR/DataLayout.h"
62 #include "llvm/IR/DerivedTypes.h"
63 #include "llvm/IR/Dominators.h"
64 #include "llvm/IR/Function.h"
65 #include "llvm/IR/IRBuilder.h"
66 #include "llvm/IR/InstrTypes.h"
67 #include "llvm/IR/Instruction.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/IntrinsicInst.h"
70 #include "llvm/IR/Module.h"
71 #include "llvm/IR/Type.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/Pass.h"
75 #include "llvm/Support/Casting.h"
76 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
78 #include "llvm/Support/MathExtras.h"
79 #include "llvm/Support/raw_ostream.h"
80 #include "llvm/Transforms/Utils/Local.h"
81 #include "llvm/Transforms/Vectorize.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <tuple>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "load-store-vectorizer"
91 
92 STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
93 STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
94 
95 // FIXME: Assuming stack alignment of 4 is always good enough
96 static const unsigned StackAdjustedAlignment = 4;
97 
98 namespace {
99 
100 using InstrList = SmallVector<Instruction *, 8>;
101 using InstrListMap = MapVector<Value *, InstrList>;
102 
103 class Vectorizer {
104   Function &F;
105   AliasAnalysis &AA;
106   DominatorTree &DT;
107   ScalarEvolution &SE;
108   TargetTransformInfo &TTI;
109   const DataLayout &DL;
110   IRBuilder<> Builder;
111 
112 public:
113   Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
114              ScalarEvolution &SE, TargetTransformInfo &TTI)
115       : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
116         DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
117 
118   bool run();
119 
120 private:
121   Value *getPointerOperand(Value *I) const;
122 
123   GetElementPtrInst *getSourceGEP(Value *Src) const;
124 
125   unsigned getPointerAddressSpace(Value *I);
126 
127   unsigned getAlignment(LoadInst *LI) const {
128     unsigned Align = LI->getAlignment();
129     if (Align != 0)
130       return Align;
131 
132     return DL.getABITypeAlignment(LI->getType());
133   }
134 
135   unsigned getAlignment(StoreInst *SI) const {
136     unsigned Align = SI->getAlignment();
137     if (Align != 0)
138       return Align;
139 
140     return DL.getABITypeAlignment(SI->getValueOperand()->getType());
141   }
142 
143   bool isConsecutiveAccess(Value *A, Value *B);
144 
145   /// After vectorization, reorder the instructions that I depends on
146   /// (the instructions defining its operands), to ensure they dominate I.
147   void reorder(Instruction *I);
148 
149   /// Returns the first and the last instructions in Chain.
150   std::pair<BasicBlock::iterator, BasicBlock::iterator>
151   getBoundaryInstrs(ArrayRef<Instruction *> Chain);
152 
153   /// Erases the original instructions after vectorizing.
154   void eraseInstructions(ArrayRef<Instruction *> Chain);
155 
156   /// "Legalize" the vector type that would be produced by combining \p
157   /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
158   /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
159   /// expected to have more than 4 elements.
160   std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
161   splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
162 
163   /// Finds the largest prefix of Chain that's vectorizable, checking for
164   /// intervening instructions which may affect the memory accessed by the
165   /// instructions within Chain.
166   ///
167   /// The elements of \p Chain must be all loads or all stores and must be in
168   /// address order.
169   ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
170 
171   /// Collects load and store instructions to vectorize.
172   std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
173 
174   /// Processes the collected instructions, the \p Map. The values of \p Map
175   /// should be all loads or all stores.
176   bool vectorizeChains(InstrListMap &Map);
177 
178   /// Finds the load/stores to consecutive memory addresses and vectorizes them.
179   bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
180 
181   /// Vectorizes the load instructions in Chain.
182   bool
183   vectorizeLoadChain(ArrayRef<Instruction *> Chain,
184                      SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
185 
186   /// Vectorizes the store instructions in Chain.
187   bool
188   vectorizeStoreChain(ArrayRef<Instruction *> Chain,
189                       SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
190 
191   /// Check if this load/store access is misaligned accesses.
192   bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
193                           unsigned Alignment);
194 };
195 
196 class LoadStoreVectorizer : public FunctionPass {
197 public:
198   static char ID;
199 
200   LoadStoreVectorizer() : FunctionPass(ID) {
201     initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
202   }
203 
204   bool runOnFunction(Function &F) override;
205 
206   StringRef getPassName() const override {
207     return "GPU Load and Store Vectorizer";
208   }
209 
210   void getAnalysisUsage(AnalysisUsage &AU) const override {
211     AU.addRequired<AAResultsWrapperPass>();
212     AU.addRequired<ScalarEvolutionWrapperPass>();
213     AU.addRequired<DominatorTreeWrapperPass>();
214     AU.addRequired<TargetTransformInfoWrapperPass>();
215     AU.setPreservesCFG();
216   }
217 };
218 
219 } // end anonymous namespace
220 
221 char LoadStoreVectorizer::ID = 0;
222 
223 INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
224                       "Vectorize load and Store instructions", false, false)
225 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
226 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
227 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
228 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
229 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
230 INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
231                     "Vectorize load and store instructions", false, false)
232 
233 Pass *llvm::createLoadStoreVectorizerPass() {
234   return new LoadStoreVectorizer();
235 }
236 
237 // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
238 // vectors of Instructions.
239 static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
240   SmallVector<Value *, 8> VL(IL.begin(), IL.end());
241   propagateMetadata(I, VL);
242 }
243 
244 bool LoadStoreVectorizer::runOnFunction(Function &F) {
245   // Don't vectorize when the attribute NoImplicitFloat is used.
246   if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
247     return false;
248 
249   AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
250   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
251   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
252   TargetTransformInfo &TTI =
253       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
254 
255   Vectorizer V(F, AA, DT, SE, TTI);
256   return V.run();
257 }
258 
259 // Vectorizer Implementation
260 bool Vectorizer::run() {
261   bool Changed = false;
262 
263   // Scan the blocks in the function in post order.
264   for (BasicBlock *BB : post_order(&F)) {
265     InstrListMap LoadRefs, StoreRefs;
266     std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
267     Changed |= vectorizeChains(LoadRefs);
268     Changed |= vectorizeChains(StoreRefs);
269   }
270 
271   return Changed;
272 }
273 
274 Value *Vectorizer::getPointerOperand(Value *I) const {
275   if (LoadInst *LI = dyn_cast<LoadInst>(I))
276     return LI->getPointerOperand();
277   if (StoreInst *SI = dyn_cast<StoreInst>(I))
278     return SI->getPointerOperand();
279   return nullptr;
280 }
281 
282 unsigned Vectorizer::getPointerAddressSpace(Value *I) {
283   if (LoadInst *L = dyn_cast<LoadInst>(I))
284     return L->getPointerAddressSpace();
285   if (StoreInst *S = dyn_cast<StoreInst>(I))
286     return S->getPointerAddressSpace();
287   return -1;
288 }
289 
290 GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
291   // First strip pointer bitcasts. Make sure pointee size is the same with
292   // and without casts.
293   // TODO: a stride set by the add instruction below can match the difference
294   // in pointee type size here. Currently it will not be vectorized.
295   Value *SrcPtr = getPointerOperand(Src);
296   Value *SrcBase = SrcPtr->stripPointerCasts();
297   if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
298       DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
299     SrcPtr = SrcBase;
300   return dyn_cast<GetElementPtrInst>(SrcPtr);
301 }
302 
303 // FIXME: Merge with llvm::isConsecutiveAccess
304 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
305   Value *PtrA = getPointerOperand(A);
306   Value *PtrB = getPointerOperand(B);
307   unsigned ASA = getPointerAddressSpace(A);
308   unsigned ASB = getPointerAddressSpace(B);
309 
310   // Check that the address spaces match and that the pointers are valid.
311   if (!PtrA || !PtrB || (ASA != ASB))
312     return false;
313 
314   // Make sure that A and B are different pointers of the same size type.
315   unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
316   Type *PtrATy = PtrA->getType()->getPointerElementType();
317   Type *PtrBTy = PtrB->getType()->getPointerElementType();
318   if (PtrA == PtrB ||
319       DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
320       DL.getTypeStoreSize(PtrATy->getScalarType()) !=
321           DL.getTypeStoreSize(PtrBTy->getScalarType()))
322     return false;
323 
324   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
325 
326   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
327   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
328   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
329 
330   APInt OffsetDelta = OffsetB - OffsetA;
331 
332   // Check if they are based on the same pointer. That makes the offsets
333   // sufficient.
334   if (PtrA == PtrB)
335     return OffsetDelta == Size;
336 
337   // Compute the necessary base pointer delta to have the necessary final delta
338   // equal to the size.
339   APInt BaseDelta = Size - OffsetDelta;
340 
341   // Compute the distance with SCEV between the base pointers.
342   const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
343   const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
344   const SCEV *C = SE.getConstant(BaseDelta);
345   const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
346   if (X == PtrSCEVB)
347     return true;
348 
349   // Sometimes even this doesn't work, because SCEV can't always see through
350   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
351   // things the hard way.
352 
353   // Look through GEPs after checking they're the same except for the last
354   // index.
355   GetElementPtrInst *GEPA = getSourceGEP(A);
356   GetElementPtrInst *GEPB = getSourceGEP(B);
357   if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
358     return false;
359   unsigned FinalIndex = GEPA->getNumOperands() - 1;
360   for (unsigned i = 0; i < FinalIndex; i++)
361     if (GEPA->getOperand(i) != GEPB->getOperand(i))
362       return false;
363 
364   Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
365   Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
366   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
367       OpA->getType() != OpB->getType())
368     return false;
369 
370   // Only look through a ZExt/SExt.
371   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
372     return false;
373 
374   bool Signed = isa<SExtInst>(OpA);
375 
376   OpA = dyn_cast<Instruction>(OpA->getOperand(0));
377   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
378   if (!OpA || !OpB || OpA->getType() != OpB->getType())
379     return false;
380 
381   // Now we need to prove that adding 1 to OpA won't overflow.
382   bool Safe = false;
383   // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
384   // we're okay.
385   if (OpB->getOpcode() == Instruction::Add &&
386       isa<ConstantInt>(OpB->getOperand(1)) &&
387       cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
388     if (Signed)
389       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
390     else
391       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
392   }
393 
394   unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
395 
396   // Second attempt:
397   // If any bits are known to be zero other than the sign bit in OpA, we can
398   // add 1 to it while guaranteeing no overflow of any sort.
399   if (!Safe) {
400     KnownBits Known(BitWidth);
401     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
402     if (Known.countMaxTrailingOnes() < (BitWidth - 1))
403       Safe = true;
404   }
405 
406   if (!Safe)
407     return false;
408 
409   const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
410   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
411   const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
412   const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
413   return X2 == OffsetSCEVB;
414 }
415 
416 void Vectorizer::reorder(Instruction *I) {
417   OrderedBasicBlock OBB(I->getParent());
418   SmallPtrSet<Instruction *, 16> InstructionsToMove;
419   SmallVector<Instruction *, 16> Worklist;
420 
421   Worklist.push_back(I);
422   while (!Worklist.empty()) {
423     Instruction *IW = Worklist.pop_back_val();
424     int NumOperands = IW->getNumOperands();
425     for (int i = 0; i < NumOperands; i++) {
426       Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
427       if (!IM || IM->getOpcode() == Instruction::PHI)
428         continue;
429 
430       // If IM is in another BB, no need to move it, because this pass only
431       // vectorizes instructions within one BB.
432       if (IM->getParent() != I->getParent())
433         continue;
434 
435       if (!OBB.dominates(IM, I)) {
436         InstructionsToMove.insert(IM);
437         Worklist.push_back(IM);
438       }
439     }
440   }
441 
442   // All instructions to move should follow I. Start from I, not from begin().
443   for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
444        ++BBI) {
445     if (!InstructionsToMove.count(&*BBI))
446       continue;
447     Instruction *IM = &*BBI;
448     --BBI;
449     IM->removeFromParent();
450     IM->insertBefore(I);
451   }
452 }
453 
454 std::pair<BasicBlock::iterator, BasicBlock::iterator>
455 Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
456   Instruction *C0 = Chain[0];
457   BasicBlock::iterator FirstInstr = C0->getIterator();
458   BasicBlock::iterator LastInstr = C0->getIterator();
459 
460   BasicBlock *BB = C0->getParent();
461   unsigned NumFound = 0;
462   for (Instruction &I : *BB) {
463     if (!is_contained(Chain, &I))
464       continue;
465 
466     ++NumFound;
467     if (NumFound == 1) {
468       FirstInstr = I.getIterator();
469     }
470     if (NumFound == Chain.size()) {
471       LastInstr = I.getIterator();
472       break;
473     }
474   }
475 
476   // Range is [first, last).
477   return std::make_pair(FirstInstr, ++LastInstr);
478 }
479 
480 void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
481   SmallVector<Instruction *, 16> Instrs;
482   for (Instruction *I : Chain) {
483     Value *PtrOperand = getPointerOperand(I);
484     assert(PtrOperand && "Instruction must have a pointer operand.");
485     Instrs.push_back(I);
486     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
487       Instrs.push_back(GEP);
488   }
489 
490   // Erase instructions.
491   for (Instruction *I : Instrs)
492     if (I->use_empty())
493       I->eraseFromParent();
494 }
495 
496 std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
497 Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
498                                unsigned ElementSizeBits) {
499   unsigned ElementSizeBytes = ElementSizeBits / 8;
500   unsigned SizeBytes = ElementSizeBytes * Chain.size();
501   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
502   if (NumLeft == Chain.size()) {
503     if ((NumLeft & 1) == 0)
504       NumLeft /= 2; // Split even in half
505     else
506       --NumLeft;    // Split off last element
507   } else if (NumLeft == 0)
508     NumLeft = 1;
509   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
510 }
511 
512 ArrayRef<Instruction *>
513 Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
514   // These are in BB order, unlike Chain, which is in address order.
515   SmallVector<Instruction *, 16> MemoryInstrs;
516   SmallVector<Instruction *, 16> ChainInstrs;
517 
518   bool IsLoadChain = isa<LoadInst>(Chain[0]);
519   DEBUG({
520     for (Instruction *I : Chain) {
521       if (IsLoadChain)
522         assert(isa<LoadInst>(I) &&
523                "All elements of Chain must be loads, or all must be stores.");
524       else
525         assert(isa<StoreInst>(I) &&
526                "All elements of Chain must be loads, or all must be stores.");
527     }
528   });
529 
530   for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
531     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
532       if (!is_contained(Chain, &I))
533         MemoryInstrs.push_back(&I);
534       else
535         ChainInstrs.push_back(&I);
536     } else if (isa<IntrinsicInst>(&I) &&
537                cast<IntrinsicInst>(&I)->getIntrinsicID() ==
538                    Intrinsic::sideeffect) {
539       // Ignore llvm.sideeffect calls.
540     } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
541       DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I << '\n');
542       break;
543     } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
544       DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
545                    << '\n');
546       break;
547     }
548   }
549 
550   OrderedBasicBlock OBB(Chain[0]->getParent());
551 
552   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
553   unsigned ChainInstrIdx = 0;
554   Instruction *BarrierMemoryInstr = nullptr;
555 
556   for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
557     Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
558 
559     // If a barrier memory instruction was found, chain instructions that follow
560     // will not be added to the valid prefix.
561     if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
562       break;
563 
564     // Check (in BB order) if any instruction prevents ChainInstr from being
565     // vectorized. Find and store the first such "conflicting" instruction.
566     for (Instruction *MemInstr : MemoryInstrs) {
567       // If a barrier memory instruction was found, do not check past it.
568       if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
569         break;
570 
571       if (isa<LoadInst>(MemInstr) && isa<LoadInst>(ChainInstr))
572         continue;
573 
574       // We can ignore the alias as long as the load comes before the store,
575       // because that means we won't be moving the load past the store to
576       // vectorize it (the vectorized load is inserted at the location of the
577       // first load in the chain).
578       if (isa<StoreInst>(MemInstr) && isa<LoadInst>(ChainInstr) &&
579           OBB.dominates(ChainInstr, MemInstr))
580         continue;
581 
582       // Same case, but in reverse.
583       if (isa<LoadInst>(MemInstr) && isa<StoreInst>(ChainInstr) &&
584           OBB.dominates(MemInstr, ChainInstr))
585         continue;
586 
587       if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
588                         MemoryLocation::get(ChainInstr))) {
589         DEBUG({
590           dbgs() << "LSV: Found alias:\n"
591                     "  Aliasing instruction and pointer:\n"
592                  << "  " << *MemInstr << '\n'
593                  << "  " << *getPointerOperand(MemInstr) << '\n'
594                  << "  Aliased instruction and pointer:\n"
595                  << "  " << *ChainInstr << '\n'
596                  << "  " << *getPointerOperand(ChainInstr) << '\n';
597         });
598         // Save this aliasing memory instruction as a barrier, but allow other
599         // instructions that precede the barrier to be vectorized with this one.
600         BarrierMemoryInstr = MemInstr;
601         break;
602       }
603     }
604     // Continue the search only for store chains, since vectorizing stores that
605     // precede an aliasing load is valid. Conversely, vectorizing loads is valid
606     // up to an aliasing store, but should not pull loads from further down in
607     // the basic block.
608     if (IsLoadChain && BarrierMemoryInstr) {
609       // The BarrierMemoryInstr is a store that precedes ChainInstr.
610       assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
611       break;
612     }
613   }
614 
615   // Find the largest prefix of Chain whose elements are all in
616   // ChainInstrs[0, ChainInstrIdx).  This is the largest vectorizable prefix of
617   // Chain.  (Recall that Chain is in address order, but ChainInstrs is in BB
618   // order.)
619   SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
620       ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
621   unsigned ChainIdx = 0;
622   for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
623     if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
624       break;
625   }
626   return Chain.slice(0, ChainIdx);
627 }
628 
629 std::pair<InstrListMap, InstrListMap>
630 Vectorizer::collectInstructions(BasicBlock *BB) {
631   InstrListMap LoadRefs;
632   InstrListMap StoreRefs;
633 
634   for (Instruction &I : *BB) {
635     if (!I.mayReadOrWriteMemory())
636       continue;
637 
638     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
639       if (!LI->isSimple())
640         continue;
641 
642       // Skip if it's not legal.
643       if (!TTI.isLegalToVectorizeLoad(LI))
644         continue;
645 
646       Type *Ty = LI->getType();
647       if (!VectorType::isValidElementType(Ty->getScalarType()))
648         continue;
649 
650       // Skip weird non-byte sizes. They probably aren't worth the effort of
651       // handling correctly.
652       unsigned TySize = DL.getTypeSizeInBits(Ty);
653       if ((TySize % 8) != 0)
654         continue;
655 
656       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
657       // functions are currently using an integer type for the vectorized
658       // load/store, and does not support casting between the integer type and a
659       // vector of pointers (e.g. i64 to <2 x i16*>)
660       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
661         continue;
662 
663       Value *Ptr = LI->getPointerOperand();
664       unsigned AS = Ptr->getType()->getPointerAddressSpace();
665       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
666 
667       // No point in looking at these if they're too big to vectorize.
668       if (TySize > VecRegSize / 2)
669         continue;
670 
671       // Make sure all the users of a vector are constant-index extracts.
672       if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
673             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
674             return EEI && isa<ConstantInt>(EEI->getOperand(1));
675           }))
676         continue;
677 
678       // Save the load locations.
679       Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
680       LoadRefs[ObjPtr].push_back(LI);
681     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
682       if (!SI->isSimple())
683         continue;
684 
685       // Skip if it's not legal.
686       if (!TTI.isLegalToVectorizeStore(SI))
687         continue;
688 
689       Type *Ty = SI->getValueOperand()->getType();
690       if (!VectorType::isValidElementType(Ty->getScalarType()))
691         continue;
692 
693       // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
694       // functions are currently using an integer type for the vectorized
695       // load/store, and does not support casting between the integer type and a
696       // vector of pointers (e.g. i64 to <2 x i16*>)
697       if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
698         continue;
699 
700       // Skip weird non-byte sizes. They probably aren't worth the effort of
701       // handling correctly.
702       unsigned TySize = DL.getTypeSizeInBits(Ty);
703       if ((TySize % 8) != 0)
704         continue;
705 
706       Value *Ptr = SI->getPointerOperand();
707       unsigned AS = Ptr->getType()->getPointerAddressSpace();
708       unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
709 
710       // No point in looking at these if they're too big to vectorize.
711       if (TySize > VecRegSize / 2)
712         continue;
713 
714       if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
715             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
716             return EEI && isa<ConstantInt>(EEI->getOperand(1));
717           }))
718         continue;
719 
720       // Save store location.
721       Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
722       StoreRefs[ObjPtr].push_back(SI);
723     }
724   }
725 
726   return {LoadRefs, StoreRefs};
727 }
728 
729 bool Vectorizer::vectorizeChains(InstrListMap &Map) {
730   bool Changed = false;
731 
732   for (const std::pair<Value *, InstrList> &Chain : Map) {
733     unsigned Size = Chain.second.size();
734     if (Size < 2)
735       continue;
736 
737     DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
738 
739     // Process the stores in chunks of 64.
740     for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
741       unsigned Len = std::min<unsigned>(CE - CI, 64);
742       ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
743       Changed |= vectorizeInstructions(Chunk);
744     }
745   }
746 
747   return Changed;
748 }
749 
750 bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
751   DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n");
752   SmallVector<int, 16> Heads, Tails;
753   int ConsecutiveChain[64];
754 
755   // Do a quadratic search on all of the given loads/stores and find all of the
756   // pairs of loads/stores that follow each other.
757   for (int i = 0, e = Instrs.size(); i < e; ++i) {
758     ConsecutiveChain[i] = -1;
759     for (int j = e - 1; j >= 0; --j) {
760       if (i == j)
761         continue;
762 
763       if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
764         if (ConsecutiveChain[i] != -1) {
765           int CurDistance = std::abs(ConsecutiveChain[i] - i);
766           int NewDistance = std::abs(ConsecutiveChain[i] - j);
767           if (j < i || NewDistance > CurDistance)
768             continue; // Should not insert.
769         }
770 
771         Tails.push_back(j);
772         Heads.push_back(i);
773         ConsecutiveChain[i] = j;
774       }
775     }
776   }
777 
778   bool Changed = false;
779   SmallPtrSet<Instruction *, 16> InstructionsProcessed;
780 
781   for (int Head : Heads) {
782     if (InstructionsProcessed.count(Instrs[Head]))
783       continue;
784     bool LongerChainExists = false;
785     for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
786       if (Head == Tails[TIt] &&
787           !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
788         LongerChainExists = true;
789         break;
790       }
791     if (LongerChainExists)
792       continue;
793 
794     // We found an instr that starts a chain. Now follow the chain and try to
795     // vectorize it.
796     SmallVector<Instruction *, 16> Operands;
797     int I = Head;
798     while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
799       if (InstructionsProcessed.count(Instrs[I]))
800         break;
801 
802       Operands.push_back(Instrs[I]);
803       I = ConsecutiveChain[I];
804     }
805 
806     bool Vectorized = false;
807     if (isa<LoadInst>(*Operands.begin()))
808       Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
809     else
810       Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
811 
812     Changed |= Vectorized;
813   }
814 
815   return Changed;
816 }
817 
818 bool Vectorizer::vectorizeStoreChain(
819     ArrayRef<Instruction *> Chain,
820     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
821   StoreInst *S0 = cast<StoreInst>(Chain[0]);
822 
823   // If the vector has an int element, default to int for the whole store.
824   Type *StoreTy;
825   for (Instruction *I : Chain) {
826     StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
827     if (StoreTy->isIntOrIntVectorTy())
828       break;
829 
830     if (StoreTy->isPtrOrPtrVectorTy()) {
831       StoreTy = Type::getIntNTy(F.getParent()->getContext(),
832                                 DL.getTypeSizeInBits(StoreTy));
833       break;
834     }
835   }
836 
837   unsigned Sz = DL.getTypeSizeInBits(StoreTy);
838   unsigned AS = S0->getPointerAddressSpace();
839   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
840   unsigned VF = VecRegSize / Sz;
841   unsigned ChainSize = Chain.size();
842   unsigned Alignment = getAlignment(S0);
843 
844   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
845     InstructionsProcessed->insert(Chain.begin(), Chain.end());
846     return false;
847   }
848 
849   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
850   if (NewChain.empty()) {
851     // No vectorization possible.
852     InstructionsProcessed->insert(Chain.begin(), Chain.end());
853     return false;
854   }
855   if (NewChain.size() == 1) {
856     // Failed after the first instruction. Discard it and try the smaller chain.
857     InstructionsProcessed->insert(NewChain.front());
858     return false;
859   }
860 
861   // Update Chain to the valid vectorizable subchain.
862   Chain = NewChain;
863   ChainSize = Chain.size();
864 
865   // Check if it's legal to vectorize this chain. If not, split the chain and
866   // try again.
867   unsigned EltSzInBytes = Sz / 8;
868   unsigned SzInBytes = EltSzInBytes * ChainSize;
869   if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
870     auto Chains = splitOddVectorElts(Chain, Sz);
871     return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
872            vectorizeStoreChain(Chains.second, InstructionsProcessed);
873   }
874 
875   VectorType *VecTy;
876   VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
877   if (VecStoreTy)
878     VecTy = VectorType::get(StoreTy->getScalarType(),
879                             Chain.size() * VecStoreTy->getNumElements());
880   else
881     VecTy = VectorType::get(StoreTy, Chain.size());
882 
883   // If it's more than the max vector size or the target has a better
884   // vector factor, break it into two pieces.
885   unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
886   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
887     DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
888                     " Creating two separate arrays.\n");
889     return vectorizeStoreChain(Chain.slice(0, TargetVF),
890                                InstructionsProcessed) |
891            vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
892   }
893 
894   DEBUG({
895     dbgs() << "LSV: Stores to vectorize:\n";
896     for (Instruction *I : Chain)
897       dbgs() << "  " << *I << "\n";
898   });
899 
900   // We won't try again to vectorize the elements of the chain, regardless of
901   // whether we succeed below.
902   InstructionsProcessed->insert(Chain.begin(), Chain.end());
903 
904   // If the store is going to be misaligned, don't vectorize it.
905   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
906     if (S0->getPointerAddressSpace() != 0)
907       return false;
908 
909     unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
910                                                    StackAdjustedAlignment,
911                                                    DL, S0, nullptr, &DT);
912     if (NewAlign < StackAdjustedAlignment)
913       return false;
914   }
915 
916   BasicBlock::iterator First, Last;
917   std::tie(First, Last) = getBoundaryInstrs(Chain);
918   Builder.SetInsertPoint(&*Last);
919 
920   Value *Vec = UndefValue::get(VecTy);
921 
922   if (VecStoreTy) {
923     unsigned VecWidth = VecStoreTy->getNumElements();
924     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
925       StoreInst *Store = cast<StoreInst>(Chain[I]);
926       for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
927         unsigned NewIdx = J + I * VecWidth;
928         Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
929                                                       Builder.getInt32(J));
930         if (Extract->getType() != StoreTy->getScalarType())
931           Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
932 
933         Value *Insert =
934             Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
935         Vec = Insert;
936       }
937     }
938   } else {
939     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
940       StoreInst *Store = cast<StoreInst>(Chain[I]);
941       Value *Extract = Store->getValueOperand();
942       if (Extract->getType() != StoreTy->getScalarType())
943         Extract =
944             Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
945 
946       Value *Insert =
947           Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
948       Vec = Insert;
949     }
950   }
951 
952   // This cast is safe because Builder.CreateStore() always creates a bona fide
953   // StoreInst.
954   StoreInst *SI = cast<StoreInst>(
955       Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
956                                                      VecTy->getPointerTo(AS))));
957   propagateMetadata(SI, Chain);
958   SI->setAlignment(Alignment);
959 
960   eraseInstructions(Chain);
961   ++NumVectorInstructions;
962   NumScalarsVectorized += Chain.size();
963   return true;
964 }
965 
966 bool Vectorizer::vectorizeLoadChain(
967     ArrayRef<Instruction *> Chain,
968     SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
969   LoadInst *L0 = cast<LoadInst>(Chain[0]);
970 
971   // If the vector has an int element, default to int for the whole load.
972   Type *LoadTy;
973   for (const auto &V : Chain) {
974     LoadTy = cast<LoadInst>(V)->getType();
975     if (LoadTy->isIntOrIntVectorTy())
976       break;
977 
978     if (LoadTy->isPtrOrPtrVectorTy()) {
979       LoadTy = Type::getIntNTy(F.getParent()->getContext(),
980                                DL.getTypeSizeInBits(LoadTy));
981       break;
982     }
983   }
984 
985   unsigned Sz = DL.getTypeSizeInBits(LoadTy);
986   unsigned AS = L0->getPointerAddressSpace();
987   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
988   unsigned VF = VecRegSize / Sz;
989   unsigned ChainSize = Chain.size();
990   unsigned Alignment = getAlignment(L0);
991 
992   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
993     InstructionsProcessed->insert(Chain.begin(), Chain.end());
994     return false;
995   }
996 
997   ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
998   if (NewChain.empty()) {
999     // No vectorization possible.
1000     InstructionsProcessed->insert(Chain.begin(), Chain.end());
1001     return false;
1002   }
1003   if (NewChain.size() == 1) {
1004     // Failed after the first instruction. Discard it and try the smaller chain.
1005     InstructionsProcessed->insert(NewChain.front());
1006     return false;
1007   }
1008 
1009   // Update Chain to the valid vectorizable subchain.
1010   Chain = NewChain;
1011   ChainSize = Chain.size();
1012 
1013   // Check if it's legal to vectorize this chain. If not, split the chain and
1014   // try again.
1015   unsigned EltSzInBytes = Sz / 8;
1016   unsigned SzInBytes = EltSzInBytes * ChainSize;
1017   if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1018     auto Chains = splitOddVectorElts(Chain, Sz);
1019     return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1020            vectorizeLoadChain(Chains.second, InstructionsProcessed);
1021   }
1022 
1023   VectorType *VecTy;
1024   VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
1025   if (VecLoadTy)
1026     VecTy = VectorType::get(LoadTy->getScalarType(),
1027                             Chain.size() * VecLoadTy->getNumElements());
1028   else
1029     VecTy = VectorType::get(LoadTy, Chain.size());
1030 
1031   // If it's more than the max vector size or the target has a better
1032   // vector factor, break it into two pieces.
1033   unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1034   if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1035     DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
1036                     " Creating two separate arrays.\n");
1037     return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1038            vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
1039   }
1040 
1041   // We won't try again to vectorize the elements of the chain, regardless of
1042   // whether we succeed below.
1043   InstructionsProcessed->insert(Chain.begin(), Chain.end());
1044 
1045   // If the load is going to be misaligned, don't vectorize it.
1046   if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1047     if (L0->getPointerAddressSpace() != 0)
1048       return false;
1049 
1050     unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1051                                                    StackAdjustedAlignment,
1052                                                    DL, L0, nullptr, &DT);
1053     if (NewAlign < StackAdjustedAlignment)
1054       return false;
1055 
1056     Alignment = NewAlign;
1057   }
1058 
1059   DEBUG({
1060     dbgs() << "LSV: Loads to vectorize:\n";
1061     for (Instruction *I : Chain)
1062       I->dump();
1063   });
1064 
1065   // getVectorizablePrefix already computed getBoundaryInstrs.  The value of
1066   // Last may have changed since then, but the value of First won't have.  If it
1067   // matters, we could compute getBoundaryInstrs only once and reuse it here.
1068   BasicBlock::iterator First, Last;
1069   std::tie(First, Last) = getBoundaryInstrs(Chain);
1070   Builder.SetInsertPoint(&*First);
1071 
1072   Value *Bitcast =
1073       Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1074   // This cast is safe because Builder.CreateLoad always creates a bona fide
1075   // LoadInst.
1076   LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
1077   propagateMetadata(LI, Chain);
1078   LI->setAlignment(Alignment);
1079 
1080   if (VecLoadTy) {
1081     SmallVector<Instruction *, 16> InstrsToErase;
1082 
1083     unsigned VecWidth = VecLoadTy->getNumElements();
1084     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1085       for (auto Use : Chain[I]->users()) {
1086         // All users of vector loads are ExtractElement instructions with
1087         // constant indices, otherwise we would have bailed before now.
1088         Instruction *UI = cast<Instruction>(Use);
1089         unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
1090         unsigned NewIdx = Idx + I * VecWidth;
1091         Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1092                                                 UI->getName());
1093         if (V->getType() != UI->getType())
1094           V = Builder.CreateBitCast(V, UI->getType());
1095 
1096         // Replace the old instruction.
1097         UI->replaceAllUsesWith(V);
1098         InstrsToErase.push_back(UI);
1099       }
1100     }
1101 
1102     // Bitcast might not be an Instruction, if the value being loaded is a
1103     // constant.  In that case, no need to reorder anything.
1104     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1105       reorder(BitcastInst);
1106 
1107     for (auto I : InstrsToErase)
1108       I->eraseFromParent();
1109   } else {
1110     for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1111       Value *CV = Chain[I];
1112       Value *V =
1113           Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1114       if (V->getType() != CV->getType()) {
1115         V = Builder.CreateBitOrPointerCast(V, CV->getType());
1116       }
1117 
1118       // Replace the old instruction.
1119       CV->replaceAllUsesWith(V);
1120     }
1121 
1122     if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1123       reorder(BitcastInst);
1124   }
1125 
1126   eraseInstructions(Chain);
1127 
1128   ++NumVectorInstructions;
1129   NumScalarsVectorized += Chain.size();
1130   return true;
1131 }
1132 
1133 bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
1134                                     unsigned Alignment) {
1135   if (Alignment % SzInBytes == 0)
1136     return false;
1137 
1138   bool Fast = false;
1139   bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1140                                                    SzInBytes * 8, AddressSpace,
1141                                                    Alignment, &Fast);
1142   DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1143                << " and fast? " << Fast << "\n";);
1144   return !Allows || !Fast;
1145 }
1146