12cab237bSDimitry Andric //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
23ca95b02SDimitry Andric //
33ca95b02SDimitry Andric // The LLVM Compiler Infrastructure
43ca95b02SDimitry Andric //
53ca95b02SDimitry Andric // This file is distributed under the University of Illinois Open Source
63ca95b02SDimitry Andric // License. See LICENSE.TXT for details.
73ca95b02SDimitry Andric //
83ca95b02SDimitry Andric //===----------------------------------------------------------------------===//
94ba319b5SDimitry Andric //
104ba319b5SDimitry Andric // This pass merges loads/stores to/from sequential memory addresses into vector
114ba319b5SDimitry Andric // loads/stores. Although there's nothing GPU-specific in here, this pass is
124ba319b5SDimitry Andric // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
134ba319b5SDimitry Andric //
144ba319b5SDimitry Andric // (For simplicity below we talk about loads only, but everything also applies
154ba319b5SDimitry Andric // to stores.)
164ba319b5SDimitry Andric //
174ba319b5SDimitry Andric // This pass is intended to be run late in the pipeline, after other
184ba319b5SDimitry Andric // vectorization opportunities have been exploited. So the assumption here is
194ba319b5SDimitry Andric // that immediately following our new vector load we'll need to extract out the
204ba319b5SDimitry Andric // individual elements of the load, so we can operate on them individually.
214ba319b5SDimitry Andric //
224ba319b5SDimitry Andric // On CPUs this transformation is usually not beneficial, because extracting the
234ba319b5SDimitry Andric // elements of a vector register is expensive on most architectures. It's
244ba319b5SDimitry Andric // usually better just to load each element individually into its own scalar
254ba319b5SDimitry Andric // register.
264ba319b5SDimitry Andric //
274ba319b5SDimitry Andric // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
284ba319b5SDimitry Andric // "vector load" loads directly into a series of scalar registers. In effect,
294ba319b5SDimitry Andric // extracting the elements of the vector is free. It's therefore always
304ba319b5SDimitry Andric // beneficial to vectorize a sequence of loads on these architectures.
314ba319b5SDimitry Andric //
324ba319b5SDimitry Andric // Vectorizing (perhaps a better name might be "coalescing") loads can have
334ba319b5SDimitry Andric // large performance impacts on GPU kernels, and opportunities for vectorizing
344ba319b5SDimitry Andric // are common in GPU code. This pass tries very hard to find such
354ba319b5SDimitry Andric // opportunities; its runtime is quadratic in the number of loads in a BB.
364ba319b5SDimitry Andric //
374ba319b5SDimitry Andric // Some CPU architectures, such as ARM, have instructions that load into
384ba319b5SDimitry Andric // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
394ba319b5SDimitry Andric // could use this pass (with some modifications), but currently it implements
404ba319b5SDimitry Andric // its own pass to do something similar to what we do here.
413ca95b02SDimitry Andric
422cab237bSDimitry Andric #include "llvm/ADT/APInt.h"
432cab237bSDimitry Andric #include "llvm/ADT/ArrayRef.h"
443ca95b02SDimitry Andric #include "llvm/ADT/MapVector.h"
453ca95b02SDimitry Andric #include "llvm/ADT/PostOrderIterator.h"
462cab237bSDimitry Andric #include "llvm/ADT/STLExtras.h"
472cab237bSDimitry Andric #include "llvm/ADT/SmallPtrSet.h"
482cab237bSDimitry Andric #include "llvm/ADT/SmallVector.h"
493ca95b02SDimitry Andric #include "llvm/ADT/Statistic.h"
502cab237bSDimitry Andric #include "llvm/ADT/iterator_range.h"
513ca95b02SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
522cab237bSDimitry Andric #include "llvm/Analysis/MemoryLocation.h"
53d88c1a5aSDimitry Andric #include "llvm/Analysis/OrderedBasicBlock.h"
543ca95b02SDimitry Andric #include "llvm/Analysis/ScalarEvolution.h"
553ca95b02SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
564ba319b5SDimitry Andric #include "llvm/Transforms/Utils/Local.h"
573ca95b02SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
583ca95b02SDimitry Andric #include "llvm/Analysis/VectorUtils.h"
592cab237bSDimitry Andric #include "llvm/IR/Attributes.h"
602cab237bSDimitry Andric #include "llvm/IR/BasicBlock.h"
612cab237bSDimitry Andric #include "llvm/IR/Constants.h"
623ca95b02SDimitry Andric #include "llvm/IR/DataLayout.h"
632cab237bSDimitry Andric #include "llvm/IR/DerivedTypes.h"
643ca95b02SDimitry Andric #include "llvm/IR/Dominators.h"
652cab237bSDimitry Andric #include "llvm/IR/Function.h"
663ca95b02SDimitry Andric #include "llvm/IR/IRBuilder.h"
672cab237bSDimitry Andric #include "llvm/IR/InstrTypes.h"
682cab237bSDimitry Andric #include "llvm/IR/Instruction.h"
693ca95b02SDimitry Andric #include "llvm/IR/Instructions.h"
702cab237bSDimitry Andric #include "llvm/IR/IntrinsicInst.h"
713ca95b02SDimitry Andric #include "llvm/IR/Module.h"
723ca95b02SDimitry Andric #include "llvm/IR/Type.h"
732cab237bSDimitry Andric #include "llvm/IR/User.h"
743ca95b02SDimitry Andric #include "llvm/IR/Value.h"
752cab237bSDimitry Andric #include "llvm/Pass.h"
762cab237bSDimitry Andric #include "llvm/Support/Casting.h"
773ca95b02SDimitry Andric #include "llvm/Support/Debug.h"
7851690af2SDimitry Andric #include "llvm/Support/KnownBits.h"
792cab237bSDimitry Andric #include "llvm/Support/MathExtras.h"
803ca95b02SDimitry Andric #include "llvm/Support/raw_ostream.h"
813ca95b02SDimitry Andric #include "llvm/Transforms/Vectorize.h"
82*b5893f02SDimitry Andric #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
832cab237bSDimitry Andric #include <algorithm>
842cab237bSDimitry Andric #include <cassert>
852cab237bSDimitry Andric #include <cstdlib>
862cab237bSDimitry Andric #include <tuple>
872cab237bSDimitry Andric #include <utility>
883ca95b02SDimitry Andric
893ca95b02SDimitry Andric using namespace llvm;
903ca95b02SDimitry Andric
913ca95b02SDimitry Andric #define DEBUG_TYPE "load-store-vectorizer"
922cab237bSDimitry Andric
933ca95b02SDimitry Andric STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
943ca95b02SDimitry Andric STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
953ca95b02SDimitry Andric
96d88c1a5aSDimitry Andric // FIXME: Assuming stack alignment of 4 is always good enough
97d88c1a5aSDimitry Andric static const unsigned StackAdjustedAlignment = 4;
982cab237bSDimitry Andric
992cab237bSDimitry Andric namespace {
1002cab237bSDimitry Andric
1014ba319b5SDimitry Andric /// ChainID is an arbitrary token that is allowed to be different only for the
1024ba319b5SDimitry Andric /// accesses that are guaranteed to be considered non-consecutive by
1034ba319b5SDimitry Andric /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
1044ba319b5SDimitry Andric /// together and reducing the number of instructions the main search operates on
1054ba319b5SDimitry Andric /// at a time, i.e. this is to reduce compile time and nothing else as the main
1064ba319b5SDimitry Andric /// search has O(n^2) time complexity. The underlying type of ChainID should not
1074ba319b5SDimitry Andric /// be relied upon.
1084ba319b5SDimitry Andric using ChainID = const Value *;
1092cab237bSDimitry Andric using InstrList = SmallVector<Instruction *, 8>;
1104ba319b5SDimitry Andric using InstrListMap = MapVector<ChainID, InstrList>;
1113ca95b02SDimitry Andric
1123ca95b02SDimitry Andric class Vectorizer {
1133ca95b02SDimitry Andric Function &F;
1143ca95b02SDimitry Andric AliasAnalysis &AA;
1153ca95b02SDimitry Andric DominatorTree &DT;
1163ca95b02SDimitry Andric ScalarEvolution &SE;
1173ca95b02SDimitry Andric TargetTransformInfo &TTI;
1183ca95b02SDimitry Andric const DataLayout &DL;
1193ca95b02SDimitry Andric IRBuilder<> Builder;
1203ca95b02SDimitry Andric
1213ca95b02SDimitry Andric public:
Vectorizer(Function & F,AliasAnalysis & AA,DominatorTree & DT,ScalarEvolution & SE,TargetTransformInfo & TTI)1223ca95b02SDimitry Andric Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
1233ca95b02SDimitry Andric ScalarEvolution &SE, TargetTransformInfo &TTI)
1243ca95b02SDimitry Andric : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
1253ca95b02SDimitry Andric DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
1263ca95b02SDimitry Andric
1273ca95b02SDimitry Andric bool run();
1283ca95b02SDimitry Andric
1293ca95b02SDimitry Andric private:
1303ca95b02SDimitry Andric unsigned getPointerAddressSpace(Value *I);
1313ca95b02SDimitry Andric
getAlignment(LoadInst * LI) const1323ca95b02SDimitry Andric unsigned getAlignment(LoadInst *LI) const {
1333ca95b02SDimitry Andric unsigned Align = LI->getAlignment();
1343ca95b02SDimitry Andric if (Align != 0)
1353ca95b02SDimitry Andric return Align;
1363ca95b02SDimitry Andric
1373ca95b02SDimitry Andric return DL.getABITypeAlignment(LI->getType());
1383ca95b02SDimitry Andric }
1393ca95b02SDimitry Andric
getAlignment(StoreInst * SI) const1403ca95b02SDimitry Andric unsigned getAlignment(StoreInst *SI) const {
1413ca95b02SDimitry Andric unsigned Align = SI->getAlignment();
1423ca95b02SDimitry Andric if (Align != 0)
1433ca95b02SDimitry Andric return Align;
1443ca95b02SDimitry Andric
1453ca95b02SDimitry Andric return DL.getABITypeAlignment(SI->getValueOperand()->getType());
1463ca95b02SDimitry Andric }
1473ca95b02SDimitry Andric
1484ba319b5SDimitry Andric static const unsigned MaxDepth = 3;
1494ba319b5SDimitry Andric
1503ca95b02SDimitry Andric bool isConsecutiveAccess(Value *A, Value *B);
1514ba319b5SDimitry Andric bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
1524ba319b5SDimitry Andric unsigned Depth = 0) const;
1534ba319b5SDimitry Andric bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
1544ba319b5SDimitry Andric unsigned Depth) const;
1554ba319b5SDimitry Andric bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
1564ba319b5SDimitry Andric unsigned Depth) const;
1573ca95b02SDimitry Andric
1583ca95b02SDimitry Andric /// After vectorization, reorder the instructions that I depends on
1593ca95b02SDimitry Andric /// (the instructions defining its operands), to ensure they dominate I.
1603ca95b02SDimitry Andric void reorder(Instruction *I);
1613ca95b02SDimitry Andric
1623ca95b02SDimitry Andric /// Returns the first and the last instructions in Chain.
1633ca95b02SDimitry Andric std::pair<BasicBlock::iterator, BasicBlock::iterator>
164d88c1a5aSDimitry Andric getBoundaryInstrs(ArrayRef<Instruction *> Chain);
1653ca95b02SDimitry Andric
1663ca95b02SDimitry Andric /// Erases the original instructions after vectorizing.
167d88c1a5aSDimitry Andric void eraseInstructions(ArrayRef<Instruction *> Chain);
1683ca95b02SDimitry Andric
1693ca95b02SDimitry Andric /// "Legalize" the vector type that would be produced by combining \p
1703ca95b02SDimitry Andric /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
1713ca95b02SDimitry Andric /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
1723ca95b02SDimitry Andric /// expected to have more than 4 elements.
173d88c1a5aSDimitry Andric std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
174d88c1a5aSDimitry Andric splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
1753ca95b02SDimitry Andric
176d88c1a5aSDimitry Andric /// Finds the largest prefix of Chain that's vectorizable, checking for
177d88c1a5aSDimitry Andric /// intervening instructions which may affect the memory accessed by the
178d88c1a5aSDimitry Andric /// instructions within Chain.
179d88c1a5aSDimitry Andric ///
180d88c1a5aSDimitry Andric /// The elements of \p Chain must be all loads or all stores and must be in
181d88c1a5aSDimitry Andric /// address order.
182d88c1a5aSDimitry Andric ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
1833ca95b02SDimitry Andric
1843ca95b02SDimitry Andric /// Collects load and store instructions to vectorize.
185d88c1a5aSDimitry Andric std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
1863ca95b02SDimitry Andric
187d88c1a5aSDimitry Andric /// Processes the collected instructions, the \p Map. The values of \p Map
1883ca95b02SDimitry Andric /// should be all loads or all stores.
189d88c1a5aSDimitry Andric bool vectorizeChains(InstrListMap &Map);
1903ca95b02SDimitry Andric
1913ca95b02SDimitry Andric /// Finds the load/stores to consecutive memory addresses and vectorizes them.
192d88c1a5aSDimitry Andric bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
1933ca95b02SDimitry Andric
1943ca95b02SDimitry Andric /// Vectorizes the load instructions in Chain.
195d88c1a5aSDimitry Andric bool
196d88c1a5aSDimitry Andric vectorizeLoadChain(ArrayRef<Instruction *> Chain,
197d88c1a5aSDimitry Andric SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
1983ca95b02SDimitry Andric
1993ca95b02SDimitry Andric /// Vectorizes the store instructions in Chain.
200d88c1a5aSDimitry Andric bool
201d88c1a5aSDimitry Andric vectorizeStoreChain(ArrayRef<Instruction *> Chain,
202d88c1a5aSDimitry Andric SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
2033ca95b02SDimitry Andric
204d88c1a5aSDimitry Andric /// Check if this load/store access is misaligned accesses.
2053ca95b02SDimitry Andric bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
2063ca95b02SDimitry Andric unsigned Alignment);
2073ca95b02SDimitry Andric };
2083ca95b02SDimitry Andric
209*b5893f02SDimitry Andric class LoadStoreVectorizerLegacyPass : public FunctionPass {
2103ca95b02SDimitry Andric public:
2113ca95b02SDimitry Andric static char ID;
2123ca95b02SDimitry Andric
LoadStoreVectorizerLegacyPass()213*b5893f02SDimitry Andric LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
214*b5893f02SDimitry Andric initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
2153ca95b02SDimitry Andric }
2163ca95b02SDimitry Andric
2173ca95b02SDimitry Andric bool runOnFunction(Function &F) override;
2183ca95b02SDimitry Andric
getPassName() const219d88c1a5aSDimitry Andric StringRef getPassName() const override {
2203ca95b02SDimitry Andric return "GPU Load and Store Vectorizer";
2213ca95b02SDimitry Andric }
2223ca95b02SDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const2233ca95b02SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override {
2243ca95b02SDimitry Andric AU.addRequired<AAResultsWrapperPass>();
2253ca95b02SDimitry Andric AU.addRequired<ScalarEvolutionWrapperPass>();
2263ca95b02SDimitry Andric AU.addRequired<DominatorTreeWrapperPass>();
2273ca95b02SDimitry Andric AU.addRequired<TargetTransformInfoWrapperPass>();
2283ca95b02SDimitry Andric AU.setPreservesCFG();
2293ca95b02SDimitry Andric }
2303ca95b02SDimitry Andric };
2312cab237bSDimitry Andric
2322cab237bSDimitry Andric } // end anonymous namespace
2332cab237bSDimitry Andric
234*b5893f02SDimitry Andric char LoadStoreVectorizerLegacyPass::ID = 0;
2353ca95b02SDimitry Andric
236*b5893f02SDimitry Andric INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
2373ca95b02SDimitry Andric "Vectorize load and Store instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)2383ca95b02SDimitry Andric INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
2393ca95b02SDimitry Andric INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
2403ca95b02SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
2413ca95b02SDimitry Andric INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
2423ca95b02SDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
243*b5893f02SDimitry Andric INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
2443ca95b02SDimitry Andric "Vectorize load and store instructions", false, false)
2453ca95b02SDimitry Andric
2463ca95b02SDimitry Andric Pass *llvm::createLoadStoreVectorizerPass() {
247*b5893f02SDimitry Andric return new LoadStoreVectorizerLegacyPass();
2483ca95b02SDimitry Andric }
2493ca95b02SDimitry Andric
runOnFunction(Function & F)250*b5893f02SDimitry Andric bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
2513ca95b02SDimitry Andric // Don't vectorize when the attribute NoImplicitFloat is used.
2523ca95b02SDimitry Andric if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
2533ca95b02SDimitry Andric return false;
2543ca95b02SDimitry Andric
2553ca95b02SDimitry Andric AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
2563ca95b02SDimitry Andric DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2573ca95b02SDimitry Andric ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2583ca95b02SDimitry Andric TargetTransformInfo &TTI =
2593ca95b02SDimitry Andric getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2603ca95b02SDimitry Andric
2613ca95b02SDimitry Andric Vectorizer V(F, AA, DT, SE, TTI);
2623ca95b02SDimitry Andric return V.run();
2633ca95b02SDimitry Andric }
2643ca95b02SDimitry Andric
run(Function & F,FunctionAnalysisManager & AM)265*b5893f02SDimitry Andric PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
266*b5893f02SDimitry Andric // Don't vectorize when the attribute NoImplicitFloat is used.
267*b5893f02SDimitry Andric if (F.hasFnAttribute(Attribute::NoImplicitFloat))
268*b5893f02SDimitry Andric return PreservedAnalyses::all();
269*b5893f02SDimitry Andric
270*b5893f02SDimitry Andric AliasAnalysis &AA = AM.getResult<AAManager>(F);
271*b5893f02SDimitry Andric DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
272*b5893f02SDimitry Andric ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
273*b5893f02SDimitry Andric TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
274*b5893f02SDimitry Andric
275*b5893f02SDimitry Andric Vectorizer V(F, AA, DT, SE, TTI);
276*b5893f02SDimitry Andric bool Changed = V.run();
277*b5893f02SDimitry Andric PreservedAnalyses PA;
278*b5893f02SDimitry Andric PA.preserveSet<CFGAnalyses>();
279*b5893f02SDimitry Andric return Changed ? PA : PreservedAnalyses::all();
280*b5893f02SDimitry Andric }
281*b5893f02SDimitry Andric
282*b5893f02SDimitry Andric // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
283*b5893f02SDimitry Andric // vectors of Instructions.
propagateMetadata(Instruction * I,ArrayRef<Instruction * > IL)284*b5893f02SDimitry Andric static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
285*b5893f02SDimitry Andric SmallVector<Value *, 8> VL(IL.begin(), IL.end());
286*b5893f02SDimitry Andric propagateMetadata(I, VL);
287*b5893f02SDimitry Andric }
288*b5893f02SDimitry Andric
2893ca95b02SDimitry Andric // Vectorizer Implementation
run()2903ca95b02SDimitry Andric bool Vectorizer::run() {
2913ca95b02SDimitry Andric bool Changed = false;
2923ca95b02SDimitry Andric
2933ca95b02SDimitry Andric // Scan the blocks in the function in post order.
2943ca95b02SDimitry Andric for (BasicBlock *BB : post_order(&F)) {
295d88c1a5aSDimitry Andric InstrListMap LoadRefs, StoreRefs;
296d88c1a5aSDimitry Andric std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
2973ca95b02SDimitry Andric Changed |= vectorizeChains(LoadRefs);
2983ca95b02SDimitry Andric Changed |= vectorizeChains(StoreRefs);
2993ca95b02SDimitry Andric }
3003ca95b02SDimitry Andric
3013ca95b02SDimitry Andric return Changed;
3023ca95b02SDimitry Andric }
3033ca95b02SDimitry Andric
getPointerAddressSpace(Value * I)3043ca95b02SDimitry Andric unsigned Vectorizer::getPointerAddressSpace(Value *I) {
3053ca95b02SDimitry Andric if (LoadInst *L = dyn_cast<LoadInst>(I))
3063ca95b02SDimitry Andric return L->getPointerAddressSpace();
3073ca95b02SDimitry Andric if (StoreInst *S = dyn_cast<StoreInst>(I))
3083ca95b02SDimitry Andric return S->getPointerAddressSpace();
3093ca95b02SDimitry Andric return -1;
3103ca95b02SDimitry Andric }
3113ca95b02SDimitry Andric
3123ca95b02SDimitry Andric // FIXME: Merge with llvm::isConsecutiveAccess
isConsecutiveAccess(Value * A,Value * B)3133ca95b02SDimitry Andric bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
3144ba319b5SDimitry Andric Value *PtrA = getLoadStorePointerOperand(A);
3154ba319b5SDimitry Andric Value *PtrB = getLoadStorePointerOperand(B);
3163ca95b02SDimitry Andric unsigned ASA = getPointerAddressSpace(A);
3173ca95b02SDimitry Andric unsigned ASB = getPointerAddressSpace(B);
3183ca95b02SDimitry Andric
3193ca95b02SDimitry Andric // Check that the address spaces match and that the pointers are valid.
3203ca95b02SDimitry Andric if (!PtrA || !PtrB || (ASA != ASB))
3213ca95b02SDimitry Andric return false;
3223ca95b02SDimitry Andric
3233ca95b02SDimitry Andric // Make sure that A and B are different pointers of the same size type.
3243ca95b02SDimitry Andric Type *PtrATy = PtrA->getType()->getPointerElementType();
3253ca95b02SDimitry Andric Type *PtrBTy = PtrB->getType()->getPointerElementType();
3263ca95b02SDimitry Andric if (PtrA == PtrB ||
3274ba319b5SDimitry Andric PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
3283ca95b02SDimitry Andric DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
3293ca95b02SDimitry Andric DL.getTypeStoreSize(PtrATy->getScalarType()) !=
3303ca95b02SDimitry Andric DL.getTypeStoreSize(PtrBTy->getScalarType()))
3313ca95b02SDimitry Andric return false;
3323ca95b02SDimitry Andric
3334ba319b5SDimitry Andric unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
3343ca95b02SDimitry Andric APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
3353ca95b02SDimitry Andric
3364ba319b5SDimitry Andric return areConsecutivePointers(PtrA, PtrB, Size);
3374ba319b5SDimitry Andric }
3384ba319b5SDimitry Andric
areConsecutivePointers(Value * PtrA,Value * PtrB,const APInt & PtrDelta,unsigned Depth) const3394ba319b5SDimitry Andric bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
3404ba319b5SDimitry Andric const APInt &PtrDelta,
3414ba319b5SDimitry Andric unsigned Depth) const {
3424ba319b5SDimitry Andric unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
3434ba319b5SDimitry Andric APInt OffsetA(PtrBitWidth, 0);
3444ba319b5SDimitry Andric APInt OffsetB(PtrBitWidth, 0);
3453ca95b02SDimitry Andric PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
3463ca95b02SDimitry Andric PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
3473ca95b02SDimitry Andric
3483ca95b02SDimitry Andric APInt OffsetDelta = OffsetB - OffsetA;
3493ca95b02SDimitry Andric
3503ca95b02SDimitry Andric // Check if they are based on the same pointer. That makes the offsets
3513ca95b02SDimitry Andric // sufficient.
3523ca95b02SDimitry Andric if (PtrA == PtrB)
3534ba319b5SDimitry Andric return OffsetDelta == PtrDelta;
3543ca95b02SDimitry Andric
3553ca95b02SDimitry Andric // Compute the necessary base pointer delta to have the necessary final delta
3564ba319b5SDimitry Andric // equal to the pointer delta requested.
3574ba319b5SDimitry Andric APInt BaseDelta = PtrDelta - OffsetDelta;
3583ca95b02SDimitry Andric
3593ca95b02SDimitry Andric // Compute the distance with SCEV between the base pointers.
3603ca95b02SDimitry Andric const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
3613ca95b02SDimitry Andric const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
3623ca95b02SDimitry Andric const SCEV *C = SE.getConstant(BaseDelta);
3633ca95b02SDimitry Andric const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
3643ca95b02SDimitry Andric if (X == PtrSCEVB)
3653ca95b02SDimitry Andric return true;
3663ca95b02SDimitry Andric
3674ba319b5SDimitry Andric // The above check will not catch the cases where one of the pointers is
3684ba319b5SDimitry Andric // factorized but the other one is not, such as (C + (S * (A + B))) vs
3694ba319b5SDimitry Andric // (AS + BS). Get the minus scev. That will allow re-combining the expresions
3704ba319b5SDimitry Andric // and getting the simplified difference.
3714ba319b5SDimitry Andric const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
3724ba319b5SDimitry Andric if (C == Dist)
3734ba319b5SDimitry Andric return true;
3744ba319b5SDimitry Andric
3753ca95b02SDimitry Andric // Sometimes even this doesn't work, because SCEV can't always see through
3763ca95b02SDimitry Andric // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
3773ca95b02SDimitry Andric // things the hard way.
3784ba319b5SDimitry Andric return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
3794ba319b5SDimitry Andric }
3804ba319b5SDimitry Andric
lookThroughComplexAddresses(Value * PtrA,Value * PtrB,APInt PtrDelta,unsigned Depth) const3814ba319b5SDimitry Andric bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
3824ba319b5SDimitry Andric APInt PtrDelta,
3834ba319b5SDimitry Andric unsigned Depth) const {
3844ba319b5SDimitry Andric auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
3854ba319b5SDimitry Andric auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
3864ba319b5SDimitry Andric if (!GEPA || !GEPB)
3874ba319b5SDimitry Andric return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
3883ca95b02SDimitry Andric
3893ca95b02SDimitry Andric // Look through GEPs after checking they're the same except for the last
3903ca95b02SDimitry Andric // index.
3914ba319b5SDimitry Andric if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
3924ba319b5SDimitry Andric GEPA->getPointerOperand() != GEPB->getPointerOperand())
3933ca95b02SDimitry Andric return false;
3944ba319b5SDimitry Andric gep_type_iterator GTIA = gep_type_begin(GEPA);
3954ba319b5SDimitry Andric gep_type_iterator GTIB = gep_type_begin(GEPB);
3964ba319b5SDimitry Andric for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
3974ba319b5SDimitry Andric if (GTIA.getOperand() != GTIB.getOperand())
3983ca95b02SDimitry Andric return false;
3994ba319b5SDimitry Andric ++GTIA;
4004ba319b5SDimitry Andric ++GTIB;
4014ba319b5SDimitry Andric }
4023ca95b02SDimitry Andric
4034ba319b5SDimitry Andric Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
4044ba319b5SDimitry Andric Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
4053ca95b02SDimitry Andric if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
4063ca95b02SDimitry Andric OpA->getType() != OpB->getType())
4073ca95b02SDimitry Andric return false;
4083ca95b02SDimitry Andric
4094ba319b5SDimitry Andric if (PtrDelta.isNegative()) {
4104ba319b5SDimitry Andric if (PtrDelta.isMinSignedValue())
4114ba319b5SDimitry Andric return false;
4124ba319b5SDimitry Andric PtrDelta.negate();
4134ba319b5SDimitry Andric std::swap(OpA, OpB);
4144ba319b5SDimitry Andric }
4154ba319b5SDimitry Andric uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
4164ba319b5SDimitry Andric if (PtrDelta.urem(Stride) != 0)
4174ba319b5SDimitry Andric return false;
4184ba319b5SDimitry Andric unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
4194ba319b5SDimitry Andric APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
4204ba319b5SDimitry Andric
4213ca95b02SDimitry Andric // Only look through a ZExt/SExt.
4223ca95b02SDimitry Andric if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
4233ca95b02SDimitry Andric return false;
4243ca95b02SDimitry Andric
4253ca95b02SDimitry Andric bool Signed = isa<SExtInst>(OpA);
4263ca95b02SDimitry Andric
4274ba319b5SDimitry Andric // At this point A could be a function parameter, i.e. not an instruction
4284ba319b5SDimitry Andric Value *ValA = OpA->getOperand(0);
4293ca95b02SDimitry Andric OpB = dyn_cast<Instruction>(OpB->getOperand(0));
4304ba319b5SDimitry Andric if (!OpB || ValA->getType() != OpB->getType())
4313ca95b02SDimitry Andric return false;
4323ca95b02SDimitry Andric
4334ba319b5SDimitry Andric // Now we need to prove that adding IdxDiff to ValA won't overflow.
4343ca95b02SDimitry Andric bool Safe = false;
4354ba319b5SDimitry Andric // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
4364ba319b5SDimitry Andric // ValA, we're okay.
4373ca95b02SDimitry Andric if (OpB->getOpcode() == Instruction::Add &&
4383ca95b02SDimitry Andric isa<ConstantInt>(OpB->getOperand(1)) &&
4394ba319b5SDimitry Andric IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
4403ca95b02SDimitry Andric if (Signed)
4413ca95b02SDimitry Andric Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
4423ca95b02SDimitry Andric else
4433ca95b02SDimitry Andric Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
4443ca95b02SDimitry Andric }
4453ca95b02SDimitry Andric
4464ba319b5SDimitry Andric unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
4473ca95b02SDimitry Andric
4483ca95b02SDimitry Andric // Second attempt:
4494ba319b5SDimitry Andric // If all set bits of IdxDiff or any higher order bit other than the sign bit
4504ba319b5SDimitry Andric // are known to be zero in ValA, we can add Diff to it while guaranteeing no
4514ba319b5SDimitry Andric // overflow of any sort.
4523ca95b02SDimitry Andric if (!Safe) {
4534ba319b5SDimitry Andric OpA = dyn_cast<Instruction>(ValA);
4544ba319b5SDimitry Andric if (!OpA)
4554ba319b5SDimitry Andric return false;
45651690af2SDimitry Andric KnownBits Known(BitWidth);
45751690af2SDimitry Andric computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
4584ba319b5SDimitry Andric APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
4594ba319b5SDimitry Andric if (Signed)
4604ba319b5SDimitry Andric BitsAllowedToBeSet.clearBit(BitWidth - 1);
4614ba319b5SDimitry Andric if (BitsAllowedToBeSet.ult(IdxDiff))
4624ba319b5SDimitry Andric return false;
4633ca95b02SDimitry Andric }
4643ca95b02SDimitry Andric
4654ba319b5SDimitry Andric const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
4664ba319b5SDimitry Andric const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
4674ba319b5SDimitry Andric const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
4684ba319b5SDimitry Andric const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
4694ba319b5SDimitry Andric return X == OffsetSCEVB;
4704ba319b5SDimitry Andric }
4714ba319b5SDimitry Andric
lookThroughSelects(Value * PtrA,Value * PtrB,const APInt & PtrDelta,unsigned Depth) const4724ba319b5SDimitry Andric bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
4734ba319b5SDimitry Andric const APInt &PtrDelta,
4744ba319b5SDimitry Andric unsigned Depth) const {
4754ba319b5SDimitry Andric if (Depth++ == MaxDepth)
4763ca95b02SDimitry Andric return false;
4773ca95b02SDimitry Andric
4784ba319b5SDimitry Andric if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
4794ba319b5SDimitry Andric if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
4804ba319b5SDimitry Andric return SelectA->getCondition() == SelectB->getCondition() &&
4814ba319b5SDimitry Andric areConsecutivePointers(SelectA->getTrueValue(),
4824ba319b5SDimitry Andric SelectB->getTrueValue(), PtrDelta, Depth) &&
4834ba319b5SDimitry Andric areConsecutivePointers(SelectA->getFalseValue(),
4844ba319b5SDimitry Andric SelectB->getFalseValue(), PtrDelta, Depth);
4854ba319b5SDimitry Andric }
4864ba319b5SDimitry Andric }
4874ba319b5SDimitry Andric return false;
4883ca95b02SDimitry Andric }
4893ca95b02SDimitry Andric
reorder(Instruction * I)4903ca95b02SDimitry Andric void Vectorizer::reorder(Instruction *I) {
491d88c1a5aSDimitry Andric OrderedBasicBlock OBB(I->getParent());
4923ca95b02SDimitry Andric SmallPtrSet<Instruction *, 16> InstructionsToMove;
4933ca95b02SDimitry Andric SmallVector<Instruction *, 16> Worklist;
4943ca95b02SDimitry Andric
4953ca95b02SDimitry Andric Worklist.push_back(I);
4963ca95b02SDimitry Andric while (!Worklist.empty()) {
4973ca95b02SDimitry Andric Instruction *IW = Worklist.pop_back_val();
4983ca95b02SDimitry Andric int NumOperands = IW->getNumOperands();
4993ca95b02SDimitry Andric for (int i = 0; i < NumOperands; i++) {
5003ca95b02SDimitry Andric Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
5013ca95b02SDimitry Andric if (!IM || IM->getOpcode() == Instruction::PHI)
5023ca95b02SDimitry Andric continue;
5033ca95b02SDimitry Andric
504d88c1a5aSDimitry Andric // If IM is in another BB, no need to move it, because this pass only
505d88c1a5aSDimitry Andric // vectorizes instructions within one BB.
506d88c1a5aSDimitry Andric if (IM->getParent() != I->getParent())
507d88c1a5aSDimitry Andric continue;
508d88c1a5aSDimitry Andric
509d88c1a5aSDimitry Andric if (!OBB.dominates(IM, I)) {
5103ca95b02SDimitry Andric InstructionsToMove.insert(IM);
5113ca95b02SDimitry Andric Worklist.push_back(IM);
5123ca95b02SDimitry Andric }
5133ca95b02SDimitry Andric }
5143ca95b02SDimitry Andric }
5153ca95b02SDimitry Andric
5163ca95b02SDimitry Andric // All instructions to move should follow I. Start from I, not from begin().
5173ca95b02SDimitry Andric for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
5183ca95b02SDimitry Andric ++BBI) {
519d88c1a5aSDimitry Andric if (!InstructionsToMove.count(&*BBI))
5203ca95b02SDimitry Andric continue;
5213ca95b02SDimitry Andric Instruction *IM = &*BBI;
5223ca95b02SDimitry Andric --BBI;
5233ca95b02SDimitry Andric IM->removeFromParent();
5243ca95b02SDimitry Andric IM->insertBefore(I);
5253ca95b02SDimitry Andric }
5263ca95b02SDimitry Andric }
5273ca95b02SDimitry Andric
5283ca95b02SDimitry Andric std::pair<BasicBlock::iterator, BasicBlock::iterator>
getBoundaryInstrs(ArrayRef<Instruction * > Chain)529d88c1a5aSDimitry Andric Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
530d88c1a5aSDimitry Andric Instruction *C0 = Chain[0];
5313ca95b02SDimitry Andric BasicBlock::iterator FirstInstr = C0->getIterator();
5323ca95b02SDimitry Andric BasicBlock::iterator LastInstr = C0->getIterator();
5333ca95b02SDimitry Andric
5343ca95b02SDimitry Andric BasicBlock *BB = C0->getParent();
5353ca95b02SDimitry Andric unsigned NumFound = 0;
5363ca95b02SDimitry Andric for (Instruction &I : *BB) {
5373ca95b02SDimitry Andric if (!is_contained(Chain, &I))
5383ca95b02SDimitry Andric continue;
5393ca95b02SDimitry Andric
5403ca95b02SDimitry Andric ++NumFound;
5413ca95b02SDimitry Andric if (NumFound == 1) {
5423ca95b02SDimitry Andric FirstInstr = I.getIterator();
5433ca95b02SDimitry Andric }
5443ca95b02SDimitry Andric if (NumFound == Chain.size()) {
5453ca95b02SDimitry Andric LastInstr = I.getIterator();
5463ca95b02SDimitry Andric break;
5473ca95b02SDimitry Andric }
5483ca95b02SDimitry Andric }
5493ca95b02SDimitry Andric
5503ca95b02SDimitry Andric // Range is [first, last).
5513ca95b02SDimitry Andric return std::make_pair(FirstInstr, ++LastInstr);
5523ca95b02SDimitry Andric }
5533ca95b02SDimitry Andric
eraseInstructions(ArrayRef<Instruction * > Chain)554d88c1a5aSDimitry Andric void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
5553ca95b02SDimitry Andric SmallVector<Instruction *, 16> Instrs;
556d88c1a5aSDimitry Andric for (Instruction *I : Chain) {
5574ba319b5SDimitry Andric Value *PtrOperand = getLoadStorePointerOperand(I);
5583ca95b02SDimitry Andric assert(PtrOperand && "Instruction must have a pointer operand.");
559d88c1a5aSDimitry Andric Instrs.push_back(I);
5603ca95b02SDimitry Andric if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
5613ca95b02SDimitry Andric Instrs.push_back(GEP);
5623ca95b02SDimitry Andric }
5633ca95b02SDimitry Andric
5643ca95b02SDimitry Andric // Erase instructions.
565d88c1a5aSDimitry Andric for (Instruction *I : Instrs)
566d88c1a5aSDimitry Andric if (I->use_empty())
567d88c1a5aSDimitry Andric I->eraseFromParent();
5683ca95b02SDimitry Andric }
5693ca95b02SDimitry Andric
570d88c1a5aSDimitry Andric std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
splitOddVectorElts(ArrayRef<Instruction * > Chain,unsigned ElementSizeBits)571d88c1a5aSDimitry Andric Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
5723ca95b02SDimitry Andric unsigned ElementSizeBits) {
573d88c1a5aSDimitry Andric unsigned ElementSizeBytes = ElementSizeBits / 8;
574d88c1a5aSDimitry Andric unsigned SizeBytes = ElementSizeBytes * Chain.size();
575d88c1a5aSDimitry Andric unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
5767a7e6055SDimitry Andric if (NumLeft == Chain.size()) {
5777a7e6055SDimitry Andric if ((NumLeft & 1) == 0)
5787a7e6055SDimitry Andric NumLeft /= 2; // Split even in half
5797a7e6055SDimitry Andric else
5807a7e6055SDimitry Andric --NumLeft; // Split off last element
5817a7e6055SDimitry Andric } else if (NumLeft == 0)
582d88c1a5aSDimitry Andric NumLeft = 1;
5833ca95b02SDimitry Andric return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
5843ca95b02SDimitry Andric }
5853ca95b02SDimitry Andric
586d88c1a5aSDimitry Andric ArrayRef<Instruction *>
getVectorizablePrefix(ArrayRef<Instruction * > Chain)587d88c1a5aSDimitry Andric Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
588d88c1a5aSDimitry Andric // These are in BB order, unlike Chain, which is in address order.
589d88c1a5aSDimitry Andric SmallVector<Instruction *, 16> MemoryInstrs;
590d88c1a5aSDimitry Andric SmallVector<Instruction *, 16> ChainInstrs;
5913ca95b02SDimitry Andric
592d88c1a5aSDimitry Andric bool IsLoadChain = isa<LoadInst>(Chain[0]);
5934ba319b5SDimitry Andric LLVM_DEBUG({
594d88c1a5aSDimitry Andric for (Instruction *I : Chain) {
595d88c1a5aSDimitry Andric if (IsLoadChain)
596d88c1a5aSDimitry Andric assert(isa<LoadInst>(I) &&
597d88c1a5aSDimitry Andric "All elements of Chain must be loads, or all must be stores.");
5983ca95b02SDimitry Andric else
599d88c1a5aSDimitry Andric assert(isa<StoreInst>(I) &&
600d88c1a5aSDimitry Andric "All elements of Chain must be loads, or all must be stores.");
601d88c1a5aSDimitry Andric }
602d88c1a5aSDimitry Andric });
603d88c1a5aSDimitry Andric
604d88c1a5aSDimitry Andric for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
605d88c1a5aSDimitry Andric if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
606d88c1a5aSDimitry Andric if (!is_contained(Chain, &I))
607d88c1a5aSDimitry Andric MemoryInstrs.push_back(&I);
608d88c1a5aSDimitry Andric else
609d88c1a5aSDimitry Andric ChainInstrs.push_back(&I);
6102cab237bSDimitry Andric } else if (isa<IntrinsicInst>(&I) &&
6112cab237bSDimitry Andric cast<IntrinsicInst>(&I)->getIntrinsicID() ==
6122cab237bSDimitry Andric Intrinsic::sideeffect) {
6132cab237bSDimitry Andric // Ignore llvm.sideeffect calls.
614d88c1a5aSDimitry Andric } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
6154ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
6164ba319b5SDimitry Andric << '\n');
617d88c1a5aSDimitry Andric break;
618d88c1a5aSDimitry Andric } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
6194ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
620d88c1a5aSDimitry Andric << '\n');
621d88c1a5aSDimitry Andric break;
6223ca95b02SDimitry Andric }
6233ca95b02SDimitry Andric }
6243ca95b02SDimitry Andric
625d88c1a5aSDimitry Andric OrderedBasicBlock OBB(Chain[0]->getParent());
6263ca95b02SDimitry Andric
627d88c1a5aSDimitry Andric // Loop until we find an instruction in ChainInstrs that we can't vectorize.
628d88c1a5aSDimitry Andric unsigned ChainInstrIdx = 0;
629d88c1a5aSDimitry Andric Instruction *BarrierMemoryInstr = nullptr;
630d88c1a5aSDimitry Andric
631d88c1a5aSDimitry Andric for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
632d88c1a5aSDimitry Andric Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
633d88c1a5aSDimitry Andric
634d88c1a5aSDimitry Andric // If a barrier memory instruction was found, chain instructions that follow
635d88c1a5aSDimitry Andric // will not be added to the valid prefix.
636d88c1a5aSDimitry Andric if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
637d88c1a5aSDimitry Andric break;
638d88c1a5aSDimitry Andric
639d88c1a5aSDimitry Andric // Check (in BB order) if any instruction prevents ChainInstr from being
640d88c1a5aSDimitry Andric // vectorized. Find and store the first such "conflicting" instruction.
641d88c1a5aSDimitry Andric for (Instruction *MemInstr : MemoryInstrs) {
642d88c1a5aSDimitry Andric // If a barrier memory instruction was found, do not check past it.
643d88c1a5aSDimitry Andric if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
644d88c1a5aSDimitry Andric break;
645d88c1a5aSDimitry Andric
6464ba319b5SDimitry Andric auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
6474ba319b5SDimitry Andric auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
6484ba319b5SDimitry Andric if (MemLoad && ChainLoad)
6493ca95b02SDimitry Andric continue;
6503ca95b02SDimitry Andric
6514ba319b5SDimitry Andric // We can ignore the alias if the we have a load store pair and the load
6524ba319b5SDimitry Andric // is known to be invariant. The load cannot be clobbered by the store.
6534ba319b5SDimitry Andric auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
6544ba319b5SDimitry Andric return LI->getMetadata(LLVMContext::MD_invariant_load);
6554ba319b5SDimitry Andric };
6564ba319b5SDimitry Andric
6573ca95b02SDimitry Andric // We can ignore the alias as long as the load comes before the store,
6583ca95b02SDimitry Andric // because that means we won't be moving the load past the store to
6593ca95b02SDimitry Andric // vectorize it (the vectorized load is inserted at the location of the
6603ca95b02SDimitry Andric // first load in the chain).
6614ba319b5SDimitry Andric if (isa<StoreInst>(MemInstr) && ChainLoad &&
6624ba319b5SDimitry Andric (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
6633ca95b02SDimitry Andric continue;
6643ca95b02SDimitry Andric
6653ca95b02SDimitry Andric // Same case, but in reverse.
6664ba319b5SDimitry Andric if (MemLoad && isa<StoreInst>(ChainInstr) &&
6674ba319b5SDimitry Andric (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
6683ca95b02SDimitry Andric continue;
6693ca95b02SDimitry Andric
670d88c1a5aSDimitry Andric if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
671d88c1a5aSDimitry Andric MemoryLocation::get(ChainInstr))) {
6724ba319b5SDimitry Andric LLVM_DEBUG({
673d88c1a5aSDimitry Andric dbgs() << "LSV: Found alias:\n"
6743ca95b02SDimitry Andric " Aliasing instruction and pointer:\n"
675d88c1a5aSDimitry Andric << " " << *MemInstr << '\n'
6764ba319b5SDimitry Andric << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
6773ca95b02SDimitry Andric << " Aliased instruction and pointer:\n"
678d88c1a5aSDimitry Andric << " " << *ChainInstr << '\n'
6794ba319b5SDimitry Andric << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
6803ca95b02SDimitry Andric });
681d88c1a5aSDimitry Andric // Save this aliasing memory instruction as a barrier, but allow other
682d88c1a5aSDimitry Andric // instructions that precede the barrier to be vectorized with this one.
683d88c1a5aSDimitry Andric BarrierMemoryInstr = MemInstr;
684d88c1a5aSDimitry Andric break;
6853ca95b02SDimitry Andric }
6863ca95b02SDimitry Andric }
687d88c1a5aSDimitry Andric // Continue the search only for store chains, since vectorizing stores that
688d88c1a5aSDimitry Andric // precede an aliasing load is valid. Conversely, vectorizing loads is valid
689d88c1a5aSDimitry Andric // up to an aliasing store, but should not pull loads from further down in
690d88c1a5aSDimitry Andric // the basic block.
691d88c1a5aSDimitry Andric if (IsLoadChain && BarrierMemoryInstr) {
692d88c1a5aSDimitry Andric // The BarrierMemoryInstr is a store that precedes ChainInstr.
693d88c1a5aSDimitry Andric assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
694d88c1a5aSDimitry Andric break;
6953ca95b02SDimitry Andric }
6963ca95b02SDimitry Andric }
6973ca95b02SDimitry Andric
698d88c1a5aSDimitry Andric // Find the largest prefix of Chain whose elements are all in
699d88c1a5aSDimitry Andric // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
700d88c1a5aSDimitry Andric // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
701d88c1a5aSDimitry Andric // order.)
702d88c1a5aSDimitry Andric SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
703d88c1a5aSDimitry Andric ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
704d88c1a5aSDimitry Andric unsigned ChainIdx = 0;
705d88c1a5aSDimitry Andric for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
706d88c1a5aSDimitry Andric if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
707d88c1a5aSDimitry Andric break;
708d88c1a5aSDimitry Andric }
709d88c1a5aSDimitry Andric return Chain.slice(0, ChainIdx);
710d88c1a5aSDimitry Andric }
711d88c1a5aSDimitry Andric
getChainID(const Value * Ptr,const DataLayout & DL)7124ba319b5SDimitry Andric static ChainID getChainID(const Value *Ptr, const DataLayout &DL) {
7134ba319b5SDimitry Andric const Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
7144ba319b5SDimitry Andric if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
7154ba319b5SDimitry Andric // The select's themselves are distinct instructions even if they share the
7164ba319b5SDimitry Andric // same condition and evaluate to consecutive pointers for true and false
7174ba319b5SDimitry Andric // values of the condition. Therefore using the select's themselves for
7184ba319b5SDimitry Andric // grouping instructions would put consecutive accesses into different lists
7194ba319b5SDimitry Andric // and they won't be even checked for being consecutive, and won't be
7204ba319b5SDimitry Andric // vectorized.
7214ba319b5SDimitry Andric return Sel->getCondition();
7224ba319b5SDimitry Andric }
7234ba319b5SDimitry Andric return ObjPtr;
7244ba319b5SDimitry Andric }
7254ba319b5SDimitry Andric
726d88c1a5aSDimitry Andric std::pair<InstrListMap, InstrListMap>
collectInstructions(BasicBlock * BB)727d88c1a5aSDimitry Andric Vectorizer::collectInstructions(BasicBlock *BB) {
728d88c1a5aSDimitry Andric InstrListMap LoadRefs;
729d88c1a5aSDimitry Andric InstrListMap StoreRefs;
7303ca95b02SDimitry Andric
7313ca95b02SDimitry Andric for (Instruction &I : *BB) {
7323ca95b02SDimitry Andric if (!I.mayReadOrWriteMemory())
7333ca95b02SDimitry Andric continue;
7343ca95b02SDimitry Andric
7353ca95b02SDimitry Andric if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
7363ca95b02SDimitry Andric if (!LI->isSimple())
7373ca95b02SDimitry Andric continue;
7383ca95b02SDimitry Andric
739d88c1a5aSDimitry Andric // Skip if it's not legal.
740d88c1a5aSDimitry Andric if (!TTI.isLegalToVectorizeLoad(LI))
741d88c1a5aSDimitry Andric continue;
742d88c1a5aSDimitry Andric
7433ca95b02SDimitry Andric Type *Ty = LI->getType();
7443ca95b02SDimitry Andric if (!VectorType::isValidElementType(Ty->getScalarType()))
7453ca95b02SDimitry Andric continue;
7463ca95b02SDimitry Andric
7473ca95b02SDimitry Andric // Skip weird non-byte sizes. They probably aren't worth the effort of
7483ca95b02SDimitry Andric // handling correctly.
7493ca95b02SDimitry Andric unsigned TySize = DL.getTypeSizeInBits(Ty);
7502cab237bSDimitry Andric if ((TySize % 8) != 0)
7512cab237bSDimitry Andric continue;
7522cab237bSDimitry Andric
7532cab237bSDimitry Andric // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
7542cab237bSDimitry Andric // functions are currently using an integer type for the vectorized
7552cab237bSDimitry Andric // load/store, and does not support casting between the integer type and a
7562cab237bSDimitry Andric // vector of pointers (e.g. i64 to <2 x i16*>)
7572cab237bSDimitry Andric if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
7583ca95b02SDimitry Andric continue;
7593ca95b02SDimitry Andric
7603ca95b02SDimitry Andric Value *Ptr = LI->getPointerOperand();
7613ca95b02SDimitry Andric unsigned AS = Ptr->getType()->getPointerAddressSpace();
7623ca95b02SDimitry Andric unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
7633ca95b02SDimitry Andric
7644ba319b5SDimitry Andric unsigned VF = VecRegSize / TySize;
7654ba319b5SDimitry Andric VectorType *VecTy = dyn_cast<VectorType>(Ty);
7664ba319b5SDimitry Andric
7673ca95b02SDimitry Andric // No point in looking at these if they're too big to vectorize.
7684ba319b5SDimitry Andric if (TySize > VecRegSize / 2 ||
7694ba319b5SDimitry Andric (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
7703ca95b02SDimitry Andric continue;
7713ca95b02SDimitry Andric
7723ca95b02SDimitry Andric // Make sure all the users of a vector are constant-index extracts.
7732cab237bSDimitry Andric if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
774d88c1a5aSDimitry Andric const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
775d88c1a5aSDimitry Andric return EEI && isa<ConstantInt>(EEI->getOperand(1));
7763ca95b02SDimitry Andric }))
7773ca95b02SDimitry Andric continue;
7783ca95b02SDimitry Andric
7793ca95b02SDimitry Andric // Save the load locations.
7804ba319b5SDimitry Andric const ChainID ID = getChainID(Ptr, DL);
7814ba319b5SDimitry Andric LoadRefs[ID].push_back(LI);
7823ca95b02SDimitry Andric } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
7833ca95b02SDimitry Andric if (!SI->isSimple())
7843ca95b02SDimitry Andric continue;
7853ca95b02SDimitry Andric
786d88c1a5aSDimitry Andric // Skip if it's not legal.
787d88c1a5aSDimitry Andric if (!TTI.isLegalToVectorizeStore(SI))
788d88c1a5aSDimitry Andric continue;
789d88c1a5aSDimitry Andric
7903ca95b02SDimitry Andric Type *Ty = SI->getValueOperand()->getType();
7913ca95b02SDimitry Andric if (!VectorType::isValidElementType(Ty->getScalarType()))
7923ca95b02SDimitry Andric continue;
7933ca95b02SDimitry Andric
7942cab237bSDimitry Andric // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
7952cab237bSDimitry Andric // functions are currently using an integer type for the vectorized
7962cab237bSDimitry Andric // load/store, and does not support casting between the integer type and a
7972cab237bSDimitry Andric // vector of pointers (e.g. i64 to <2 x i16*>)
7982cab237bSDimitry Andric if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
7992cab237bSDimitry Andric continue;
8002cab237bSDimitry Andric
8013ca95b02SDimitry Andric // Skip weird non-byte sizes. They probably aren't worth the effort of
8023ca95b02SDimitry Andric // handling correctly.
8033ca95b02SDimitry Andric unsigned TySize = DL.getTypeSizeInBits(Ty);
8042cab237bSDimitry Andric if ((TySize % 8) != 0)
8053ca95b02SDimitry Andric continue;
8063ca95b02SDimitry Andric
8073ca95b02SDimitry Andric Value *Ptr = SI->getPointerOperand();
8083ca95b02SDimitry Andric unsigned AS = Ptr->getType()->getPointerAddressSpace();
8093ca95b02SDimitry Andric unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
8102cab237bSDimitry Andric
8114ba319b5SDimitry Andric unsigned VF = VecRegSize / TySize;
8124ba319b5SDimitry Andric VectorType *VecTy = dyn_cast<VectorType>(Ty);
8134ba319b5SDimitry Andric
8142cab237bSDimitry Andric // No point in looking at these if they're too big to vectorize.
8154ba319b5SDimitry Andric if (TySize > VecRegSize / 2 ||
8164ba319b5SDimitry Andric (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
8173ca95b02SDimitry Andric continue;
8183ca95b02SDimitry Andric
8192cab237bSDimitry Andric if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
820d88c1a5aSDimitry Andric const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
821d88c1a5aSDimitry Andric return EEI && isa<ConstantInt>(EEI->getOperand(1));
8223ca95b02SDimitry Andric }))
8233ca95b02SDimitry Andric continue;
8243ca95b02SDimitry Andric
8253ca95b02SDimitry Andric // Save store location.
8264ba319b5SDimitry Andric const ChainID ID = getChainID(Ptr, DL);
8274ba319b5SDimitry Andric StoreRefs[ID].push_back(SI);
8283ca95b02SDimitry Andric }
8293ca95b02SDimitry Andric }
830d88c1a5aSDimitry Andric
831d88c1a5aSDimitry Andric return {LoadRefs, StoreRefs};
8323ca95b02SDimitry Andric }
8333ca95b02SDimitry Andric
vectorizeChains(InstrListMap & Map)834d88c1a5aSDimitry Andric bool Vectorizer::vectorizeChains(InstrListMap &Map) {
8353ca95b02SDimitry Andric bool Changed = false;
8363ca95b02SDimitry Andric
8374ba319b5SDimitry Andric for (const std::pair<ChainID, InstrList> &Chain : Map) {
8383ca95b02SDimitry Andric unsigned Size = Chain.second.size();
8393ca95b02SDimitry Andric if (Size < 2)
8403ca95b02SDimitry Andric continue;
8413ca95b02SDimitry Andric
8424ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
8433ca95b02SDimitry Andric
8443ca95b02SDimitry Andric // Process the stores in chunks of 64.
8453ca95b02SDimitry Andric for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
8463ca95b02SDimitry Andric unsigned Len = std::min<unsigned>(CE - CI, 64);
847d88c1a5aSDimitry Andric ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
8483ca95b02SDimitry Andric Changed |= vectorizeInstructions(Chunk);
8493ca95b02SDimitry Andric }
8503ca95b02SDimitry Andric }
8513ca95b02SDimitry Andric
8523ca95b02SDimitry Andric return Changed;
8533ca95b02SDimitry Andric }
8543ca95b02SDimitry Andric
vectorizeInstructions(ArrayRef<Instruction * > Instrs)855d88c1a5aSDimitry Andric bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
8564ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
8574ba319b5SDimitry Andric << " instructions.\n");
858d88c1a5aSDimitry Andric SmallVector<int, 16> Heads, Tails;
8593ca95b02SDimitry Andric int ConsecutiveChain[64];
8603ca95b02SDimitry Andric
8612cab237bSDimitry Andric // Do a quadratic search on all of the given loads/stores and find all of the
8622cab237bSDimitry Andric // pairs of loads/stores that follow each other.
8633ca95b02SDimitry Andric for (int i = 0, e = Instrs.size(); i < e; ++i) {
8643ca95b02SDimitry Andric ConsecutiveChain[i] = -1;
8653ca95b02SDimitry Andric for (int j = e - 1; j >= 0; --j) {
8663ca95b02SDimitry Andric if (i == j)
8673ca95b02SDimitry Andric continue;
8683ca95b02SDimitry Andric
8693ca95b02SDimitry Andric if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
8703ca95b02SDimitry Andric if (ConsecutiveChain[i] != -1) {
8713ca95b02SDimitry Andric int CurDistance = std::abs(ConsecutiveChain[i] - i);
8723ca95b02SDimitry Andric int NewDistance = std::abs(ConsecutiveChain[i] - j);
8733ca95b02SDimitry Andric if (j < i || NewDistance > CurDistance)
8743ca95b02SDimitry Andric continue; // Should not insert.
8753ca95b02SDimitry Andric }
8763ca95b02SDimitry Andric
877d88c1a5aSDimitry Andric Tails.push_back(j);
878d88c1a5aSDimitry Andric Heads.push_back(i);
8793ca95b02SDimitry Andric ConsecutiveChain[i] = j;
8803ca95b02SDimitry Andric }
8813ca95b02SDimitry Andric }
8823ca95b02SDimitry Andric }
8833ca95b02SDimitry Andric
8843ca95b02SDimitry Andric bool Changed = false;
885d88c1a5aSDimitry Andric SmallPtrSet<Instruction *, 16> InstructionsProcessed;
8863ca95b02SDimitry Andric
8873ca95b02SDimitry Andric for (int Head : Heads) {
8883ca95b02SDimitry Andric if (InstructionsProcessed.count(Instrs[Head]))
8893ca95b02SDimitry Andric continue;
890d88c1a5aSDimitry Andric bool LongerChainExists = false;
8913ca95b02SDimitry Andric for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
8923ca95b02SDimitry Andric if (Head == Tails[TIt] &&
8933ca95b02SDimitry Andric !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
894d88c1a5aSDimitry Andric LongerChainExists = true;
8953ca95b02SDimitry Andric break;
8963ca95b02SDimitry Andric }
897d88c1a5aSDimitry Andric if (LongerChainExists)
8983ca95b02SDimitry Andric continue;
8993ca95b02SDimitry Andric
9003ca95b02SDimitry Andric // We found an instr that starts a chain. Now follow the chain and try to
9013ca95b02SDimitry Andric // vectorize it.
902d88c1a5aSDimitry Andric SmallVector<Instruction *, 16> Operands;
9033ca95b02SDimitry Andric int I = Head;
904d88c1a5aSDimitry Andric while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
9053ca95b02SDimitry Andric if (InstructionsProcessed.count(Instrs[I]))
9063ca95b02SDimitry Andric break;
9073ca95b02SDimitry Andric
9083ca95b02SDimitry Andric Operands.push_back(Instrs[I]);
9093ca95b02SDimitry Andric I = ConsecutiveChain[I];
9103ca95b02SDimitry Andric }
9113ca95b02SDimitry Andric
9123ca95b02SDimitry Andric bool Vectorized = false;
9133ca95b02SDimitry Andric if (isa<LoadInst>(*Operands.begin()))
9143ca95b02SDimitry Andric Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
9153ca95b02SDimitry Andric else
9163ca95b02SDimitry Andric Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
9173ca95b02SDimitry Andric
9183ca95b02SDimitry Andric Changed |= Vectorized;
9193ca95b02SDimitry Andric }
9203ca95b02SDimitry Andric
9213ca95b02SDimitry Andric return Changed;
9223ca95b02SDimitry Andric }
9233ca95b02SDimitry Andric
vectorizeStoreChain(ArrayRef<Instruction * > Chain,SmallPtrSet<Instruction *,16> * InstructionsProcessed)9243ca95b02SDimitry Andric bool Vectorizer::vectorizeStoreChain(
925d88c1a5aSDimitry Andric ArrayRef<Instruction *> Chain,
926d88c1a5aSDimitry Andric SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
9273ca95b02SDimitry Andric StoreInst *S0 = cast<StoreInst>(Chain[0]);
9283ca95b02SDimitry Andric
9292cab237bSDimitry Andric // If the vector has an int element, default to int for the whole store.
9303ca95b02SDimitry Andric Type *StoreTy;
931d88c1a5aSDimitry Andric for (Instruction *I : Chain) {
932d88c1a5aSDimitry Andric StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
9333ca95b02SDimitry Andric if (StoreTy->isIntOrIntVectorTy())
9343ca95b02SDimitry Andric break;
9353ca95b02SDimitry Andric
9363ca95b02SDimitry Andric if (StoreTy->isPtrOrPtrVectorTy()) {
9373ca95b02SDimitry Andric StoreTy = Type::getIntNTy(F.getParent()->getContext(),
9383ca95b02SDimitry Andric DL.getTypeSizeInBits(StoreTy));
9393ca95b02SDimitry Andric break;
9403ca95b02SDimitry Andric }
9413ca95b02SDimitry Andric }
9423ca95b02SDimitry Andric
9433ca95b02SDimitry Andric unsigned Sz = DL.getTypeSizeInBits(StoreTy);
9443ca95b02SDimitry Andric unsigned AS = S0->getPointerAddressSpace();
9453ca95b02SDimitry Andric unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
9463ca95b02SDimitry Andric unsigned VF = VecRegSize / Sz;
9473ca95b02SDimitry Andric unsigned ChainSize = Chain.size();
948d88c1a5aSDimitry Andric unsigned Alignment = getAlignment(S0);
9493ca95b02SDimitry Andric
9503ca95b02SDimitry Andric if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
9513ca95b02SDimitry Andric InstructionsProcessed->insert(Chain.begin(), Chain.end());
9523ca95b02SDimitry Andric return false;
9533ca95b02SDimitry Andric }
9543ca95b02SDimitry Andric
955d88c1a5aSDimitry Andric ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
956d88c1a5aSDimitry Andric if (NewChain.empty()) {
957d88c1a5aSDimitry Andric // No vectorization possible.
9583ca95b02SDimitry Andric InstructionsProcessed->insert(Chain.begin(), Chain.end());
9593ca95b02SDimitry Andric return false;
9603ca95b02SDimitry Andric }
961d88c1a5aSDimitry Andric if (NewChain.size() == 1) {
9623ca95b02SDimitry Andric // Failed after the first instruction. Discard it and try the smaller chain.
963d88c1a5aSDimitry Andric InstructionsProcessed->insert(NewChain.front());
9643ca95b02SDimitry Andric return false;
9653ca95b02SDimitry Andric }
9663ca95b02SDimitry Andric
9673ca95b02SDimitry Andric // Update Chain to the valid vectorizable subchain.
968d88c1a5aSDimitry Andric Chain = NewChain;
9693ca95b02SDimitry Andric ChainSize = Chain.size();
9703ca95b02SDimitry Andric
971d88c1a5aSDimitry Andric // Check if it's legal to vectorize this chain. If not, split the chain and
972d88c1a5aSDimitry Andric // try again.
973d88c1a5aSDimitry Andric unsigned EltSzInBytes = Sz / 8;
974d88c1a5aSDimitry Andric unsigned SzInBytes = EltSzInBytes * ChainSize;
9753ca95b02SDimitry Andric
9763ca95b02SDimitry Andric VectorType *VecTy;
9773ca95b02SDimitry Andric VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
9783ca95b02SDimitry Andric if (VecStoreTy)
9793ca95b02SDimitry Andric VecTy = VectorType::get(StoreTy->getScalarType(),
9803ca95b02SDimitry Andric Chain.size() * VecStoreTy->getNumElements());
9813ca95b02SDimitry Andric else
9823ca95b02SDimitry Andric VecTy = VectorType::get(StoreTy, Chain.size());
9833ca95b02SDimitry Andric
984d88c1a5aSDimitry Andric // If it's more than the max vector size or the target has a better
985d88c1a5aSDimitry Andric // vector factor, break it into two pieces.
986d88c1a5aSDimitry Andric unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
987d88c1a5aSDimitry Andric if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
9884ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
9893ca95b02SDimitry Andric " Creating two separate arrays.\n");
990d88c1a5aSDimitry Andric return vectorizeStoreChain(Chain.slice(0, TargetVF),
991d88c1a5aSDimitry Andric InstructionsProcessed) |
992d88c1a5aSDimitry Andric vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
9933ca95b02SDimitry Andric }
9943ca95b02SDimitry Andric
9954ba319b5SDimitry Andric LLVM_DEBUG({
9963ca95b02SDimitry Andric dbgs() << "LSV: Stores to vectorize:\n";
997d88c1a5aSDimitry Andric for (Instruction *I : Chain)
998d88c1a5aSDimitry Andric dbgs() << " " << *I << "\n";
9993ca95b02SDimitry Andric });
10003ca95b02SDimitry Andric
10013ca95b02SDimitry Andric // We won't try again to vectorize the elements of the chain, regardless of
10023ca95b02SDimitry Andric // whether we succeed below.
10033ca95b02SDimitry Andric InstructionsProcessed->insert(Chain.begin(), Chain.end());
10043ca95b02SDimitry Andric
10053ca95b02SDimitry Andric // If the store is going to be misaligned, don't vectorize it.
10063ca95b02SDimitry Andric if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1007*b5893f02SDimitry Andric if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1008*b5893f02SDimitry Andric auto Chains = splitOddVectorElts(Chain, Sz);
1009*b5893f02SDimitry Andric return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1010*b5893f02SDimitry Andric vectorizeStoreChain(Chains.second, InstructionsProcessed);
1011*b5893f02SDimitry Andric }
10123ca95b02SDimitry Andric
1013d88c1a5aSDimitry Andric unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
1014d88c1a5aSDimitry Andric StackAdjustedAlignment,
1015d88c1a5aSDimitry Andric DL, S0, nullptr, &DT);
1016*b5893f02SDimitry Andric if (NewAlign != 0)
1017*b5893f02SDimitry Andric Alignment = NewAlign;
1018*b5893f02SDimitry Andric }
1019*b5893f02SDimitry Andric
1020*b5893f02SDimitry Andric if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1021*b5893f02SDimitry Andric auto Chains = splitOddVectorElts(Chain, Sz);
1022*b5893f02SDimitry Andric return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
1023*b5893f02SDimitry Andric vectorizeStoreChain(Chains.second, InstructionsProcessed);
10243ca95b02SDimitry Andric }
10253ca95b02SDimitry Andric
1026d88c1a5aSDimitry Andric BasicBlock::iterator First, Last;
1027d88c1a5aSDimitry Andric std::tie(First, Last) = getBoundaryInstrs(Chain);
10283ca95b02SDimitry Andric Builder.SetInsertPoint(&*Last);
10293ca95b02SDimitry Andric
10303ca95b02SDimitry Andric Value *Vec = UndefValue::get(VecTy);
10313ca95b02SDimitry Andric
10323ca95b02SDimitry Andric if (VecStoreTy) {
10333ca95b02SDimitry Andric unsigned VecWidth = VecStoreTy->getNumElements();
10343ca95b02SDimitry Andric for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
10353ca95b02SDimitry Andric StoreInst *Store = cast<StoreInst>(Chain[I]);
10363ca95b02SDimitry Andric for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
10373ca95b02SDimitry Andric unsigned NewIdx = J + I * VecWidth;
10383ca95b02SDimitry Andric Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
10393ca95b02SDimitry Andric Builder.getInt32(J));
10403ca95b02SDimitry Andric if (Extract->getType() != StoreTy->getScalarType())
10413ca95b02SDimitry Andric Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
10423ca95b02SDimitry Andric
10433ca95b02SDimitry Andric Value *Insert =
10443ca95b02SDimitry Andric Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
10453ca95b02SDimitry Andric Vec = Insert;
10463ca95b02SDimitry Andric }
10473ca95b02SDimitry Andric }
10483ca95b02SDimitry Andric } else {
10493ca95b02SDimitry Andric for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
10503ca95b02SDimitry Andric StoreInst *Store = cast<StoreInst>(Chain[I]);
10513ca95b02SDimitry Andric Value *Extract = Store->getValueOperand();
10523ca95b02SDimitry Andric if (Extract->getType() != StoreTy->getScalarType())
10533ca95b02SDimitry Andric Extract =
10543ca95b02SDimitry Andric Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
10553ca95b02SDimitry Andric
10563ca95b02SDimitry Andric Value *Insert =
10573ca95b02SDimitry Andric Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
10583ca95b02SDimitry Andric Vec = Insert;
10593ca95b02SDimitry Andric }
10603ca95b02SDimitry Andric }
10613ca95b02SDimitry Andric
1062*b5893f02SDimitry Andric StoreInst *SI = Builder.CreateAlignedStore(
1063*b5893f02SDimitry Andric Vec,
1064*b5893f02SDimitry Andric Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1065*b5893f02SDimitry Andric Alignment);
10663ca95b02SDimitry Andric propagateMetadata(SI, Chain);
10673ca95b02SDimitry Andric
10683ca95b02SDimitry Andric eraseInstructions(Chain);
10693ca95b02SDimitry Andric ++NumVectorInstructions;
10703ca95b02SDimitry Andric NumScalarsVectorized += Chain.size();
10713ca95b02SDimitry Andric return true;
10723ca95b02SDimitry Andric }
10733ca95b02SDimitry Andric
vectorizeLoadChain(ArrayRef<Instruction * > Chain,SmallPtrSet<Instruction *,16> * InstructionsProcessed)10743ca95b02SDimitry Andric bool Vectorizer::vectorizeLoadChain(
1075d88c1a5aSDimitry Andric ArrayRef<Instruction *> Chain,
1076d88c1a5aSDimitry Andric SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
10773ca95b02SDimitry Andric LoadInst *L0 = cast<LoadInst>(Chain[0]);
10783ca95b02SDimitry Andric
10793ca95b02SDimitry Andric // If the vector has an int element, default to int for the whole load.
10803ca95b02SDimitry Andric Type *LoadTy;
10813ca95b02SDimitry Andric for (const auto &V : Chain) {
10823ca95b02SDimitry Andric LoadTy = cast<LoadInst>(V)->getType();
10833ca95b02SDimitry Andric if (LoadTy->isIntOrIntVectorTy())
10843ca95b02SDimitry Andric break;
10853ca95b02SDimitry Andric
10863ca95b02SDimitry Andric if (LoadTy->isPtrOrPtrVectorTy()) {
10873ca95b02SDimitry Andric LoadTy = Type::getIntNTy(F.getParent()->getContext(),
10883ca95b02SDimitry Andric DL.getTypeSizeInBits(LoadTy));
10893ca95b02SDimitry Andric break;
10903ca95b02SDimitry Andric }
10913ca95b02SDimitry Andric }
10923ca95b02SDimitry Andric
10933ca95b02SDimitry Andric unsigned Sz = DL.getTypeSizeInBits(LoadTy);
10943ca95b02SDimitry Andric unsigned AS = L0->getPointerAddressSpace();
10953ca95b02SDimitry Andric unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
10963ca95b02SDimitry Andric unsigned VF = VecRegSize / Sz;
10973ca95b02SDimitry Andric unsigned ChainSize = Chain.size();
1098d88c1a5aSDimitry Andric unsigned Alignment = getAlignment(L0);
10993ca95b02SDimitry Andric
11003ca95b02SDimitry Andric if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
11013ca95b02SDimitry Andric InstructionsProcessed->insert(Chain.begin(), Chain.end());
11023ca95b02SDimitry Andric return false;
11033ca95b02SDimitry Andric }
11043ca95b02SDimitry Andric
1105d88c1a5aSDimitry Andric ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
1106d88c1a5aSDimitry Andric if (NewChain.empty()) {
1107d88c1a5aSDimitry Andric // No vectorization possible.
11083ca95b02SDimitry Andric InstructionsProcessed->insert(Chain.begin(), Chain.end());
11093ca95b02SDimitry Andric return false;
11103ca95b02SDimitry Andric }
1111d88c1a5aSDimitry Andric if (NewChain.size() == 1) {
11123ca95b02SDimitry Andric // Failed after the first instruction. Discard it and try the smaller chain.
1113d88c1a5aSDimitry Andric InstructionsProcessed->insert(NewChain.front());
11143ca95b02SDimitry Andric return false;
11153ca95b02SDimitry Andric }
11163ca95b02SDimitry Andric
11173ca95b02SDimitry Andric // Update Chain to the valid vectorizable subchain.
1118d88c1a5aSDimitry Andric Chain = NewChain;
11193ca95b02SDimitry Andric ChainSize = Chain.size();
11203ca95b02SDimitry Andric
1121d88c1a5aSDimitry Andric // Check if it's legal to vectorize this chain. If not, split the chain and
1122d88c1a5aSDimitry Andric // try again.
1123d88c1a5aSDimitry Andric unsigned EltSzInBytes = Sz / 8;
1124d88c1a5aSDimitry Andric unsigned SzInBytes = EltSzInBytes * ChainSize;
11253ca95b02SDimitry Andric VectorType *VecTy;
11263ca95b02SDimitry Andric VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
11273ca95b02SDimitry Andric if (VecLoadTy)
11283ca95b02SDimitry Andric VecTy = VectorType::get(LoadTy->getScalarType(),
11293ca95b02SDimitry Andric Chain.size() * VecLoadTy->getNumElements());
11303ca95b02SDimitry Andric else
11313ca95b02SDimitry Andric VecTy = VectorType::get(LoadTy, Chain.size());
11323ca95b02SDimitry Andric
1133d88c1a5aSDimitry Andric // If it's more than the max vector size or the target has a better
1134d88c1a5aSDimitry Andric // vector factor, break it into two pieces.
1135d88c1a5aSDimitry Andric unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
1136d88c1a5aSDimitry Andric if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
11374ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
11383ca95b02SDimitry Andric " Creating two separate arrays.\n");
1139d88c1a5aSDimitry Andric return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
1140d88c1a5aSDimitry Andric vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
11413ca95b02SDimitry Andric }
11423ca95b02SDimitry Andric
11433ca95b02SDimitry Andric // We won't try again to vectorize the elements of the chain, regardless of
11443ca95b02SDimitry Andric // whether we succeed below.
11453ca95b02SDimitry Andric InstructionsProcessed->insert(Chain.begin(), Chain.end());
11463ca95b02SDimitry Andric
11473ca95b02SDimitry Andric // If the load is going to be misaligned, don't vectorize it.
11483ca95b02SDimitry Andric if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1149*b5893f02SDimitry Andric if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1150*b5893f02SDimitry Andric auto Chains = splitOddVectorElts(Chain, Sz);
1151*b5893f02SDimitry Andric return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1152*b5893f02SDimitry Andric vectorizeLoadChain(Chains.second, InstructionsProcessed);
1153*b5893f02SDimitry Andric }
11543ca95b02SDimitry Andric
1155d88c1a5aSDimitry Andric unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
1156d88c1a5aSDimitry Andric StackAdjustedAlignment,
1157d88c1a5aSDimitry Andric DL, L0, nullptr, &DT);
1158*b5893f02SDimitry Andric if (NewAlign != 0)
1159*b5893f02SDimitry Andric Alignment = NewAlign;
1160d88c1a5aSDimitry Andric
1161d88c1a5aSDimitry Andric Alignment = NewAlign;
11623ca95b02SDimitry Andric }
11633ca95b02SDimitry Andric
1164*b5893f02SDimitry Andric if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1165*b5893f02SDimitry Andric auto Chains = splitOddVectorElts(Chain, Sz);
1166*b5893f02SDimitry Andric return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
1167*b5893f02SDimitry Andric vectorizeLoadChain(Chains.second, InstructionsProcessed);
1168*b5893f02SDimitry Andric }
1169*b5893f02SDimitry Andric
11704ba319b5SDimitry Andric LLVM_DEBUG({
11713ca95b02SDimitry Andric dbgs() << "LSV: Loads to vectorize:\n";
1172d88c1a5aSDimitry Andric for (Instruction *I : Chain)
1173d88c1a5aSDimitry Andric I->dump();
11743ca95b02SDimitry Andric });
11753ca95b02SDimitry Andric
1176d88c1a5aSDimitry Andric // getVectorizablePrefix already computed getBoundaryInstrs. The value of
1177d88c1a5aSDimitry Andric // Last may have changed since then, but the value of First won't have. If it
1178d88c1a5aSDimitry Andric // matters, we could compute getBoundaryInstrs only once and reuse it here.
1179d88c1a5aSDimitry Andric BasicBlock::iterator First, Last;
1180d88c1a5aSDimitry Andric std::tie(First, Last) = getBoundaryInstrs(Chain);
11813ca95b02SDimitry Andric Builder.SetInsertPoint(&*First);
11823ca95b02SDimitry Andric
11833ca95b02SDimitry Andric Value *Bitcast =
11843ca95b02SDimitry Andric Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1185*b5893f02SDimitry Andric LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);
11863ca95b02SDimitry Andric propagateMetadata(LI, Chain);
11873ca95b02SDimitry Andric
11883ca95b02SDimitry Andric if (VecLoadTy) {
11893ca95b02SDimitry Andric SmallVector<Instruction *, 16> InstrsToErase;
11903ca95b02SDimitry Andric
11913ca95b02SDimitry Andric unsigned VecWidth = VecLoadTy->getNumElements();
11923ca95b02SDimitry Andric for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
11933ca95b02SDimitry Andric for (auto Use : Chain[I]->users()) {
1194d88c1a5aSDimitry Andric // All users of vector loads are ExtractElement instructions with
1195d88c1a5aSDimitry Andric // constant indices, otherwise we would have bailed before now.
11963ca95b02SDimitry Andric Instruction *UI = cast<Instruction>(Use);
11973ca95b02SDimitry Andric unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
11983ca95b02SDimitry Andric unsigned NewIdx = Idx + I * VecWidth;
1199d88c1a5aSDimitry Andric Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
1200d88c1a5aSDimitry Andric UI->getName());
1201d88c1a5aSDimitry Andric if (V->getType() != UI->getType())
1202d88c1a5aSDimitry Andric V = Builder.CreateBitCast(V, UI->getType());
12033ca95b02SDimitry Andric
12043ca95b02SDimitry Andric // Replace the old instruction.
1205d88c1a5aSDimitry Andric UI->replaceAllUsesWith(V);
12063ca95b02SDimitry Andric InstrsToErase.push_back(UI);
12073ca95b02SDimitry Andric }
12083ca95b02SDimitry Andric }
12093ca95b02SDimitry Andric
1210d88c1a5aSDimitry Andric // Bitcast might not be an Instruction, if the value being loaded is a
1211d88c1a5aSDimitry Andric // constant. In that case, no need to reorder anything.
1212d88c1a5aSDimitry Andric if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1213d88c1a5aSDimitry Andric reorder(BitcastInst);
12143ca95b02SDimitry Andric
12153ca95b02SDimitry Andric for (auto I : InstrsToErase)
12163ca95b02SDimitry Andric I->eraseFromParent();
12173ca95b02SDimitry Andric } else {
12183ca95b02SDimitry Andric for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
1219d88c1a5aSDimitry Andric Value *CV = Chain[I];
1220d88c1a5aSDimitry Andric Value *V =
1221d88c1a5aSDimitry Andric Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
1222d88c1a5aSDimitry Andric if (V->getType() != CV->getType()) {
1223d88c1a5aSDimitry Andric V = Builder.CreateBitOrPointerCast(V, CV->getType());
12243ca95b02SDimitry Andric }
12253ca95b02SDimitry Andric
12263ca95b02SDimitry Andric // Replace the old instruction.
1227d88c1a5aSDimitry Andric CV->replaceAllUsesWith(V);
12283ca95b02SDimitry Andric }
12293ca95b02SDimitry Andric
1230d88c1a5aSDimitry Andric if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
1231d88c1a5aSDimitry Andric reorder(BitcastInst);
12323ca95b02SDimitry Andric }
12333ca95b02SDimitry Andric
12343ca95b02SDimitry Andric eraseInstructions(Chain);
12353ca95b02SDimitry Andric
12363ca95b02SDimitry Andric ++NumVectorInstructions;
12373ca95b02SDimitry Andric NumScalarsVectorized += Chain.size();
12383ca95b02SDimitry Andric return true;
12393ca95b02SDimitry Andric }
12403ca95b02SDimitry Andric
accessIsMisaligned(unsigned SzInBytes,unsigned AddressSpace,unsigned Alignment)12413ca95b02SDimitry Andric bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
12423ca95b02SDimitry Andric unsigned Alignment) {
1243d88c1a5aSDimitry Andric if (Alignment % SzInBytes == 0)
1244d88c1a5aSDimitry Andric return false;
1245d88c1a5aSDimitry Andric
12463ca95b02SDimitry Andric bool Fast = false;
1247d88c1a5aSDimitry Andric bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
1248d88c1a5aSDimitry Andric SzInBytes * 8, AddressSpace,
12493ca95b02SDimitry Andric Alignment, &Fast);
12504ba319b5SDimitry Andric LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
1251d88c1a5aSDimitry Andric << " and fast? " << Fast << "\n";);
1252d88c1a5aSDimitry Andric return !Allows || !Fast;
12533ca95b02SDimitry Andric }
1254