16cadde7fSEugene Zelenko //===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
208debb02SMatt Arsenault //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
608debb02SMatt Arsenault //
708debb02SMatt Arsenault //===----------------------------------------------------------------------===//
89d3afd3cSJustin Lebar //
99d3afd3cSJustin Lebar // This pass merges loads/stores to/from sequential memory addresses into vector
109d3afd3cSJustin Lebar // loads/stores. Although there's nothing GPU-specific in here, this pass is
119d3afd3cSJustin Lebar // motivated by the microarchitectural quirks of nVidia and AMD GPUs.
129d3afd3cSJustin Lebar //
139d3afd3cSJustin Lebar // (For simplicity below we talk about loads only, but everything also applies
149d3afd3cSJustin Lebar // to stores.)
159d3afd3cSJustin Lebar //
169d3afd3cSJustin Lebar // This pass is intended to be run late in the pipeline, after other
179d3afd3cSJustin Lebar // vectorization opportunities have been exploited. So the assumption here is
189d3afd3cSJustin Lebar // that immediately following our new vector load we'll need to extract out the
199d3afd3cSJustin Lebar // individual elements of the load, so we can operate on them individually.
209d3afd3cSJustin Lebar //
219d3afd3cSJustin Lebar // On CPUs this transformation is usually not beneficial, because extracting the
229d3afd3cSJustin Lebar // elements of a vector register is expensive on most architectures. It's
239d3afd3cSJustin Lebar // usually better just to load each element individually into its own scalar
249d3afd3cSJustin Lebar // register.
259d3afd3cSJustin Lebar //
269d3afd3cSJustin Lebar // However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
279d3afd3cSJustin Lebar // "vector load" loads directly into a series of scalar registers. In effect,
289d3afd3cSJustin Lebar // extracting the elements of the vector is free. It's therefore always
299d3afd3cSJustin Lebar // beneficial to vectorize a sequence of loads on these architectures.
309d3afd3cSJustin Lebar //
319d3afd3cSJustin Lebar // Vectorizing (perhaps a better name might be "coalescing") loads can have
329d3afd3cSJustin Lebar // large performance impacts on GPU kernels, and opportunities for vectorizing
339d3afd3cSJustin Lebar // are common in GPU code. This pass tries very hard to find such
349d3afd3cSJustin Lebar // opportunities; its runtime is quadratic in the number of loads in a BB.
359d3afd3cSJustin Lebar //
369d3afd3cSJustin Lebar // Some CPU architectures, such as ARM, have instructions that load into
379d3afd3cSJustin Lebar // multiple scalar registers, similar to a GPU vectorized load. In theory ARM
389d3afd3cSJustin Lebar // could use this pass (with some modifications), but currently it implements
399d3afd3cSJustin Lebar // its own pass to do something similar to what we do here.
4008debb02SMatt Arsenault
4105da2fe5SReid Kleckner #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
426cadde7fSEugene Zelenko #include "llvm/ADT/APInt.h"
436cadde7fSEugene Zelenko #include "llvm/ADT/ArrayRef.h"
4408debb02SMatt Arsenault #include "llvm/ADT/MapVector.h"
4508debb02SMatt Arsenault #include "llvm/ADT/PostOrderIterator.h"
466cadde7fSEugene Zelenko #include "llvm/ADT/STLExtras.h"
476cadde7fSEugene Zelenko #include "llvm/ADT/SmallPtrSet.h"
486cadde7fSEugene Zelenko #include "llvm/ADT/SmallVector.h"
4908debb02SMatt Arsenault #include "llvm/ADT/Statistic.h"
506cadde7fSEugene Zelenko #include "llvm/ADT/iterator_range.h"
5108debb02SMatt Arsenault #include "llvm/Analysis/AliasAnalysis.h"
5295427210SJustin Bogner #include "llvm/Analysis/AssumptionCache.h"
536cadde7fSEugene Zelenko #include "llvm/Analysis/MemoryLocation.h"
5408debb02SMatt Arsenault #include "llvm/Analysis/ScalarEvolution.h"
5508debb02SMatt Arsenault #include "llvm/Analysis/TargetTransformInfo.h"
5608debb02SMatt Arsenault #include "llvm/Analysis/ValueTracking.h"
5708debb02SMatt Arsenault #include "llvm/Analysis/VectorUtils.h"
586cadde7fSEugene Zelenko #include "llvm/IR/Attributes.h"
596cadde7fSEugene Zelenko #include "llvm/IR/BasicBlock.h"
606cadde7fSEugene Zelenko #include "llvm/IR/Constants.h"
6108debb02SMatt Arsenault #include "llvm/IR/DataLayout.h"
626cadde7fSEugene Zelenko #include "llvm/IR/DerivedTypes.h"
6308debb02SMatt Arsenault #include "llvm/IR/Dominators.h"
646cadde7fSEugene Zelenko #include "llvm/IR/Function.h"
65a494ae43Sserge-sans-paille #include "llvm/IR/GetElementPtrTypeIterator.h"
6608debb02SMatt Arsenault #include "llvm/IR/IRBuilder.h"
676cadde7fSEugene Zelenko #include "llvm/IR/InstrTypes.h"
686cadde7fSEugene Zelenko #include "llvm/IR/Instruction.h"
6908debb02SMatt Arsenault #include "llvm/IR/Instructions.h"
7008debb02SMatt Arsenault #include "llvm/IR/Module.h"
7108debb02SMatt Arsenault #include "llvm/IR/Type.h"
7208debb02SMatt Arsenault #include "llvm/IR/Value.h"
7305da2fe5SReid Kleckner #include "llvm/InitializePasses.h"
746cadde7fSEugene Zelenko #include "llvm/Pass.h"
756cadde7fSEugene Zelenko #include "llvm/Support/Casting.h"
7608debb02SMatt Arsenault #include "llvm/Support/Debug.h"
77b45eabcfSCraig Topper #include "llvm/Support/KnownBits.h"
786cadde7fSEugene Zelenko #include "llvm/Support/MathExtras.h"
7908debb02SMatt Arsenault #include "llvm/Support/raw_ostream.h"
8005da2fe5SReid Kleckner #include "llvm/Transforms/Utils/Local.h"
81598f8aadSAlina Sbirlea #include "llvm/Transforms/Vectorize.h"
826cadde7fSEugene Zelenko #include <algorithm>
836cadde7fSEugene Zelenko #include <cassert>
846cadde7fSEugene Zelenko #include <cstdlib>
856cadde7fSEugene Zelenko #include <tuple>
866cadde7fSEugene Zelenko #include <utility>
8708debb02SMatt Arsenault
8808debb02SMatt Arsenault using namespace llvm;
8908debb02SMatt Arsenault
9008debb02SMatt Arsenault #define DEBUG_TYPE "load-store-vectorizer"
916cadde7fSEugene Zelenko
9208debb02SMatt Arsenault STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
9308debb02SMatt Arsenault STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
9408debb02SMatt Arsenault
956f937b11SAlina Sbirlea // FIXME: Assuming stack alignment of 4 is always good enough
966f937b11SAlina Sbirlea static const unsigned StackAdjustedAlignment = 4;
976cadde7fSEugene Zelenko
986cadde7fSEugene Zelenko namespace {
996cadde7fSEugene Zelenko
1004f10a9d3SRoman Tereshin /// ChainID is an arbitrary token that is allowed to be different only for the
1014f10a9d3SRoman Tereshin /// accesses that are guaranteed to be considered non-consecutive by
1024f10a9d3SRoman Tereshin /// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
1034f10a9d3SRoman Tereshin /// together and reducing the number of instructions the main search operates on
1044f10a9d3SRoman Tereshin /// at a time, i.e. this is to reduce compile time and nothing else as the main
1054f10a9d3SRoman Tereshin /// search has O(n^2) time complexity. The underlying type of ChainID should not
1064f10a9d3SRoman Tereshin /// be relied upon.
1074f10a9d3SRoman Tereshin using ChainID = const Value *;
1086cadde7fSEugene Zelenko using InstrList = SmallVector<Instruction *, 8>;
1094f10a9d3SRoman Tereshin using InstrListMap = MapVector<ChainID, InstrList>;
11008debb02SMatt Arsenault
1114ee8a2d0SJustin Lebar class Vectorizer {
11208debb02SMatt Arsenault Function &F;
11308debb02SMatt Arsenault AliasAnalysis &AA;
11495427210SJustin Bogner AssumptionCache &AC;
11508debb02SMatt Arsenault DominatorTree &DT;
11608debb02SMatt Arsenault ScalarEvolution &SE;
117370e8226SMatt Arsenault TargetTransformInfo &TTI;
11808debb02SMatt Arsenault const DataLayout &DL;
11908debb02SMatt Arsenault IRBuilder<> Builder;
12008debb02SMatt Arsenault
12108debb02SMatt Arsenault public:
Vectorizer(Function & F,AliasAnalysis & AA,AssumptionCache & AC,DominatorTree & DT,ScalarEvolution & SE,TargetTransformInfo & TTI)12295427210SJustin Bogner Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
12395427210SJustin Bogner DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
12495427210SJustin Bogner : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
125598f8aadSAlina Sbirlea DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
12608debb02SMatt Arsenault
12708debb02SMatt Arsenault bool run();
12808debb02SMatt Arsenault
12908debb02SMatt Arsenault private:
13008debb02SMatt Arsenault unsigned getPointerAddressSpace(Value *I);
13108debb02SMatt Arsenault
1324f10a9d3SRoman Tereshin static const unsigned MaxDepth = 3;
1334f10a9d3SRoman Tereshin
13408debb02SMatt Arsenault bool isConsecutiveAccess(Value *A, Value *B);
1356fe00a21SStanislav Mekhanoshin bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
1364f10a9d3SRoman Tereshin unsigned Depth = 0) const;
1374f10a9d3SRoman Tereshin bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
1384f10a9d3SRoman Tereshin unsigned Depth) const;
139984a424cSFangrui Song bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
1404f10a9d3SRoman Tereshin unsigned Depth) const;
14108debb02SMatt Arsenault
142cbc6ac2aSAlina Sbirlea /// After vectorization, reorder the instructions that I depends on
143cbc6ac2aSAlina Sbirlea /// (the instructions defining its operands), to ensure they dominate I.
14408debb02SMatt Arsenault void reorder(Instruction *I);
14508debb02SMatt Arsenault
14608debb02SMatt Arsenault /// Returns the first and the last instructions in Chain.
14708debb02SMatt Arsenault std::pair<BasicBlock::iterator, BasicBlock::iterator>
14837f4e0e0SJustin Lebar getBoundaryInstrs(ArrayRef<Instruction *> Chain);
14908debb02SMatt Arsenault
15008debb02SMatt Arsenault /// Erases the original instructions after vectorizing.
15137f4e0e0SJustin Lebar void eraseInstructions(ArrayRef<Instruction *> Chain);
15208debb02SMatt Arsenault
15308debb02SMatt Arsenault /// "Legalize" the vector type that would be produced by combining \p
15408debb02SMatt Arsenault /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
15508debb02SMatt Arsenault /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
15608debb02SMatt Arsenault /// expected to have more than 4 elements.
15737f4e0e0SJustin Lebar std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
15837f4e0e0SJustin Lebar splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
15908debb02SMatt Arsenault
1608778c626SJustin Lebar /// Finds the largest prefix of Chain that's vectorizable, checking for
1618778c626SJustin Lebar /// intervening instructions which may affect the memory accessed by the
1628778c626SJustin Lebar /// instructions within Chain.
1638778c626SJustin Lebar ///
1646114b378SJustin Lebar /// The elements of \p Chain must be all loads or all stores and must be in
1656114b378SJustin Lebar /// address order.
16637f4e0e0SJustin Lebar ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
16708debb02SMatt Arsenault
16808debb02SMatt Arsenault /// Collects load and store instructions to vectorize.
16937f4e0e0SJustin Lebar std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
17008debb02SMatt Arsenault
17137f4e0e0SJustin Lebar /// Processes the collected instructions, the \p Map. The values of \p Map
17208debb02SMatt Arsenault /// should be all loads or all stores.
17337f4e0e0SJustin Lebar bool vectorizeChains(InstrListMap &Map);
17408debb02SMatt Arsenault
17508debb02SMatt Arsenault /// Finds the load/stores to consecutive memory addresses and vectorizes them.
17637f4e0e0SJustin Lebar bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
17708debb02SMatt Arsenault
17808debb02SMatt Arsenault /// Vectorizes the load instructions in Chain.
17937f4e0e0SJustin Lebar bool
18037f4e0e0SJustin Lebar vectorizeLoadChain(ArrayRef<Instruction *> Chain,
18137f4e0e0SJustin Lebar SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
18208debb02SMatt Arsenault
18308debb02SMatt Arsenault /// Vectorizes the store instructions in Chain.
18437f4e0e0SJustin Lebar bool
18537f4e0e0SJustin Lebar vectorizeStoreChain(ArrayRef<Instruction *> Chain,
18637f4e0e0SJustin Lebar SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
187327955e0SAlina Sbirlea
1886ec2ac04SVolkan Keles /// Check if this load/store access is misaligned accesses.
189327955e0SAlina Sbirlea bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
19011ef356dSCraig Topper Align Alignment);
19108debb02SMatt Arsenault };
19208debb02SMatt Arsenault
1934dc4ebd6SMarkus Lavin class LoadStoreVectorizerLegacyPass : public FunctionPass {
19408debb02SMatt Arsenault public:
19508debb02SMatt Arsenault static char ID;
19608debb02SMatt Arsenault
LoadStoreVectorizerLegacyPass()1974dc4ebd6SMarkus Lavin LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
1984dc4ebd6SMarkus Lavin initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
19908debb02SMatt Arsenault }
20008debb02SMatt Arsenault
20108debb02SMatt Arsenault bool runOnFunction(Function &F) override;
20208debb02SMatt Arsenault
getPassName() const203117296c0SMehdi Amini StringRef getPassName() const override {
20408debb02SMatt Arsenault return "GPU Load and Store Vectorizer";
20508debb02SMatt Arsenault }
20608debb02SMatt Arsenault
getAnalysisUsage(AnalysisUsage & AU) const20708debb02SMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override {
20808debb02SMatt Arsenault AU.addRequired<AAResultsWrapperPass>();
20995427210SJustin Bogner AU.addRequired<AssumptionCacheTracker>();
21008debb02SMatt Arsenault AU.addRequired<ScalarEvolutionWrapperPass>();
21108debb02SMatt Arsenault AU.addRequired<DominatorTreeWrapperPass>();
212370e8226SMatt Arsenault AU.addRequired<TargetTransformInfoWrapperPass>();
21308debb02SMatt Arsenault AU.setPreservesCFG();
21408debb02SMatt Arsenault }
21508debb02SMatt Arsenault };
2166cadde7fSEugene Zelenko
2176cadde7fSEugene Zelenko } // end anonymous namespace
2186cadde7fSEugene Zelenko
2194dc4ebd6SMarkus Lavin char LoadStoreVectorizerLegacyPass::ID = 0;
22008debb02SMatt Arsenault
2214dc4ebd6SMarkus Lavin INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
2223add3a40SMatt Arsenault "Vectorize load and Store instructions", false, false)
22308debb02SMatt Arsenault INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
22495427210SJustin Bogner INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
22508debb02SMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)22608debb02SMatt Arsenault INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
22708debb02SMatt Arsenault INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
228370e8226SMatt Arsenault INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
2294dc4ebd6SMarkus Lavin INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
2303add3a40SMatt Arsenault "Vectorize load and store instructions", false, false)
23108debb02SMatt Arsenault
232370e8226SMatt Arsenault Pass *llvm::createLoadStoreVectorizerPass() {
2334dc4ebd6SMarkus Lavin return new LoadStoreVectorizerLegacyPass();
23408debb02SMatt Arsenault }
23508debb02SMatt Arsenault
runOnFunction(Function & F)2364dc4ebd6SMarkus Lavin bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
237079d0f19SMatt Arsenault // Don't vectorize when the attribute NoImplicitFloat is used.
238079d0f19SMatt Arsenault if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
239079d0f19SMatt Arsenault return false;
240079d0f19SMatt Arsenault
24108debb02SMatt Arsenault AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
24208debb02SMatt Arsenault DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
24308debb02SMatt Arsenault ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
244598f8aadSAlina Sbirlea TargetTransformInfo &TTI =
245598f8aadSAlina Sbirlea getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
24608debb02SMatt Arsenault
24795427210SJustin Bogner AssumptionCache &AC =
24895427210SJustin Bogner getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
24995427210SJustin Bogner
25095427210SJustin Bogner Vectorizer V(F, AA, AC, DT, SE, TTI);
25108debb02SMatt Arsenault return V.run();
25208debb02SMatt Arsenault }
25308debb02SMatt Arsenault
run(Function & F,FunctionAnalysisManager & AM)2544dc4ebd6SMarkus Lavin PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
2554dc4ebd6SMarkus Lavin // Don't vectorize when the attribute NoImplicitFloat is used.
2564dc4ebd6SMarkus Lavin if (F.hasFnAttribute(Attribute::NoImplicitFloat))
2574dc4ebd6SMarkus Lavin return PreservedAnalyses::all();
2584dc4ebd6SMarkus Lavin
2594dc4ebd6SMarkus Lavin AliasAnalysis &AA = AM.getResult<AAManager>(F);
2604dc4ebd6SMarkus Lavin DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
2614dc4ebd6SMarkus Lavin ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
2624dc4ebd6SMarkus Lavin TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
26395427210SJustin Bogner AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
2644dc4ebd6SMarkus Lavin
26595427210SJustin Bogner Vectorizer V(F, AA, AC, DT, SE, TTI);
2664dc4ebd6SMarkus Lavin bool Changed = V.run();
2674dc4ebd6SMarkus Lavin PreservedAnalyses PA;
2684dc4ebd6SMarkus Lavin PA.preserveSet<CFGAnalyses>();
2694dc4ebd6SMarkus Lavin return Changed ? PA : PreservedAnalyses::all();
2704dc4ebd6SMarkus Lavin }
2714dc4ebd6SMarkus Lavin
2724dc4ebd6SMarkus Lavin // The real propagateMetadata expects a SmallVector<Value*>, but we deal in
2734dc4ebd6SMarkus Lavin // vectors of Instructions.
propagateMetadata(Instruction * I,ArrayRef<Instruction * > IL)2744dc4ebd6SMarkus Lavin static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
2754dc4ebd6SMarkus Lavin SmallVector<Value *, 8> VL(IL.begin(), IL.end());
2764dc4ebd6SMarkus Lavin propagateMetadata(I, VL);
2774dc4ebd6SMarkus Lavin }
2784dc4ebd6SMarkus Lavin
27908debb02SMatt Arsenault // Vectorizer Implementation
run()28008debb02SMatt Arsenault bool Vectorizer::run() {
28108debb02SMatt Arsenault bool Changed = false;
28208debb02SMatt Arsenault
28308debb02SMatt Arsenault // Scan the blocks in the function in post order.
28408debb02SMatt Arsenault for (BasicBlock *BB : post_order(&F)) {
28537f4e0e0SJustin Lebar InstrListMap LoadRefs, StoreRefs;
2864ee8a2d0SJustin Lebar std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
28708debb02SMatt Arsenault Changed |= vectorizeChains(LoadRefs);
28808debb02SMatt Arsenault Changed |= vectorizeChains(StoreRefs);
28908debb02SMatt Arsenault }
29008debb02SMatt Arsenault
29108debb02SMatt Arsenault return Changed;
29208debb02SMatt Arsenault }
29308debb02SMatt Arsenault
getPointerAddressSpace(Value * I)29408debb02SMatt Arsenault unsigned Vectorizer::getPointerAddressSpace(Value *I) {
29508debb02SMatt Arsenault if (LoadInst *L = dyn_cast<LoadInst>(I))
29608debb02SMatt Arsenault return L->getPointerAddressSpace();
29708debb02SMatt Arsenault if (StoreInst *S = dyn_cast<StoreInst>(I))
29808debb02SMatt Arsenault return S->getPointerAddressSpace();
29908debb02SMatt Arsenault return -1;
30008debb02SMatt Arsenault }
30108debb02SMatt Arsenault
30208debb02SMatt Arsenault // FIXME: Merge with llvm::isConsecutiveAccess
isConsecutiveAccess(Value * A,Value * B)30308debb02SMatt Arsenault bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
304038ede2aSRenato Golin Value *PtrA = getLoadStorePointerOperand(A);
305038ede2aSRenato Golin Value *PtrB = getLoadStorePointerOperand(B);
30608debb02SMatt Arsenault unsigned ASA = getPointerAddressSpace(A);
30708debb02SMatt Arsenault unsigned ASB = getPointerAddressSpace(B);
30808debb02SMatt Arsenault
30908debb02SMatt Arsenault // Check that the address spaces match and that the pointers are valid.
31008debb02SMatt Arsenault if (!PtrA || !PtrB || (ASA != ASB))
31108debb02SMatt Arsenault return false;
31208debb02SMatt Arsenault
31308debb02SMatt Arsenault // Make sure that A and B are different pointers of the same size type.
314a9129f89SNikita Popov Type *PtrATy = getLoadStoreType(A);
315a9129f89SNikita Popov Type *PtrBTy = getLoadStoreType(B);
31608debb02SMatt Arsenault if (PtrA == PtrB ||
31719f531d3SSven van Haastregt PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
31808debb02SMatt Arsenault DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
31908debb02SMatt Arsenault DL.getTypeStoreSize(PtrATy->getScalarType()) !=
32008debb02SMatt Arsenault DL.getTypeStoreSize(PtrBTy->getScalarType()))
32108debb02SMatt Arsenault return false;
32208debb02SMatt Arsenault
32331d52847SRoman Tereshin unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
32408debb02SMatt Arsenault APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
32508debb02SMatt Arsenault
32631d52847SRoman Tereshin return areConsecutivePointers(PtrA, PtrB, Size);
32731d52847SRoman Tereshin }
32831d52847SRoman Tereshin
areConsecutivePointers(Value * PtrA,Value * PtrB,APInt PtrDelta,unsigned Depth) const3294f10a9d3SRoman Tereshin bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
3306fe00a21SStanislav Mekhanoshin APInt PtrDelta, unsigned Depth) const {
33131d52847SRoman Tereshin unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
33231d52847SRoman Tereshin APInt OffsetA(PtrBitWidth, 0);
33331d52847SRoman Tereshin APInt OffsetB(PtrBitWidth, 0);
33408debb02SMatt Arsenault PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
33508debb02SMatt Arsenault PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
33608debb02SMatt Arsenault
3376fe00a21SStanislav Mekhanoshin unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
3386fe00a21SStanislav Mekhanoshin
3396fe00a21SStanislav Mekhanoshin if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
340ba1e845cSStanislav Mekhanoshin return false;
341ba1e845cSStanislav Mekhanoshin
3426fe00a21SStanislav Mekhanoshin // In case if we have to shrink the pointer
3436fe00a21SStanislav Mekhanoshin // stripAndAccumulateInBoundsConstantOffsets should properly handle a
3446fe00a21SStanislav Mekhanoshin // possible overflow and the value should fit into a smallest data type
3456fe00a21SStanislav Mekhanoshin // used in the cast/gep chain.
3466fe00a21SStanislav Mekhanoshin assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
3476fe00a21SStanislav Mekhanoshin OffsetB.getMinSignedBits() <= NewPtrBitWidth);
3486fe00a21SStanislav Mekhanoshin
3496fe00a21SStanislav Mekhanoshin OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
3506fe00a21SStanislav Mekhanoshin OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
3516fe00a21SStanislav Mekhanoshin PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
3526fe00a21SStanislav Mekhanoshin
35308debb02SMatt Arsenault APInt OffsetDelta = OffsetB - OffsetA;
35408debb02SMatt Arsenault
35508debb02SMatt Arsenault // Check if they are based on the same pointer. That makes the offsets
35608debb02SMatt Arsenault // sufficient.
35708debb02SMatt Arsenault if (PtrA == PtrB)
3584f10a9d3SRoman Tereshin return OffsetDelta == PtrDelta;
35908debb02SMatt Arsenault
36008debb02SMatt Arsenault // Compute the necessary base pointer delta to have the necessary final delta
3614f10a9d3SRoman Tereshin // equal to the pointer delta requested.
3624f10a9d3SRoman Tereshin APInt BaseDelta = PtrDelta - OffsetDelta;
36308debb02SMatt Arsenault
36408debb02SMatt Arsenault // Compute the distance with SCEV between the base pointers.
36508debb02SMatt Arsenault const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
36608debb02SMatt Arsenault const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
36708debb02SMatt Arsenault const SCEV *C = SE.getConstant(BaseDelta);
36808debb02SMatt Arsenault const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
36908debb02SMatt Arsenault if (X == PtrSCEVB)
37008debb02SMatt Arsenault return true;
37108debb02SMatt Arsenault
3728c7a30baSFarhana Aleen // The above check will not catch the cases where one of the pointers is
3738c7a30baSFarhana Aleen // factorized but the other one is not, such as (C + (S * (A + B))) vs
3748c7a30baSFarhana Aleen // (AS + BS). Get the minus scev. That will allow re-combining the expresions
3758c7a30baSFarhana Aleen // and getting the simplified difference.
3768c7a30baSFarhana Aleen const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
3778c7a30baSFarhana Aleen if (C == Dist)
3788c7a30baSFarhana Aleen return true;
3798c7a30baSFarhana Aleen
38008debb02SMatt Arsenault // Sometimes even this doesn't work, because SCEV can't always see through
38108debb02SMatt Arsenault // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
38208debb02SMatt Arsenault // things the hard way.
3834f10a9d3SRoman Tereshin return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
38431d52847SRoman Tereshin }
38531d52847SRoman Tereshin
checkNoWrapFlags(Instruction * I,bool Signed)38611996586SSlava Nikolaev static bool checkNoWrapFlags(Instruction *I, bool Signed) {
38711996586SSlava Nikolaev BinaryOperator *BinOpI = cast<BinaryOperator>(I);
38811996586SSlava Nikolaev return (Signed && BinOpI->hasNoSignedWrap()) ||
38911996586SSlava Nikolaev (!Signed && BinOpI->hasNoUnsignedWrap());
39011996586SSlava Nikolaev }
39111996586SSlava Nikolaev
checkIfSafeAddSequence(const APInt & IdxDiff,Instruction * AddOpA,unsigned MatchingOpIdxA,Instruction * AddOpB,unsigned MatchingOpIdxB,bool Signed)39211996586SSlava Nikolaev static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
39311996586SSlava Nikolaev unsigned MatchingOpIdxA, Instruction *AddOpB,
39411996586SSlava Nikolaev unsigned MatchingOpIdxB, bool Signed) {
39511996586SSlava Nikolaev // If both OpA and OpB is an add with NSW/NUW and with
39611996586SSlava Nikolaev // one of the operands being the same, we can guarantee that the
39711996586SSlava Nikolaev // transformation is safe if we can prove that OpA won't overflow when
39811996586SSlava Nikolaev // IdxDiff added to the other operand of OpA.
39911996586SSlava Nikolaev // For example:
40011996586SSlava Nikolaev // %tmp7 = add nsw i32 %tmp2, %v0
40111996586SSlava Nikolaev // %tmp8 = sext i32 %tmp7 to i64
40211996586SSlava Nikolaev // ...
40311996586SSlava Nikolaev // %tmp11 = add nsw i32 %v0, 1
40411996586SSlava Nikolaev // %tmp12 = add nsw i32 %tmp2, %tmp11
40511996586SSlava Nikolaev // %tmp13 = sext i32 %tmp12 to i64
40611996586SSlava Nikolaev //
40711996586SSlava Nikolaev // Both %tmp7 and %tmp2 has the nsw flag and the first operand
40811996586SSlava Nikolaev // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
40911996586SSlava Nikolaev // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
41011996586SSlava Nikolaev // nsw flag.
41111996586SSlava Nikolaev assert(AddOpA->getOpcode() == Instruction::Add &&
41211996586SSlava Nikolaev AddOpB->getOpcode() == Instruction::Add &&
41311996586SSlava Nikolaev checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
41411996586SSlava Nikolaev if (AddOpA->getOperand(MatchingOpIdxA) ==
41511996586SSlava Nikolaev AddOpB->getOperand(MatchingOpIdxB)) {
41611996586SSlava Nikolaev Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
41711996586SSlava Nikolaev Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
41811996586SSlava Nikolaev Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
41911996586SSlava Nikolaev Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
42011996586SSlava Nikolaev // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
42111996586SSlava Nikolaev if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
42211996586SSlava Nikolaev checkNoWrapFlags(OtherInstrB, Signed) &&
42311996586SSlava Nikolaev isa<ConstantInt>(OtherInstrB->getOperand(1))) {
42411996586SSlava Nikolaev int64_t CstVal =
42511996586SSlava Nikolaev cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
42611996586SSlava Nikolaev if (OtherInstrB->getOperand(0) == OtherOperandA &&
42711996586SSlava Nikolaev IdxDiff.getSExtValue() == CstVal)
42811996586SSlava Nikolaev return true;
42911996586SSlava Nikolaev }
43011996586SSlava Nikolaev // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
43111996586SSlava Nikolaev if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
43211996586SSlava Nikolaev checkNoWrapFlags(OtherInstrA, Signed) &&
43311996586SSlava Nikolaev isa<ConstantInt>(OtherInstrA->getOperand(1))) {
43411996586SSlava Nikolaev int64_t CstVal =
43511996586SSlava Nikolaev cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
43611996586SSlava Nikolaev if (OtherInstrA->getOperand(0) == OtherOperandB &&
43711996586SSlava Nikolaev IdxDiff.getSExtValue() == -CstVal)
43811996586SSlava Nikolaev return true;
43911996586SSlava Nikolaev }
44011996586SSlava Nikolaev // Match `x +nsw/nuw (y +nsw/nuw c)` and
44111996586SSlava Nikolaev // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
44211996586SSlava Nikolaev if (OtherInstrA && OtherInstrB &&
44311996586SSlava Nikolaev OtherInstrA->getOpcode() == Instruction::Add &&
44411996586SSlava Nikolaev OtherInstrB->getOpcode() == Instruction::Add &&
44511996586SSlava Nikolaev checkNoWrapFlags(OtherInstrA, Signed) &&
44611996586SSlava Nikolaev checkNoWrapFlags(OtherInstrB, Signed) &&
44711996586SSlava Nikolaev isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
44811996586SSlava Nikolaev isa<ConstantInt>(OtherInstrB->getOperand(1))) {
44911996586SSlava Nikolaev int64_t CstValA =
45011996586SSlava Nikolaev cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
45111996586SSlava Nikolaev int64_t CstValB =
45211996586SSlava Nikolaev cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
45311996586SSlava Nikolaev if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
45411996586SSlava Nikolaev IdxDiff.getSExtValue() == (CstValB - CstValA))
45511996586SSlava Nikolaev return true;
45611996586SSlava Nikolaev }
45711996586SSlava Nikolaev }
45811996586SSlava Nikolaev return false;
45911996586SSlava Nikolaev }
46011996586SSlava Nikolaev
lookThroughComplexAddresses(Value * PtrA,Value * PtrB,APInt PtrDelta,unsigned Depth) const46131d52847SRoman Tereshin bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
4624f10a9d3SRoman Tereshin APInt PtrDelta,
4634f10a9d3SRoman Tereshin unsigned Depth) const {
46431d52847SRoman Tereshin auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
46531d52847SRoman Tereshin auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
46631d52847SRoman Tereshin if (!GEPA || !GEPB)
4674f10a9d3SRoman Tereshin return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
46808debb02SMatt Arsenault
46908debb02SMatt Arsenault // Look through GEPs after checking they're the same except for the last
47008debb02SMatt Arsenault // index.
47131d52847SRoman Tereshin if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
47231d52847SRoman Tereshin GEPA->getPointerOperand() != GEPB->getPointerOperand())
47308debb02SMatt Arsenault return false;
47431d52847SRoman Tereshin gep_type_iterator GTIA = gep_type_begin(GEPA);
47531d52847SRoman Tereshin gep_type_iterator GTIB = gep_type_begin(GEPB);
47631d52847SRoman Tereshin for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
47731d52847SRoman Tereshin if (GTIA.getOperand() != GTIB.getOperand())
47808debb02SMatt Arsenault return false;
47931d52847SRoman Tereshin ++GTIA;
48031d52847SRoman Tereshin ++GTIB;
48131d52847SRoman Tereshin }
48208debb02SMatt Arsenault
48331d52847SRoman Tereshin Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
48431d52847SRoman Tereshin Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
48508debb02SMatt Arsenault if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
48608debb02SMatt Arsenault OpA->getType() != OpB->getType())
48708debb02SMatt Arsenault return false;
48808debb02SMatt Arsenault
48931d52847SRoman Tereshin if (PtrDelta.isNegative()) {
49031d52847SRoman Tereshin if (PtrDelta.isMinSignedValue())
49131d52847SRoman Tereshin return false;
49231d52847SRoman Tereshin PtrDelta.negate();
49331d52847SRoman Tereshin std::swap(OpA, OpB);
49431d52847SRoman Tereshin }
49531d52847SRoman Tereshin uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
49631d52847SRoman Tereshin if (PtrDelta.urem(Stride) != 0)
49731d52847SRoman Tereshin return false;
49831d52847SRoman Tereshin unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
499*6bec3e93SJay Foad APInt IdxDiff = PtrDelta.udiv(Stride).zext(IdxBitWidth);
50031d52847SRoman Tereshin
50108debb02SMatt Arsenault // Only look through a ZExt/SExt.
50208debb02SMatt Arsenault if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
50308debb02SMatt Arsenault return false;
50408debb02SMatt Arsenault
505a8576706SMatt Arsenault bool Signed = isa<SExtInst>(OpA);
506a8576706SMatt Arsenault
50731d52847SRoman Tereshin // At this point A could be a function parameter, i.e. not an instruction
50831d52847SRoman Tereshin Value *ValA = OpA->getOperand(0);
50908debb02SMatt Arsenault OpB = dyn_cast<Instruction>(OpB->getOperand(0));
51031d52847SRoman Tereshin if (!OpB || ValA->getType() != OpB->getType())
51108debb02SMatt Arsenault return false;
51208debb02SMatt Arsenault
51331d52847SRoman Tereshin // Now we need to prove that adding IdxDiff to ValA won't overflow.
514a8576706SMatt Arsenault bool Safe = false;
51563081dc6SVolkan Keles
51631d52847SRoman Tereshin // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
51731d52847SRoman Tereshin // ValA, we're okay.
518a8576706SMatt Arsenault if (OpB->getOpcode() == Instruction::Add &&
519a8576706SMatt Arsenault isa<ConstantInt>(OpB->getOperand(1)) &&
52063081dc6SVolkan Keles IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
52111996586SSlava Nikolaev checkNoWrapFlags(OpB, Signed))
52263081dc6SVolkan Keles Safe = true;
52363081dc6SVolkan Keles
52411996586SSlava Nikolaev // Second attempt: check if we have eligible add NSW/NUW instruction
52511996586SSlava Nikolaev // sequences.
52663081dc6SVolkan Keles OpA = dyn_cast<Instruction>(ValA);
52763081dc6SVolkan Keles if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
52811996586SSlava Nikolaev OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
52911996586SSlava Nikolaev checkNoWrapFlags(OpB, Signed)) {
53011996586SSlava Nikolaev // In the checks below a matching operand in OpA and OpB is
53111996586SSlava Nikolaev // an operand which is the same in those two instructions.
53211996586SSlava Nikolaev // Below we account for possible orders of the operands of
53311996586SSlava Nikolaev // these add instructions.
53411996586SSlava Nikolaev for (unsigned MatchingOpIdxA : {0, 1})
53511996586SSlava Nikolaev for (unsigned MatchingOpIdxB : {0, 1})
53611996586SSlava Nikolaev if (!Safe)
53711996586SSlava Nikolaev Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
53811996586SSlava Nikolaev MatchingOpIdxB, Signed);
539a8576706SMatt Arsenault }
540a8576706SMatt Arsenault
54131d52847SRoman Tereshin unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
542a8576706SMatt Arsenault
54363081dc6SVolkan Keles // Third attempt:
54431d52847SRoman Tereshin // If all set bits of IdxDiff or any higher order bit other than the sign bit
54531d52847SRoman Tereshin // are known to be zero in ValA, we can add Diff to it while guaranteeing no
54631d52847SRoman Tereshin // overflow of any sort.
547a8576706SMatt Arsenault if (!Safe) {
548b45eabcfSCraig Topper KnownBits Known(BitWidth);
549e7d26aceSJustin Bogner computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
55031d52847SRoman Tereshin APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
55131d52847SRoman Tereshin if (Signed)
55231d52847SRoman Tereshin BitsAllowedToBeSet.clearBit(BitWidth - 1);
55331d52847SRoman Tereshin if (BitsAllowedToBeSet.ult(IdxDiff))
55431d52847SRoman Tereshin return false;
555a8576706SMatt Arsenault }
556a8576706SMatt Arsenault
55731d52847SRoman Tereshin const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
55808debb02SMatt Arsenault const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
55931d52847SRoman Tereshin const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
56031d52847SRoman Tereshin const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
56131d52847SRoman Tereshin return X == OffsetSCEVB;
56208debb02SMatt Arsenault }
56308debb02SMatt Arsenault
lookThroughSelects(Value * PtrA,Value * PtrB,const APInt & PtrDelta,unsigned Depth) const564984a424cSFangrui Song bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
565984a424cSFangrui Song const APInt &PtrDelta,
5664f10a9d3SRoman Tereshin unsigned Depth) const {
5674f10a9d3SRoman Tereshin if (Depth++ == MaxDepth)
5684f10a9d3SRoman Tereshin return false;
5694f10a9d3SRoman Tereshin
5704f10a9d3SRoman Tereshin if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
5714f10a9d3SRoman Tereshin if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
5724f10a9d3SRoman Tereshin return SelectA->getCondition() == SelectB->getCondition() &&
5734f10a9d3SRoman Tereshin areConsecutivePointers(SelectA->getTrueValue(),
5744f10a9d3SRoman Tereshin SelectB->getTrueValue(), PtrDelta, Depth) &&
5754f10a9d3SRoman Tereshin areConsecutivePointers(SelectA->getFalseValue(),
5764f10a9d3SRoman Tereshin SelectB->getFalseValue(), PtrDelta, Depth);
5774f10a9d3SRoman Tereshin }
5784f10a9d3SRoman Tereshin }
5794f10a9d3SRoman Tereshin return false;
5804f10a9d3SRoman Tereshin }
5814f10a9d3SRoman Tereshin
reorder(Instruction * I)58208debb02SMatt Arsenault void Vectorizer::reorder(Instruction *I) {
583cbc6ac2aSAlina Sbirlea SmallPtrSet<Instruction *, 16> InstructionsToMove;
584cbc6ac2aSAlina Sbirlea SmallVector<Instruction *, 16> Worklist;
585cbc6ac2aSAlina Sbirlea
586cbc6ac2aSAlina Sbirlea Worklist.push_back(I);
587cbc6ac2aSAlina Sbirlea while (!Worklist.empty()) {
588cbc6ac2aSAlina Sbirlea Instruction *IW = Worklist.pop_back_val();
589cbc6ac2aSAlina Sbirlea int NumOperands = IW->getNumOperands();
590cbc6ac2aSAlina Sbirlea for (int i = 0; i < NumOperands; i++) {
591cbc6ac2aSAlina Sbirlea Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
592cbc6ac2aSAlina Sbirlea if (!IM || IM->getOpcode() == Instruction::PHI)
59308debb02SMatt Arsenault continue;
59408debb02SMatt Arsenault
595222ceff2SJustin Lebar // If IM is in another BB, no need to move it, because this pass only
596222ceff2SJustin Lebar // vectorizes instructions within one BB.
597222ceff2SJustin Lebar if (IM->getParent() != I->getParent())
598222ceff2SJustin Lebar continue;
599222ceff2SJustin Lebar
6000c2b09a9SReid Kleckner if (!IM->comesBefore(I)) {
601cbc6ac2aSAlina Sbirlea InstructionsToMove.insert(IM);
602cbc6ac2aSAlina Sbirlea Worklist.push_back(IM);
60308debb02SMatt Arsenault }
60408debb02SMatt Arsenault }
60508debb02SMatt Arsenault }
60608debb02SMatt Arsenault
607cbc6ac2aSAlina Sbirlea // All instructions to move should follow I. Start from I, not from begin().
608cbc6ac2aSAlina Sbirlea for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
609cbc6ac2aSAlina Sbirlea ++BBI) {
610a0053cc0SBenjamin Kramer if (!InstructionsToMove.count(&*BBI))
611cbc6ac2aSAlina Sbirlea continue;
612cbc6ac2aSAlina Sbirlea Instruction *IM = &*BBI;
613cbc6ac2aSAlina Sbirlea --BBI;
614cbc6ac2aSAlina Sbirlea IM->removeFromParent();
615cbc6ac2aSAlina Sbirlea IM->insertBefore(I);
616cbc6ac2aSAlina Sbirlea }
617cbc6ac2aSAlina Sbirlea }
618cbc6ac2aSAlina Sbirlea
61908debb02SMatt Arsenault std::pair<BasicBlock::iterator, BasicBlock::iterator>
getBoundaryInstrs(ArrayRef<Instruction * > Chain)62037f4e0e0SJustin Lebar Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
62137f4e0e0SJustin Lebar Instruction *C0 = Chain[0];
62208debb02SMatt Arsenault BasicBlock::iterator FirstInstr = C0->getIterator();
62308debb02SMatt Arsenault BasicBlock::iterator LastInstr = C0->getIterator();
62408debb02SMatt Arsenault
62508debb02SMatt Arsenault BasicBlock *BB = C0->getParent();
62608debb02SMatt Arsenault unsigned NumFound = 0;
62708debb02SMatt Arsenault for (Instruction &I : *BB) {
62808debb02SMatt Arsenault if (!is_contained(Chain, &I))
62908debb02SMatt Arsenault continue;
63008debb02SMatt Arsenault
63108debb02SMatt Arsenault ++NumFound;
63208debb02SMatt Arsenault if (NumFound == 1) {
63308debb02SMatt Arsenault FirstInstr = I.getIterator();
6348d8aa5ddSAlina Sbirlea }
6358d8aa5ddSAlina Sbirlea if (NumFound == Chain.size()) {
63608debb02SMatt Arsenault LastInstr = I.getIterator();
63708debb02SMatt Arsenault break;
63808debb02SMatt Arsenault }
63908debb02SMatt Arsenault }
64008debb02SMatt Arsenault
6418d8aa5ddSAlina Sbirlea // Range is [first, last).
6428d8aa5ddSAlina Sbirlea return std::make_pair(FirstInstr, ++LastInstr);
64308debb02SMatt Arsenault }
64408debb02SMatt Arsenault
eraseInstructions(ArrayRef<Instruction * > Chain)64537f4e0e0SJustin Lebar void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
64608debb02SMatt Arsenault SmallVector<Instruction *, 16> Instrs;
64737f4e0e0SJustin Lebar for (Instruction *I : Chain) {
648038ede2aSRenato Golin Value *PtrOperand = getLoadStorePointerOperand(I);
64908debb02SMatt Arsenault assert(PtrOperand && "Instruction must have a pointer operand.");
65037f4e0e0SJustin Lebar Instrs.push_back(I);
65108debb02SMatt Arsenault if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
65208debb02SMatt Arsenault Instrs.push_back(GEP);
65308debb02SMatt Arsenault }
65408debb02SMatt Arsenault
65508debb02SMatt Arsenault // Erase instructions.
65637f4e0e0SJustin Lebar for (Instruction *I : Instrs)
65737f4e0e0SJustin Lebar if (I->use_empty())
65837f4e0e0SJustin Lebar I->eraseFromParent();
65908debb02SMatt Arsenault }
66008debb02SMatt Arsenault
66137f4e0e0SJustin Lebar std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
splitOddVectorElts(ArrayRef<Instruction * > Chain,unsigned ElementSizeBits)66237f4e0e0SJustin Lebar Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
66308debb02SMatt Arsenault unsigned ElementSizeBits) {
6641c38681aSVolkan Keles unsigned ElementSizeBytes = ElementSizeBits / 8;
6651c38681aSVolkan Keles unsigned SizeBytes = ElementSizeBytes * Chain.size();
6661c38681aSVolkan Keles unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
667f0a88dbaSMatt Arsenault if (NumLeft == Chain.size()) {
668f0a88dbaSMatt Arsenault if ((NumLeft & 1) == 0)
669f0a88dbaSMatt Arsenault NumLeft /= 2; // Split even in half
670f0a88dbaSMatt Arsenault else
671f0a88dbaSMatt Arsenault --NumLeft; // Split off last element
672f0a88dbaSMatt Arsenault } else if (NumLeft == 0)
6731c38681aSVolkan Keles NumLeft = 1;
67408debb02SMatt Arsenault return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
67508debb02SMatt Arsenault }
67608debb02SMatt Arsenault
67737f4e0e0SJustin Lebar ArrayRef<Instruction *>
getVectorizablePrefix(ArrayRef<Instruction * > Chain)67837f4e0e0SJustin Lebar Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
6796114b378SJustin Lebar // These are in BB order, unlike Chain, which is in address order.
680222ceff2SJustin Lebar SmallVector<Instruction *, 16> MemoryInstrs;
681222ceff2SJustin Lebar SmallVector<Instruction *, 16> ChainInstrs;
68208debb02SMatt Arsenault
683a272c12bSJustin Lebar bool IsLoadChain = isa<LoadInst>(Chain[0]);
684d34e60caSNicola Zaghen LLVM_DEBUG({
68537f4e0e0SJustin Lebar for (Instruction *I : Chain) {
686a272c12bSJustin Lebar if (IsLoadChain)
68737f4e0e0SJustin Lebar assert(isa<LoadInst>(I) &&
688a272c12bSJustin Lebar "All elements of Chain must be loads, or all must be stores.");
689a272c12bSJustin Lebar else
69037f4e0e0SJustin Lebar assert(isa<StoreInst>(I) &&
691a272c12bSJustin Lebar "All elements of Chain must be loads, or all must be stores.");
692a272c12bSJustin Lebar }
693a272c12bSJustin Lebar });
694a272c12bSJustin Lebar
6958778c626SJustin Lebar for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
6969d720dcbSNikita Popov if ((isa<LoadInst>(I) || isa<StoreInst>(I)) && is_contained(Chain, &I)) {
697222ceff2SJustin Lebar ChainInstrs.push_back(&I);
6989d720dcbSNikita Popov continue;
6999d720dcbSNikita Popov }
700330cb032SNikita Popov if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
701330cb032SNikita Popov LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
702330cb032SNikita Popov << I << '\n');
70362b03e34SJustin Lebar break;
70408debb02SMatt Arsenault }
7059d720dcbSNikita Popov if (I.mayReadOrWriteMemory())
7069d720dcbSNikita Popov MemoryInstrs.push_back(&I);
70708debb02SMatt Arsenault }
70808debb02SMatt Arsenault
7096114b378SJustin Lebar // Loop until we find an instruction in ChainInstrs that we can't vectorize.
710222ceff2SJustin Lebar unsigned ChainInstrIdx = 0;
711a3d2f703SAlina Sbirlea Instruction *BarrierMemoryInstr = nullptr;
712a3d2f703SAlina Sbirlea
713222ceff2SJustin Lebar for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
714222ceff2SJustin Lebar Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
715a3d2f703SAlina Sbirlea
716a3d2f703SAlina Sbirlea // If a barrier memory instruction was found, chain instructions that follow
717a3d2f703SAlina Sbirlea // will not be added to the valid prefix.
7180c2b09a9SReid Kleckner if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
719a3d2f703SAlina Sbirlea break;
720a3d2f703SAlina Sbirlea
721a3d2f703SAlina Sbirlea // Check (in BB order) if any instruction prevents ChainInstr from being
722a3d2f703SAlina Sbirlea // vectorized. Find and store the first such "conflicting" instruction.
723222ceff2SJustin Lebar for (Instruction *MemInstr : MemoryInstrs) {
724a3d2f703SAlina Sbirlea // If a barrier memory instruction was found, do not check past it.
7250c2b09a9SReid Kleckner if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
726a3d2f703SAlina Sbirlea break;
727a3d2f703SAlina Sbirlea
728f85f5da3SBenjamin Kramer auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
729f85f5da3SBenjamin Kramer auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
730f85f5da3SBenjamin Kramer if (MemLoad && ChainLoad)
73108debb02SMatt Arsenault continue;
73208debb02SMatt Arsenault
733f85f5da3SBenjamin Kramer // We can ignore the alias if the we have a load store pair and the load
734f85f5da3SBenjamin Kramer // is known to be invariant. The load cannot be clobbered by the store.
735f85f5da3SBenjamin Kramer auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
7364228245eSPhilip Reames return LI->hasMetadata(LLVMContext::MD_invariant_load);
737f85f5da3SBenjamin Kramer };
738f85f5da3SBenjamin Kramer
7399d720dcbSNikita Popov if (IsLoadChain) {
74008debb02SMatt Arsenault // We can ignore the alias as long as the load comes before the store,
74108debb02SMatt Arsenault // because that means we won't be moving the load past the store to
74208debb02SMatt Arsenault // vectorize it (the vectorized load is inserted at the location of the
74308debb02SMatt Arsenault // first load in the chain).
7449d720dcbSNikita Popov if (ChainInstr->comesBefore(MemInstr) ||
7459d720dcbSNikita Popov (ChainLoad && IsInvariantLoad(ChainLoad)))
74608debb02SMatt Arsenault continue;
7479d720dcbSNikita Popov } else {
74808debb02SMatt Arsenault // Same case, but in reverse.
7499d720dcbSNikita Popov if (MemInstr->comesBefore(ChainInstr) ||
7509d720dcbSNikita Popov (MemLoad && IsInvariantLoad(MemLoad)))
75108debb02SMatt Arsenault continue;
7529d720dcbSNikita Popov }
75308debb02SMatt Arsenault
7549d720dcbSNikita Popov ModRefInfo MR =
7559d720dcbSNikita Popov AA.getModRefInfo(MemInstr, MemoryLocation::get(ChainInstr));
7569d720dcbSNikita Popov if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
757d34e60caSNicola Zaghen LLVM_DEBUG({
7582cf2c228SJustin Lebar dbgs() << "LSV: Found alias:\n"
7599d720dcbSNikita Popov " Aliasing instruction:\n"
7606114b378SJustin Lebar << " " << *MemInstr << '\n'
76108debb02SMatt Arsenault << " Aliased instruction and pointer:\n"
7626114b378SJustin Lebar << " " << *ChainInstr << '\n'
763038ede2aSRenato Golin << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
764598f8aadSAlina Sbirlea });
765a3d2f703SAlina Sbirlea // Save this aliasing memory instruction as a barrier, but allow other
766a3d2f703SAlina Sbirlea // instructions that precede the barrier to be vectorized with this one.
767a3d2f703SAlina Sbirlea BarrierMemoryInstr = MemInstr;
7686114b378SJustin Lebar break;
7696114b378SJustin Lebar }
7706114b378SJustin Lebar }
771a3d2f703SAlina Sbirlea // Continue the search only for store chains, since vectorizing stores that
772a3d2f703SAlina Sbirlea // precede an aliasing load is valid. Conversely, vectorizing loads is valid
773a3d2f703SAlina Sbirlea // up to an aliasing store, but should not pull loads from further down in
774a3d2f703SAlina Sbirlea // the basic block.
775a3d2f703SAlina Sbirlea if (IsLoadChain && BarrierMemoryInstr) {
776a3d2f703SAlina Sbirlea // The BarrierMemoryInstr is a store that precedes ChainInstr.
7770c2b09a9SReid Kleckner assert(BarrierMemoryInstr->comesBefore(ChainInstr));
7786114b378SJustin Lebar break;
7796114b378SJustin Lebar }
780a3d2f703SAlina Sbirlea }
78108debb02SMatt Arsenault
7826114b378SJustin Lebar // Find the largest prefix of Chain whose elements are all in
7836114b378SJustin Lebar // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
7846114b378SJustin Lebar // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
7856114b378SJustin Lebar // order.)
786d1675aadSJustin Lebar SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
787d1675aadSJustin Lebar ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
788d1675aadSJustin Lebar unsigned ChainIdx = 0;
789d1675aadSJustin Lebar for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
790d1675aadSJustin Lebar if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
7916114b378SJustin Lebar break;
7926114b378SJustin Lebar }
7938778c626SJustin Lebar return Chain.slice(0, ChainIdx);
79408debb02SMatt Arsenault }
79508debb02SMatt Arsenault
getChainID(const Value * Ptr)796b0eb40caSVitaly Buka static ChainID getChainID(const Value *Ptr) {
797b0eb40caSVitaly Buka const Value *ObjPtr = getUnderlyingObject(Ptr);
7984f10a9d3SRoman Tereshin if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
7994f10a9d3SRoman Tereshin // The select's themselves are distinct instructions even if they share the
8004f10a9d3SRoman Tereshin // same condition and evaluate to consecutive pointers for true and false
8014f10a9d3SRoman Tereshin // values of the condition. Therefore using the select's themselves for
8024f10a9d3SRoman Tereshin // grouping instructions would put consecutive accesses into different lists
8034f10a9d3SRoman Tereshin // and they won't be even checked for being consecutive, and won't be
8044f10a9d3SRoman Tereshin // vectorized.
8054f10a9d3SRoman Tereshin return Sel->getCondition();
8064f10a9d3SRoman Tereshin }
8074f10a9d3SRoman Tereshin return ObjPtr;
8084f10a9d3SRoman Tereshin }
8094f10a9d3SRoman Tereshin
81037f4e0e0SJustin Lebar std::pair<InstrListMap, InstrListMap>
collectInstructions(BasicBlock * BB)8114ee8a2d0SJustin Lebar Vectorizer::collectInstructions(BasicBlock *BB) {
81237f4e0e0SJustin Lebar InstrListMap LoadRefs;
81337f4e0e0SJustin Lebar InstrListMap StoreRefs;
81408debb02SMatt Arsenault
81508debb02SMatt Arsenault for (Instruction &I : *BB) {
81608debb02SMatt Arsenault if (!I.mayReadOrWriteMemory())
81708debb02SMatt Arsenault continue;
81808debb02SMatt Arsenault
81908debb02SMatt Arsenault if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
82008debb02SMatt Arsenault if (!LI->isSimple())
82108debb02SMatt Arsenault continue;
82208debb02SMatt Arsenault
8231c38681aSVolkan Keles // Skip if it's not legal.
8241c38681aSVolkan Keles if (!TTI.isLegalToVectorizeLoad(LI))
8251c38681aSVolkan Keles continue;
8261c38681aSVolkan Keles
82708debb02SMatt Arsenault Type *Ty = LI->getType();
82808debb02SMatt Arsenault if (!VectorType::isValidElementType(Ty->getScalarType()))
82908debb02SMatt Arsenault continue;
83008debb02SMatt Arsenault
8318a4ab5e1SMatt Arsenault // Skip weird non-byte sizes. They probably aren't worth the effort of
8328a4ab5e1SMatt Arsenault // handling correctly.
8338a4ab5e1SMatt Arsenault unsigned TySize = DL.getTypeSizeInBits(Ty);
83422a2282dSBjorn Pettersson if ((TySize % 8) != 0)
8358a4ab5e1SMatt Arsenault continue;
8368a4ab5e1SMatt Arsenault
83786db068eSBjorn Pettersson // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
83886db068eSBjorn Pettersson // functions are currently using an integer type for the vectorized
83986db068eSBjorn Pettersson // load/store, and does not support casting between the integer type and a
84086db068eSBjorn Pettersson // vector of pointers (e.g. i64 to <2 x i16*>)
84186db068eSBjorn Pettersson if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
84286db068eSBjorn Pettersson continue;
84386db068eSBjorn Pettersson
844370e8226SMatt Arsenault Value *Ptr = LI->getPointerOperand();
845370e8226SMatt Arsenault unsigned AS = Ptr->getType()->getPointerAddressSpace();
846370e8226SMatt Arsenault unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
847370e8226SMatt Arsenault
84889196642SFarhana Aleen unsigned VF = VecRegSize / TySize;
84989196642SFarhana Aleen VectorType *VecTy = dyn_cast<VectorType>(Ty);
85089196642SFarhana Aleen
85108debb02SMatt Arsenault // No point in looking at these if they're too big to vectorize.
85289196642SFarhana Aleen if (TySize > VecRegSize / 2 ||
85389196642SFarhana Aleen (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
85408debb02SMatt Arsenault continue;
85508debb02SMatt Arsenault
85608debb02SMatt Arsenault // Save the load locations.
857b0eb40caSVitaly Buka const ChainID ID = getChainID(Ptr);
8584f10a9d3SRoman Tereshin LoadRefs[ID].push_back(LI);
85908debb02SMatt Arsenault } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
86008debb02SMatt Arsenault if (!SI->isSimple())
86108debb02SMatt Arsenault continue;
86208debb02SMatt Arsenault
8631c38681aSVolkan Keles // Skip if it's not legal.
8641c38681aSVolkan Keles if (!TTI.isLegalToVectorizeStore(SI))
8651c38681aSVolkan Keles continue;
8661c38681aSVolkan Keles
86708debb02SMatt Arsenault Type *Ty = SI->getValueOperand()->getType();
86808debb02SMatt Arsenault if (!VectorType::isValidElementType(Ty->getScalarType()))
86908debb02SMatt Arsenault continue;
87008debb02SMatt Arsenault
87186db068eSBjorn Pettersson // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
87286db068eSBjorn Pettersson // functions are currently using an integer type for the vectorized
87386db068eSBjorn Pettersson // load/store, and does not support casting between the integer type and a
87486db068eSBjorn Pettersson // vector of pointers (e.g. i64 to <2 x i16*>)
87586db068eSBjorn Pettersson if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
87686db068eSBjorn Pettersson continue;
87786db068eSBjorn Pettersson
8788a4ab5e1SMatt Arsenault // Skip weird non-byte sizes. They probably aren't worth the effort of
8798a4ab5e1SMatt Arsenault // handling correctly.
8808a4ab5e1SMatt Arsenault unsigned TySize = DL.getTypeSizeInBits(Ty);
88122a2282dSBjorn Pettersson if ((TySize % 8) != 0)
8828a4ab5e1SMatt Arsenault continue;
8838a4ab5e1SMatt Arsenault
884370e8226SMatt Arsenault Value *Ptr = SI->getPointerOperand();
885370e8226SMatt Arsenault unsigned AS = Ptr->getType()->getPointerAddressSpace();
886370e8226SMatt Arsenault unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
88722a2282dSBjorn Pettersson
88889196642SFarhana Aleen unsigned VF = VecRegSize / TySize;
88989196642SFarhana Aleen VectorType *VecTy = dyn_cast<VectorType>(Ty);
89089196642SFarhana Aleen
89122a2282dSBjorn Pettersson // No point in looking at these if they're too big to vectorize.
89289196642SFarhana Aleen if (TySize > VecRegSize / 2 ||
89389196642SFarhana Aleen (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
89408debb02SMatt Arsenault continue;
89508debb02SMatt Arsenault
89608debb02SMatt Arsenault // Save store location.
897b0eb40caSVitaly Buka const ChainID ID = getChainID(Ptr);
8984f10a9d3SRoman Tereshin StoreRefs[ID].push_back(SI);
89908debb02SMatt Arsenault }
90008debb02SMatt Arsenault }
9014ee8a2d0SJustin Lebar
9024ee8a2d0SJustin Lebar return {LoadRefs, StoreRefs};
90308debb02SMatt Arsenault }
90408debb02SMatt Arsenault
vectorizeChains(InstrListMap & Map)90537f4e0e0SJustin Lebar bool Vectorizer::vectorizeChains(InstrListMap &Map) {
90608debb02SMatt Arsenault bool Changed = false;
90708debb02SMatt Arsenault
9084f10a9d3SRoman Tereshin for (const std::pair<ChainID, InstrList> &Chain : Map) {
90908debb02SMatt Arsenault unsigned Size = Chain.second.size();
91008debb02SMatt Arsenault if (Size < 2)
91108debb02SMatt Arsenault continue;
91208debb02SMatt Arsenault
913d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
91408debb02SMatt Arsenault
91508debb02SMatt Arsenault // Process the stores in chunks of 64.
91608debb02SMatt Arsenault for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
91708debb02SMatt Arsenault unsigned Len = std::min<unsigned>(CE - CI, 64);
91837f4e0e0SJustin Lebar ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
91908debb02SMatt Arsenault Changed |= vectorizeInstructions(Chunk);
92008debb02SMatt Arsenault }
92108debb02SMatt Arsenault }
92208debb02SMatt Arsenault
92308debb02SMatt Arsenault return Changed;
92408debb02SMatt Arsenault }
92508debb02SMatt Arsenault
vectorizeInstructions(ArrayRef<Instruction * > Instrs)92637f4e0e0SJustin Lebar bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
927d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
928d34e60caSNicola Zaghen << " instructions.\n");
9293f8f7840SAlina Sbirlea SmallVector<int, 16> Heads, Tails;
93008debb02SMatt Arsenault int ConsecutiveChain[64];
93108debb02SMatt Arsenault
93286db068eSBjorn Pettersson // Do a quadratic search on all of the given loads/stores and find all of the
93386db068eSBjorn Pettersson // pairs of loads/stores that follow each other.
93408debb02SMatt Arsenault for (int i = 0, e = Instrs.size(); i < e; ++i) {
93508debb02SMatt Arsenault ConsecutiveChain[i] = -1;
93608debb02SMatt Arsenault for (int j = e - 1; j >= 0; --j) {
93708debb02SMatt Arsenault if (i == j)
93808debb02SMatt Arsenault continue;
93908debb02SMatt Arsenault
94008debb02SMatt Arsenault if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
94108debb02SMatt Arsenault if (ConsecutiveChain[i] != -1) {
94208debb02SMatt Arsenault int CurDistance = std::abs(ConsecutiveChain[i] - i);
94308debb02SMatt Arsenault int NewDistance = std::abs(ConsecutiveChain[i] - j);
94408debb02SMatt Arsenault if (j < i || NewDistance > CurDistance)
94508debb02SMatt Arsenault continue; // Should not insert.
94608debb02SMatt Arsenault }
94708debb02SMatt Arsenault
9483f8f7840SAlina Sbirlea Tails.push_back(j);
9493f8f7840SAlina Sbirlea Heads.push_back(i);
95008debb02SMatt Arsenault ConsecutiveChain[i] = j;
95108debb02SMatt Arsenault }
95208debb02SMatt Arsenault }
95308debb02SMatt Arsenault }
95408debb02SMatt Arsenault
95508debb02SMatt Arsenault bool Changed = false;
95637f4e0e0SJustin Lebar SmallPtrSet<Instruction *, 16> InstructionsProcessed;
95708debb02SMatt Arsenault
95808debb02SMatt Arsenault for (int Head : Heads) {
959640a61cdSAlina Sbirlea if (InstructionsProcessed.count(Instrs[Head]))
960640a61cdSAlina Sbirlea continue;
9613f8f7840SAlina Sbirlea bool LongerChainExists = false;
962640a61cdSAlina Sbirlea for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
963640a61cdSAlina Sbirlea if (Head == Tails[TIt] &&
964640a61cdSAlina Sbirlea !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
9653f8f7840SAlina Sbirlea LongerChainExists = true;
966640a61cdSAlina Sbirlea break;
967640a61cdSAlina Sbirlea }
9683f8f7840SAlina Sbirlea if (LongerChainExists)
96908debb02SMatt Arsenault continue;
97008debb02SMatt Arsenault
97108debb02SMatt Arsenault // We found an instr that starts a chain. Now follow the chain and try to
97208debb02SMatt Arsenault // vectorize it.
97337f4e0e0SJustin Lebar SmallVector<Instruction *, 16> Operands;
97408debb02SMatt Arsenault int I = Head;
9753f8f7840SAlina Sbirlea while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
976640a61cdSAlina Sbirlea if (InstructionsProcessed.count(Instrs[I]))
97708debb02SMatt Arsenault break;
97808debb02SMatt Arsenault
97908debb02SMatt Arsenault Operands.push_back(Instrs[I]);
98008debb02SMatt Arsenault I = ConsecutiveChain[I];
98108debb02SMatt Arsenault }
98208debb02SMatt Arsenault
98308debb02SMatt Arsenault bool Vectorized = false;
98408debb02SMatt Arsenault if (isa<LoadInst>(*Operands.begin()))
985640a61cdSAlina Sbirlea Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
98608debb02SMatt Arsenault else
987640a61cdSAlina Sbirlea Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
98808debb02SMatt Arsenault
98908debb02SMatt Arsenault Changed |= Vectorized;
99008debb02SMatt Arsenault }
99108debb02SMatt Arsenault
99208debb02SMatt Arsenault return Changed;
99308debb02SMatt Arsenault }
99408debb02SMatt Arsenault
vectorizeStoreChain(ArrayRef<Instruction * > Chain,SmallPtrSet<Instruction *,16> * InstructionsProcessed)995640a61cdSAlina Sbirlea bool Vectorizer::vectorizeStoreChain(
99637f4e0e0SJustin Lebar ArrayRef<Instruction *> Chain,
99737f4e0e0SJustin Lebar SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
99808debb02SMatt Arsenault StoreInst *S0 = cast<StoreInst>(Chain[0]);
999d7e8898bSMatt Arsenault
100086db068eSBjorn Pettersson // If the vector has an int element, default to int for the whole store.
100197fbc2abSSimon Pilgrim Type *StoreTy = nullptr;
100237f4e0e0SJustin Lebar for (Instruction *I : Chain) {
100337f4e0e0SJustin Lebar StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
1004d7e8898bSMatt Arsenault if (StoreTy->isIntOrIntVectorTy())
1005d7e8898bSMatt Arsenault break;
100642ad1705SMatt Arsenault
100742ad1705SMatt Arsenault if (StoreTy->isPtrOrPtrVectorTy()) {
100842ad1705SMatt Arsenault StoreTy = Type::getIntNTy(F.getParent()->getContext(),
100942ad1705SMatt Arsenault DL.getTypeSizeInBits(StoreTy));
101042ad1705SMatt Arsenault break;
101142ad1705SMatt Arsenault }
1012d7e8898bSMatt Arsenault }
101397fbc2abSSimon Pilgrim assert(StoreTy && "Failed to find store type");
1014d7e8898bSMatt Arsenault
101508debb02SMatt Arsenault unsigned Sz = DL.getTypeSizeInBits(StoreTy);
1016370e8226SMatt Arsenault unsigned AS = S0->getPointerAddressSpace();
1017370e8226SMatt Arsenault unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
101808debb02SMatt Arsenault unsigned VF = VecRegSize / Sz;
101908debb02SMatt Arsenault unsigned ChainSize = Chain.size();
102052e98f62SNikita Popov Align Alignment = S0->getAlign();
102108debb02SMatt Arsenault
1022640a61cdSAlina Sbirlea if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1023640a61cdSAlina Sbirlea InstructionsProcessed->insert(Chain.begin(), Chain.end());
102408debb02SMatt Arsenault return false;
1025640a61cdSAlina Sbirlea }
1026640a61cdSAlina Sbirlea
102737f4e0e0SJustin Lebar ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
10288778c626SJustin Lebar if (NewChain.empty()) {
102962b03e34SJustin Lebar // No vectorization possible.
1030640a61cdSAlina Sbirlea InstructionsProcessed->insert(Chain.begin(), Chain.end());
1031640a61cdSAlina Sbirlea return false;
1032640a61cdSAlina Sbirlea }
10338778c626SJustin Lebar if (NewChain.size() == 1) {
1034640a61cdSAlina Sbirlea // Failed after the first instruction. Discard it and try the smaller chain.
10358778c626SJustin Lebar InstructionsProcessed->insert(NewChain.front());
1036640a61cdSAlina Sbirlea return false;
1037640a61cdSAlina Sbirlea }
1038640a61cdSAlina Sbirlea
1039640a61cdSAlina Sbirlea // Update Chain to the valid vectorizable subchain.
10408778c626SJustin Lebar Chain = NewChain;
1041640a61cdSAlina Sbirlea ChainSize = Chain.size();
104208debb02SMatt Arsenault
10431c38681aSVolkan Keles // Check if it's legal to vectorize this chain. If not, split the chain and
10441c38681aSVolkan Keles // try again.
1045950a8204SMatt Arsenault unsigned EltSzInBytes = Sz / 8;
1046950a8204SMatt Arsenault unsigned SzInBytes = EltSzInBytes * ChainSize;
104708debb02SMatt Arsenault
10485e630834SChristopher Tetreault FixedVectorType *VecTy;
10495e630834SChristopher Tetreault auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
105008debb02SMatt Arsenault if (VecStoreTy)
1051d2befc66SChristopher Tetreault VecTy = FixedVectorType::get(StoreTy->getScalarType(),
105208debb02SMatt Arsenault Chain.size() * VecStoreTy->getNumElements());
105308debb02SMatt Arsenault else
1054d2befc66SChristopher Tetreault VecTy = FixedVectorType::get(StoreTy, Chain.size());
105508debb02SMatt Arsenault
10561c38681aSVolkan Keles // If it's more than the max vector size or the target has a better
10571c38681aSVolkan Keles // vector factor, break it into two pieces.
10581c38681aSVolkan Keles unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
10591c38681aSVolkan Keles if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1060d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
106108debb02SMatt Arsenault " Creating two separate arrays.\n");
10625f2f6118SDávid Bolvanský bool Vectorized = false;
10635f2f6118SDávid Bolvanský Vectorized |=
10645f2f6118SDávid Bolvanský vectorizeStoreChain(Chain.slice(0, TargetVF), InstructionsProcessed);
10655f2f6118SDávid Bolvanský Vectorized |=
10661c38681aSVolkan Keles vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
10675f2f6118SDávid Bolvanský return Vectorized;
106808debb02SMatt Arsenault }
106908debb02SMatt Arsenault
1070d34e60caSNicola Zaghen LLVM_DEBUG({
107108debb02SMatt Arsenault dbgs() << "LSV: Stores to vectorize:\n";
107237f4e0e0SJustin Lebar for (Instruction *I : Chain)
107337f4e0e0SJustin Lebar dbgs() << " " << *I << "\n";
1074598f8aadSAlina Sbirlea });
107508debb02SMatt Arsenault
1076640a61cdSAlina Sbirlea // We won't try again to vectorize the elements of the chain, regardless of
1077640a61cdSAlina Sbirlea // whether we succeed below.
1078640a61cdSAlina Sbirlea InstructionsProcessed->insert(Chain.begin(), Chain.end());
1079640a61cdSAlina Sbirlea
108008debb02SMatt Arsenault // If the store is going to be misaligned, don't vectorize it.
108111ef356dSCraig Topper if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1082c6407985SMatt Arsenault if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1083c6407985SMatt Arsenault auto Chains = splitOddVectorElts(Chain, Sz);
10845f2f6118SDávid Bolvanský bool Vectorized = false;
10855f2f6118SDávid Bolvanský Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
10865f2f6118SDávid Bolvanský Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
10875f2f6118SDávid Bolvanský return Vectorized;
1088c6407985SMatt Arsenault }
1089327955e0SAlina Sbirlea
109068b2e507SCraig Topper Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
109168b2e507SCraig Topper Align(StackAdjustedAlignment),
1092aec2fa35SDaniel Jasper DL, S0, nullptr, &DT);
109368b2e507SCraig Topper if (NewAlign >= Alignment)
109468b2e507SCraig Topper Alignment = NewAlign;
109586f9117dSMatt Arsenault else
109686f9117dSMatt Arsenault return false;
1097c6407985SMatt Arsenault }
1098c6407985SMatt Arsenault
10991507fc15SGuillaume Chatelet if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
1100c6407985SMatt Arsenault auto Chains = splitOddVectorElts(Chain, Sz);
11015f2f6118SDávid Bolvanský bool Vectorized = false;
11025f2f6118SDávid Bolvanský Vectorized |= vectorizeStoreChain(Chains.first, InstructionsProcessed);
11035f2f6118SDávid Bolvanský Vectorized |= vectorizeStoreChain(Chains.second, InstructionsProcessed);
11045f2f6118SDávid Bolvanský return Vectorized;
110508debb02SMatt Arsenault }
110608debb02SMatt Arsenault
11078778c626SJustin Lebar BasicBlock::iterator First, Last;
11088778c626SJustin Lebar std::tie(First, Last) = getBoundaryInstrs(Chain);
110908debb02SMatt Arsenault Builder.SetInsertPoint(&*Last);
111008debb02SMatt Arsenault
1111cf284f6cShyeongyu kim Value *Vec = PoisonValue::get(VecTy);
111208debb02SMatt Arsenault
111308debb02SMatt Arsenault if (VecStoreTy) {
111408debb02SMatt Arsenault unsigned VecWidth = VecStoreTy->getNumElements();
111508debb02SMatt Arsenault for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
111608debb02SMatt Arsenault StoreInst *Store = cast<StoreInst>(Chain[I]);
111708debb02SMatt Arsenault for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
111808debb02SMatt Arsenault unsigned NewIdx = J + I * VecWidth;
111908debb02SMatt Arsenault Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
112008debb02SMatt Arsenault Builder.getInt32(J));
112108debb02SMatt Arsenault if (Extract->getType() != StoreTy->getScalarType())
112208debb02SMatt Arsenault Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
112308debb02SMatt Arsenault
1124598f8aadSAlina Sbirlea Value *Insert =
1125598f8aadSAlina Sbirlea Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
112608debb02SMatt Arsenault Vec = Insert;
112708debb02SMatt Arsenault }
112808debb02SMatt Arsenault }
112908debb02SMatt Arsenault } else {
113008debb02SMatt Arsenault for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
113108debb02SMatt Arsenault StoreInst *Store = cast<StoreInst>(Chain[I]);
113208debb02SMatt Arsenault Value *Extract = Store->getValueOperand();
113308debb02SMatt Arsenault if (Extract->getType() != StoreTy->getScalarType())
1134598f8aadSAlina Sbirlea Extract =
1135598f8aadSAlina Sbirlea Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
113608debb02SMatt Arsenault
1137598f8aadSAlina Sbirlea Value *Insert =
1138598f8aadSAlina Sbirlea Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
113908debb02SMatt Arsenault Vec = Insert;
114008debb02SMatt Arsenault }
114108debb02SMatt Arsenault }
114208debb02SMatt Arsenault
1143c6407985SMatt Arsenault StoreInst *SI = Builder.CreateAlignedStore(
1144c6407985SMatt Arsenault Vec,
1145c6407985SMatt Arsenault Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
1146c6407985SMatt Arsenault Alignment);
114708debb02SMatt Arsenault propagateMetadata(SI, Chain);
114808debb02SMatt Arsenault
114908debb02SMatt Arsenault eraseInstructions(Chain);
115008debb02SMatt Arsenault ++NumVectorInstructions;
115108debb02SMatt Arsenault NumScalarsVectorized += Chain.size();
115208debb02SMatt Arsenault return true;
115308debb02SMatt Arsenault }
115408debb02SMatt Arsenault
vectorizeLoadChain(ArrayRef<Instruction * > Chain,SmallPtrSet<Instruction *,16> * InstructionsProcessed)1155640a61cdSAlina Sbirlea bool Vectorizer::vectorizeLoadChain(
115637f4e0e0SJustin Lebar ArrayRef<Instruction *> Chain,
115737f4e0e0SJustin Lebar SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
115808debb02SMatt Arsenault LoadInst *L0 = cast<LoadInst>(Chain[0]);
1159d7e8898bSMatt Arsenault
1160d7e8898bSMatt Arsenault // If the vector has an int element, default to int for the whole load.
11614e46ea39SSimon Pilgrim Type *LoadTy = nullptr;
1162d7e8898bSMatt Arsenault for (const auto &V : Chain) {
1163d7e8898bSMatt Arsenault LoadTy = cast<LoadInst>(V)->getType();
1164d7e8898bSMatt Arsenault if (LoadTy->isIntOrIntVectorTy())
1165d7e8898bSMatt Arsenault break;
116642ad1705SMatt Arsenault
116742ad1705SMatt Arsenault if (LoadTy->isPtrOrPtrVectorTy()) {
116842ad1705SMatt Arsenault LoadTy = Type::getIntNTy(F.getParent()->getContext(),
116942ad1705SMatt Arsenault DL.getTypeSizeInBits(LoadTy));
117042ad1705SMatt Arsenault break;
117142ad1705SMatt Arsenault }
1172d7e8898bSMatt Arsenault }
11734e46ea39SSimon Pilgrim assert(LoadTy && "Can't determine LoadInst type from chain");
1174d7e8898bSMatt Arsenault
117508debb02SMatt Arsenault unsigned Sz = DL.getTypeSizeInBits(LoadTy);
1176370e8226SMatt Arsenault unsigned AS = L0->getPointerAddressSpace();
1177370e8226SMatt Arsenault unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
117808debb02SMatt Arsenault unsigned VF = VecRegSize / Sz;
117908debb02SMatt Arsenault unsigned ChainSize = Chain.size();
118052e98f62SNikita Popov Align Alignment = L0->getAlign();
118108debb02SMatt Arsenault
1182640a61cdSAlina Sbirlea if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
1183640a61cdSAlina Sbirlea InstructionsProcessed->insert(Chain.begin(), Chain.end());
118408debb02SMatt Arsenault return false;
1185640a61cdSAlina Sbirlea }
1186640a61cdSAlina Sbirlea
118737f4e0e0SJustin Lebar ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
11888778c626SJustin Lebar if (NewChain.empty()) {
118962b03e34SJustin Lebar // No vectorization possible.
1190640a61cdSAlina Sbirlea InstructionsProcessed->insert(Chain.begin(), Chain.end());
1191640a61cdSAlina Sbirlea return false;
1192640a61cdSAlina Sbirlea }
11938778c626SJustin Lebar if (NewChain.size() == 1) {
1194640a61cdSAlina Sbirlea // Failed after the first instruction. Discard it and try the smaller chain.
11958778c626SJustin Lebar InstructionsProcessed->insert(NewChain.front());
1196640a61cdSAlina Sbirlea return false;
1197640a61cdSAlina Sbirlea }
1198640a61cdSAlina Sbirlea
1199640a61cdSAlina Sbirlea // Update Chain to the valid vectorizable subchain.
12008778c626SJustin Lebar Chain = NewChain;
1201640a61cdSAlina Sbirlea ChainSize = Chain.size();
120208debb02SMatt Arsenault
12031c38681aSVolkan Keles // Check if it's legal to vectorize this chain. If not, split the chain and
12041c38681aSVolkan Keles // try again.
1205950a8204SMatt Arsenault unsigned EltSzInBytes = Sz / 8;
1206950a8204SMatt Arsenault unsigned SzInBytes = EltSzInBytes * ChainSize;
120708debb02SMatt Arsenault VectorType *VecTy;
12085e630834SChristopher Tetreault auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
120908debb02SMatt Arsenault if (VecLoadTy)
1210d2befc66SChristopher Tetreault VecTy = FixedVectorType::get(LoadTy->getScalarType(),
121108debb02SMatt Arsenault Chain.size() * VecLoadTy->getNumElements());
121208debb02SMatt Arsenault else
1213d2befc66SChristopher Tetreault VecTy = FixedVectorType::get(LoadTy, Chain.size());
121408debb02SMatt Arsenault
12151c38681aSVolkan Keles // If it's more than the max vector size or the target has a better
12161c38681aSVolkan Keles // vector factor, break it into two pieces.
12171c38681aSVolkan Keles unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
12181c38681aSVolkan Keles if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
1219d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
122008debb02SMatt Arsenault " Creating two separate arrays.\n");
12215f2f6118SDávid Bolvanský bool Vectorized = false;
12225f2f6118SDávid Bolvanský Vectorized |=
12235f2f6118SDávid Bolvanský vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed);
12245f2f6118SDávid Bolvanský Vectorized |=
12251c38681aSVolkan Keles vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
12265f2f6118SDávid Bolvanský return Vectorized;
122708debb02SMatt Arsenault }
122808debb02SMatt Arsenault
1229640a61cdSAlina Sbirlea // We won't try again to vectorize the elements of the chain, regardless of
1230640a61cdSAlina Sbirlea // whether we succeed below.
1231640a61cdSAlina Sbirlea InstructionsProcessed->insert(Chain.begin(), Chain.end());
1232640a61cdSAlina Sbirlea
123308debb02SMatt Arsenault // If the load is going to be misaligned, don't vectorize it.
123411ef356dSCraig Topper if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
1235c6407985SMatt Arsenault if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
1236c6407985SMatt Arsenault auto Chains = splitOddVectorElts(Chain, Sz);
12375f2f6118SDávid Bolvanský bool Vectorized = false;
12385f2f6118SDávid Bolvanský Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
12395f2f6118SDávid Bolvanský Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
12405f2f6118SDávid Bolvanský return Vectorized;
1241c6407985SMatt Arsenault }
1242327955e0SAlina Sbirlea
124368b2e507SCraig Topper Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
124468b2e507SCraig Topper Align(StackAdjustedAlignment),
124568b2e507SCraig Topper DL, L0, nullptr, &DT);
124668b2e507SCraig Topper if (NewAlign >= Alignment)
124768b2e507SCraig Topper Alignment = NewAlign;
124886f9117dSMatt Arsenault else
124986f9117dSMatt Arsenault return false;
125008debb02SMatt Arsenault }
125108debb02SMatt Arsenault
12521507fc15SGuillaume Chatelet if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
1253c6407985SMatt Arsenault auto Chains = splitOddVectorElts(Chain, Sz);
12545f2f6118SDávid Bolvanský bool Vectorized = false;
12555f2f6118SDávid Bolvanský Vectorized |= vectorizeLoadChain(Chains.first, InstructionsProcessed);
12565f2f6118SDávid Bolvanský Vectorized |= vectorizeLoadChain(Chains.second, InstructionsProcessed);
12575f2f6118SDávid Bolvanský return Vectorized;
1258c6407985SMatt Arsenault }
1259c6407985SMatt Arsenault
1260d34e60caSNicola Zaghen LLVM_DEBUG({
126108debb02SMatt Arsenault dbgs() << "LSV: Loads to vectorize:\n";
126237f4e0e0SJustin Lebar for (Instruction *I : Chain)
126337f4e0e0SJustin Lebar I->dump();
1264598f8aadSAlina Sbirlea });
126508debb02SMatt Arsenault
12668778c626SJustin Lebar // getVectorizablePrefix already computed getBoundaryInstrs. The value of
12678778c626SJustin Lebar // Last may have changed since then, but the value of First won't have. If it
12688778c626SJustin Lebar // matters, we could compute getBoundaryInstrs only once and reuse it here.
12698778c626SJustin Lebar BasicBlock::iterator First, Last;
12708778c626SJustin Lebar std::tie(First, Last) = getBoundaryInstrs(Chain);
1271cbc6ac2aSAlina Sbirlea Builder.SetInsertPoint(&*First);
127208debb02SMatt Arsenault
127308debb02SMatt Arsenault Value *Bitcast =
127408debb02SMatt Arsenault Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
1275279fa8e0SGuillaume Chatelet LoadInst *LI =
1276279fa8e0SGuillaume Chatelet Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
127708debb02SMatt Arsenault propagateMetadata(LI, Chain);
127808debb02SMatt Arsenault
127908debb02SMatt Arsenault for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
128037f4e0e0SJustin Lebar Value *CV = Chain[I];
12810776f6e0SBenjamin Kramer Value *V;
12820776f6e0SBenjamin Kramer if (VecLoadTy) {
12830776f6e0SBenjamin Kramer // Extract a subvector using shufflevector.
12840776f6e0SBenjamin Kramer unsigned VecWidth = VecLoadTy->getNumElements();
12850776f6e0SBenjamin Kramer auto Mask =
12860776f6e0SBenjamin Kramer llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
12870776f6e0SBenjamin Kramer V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
12880776f6e0SBenjamin Kramer } else {
12890776f6e0SBenjamin Kramer V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
12900776f6e0SBenjamin Kramer }
12910776f6e0SBenjamin Kramer
129237f4e0e0SJustin Lebar if (V->getType() != CV->getType()) {
129337f4e0e0SJustin Lebar V = Builder.CreateBitOrPointerCast(V, CV->getType());
129442ad1705SMatt Arsenault }
129508debb02SMatt Arsenault
129608debb02SMatt Arsenault // Replace the old instruction.
129737f4e0e0SJustin Lebar CV->replaceAllUsesWith(V);
129808debb02SMatt Arsenault }
129908debb02SMatt Arsenault
13001fb415feSJohannes Doerfert // Since we might have opaque pointers we might end up using the pointer
13011fb415feSJohannes Doerfert // operand of the first load (wrt. memory loaded) for the vector load. Since
13021fb415feSJohannes Doerfert // this first load might not be the first in the block we potentially need to
13031fb415feSJohannes Doerfert // reorder the pointer operand (and its operands). If we have a bitcast though
13041fb415feSJohannes Doerfert // it might be before the load and should be the reorder start instruction.
13051fb415feSJohannes Doerfert // "Might" because for opaque pointers the "bitcast" is just the first loads
13061fb415feSJohannes Doerfert // pointer operand, as oppposed to something we inserted at the right position
13071fb415feSJohannes Doerfert // ourselves.
13081fb415feSJohannes Doerfert Instruction *BCInst = dyn_cast<Instruction>(Bitcast);
13091fb415feSJohannes Doerfert reorder((BCInst && BCInst != L0->getPointerOperand()) ? BCInst : LI);
131008debb02SMatt Arsenault
131108debb02SMatt Arsenault eraseInstructions(Chain);
131208debb02SMatt Arsenault
131308debb02SMatt Arsenault ++NumVectorInstructions;
131408debb02SMatt Arsenault NumScalarsVectorized += Chain.size();
131508debb02SMatt Arsenault return true;
131608debb02SMatt Arsenault }
1317327955e0SAlina Sbirlea
accessIsMisaligned(unsigned SzInBytes,unsigned AddressSpace,Align Alignment)1318327955e0SAlina Sbirlea bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
131911ef356dSCraig Topper Align Alignment) {
132011ef356dSCraig Topper if (Alignment.value() % SzInBytes == 0)
13216f937b11SAlina Sbirlea return false;
1322950a8204SMatt Arsenault
1323327955e0SAlina Sbirlea bool Fast = false;
13246f937b11SAlina Sbirlea bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
13256f937b11SAlina Sbirlea SzInBytes * 8, AddressSpace,
1326327955e0SAlina Sbirlea Alignment, &Fast);
1327d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
13286f937b11SAlina Sbirlea << " and fast? " << Fast << "\n";);
13296f937b11SAlina Sbirlea return !Allows || !Fast;
1330327955e0SAlina Sbirlea }
1331