15ffd83dbSDimitry Andric //===------- VectorCombine.cpp - Optimize partial vector operations -------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This pass optimizes scalar/vector interactions using target cost models. The
105ffd83dbSDimitry Andric // transforms implemented here may not fit in traditional loop-based or SLP
115ffd83dbSDimitry Andric // vectorization passes.
125ffd83dbSDimitry Andric //
135ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
145ffd83dbSDimitry Andric 
155ffd83dbSDimitry Andric #include "llvm/Transforms/Vectorize/VectorCombine.h"
165ffd83dbSDimitry Andric #include "llvm/ADT/Statistic.h"
17*5f7ddb14SDimitry Andric #include "llvm/Analysis/AssumptionCache.h"
185ffd83dbSDimitry Andric #include "llvm/Analysis/BasicAliasAnalysis.h"
195ffd83dbSDimitry Andric #include "llvm/Analysis/GlobalsModRef.h"
20af732203SDimitry Andric #include "llvm/Analysis/Loads.h"
215ffd83dbSDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h"
225ffd83dbSDimitry Andric #include "llvm/Analysis/ValueTracking.h"
235ffd83dbSDimitry Andric #include "llvm/Analysis/VectorUtils.h"
245ffd83dbSDimitry Andric #include "llvm/IR/Dominators.h"
255ffd83dbSDimitry Andric #include "llvm/IR/Function.h"
265ffd83dbSDimitry Andric #include "llvm/IR/IRBuilder.h"
275ffd83dbSDimitry Andric #include "llvm/IR/PatternMatch.h"
285ffd83dbSDimitry Andric #include "llvm/InitializePasses.h"
295ffd83dbSDimitry Andric #include "llvm/Pass.h"
305ffd83dbSDimitry Andric #include "llvm/Support/CommandLine.h"
315ffd83dbSDimitry Andric #include "llvm/Transforms/Utils/Local.h"
325ffd83dbSDimitry Andric #include "llvm/Transforms/Vectorize.h"
335ffd83dbSDimitry Andric 
345ffd83dbSDimitry Andric using namespace llvm;
355ffd83dbSDimitry Andric using namespace llvm::PatternMatch;
365ffd83dbSDimitry Andric 
375ffd83dbSDimitry Andric #define DEBUG_TYPE "vector-combine"
38af732203SDimitry Andric STATISTIC(NumVecLoad, "Number of vector loads formed");
395ffd83dbSDimitry Andric STATISTIC(NumVecCmp, "Number of vector compares formed");
405ffd83dbSDimitry Andric STATISTIC(NumVecBO, "Number of vector binops formed");
415ffd83dbSDimitry Andric STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
425ffd83dbSDimitry Andric STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
435ffd83dbSDimitry Andric STATISTIC(NumScalarBO, "Number of scalar binops formed");
445ffd83dbSDimitry Andric STATISTIC(NumScalarCmp, "Number of scalar compares formed");
455ffd83dbSDimitry Andric 
465ffd83dbSDimitry Andric static cl::opt<bool> DisableVectorCombine(
475ffd83dbSDimitry Andric     "disable-vector-combine", cl::init(false), cl::Hidden,
485ffd83dbSDimitry Andric     cl::desc("Disable all vector combine transforms"));
495ffd83dbSDimitry Andric 
505ffd83dbSDimitry Andric static cl::opt<bool> DisableBinopExtractShuffle(
515ffd83dbSDimitry Andric     "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
525ffd83dbSDimitry Andric     cl::desc("Disable binop extract to shuffle transforms"));
535ffd83dbSDimitry Andric 
54*5f7ddb14SDimitry Andric static cl::opt<unsigned> MaxInstrsToScan(
55*5f7ddb14SDimitry Andric     "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
56*5f7ddb14SDimitry Andric     cl::desc("Max number of instructions to scan for vector combining."));
57*5f7ddb14SDimitry Andric 
585ffd83dbSDimitry Andric static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
595ffd83dbSDimitry Andric 
605ffd83dbSDimitry Andric namespace {
615ffd83dbSDimitry Andric class VectorCombine {
625ffd83dbSDimitry Andric public:
VectorCombine(Function & F,const TargetTransformInfo & TTI,const DominatorTree & DT,AAResults & AA,AssumptionCache & AC)635ffd83dbSDimitry Andric   VectorCombine(Function &F, const TargetTransformInfo &TTI,
64*5f7ddb14SDimitry Andric                 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC)
65*5f7ddb14SDimitry Andric       : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC) {}
665ffd83dbSDimitry Andric 
675ffd83dbSDimitry Andric   bool run();
685ffd83dbSDimitry Andric 
695ffd83dbSDimitry Andric private:
705ffd83dbSDimitry Andric   Function &F;
715ffd83dbSDimitry Andric   IRBuilder<> Builder;
725ffd83dbSDimitry Andric   const TargetTransformInfo &TTI;
735ffd83dbSDimitry Andric   const DominatorTree &DT;
74*5f7ddb14SDimitry Andric   AAResults &AA;
75*5f7ddb14SDimitry Andric   AssumptionCache &AC;
765ffd83dbSDimitry Andric 
77af732203SDimitry Andric   bool vectorizeLoadInsert(Instruction &I);
785ffd83dbSDimitry Andric   ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
795ffd83dbSDimitry Andric                                         ExtractElementInst *Ext1,
805ffd83dbSDimitry Andric                                         unsigned PreferredExtractIndex) const;
815ffd83dbSDimitry Andric   bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
825ffd83dbSDimitry Andric                              unsigned Opcode,
835ffd83dbSDimitry Andric                              ExtractElementInst *&ConvertToShuffle,
845ffd83dbSDimitry Andric                              unsigned PreferredExtractIndex);
855ffd83dbSDimitry Andric   void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
865ffd83dbSDimitry Andric                      Instruction &I);
875ffd83dbSDimitry Andric   void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
885ffd83dbSDimitry Andric                        Instruction &I);
895ffd83dbSDimitry Andric   bool foldExtractExtract(Instruction &I);
905ffd83dbSDimitry Andric   bool foldBitcastShuf(Instruction &I);
915ffd83dbSDimitry Andric   bool scalarizeBinopOrCmp(Instruction &I);
925ffd83dbSDimitry Andric   bool foldExtractedCmps(Instruction &I);
93*5f7ddb14SDimitry Andric   bool foldSingleElementStore(Instruction &I);
94*5f7ddb14SDimitry Andric   bool scalarizeLoadExtract(Instruction &I);
955ffd83dbSDimitry Andric };
965ffd83dbSDimitry Andric } // namespace
975ffd83dbSDimitry Andric 
replaceValue(Value & Old,Value & New)985ffd83dbSDimitry Andric static void replaceValue(Value &Old, Value &New) {
995ffd83dbSDimitry Andric   Old.replaceAllUsesWith(&New);
1005ffd83dbSDimitry Andric   New.takeName(&Old);
1015ffd83dbSDimitry Andric }
1025ffd83dbSDimitry Andric 
vectorizeLoadInsert(Instruction & I)103af732203SDimitry Andric bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
104af732203SDimitry Andric   // Match insert into fixed vector of scalar value.
105af732203SDimitry Andric   // TODO: Handle non-zero insert index.
106af732203SDimitry Andric   auto *Ty = dyn_cast<FixedVectorType>(I.getType());
107af732203SDimitry Andric   Value *Scalar;
108af732203SDimitry Andric   if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
109af732203SDimitry Andric       !Scalar->hasOneUse())
110af732203SDimitry Andric     return false;
111af732203SDimitry Andric 
112af732203SDimitry Andric   // Optionally match an extract from another vector.
113af732203SDimitry Andric   Value *X;
114af732203SDimitry Andric   bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
115af732203SDimitry Andric   if (!HasExtract)
116af732203SDimitry Andric     X = Scalar;
117af732203SDimitry Andric 
118af732203SDimitry Andric   // Match source value as load of scalar or vector.
119af732203SDimitry Andric   // Do not vectorize scalar load (widening) if atomic/volatile or under
120af732203SDimitry Andric   // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
121af732203SDimitry Andric   // or create data races non-existent in the source.
122af732203SDimitry Andric   auto *Load = dyn_cast<LoadInst>(X);
123af732203SDimitry Andric   if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
124af732203SDimitry Andric       Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
125af732203SDimitry Andric       mustSuppressSpeculation(*Load))
126af732203SDimitry Andric     return false;
127af732203SDimitry Andric 
128af732203SDimitry Andric   const DataLayout &DL = I.getModule()->getDataLayout();
129af732203SDimitry Andric   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
130af732203SDimitry Andric   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
131af732203SDimitry Andric 
132af732203SDimitry Andric   // If original AS != Load's AS, we can't bitcast the original pointer and have
133af732203SDimitry Andric   // to use Load's operand instead. Ideally we would want to strip pointer casts
134af732203SDimitry Andric   // without changing AS, but there's no API to do that ATM.
135af732203SDimitry Andric   unsigned AS = Load->getPointerAddressSpace();
136af732203SDimitry Andric   if (AS != SrcPtr->getType()->getPointerAddressSpace())
137af732203SDimitry Andric     SrcPtr = Load->getPointerOperand();
138af732203SDimitry Andric 
139af732203SDimitry Andric   // We are potentially transforming byte-sized (8-bit) memory accesses, so make
140af732203SDimitry Andric   // sure we have all of our type-based constraints in place for this target.
141af732203SDimitry Andric   Type *ScalarTy = Scalar->getType();
142af732203SDimitry Andric   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
143af732203SDimitry Andric   unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
144af732203SDimitry Andric   if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
145af732203SDimitry Andric       ScalarSize % 8 != 0)
146af732203SDimitry Andric     return false;
147af732203SDimitry Andric 
148af732203SDimitry Andric   // Check safety of replacing the scalar load with a larger vector load.
149af732203SDimitry Andric   // We use minimal alignment (maximum flexibility) because we only care about
150af732203SDimitry Andric   // the dereferenceable region. When calculating cost and creating a new op,
151af732203SDimitry Andric   // we may use a larger value based on alignment attributes.
152af732203SDimitry Andric   unsigned MinVecNumElts = MinVectorSize / ScalarSize;
153af732203SDimitry Andric   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
154af732203SDimitry Andric   unsigned OffsetEltIndex = 0;
155af732203SDimitry Andric   Align Alignment = Load->getAlign();
156af732203SDimitry Andric   if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
157af732203SDimitry Andric     // It is not safe to load directly from the pointer, but we can still peek
158af732203SDimitry Andric     // through gep offsets and check if it safe to load from a base address with
159af732203SDimitry Andric     // updated alignment. If it is, we can shuffle the element(s) into place
160af732203SDimitry Andric     // after loading.
161af732203SDimitry Andric     unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
162af732203SDimitry Andric     APInt Offset(OffsetBitWidth, 0);
163af732203SDimitry Andric     SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
164af732203SDimitry Andric 
165af732203SDimitry Andric     // We want to shuffle the result down from a high element of a vector, so
166af732203SDimitry Andric     // the offset must be positive.
167af732203SDimitry Andric     if (Offset.isNegative())
168af732203SDimitry Andric       return false;
169af732203SDimitry Andric 
170af732203SDimitry Andric     // The offset must be a multiple of the scalar element to shuffle cleanly
171af732203SDimitry Andric     // in the element's size.
172af732203SDimitry Andric     uint64_t ScalarSizeInBytes = ScalarSize / 8;
173af732203SDimitry Andric     if (Offset.urem(ScalarSizeInBytes) != 0)
174af732203SDimitry Andric       return false;
175af732203SDimitry Andric 
176af732203SDimitry Andric     // If we load MinVecNumElts, will our target element still be loaded?
177af732203SDimitry Andric     OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
178af732203SDimitry Andric     if (OffsetEltIndex >= MinVecNumElts)
179af732203SDimitry Andric       return false;
180af732203SDimitry Andric 
181af732203SDimitry Andric     if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
182af732203SDimitry Andric       return false;
183af732203SDimitry Andric 
184af732203SDimitry Andric     // Update alignment with offset value. Note that the offset could be negated
185af732203SDimitry Andric     // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
186af732203SDimitry Andric     // negation does not change the result of the alignment calculation.
187af732203SDimitry Andric     Alignment = commonAlignment(Alignment, Offset.getZExtValue());
188af732203SDimitry Andric   }
189af732203SDimitry Andric 
190af732203SDimitry Andric   // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
191af732203SDimitry Andric   // Use the greater of the alignment on the load or its source pointer.
192af732203SDimitry Andric   Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
193af732203SDimitry Andric   Type *LoadTy = Load->getType();
194af732203SDimitry Andric   InstructionCost OldCost =
195af732203SDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
196af732203SDimitry Andric   APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
197af732203SDimitry Andric   OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
198af732203SDimitry Andric                                           /* Insert */ true, HasExtract);
199af732203SDimitry Andric 
200af732203SDimitry Andric   // New pattern: load VecPtr
201af732203SDimitry Andric   InstructionCost NewCost =
202af732203SDimitry Andric       TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
203af732203SDimitry Andric   // Optionally, we are shuffling the loaded vector element(s) into place.
204*5f7ddb14SDimitry Andric   // For the mask set everything but element 0 to undef to prevent poison from
205*5f7ddb14SDimitry Andric   // propagating from the extra loaded memory. This will also optionally
206*5f7ddb14SDimitry Andric   // shrink/grow the vector from the loaded size to the output size.
207*5f7ddb14SDimitry Andric   // We assume this operation has no cost in codegen if there was no offset.
208*5f7ddb14SDimitry Andric   // Note that we could use freeze to avoid poison problems, but then we might
209*5f7ddb14SDimitry Andric   // still need a shuffle to change the vector size.
210*5f7ddb14SDimitry Andric   unsigned OutputNumElts = Ty->getNumElements();
211*5f7ddb14SDimitry Andric   SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
212*5f7ddb14SDimitry Andric   assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
213*5f7ddb14SDimitry Andric   Mask[0] = OffsetEltIndex;
214af732203SDimitry Andric   if (OffsetEltIndex)
215*5f7ddb14SDimitry Andric     NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
216af732203SDimitry Andric 
217af732203SDimitry Andric   // We can aggressively convert to the vector form because the backend can
218af732203SDimitry Andric   // invert this transform if it does not result in a performance win.
219af732203SDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
220af732203SDimitry Andric     return false;
221af732203SDimitry Andric 
222af732203SDimitry Andric   // It is safe and potentially profitable to load a vector directly:
223af732203SDimitry Andric   // inselt undef, load Scalar, 0 --> load VecPtr
224af732203SDimitry Andric   IRBuilder<> Builder(Load);
225af732203SDimitry Andric   Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
226af732203SDimitry Andric   Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
227af732203SDimitry Andric   VecLd = Builder.CreateShuffleVector(VecLd, Mask);
228af732203SDimitry Andric 
229af732203SDimitry Andric   replaceValue(I, *VecLd);
230af732203SDimitry Andric   ++NumVecLoad;
231af732203SDimitry Andric   return true;
232af732203SDimitry Andric }
233af732203SDimitry Andric 
2345ffd83dbSDimitry Andric /// Determine which, if any, of the inputs should be replaced by a shuffle
2355ffd83dbSDimitry Andric /// followed by extract from a different index.
getShuffleExtract(ExtractElementInst * Ext0,ExtractElementInst * Ext1,unsigned PreferredExtractIndex=InvalidIndex) const2365ffd83dbSDimitry Andric ExtractElementInst *VectorCombine::getShuffleExtract(
2375ffd83dbSDimitry Andric     ExtractElementInst *Ext0, ExtractElementInst *Ext1,
2385ffd83dbSDimitry Andric     unsigned PreferredExtractIndex = InvalidIndex) const {
2395ffd83dbSDimitry Andric   assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
2405ffd83dbSDimitry Andric          isa<ConstantInt>(Ext1->getIndexOperand()) &&
2415ffd83dbSDimitry Andric          "Expected constant extract indexes");
2425ffd83dbSDimitry Andric 
2435ffd83dbSDimitry Andric   unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
2445ffd83dbSDimitry Andric   unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
2455ffd83dbSDimitry Andric 
2465ffd83dbSDimitry Andric   // If the extract indexes are identical, no shuffle is needed.
2475ffd83dbSDimitry Andric   if (Index0 == Index1)
2485ffd83dbSDimitry Andric     return nullptr;
2495ffd83dbSDimitry Andric 
2505ffd83dbSDimitry Andric   Type *VecTy = Ext0->getVectorOperand()->getType();
2515ffd83dbSDimitry Andric   assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
252af732203SDimitry Andric   InstructionCost Cost0 =
253af732203SDimitry Andric       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
254af732203SDimitry Andric   InstructionCost Cost1 =
255af732203SDimitry Andric       TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
256af732203SDimitry Andric 
257af732203SDimitry Andric   // If both costs are invalid no shuffle is needed
258af732203SDimitry Andric   if (!Cost0.isValid() && !Cost1.isValid())
259af732203SDimitry Andric     return nullptr;
2605ffd83dbSDimitry Andric 
2615ffd83dbSDimitry Andric   // We are extracting from 2 different indexes, so one operand must be shuffled
2625ffd83dbSDimitry Andric   // before performing a vector operation and/or extract. The more expensive
2635ffd83dbSDimitry Andric   // extract will be replaced by a shuffle.
2645ffd83dbSDimitry Andric   if (Cost0 > Cost1)
2655ffd83dbSDimitry Andric     return Ext0;
2665ffd83dbSDimitry Andric   if (Cost1 > Cost0)
2675ffd83dbSDimitry Andric     return Ext1;
2685ffd83dbSDimitry Andric 
2695ffd83dbSDimitry Andric   // If the costs are equal and there is a preferred extract index, shuffle the
2705ffd83dbSDimitry Andric   // opposite operand.
2715ffd83dbSDimitry Andric   if (PreferredExtractIndex == Index0)
2725ffd83dbSDimitry Andric     return Ext1;
2735ffd83dbSDimitry Andric   if (PreferredExtractIndex == Index1)
2745ffd83dbSDimitry Andric     return Ext0;
2755ffd83dbSDimitry Andric 
2765ffd83dbSDimitry Andric   // Otherwise, replace the extract with the higher index.
2775ffd83dbSDimitry Andric   return Index0 > Index1 ? Ext0 : Ext1;
2785ffd83dbSDimitry Andric }
2795ffd83dbSDimitry Andric 
2805ffd83dbSDimitry Andric /// Compare the relative costs of 2 extracts followed by scalar operation vs.
2815ffd83dbSDimitry Andric /// vector operation(s) followed by extract. Return true if the existing
2825ffd83dbSDimitry Andric /// instructions are cheaper than a vector alternative. Otherwise, return false
2835ffd83dbSDimitry Andric /// and if one of the extracts should be transformed to a shufflevector, set
2845ffd83dbSDimitry Andric /// \p ConvertToShuffle to that extract instruction.
isExtractExtractCheap(ExtractElementInst * Ext0,ExtractElementInst * Ext1,unsigned Opcode,ExtractElementInst * & ConvertToShuffle,unsigned PreferredExtractIndex)2855ffd83dbSDimitry Andric bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
2865ffd83dbSDimitry Andric                                           ExtractElementInst *Ext1,
2875ffd83dbSDimitry Andric                                           unsigned Opcode,
2885ffd83dbSDimitry Andric                                           ExtractElementInst *&ConvertToShuffle,
2895ffd83dbSDimitry Andric                                           unsigned PreferredExtractIndex) {
2905ffd83dbSDimitry Andric   assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
2915ffd83dbSDimitry Andric          isa<ConstantInt>(Ext1->getOperand(1)) &&
2925ffd83dbSDimitry Andric          "Expected constant extract indexes");
2935ffd83dbSDimitry Andric   Type *ScalarTy = Ext0->getType();
2945ffd83dbSDimitry Andric   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
295af732203SDimitry Andric   InstructionCost ScalarOpCost, VectorOpCost;
2965ffd83dbSDimitry Andric 
2975ffd83dbSDimitry Andric   // Get cost estimates for scalar and vector versions of the operation.
2985ffd83dbSDimitry Andric   bool IsBinOp = Instruction::isBinaryOp(Opcode);
2995ffd83dbSDimitry Andric   if (IsBinOp) {
3005ffd83dbSDimitry Andric     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
3015ffd83dbSDimitry Andric     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
3025ffd83dbSDimitry Andric   } else {
3035ffd83dbSDimitry Andric     assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
3045ffd83dbSDimitry Andric            "Expected a compare");
3055ffd83dbSDimitry Andric     ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
3065ffd83dbSDimitry Andric                                           CmpInst::makeCmpResultType(ScalarTy));
3075ffd83dbSDimitry Andric     VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
3085ffd83dbSDimitry Andric                                           CmpInst::makeCmpResultType(VecTy));
3095ffd83dbSDimitry Andric   }
3105ffd83dbSDimitry Andric 
3115ffd83dbSDimitry Andric   // Get cost estimates for the extract elements. These costs will factor into
3125ffd83dbSDimitry Andric   // both sequences.
3135ffd83dbSDimitry Andric   unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
3145ffd83dbSDimitry Andric   unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
3155ffd83dbSDimitry Andric 
316af732203SDimitry Andric   InstructionCost Extract0Cost =
3175ffd83dbSDimitry Andric       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
318af732203SDimitry Andric   InstructionCost Extract1Cost =
3195ffd83dbSDimitry Andric       TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
3205ffd83dbSDimitry Andric 
3215ffd83dbSDimitry Andric   // A more expensive extract will always be replaced by a splat shuffle.
3225ffd83dbSDimitry Andric   // For example, if Ext0 is more expensive:
3235ffd83dbSDimitry Andric   // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
3245ffd83dbSDimitry Andric   // extelt (opcode (splat V0, Ext0), V1), Ext1
3255ffd83dbSDimitry Andric   // TODO: Evaluate whether that always results in lowest cost. Alternatively,
3265ffd83dbSDimitry Andric   //       check the cost of creating a broadcast shuffle and shuffling both
3275ffd83dbSDimitry Andric   //       operands to element 0.
328af732203SDimitry Andric   InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
3295ffd83dbSDimitry Andric 
3305ffd83dbSDimitry Andric   // Extra uses of the extracts mean that we include those costs in the
3315ffd83dbSDimitry Andric   // vector total because those instructions will not be eliminated.
332af732203SDimitry Andric   InstructionCost OldCost, NewCost;
3335ffd83dbSDimitry Andric   if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
3345ffd83dbSDimitry Andric     // Handle a special case. If the 2 extracts are identical, adjust the
3355ffd83dbSDimitry Andric     // formulas to account for that. The extra use charge allows for either the
3365ffd83dbSDimitry Andric     // CSE'd pattern or an unoptimized form with identical values:
3375ffd83dbSDimitry Andric     // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
3385ffd83dbSDimitry Andric     bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
3395ffd83dbSDimitry Andric                                   : !Ext0->hasOneUse() || !Ext1->hasOneUse();
3405ffd83dbSDimitry Andric     OldCost = CheapExtractCost + ScalarOpCost;
3415ffd83dbSDimitry Andric     NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
3425ffd83dbSDimitry Andric   } else {
3435ffd83dbSDimitry Andric     // Handle the general case. Each extract is actually a different value:
3445ffd83dbSDimitry Andric     // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
3455ffd83dbSDimitry Andric     OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
3465ffd83dbSDimitry Andric     NewCost = VectorOpCost + CheapExtractCost +
3475ffd83dbSDimitry Andric               !Ext0->hasOneUse() * Extract0Cost +
3485ffd83dbSDimitry Andric               !Ext1->hasOneUse() * Extract1Cost;
3495ffd83dbSDimitry Andric   }
3505ffd83dbSDimitry Andric 
3515ffd83dbSDimitry Andric   ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
3525ffd83dbSDimitry Andric   if (ConvertToShuffle) {
3535ffd83dbSDimitry Andric     if (IsBinOp && DisableBinopExtractShuffle)
3545ffd83dbSDimitry Andric       return true;
3555ffd83dbSDimitry Andric 
3565ffd83dbSDimitry Andric     // If we are extracting from 2 different indexes, then one operand must be
3575ffd83dbSDimitry Andric     // shuffled before performing the vector operation. The shuffle mask is
3585ffd83dbSDimitry Andric     // undefined except for 1 lane that is being translated to the remaining
3595ffd83dbSDimitry Andric     // extraction lane. Therefore, it is a splat shuffle. Ex:
3605ffd83dbSDimitry Andric     // ShufMask = { undef, undef, 0, undef }
3615ffd83dbSDimitry Andric     // TODO: The cost model has an option for a "broadcast" shuffle
3625ffd83dbSDimitry Andric     //       (splat-from-element-0), but no option for a more general splat.
3635ffd83dbSDimitry Andric     NewCost +=
3645ffd83dbSDimitry Andric         TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
3655ffd83dbSDimitry Andric   }
3665ffd83dbSDimitry Andric 
3675ffd83dbSDimitry Andric   // Aggressively form a vector op if the cost is equal because the transform
3685ffd83dbSDimitry Andric   // may enable further optimization.
3695ffd83dbSDimitry Andric   // Codegen can reverse this transform (scalarize) if it was not profitable.
3705ffd83dbSDimitry Andric   return OldCost < NewCost;
3715ffd83dbSDimitry Andric }
3725ffd83dbSDimitry Andric 
3735ffd83dbSDimitry Andric /// Create a shuffle that translates (shifts) 1 element from the input vector
3745ffd83dbSDimitry Andric /// to a new element location.
createShiftShuffle(Value * Vec,unsigned OldIndex,unsigned NewIndex,IRBuilder<> & Builder)3755ffd83dbSDimitry Andric static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
3765ffd83dbSDimitry Andric                                  unsigned NewIndex, IRBuilder<> &Builder) {
3775ffd83dbSDimitry Andric   // The shuffle mask is undefined except for 1 lane that is being translated
3785ffd83dbSDimitry Andric   // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
3795ffd83dbSDimitry Andric   // ShufMask = { 2, undef, undef, undef }
3805ffd83dbSDimitry Andric   auto *VecTy = cast<FixedVectorType>(Vec->getType());
3815ffd83dbSDimitry Andric   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
3825ffd83dbSDimitry Andric   ShufMask[NewIndex] = OldIndex;
383af732203SDimitry Andric   return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
3845ffd83dbSDimitry Andric }
3855ffd83dbSDimitry Andric 
3865ffd83dbSDimitry Andric /// Given an extract element instruction with constant index operand, shuffle
3875ffd83dbSDimitry Andric /// the source vector (shift the scalar element) to a NewIndex for extraction.
3885ffd83dbSDimitry Andric /// Return null if the input can be constant folded, so that we are not creating
3895ffd83dbSDimitry Andric /// unnecessary instructions.
translateExtract(ExtractElementInst * ExtElt,unsigned NewIndex,IRBuilder<> & Builder)3905ffd83dbSDimitry Andric static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
3915ffd83dbSDimitry Andric                                             unsigned NewIndex,
3925ffd83dbSDimitry Andric                                             IRBuilder<> &Builder) {
3935ffd83dbSDimitry Andric   // If the extract can be constant-folded, this code is unsimplified. Defer
3945ffd83dbSDimitry Andric   // to other passes to handle that.
3955ffd83dbSDimitry Andric   Value *X = ExtElt->getVectorOperand();
3965ffd83dbSDimitry Andric   Value *C = ExtElt->getIndexOperand();
3975ffd83dbSDimitry Andric   assert(isa<ConstantInt>(C) && "Expected a constant index operand");
3985ffd83dbSDimitry Andric   if (isa<Constant>(X))
3995ffd83dbSDimitry Andric     return nullptr;
4005ffd83dbSDimitry Andric 
4015ffd83dbSDimitry Andric   Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
4025ffd83dbSDimitry Andric                                    NewIndex, Builder);
4035ffd83dbSDimitry Andric   return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
4045ffd83dbSDimitry Andric }
4055ffd83dbSDimitry Andric 
4065ffd83dbSDimitry Andric /// Try to reduce extract element costs by converting scalar compares to vector
4075ffd83dbSDimitry Andric /// compares followed by extract.
4085ffd83dbSDimitry Andric /// cmp (ext0 V0, C), (ext1 V1, C)
foldExtExtCmp(ExtractElementInst * Ext0,ExtractElementInst * Ext1,Instruction & I)4095ffd83dbSDimitry Andric void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
4105ffd83dbSDimitry Andric                                   ExtractElementInst *Ext1, Instruction &I) {
4115ffd83dbSDimitry Andric   assert(isa<CmpInst>(&I) && "Expected a compare");
4125ffd83dbSDimitry Andric   assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
4135ffd83dbSDimitry Andric              cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
4145ffd83dbSDimitry Andric          "Expected matching constant extract indexes");
4155ffd83dbSDimitry Andric 
4165ffd83dbSDimitry Andric   // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
4175ffd83dbSDimitry Andric   ++NumVecCmp;
4185ffd83dbSDimitry Andric   CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
4195ffd83dbSDimitry Andric   Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
4205ffd83dbSDimitry Andric   Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
4215ffd83dbSDimitry Andric   Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
4225ffd83dbSDimitry Andric   replaceValue(I, *NewExt);
4235ffd83dbSDimitry Andric }
4245ffd83dbSDimitry Andric 
4255ffd83dbSDimitry Andric /// Try to reduce extract element costs by converting scalar binops to vector
4265ffd83dbSDimitry Andric /// binops followed by extract.
4275ffd83dbSDimitry Andric /// bo (ext0 V0, C), (ext1 V1, C)
foldExtExtBinop(ExtractElementInst * Ext0,ExtractElementInst * Ext1,Instruction & I)4285ffd83dbSDimitry Andric void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
4295ffd83dbSDimitry Andric                                     ExtractElementInst *Ext1, Instruction &I) {
4305ffd83dbSDimitry Andric   assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
4315ffd83dbSDimitry Andric   assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
4325ffd83dbSDimitry Andric              cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
4335ffd83dbSDimitry Andric          "Expected matching constant extract indexes");
4345ffd83dbSDimitry Andric 
4355ffd83dbSDimitry Andric   // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
4365ffd83dbSDimitry Andric   ++NumVecBO;
4375ffd83dbSDimitry Andric   Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
4385ffd83dbSDimitry Andric   Value *VecBO =
4395ffd83dbSDimitry Andric       Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
4405ffd83dbSDimitry Andric 
4415ffd83dbSDimitry Andric   // All IR flags are safe to back-propagate because any potential poison
4425ffd83dbSDimitry Andric   // created in unused vector elements is discarded by the extract.
4435ffd83dbSDimitry Andric   if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
4445ffd83dbSDimitry Andric     VecBOInst->copyIRFlags(&I);
4455ffd83dbSDimitry Andric 
4465ffd83dbSDimitry Andric   Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
4475ffd83dbSDimitry Andric   replaceValue(I, *NewExt);
4485ffd83dbSDimitry Andric }
4495ffd83dbSDimitry Andric 
4505ffd83dbSDimitry Andric /// Match an instruction with extracted vector operands.
foldExtractExtract(Instruction & I)4515ffd83dbSDimitry Andric bool VectorCombine::foldExtractExtract(Instruction &I) {
4525ffd83dbSDimitry Andric   // It is not safe to transform things like div, urem, etc. because we may
4535ffd83dbSDimitry Andric   // create undefined behavior when executing those on unknown vector elements.
4545ffd83dbSDimitry Andric   if (!isSafeToSpeculativelyExecute(&I))
4555ffd83dbSDimitry Andric     return false;
4565ffd83dbSDimitry Andric 
4575ffd83dbSDimitry Andric   Instruction *I0, *I1;
4585ffd83dbSDimitry Andric   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
4595ffd83dbSDimitry Andric   if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
4605ffd83dbSDimitry Andric       !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
4615ffd83dbSDimitry Andric     return false;
4625ffd83dbSDimitry Andric 
4635ffd83dbSDimitry Andric   Value *V0, *V1;
4645ffd83dbSDimitry Andric   uint64_t C0, C1;
4655ffd83dbSDimitry Andric   if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
4665ffd83dbSDimitry Andric       !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
4675ffd83dbSDimitry Andric       V0->getType() != V1->getType())
4685ffd83dbSDimitry Andric     return false;
4695ffd83dbSDimitry Andric 
4705ffd83dbSDimitry Andric   // If the scalar value 'I' is going to be re-inserted into a vector, then try
4715ffd83dbSDimitry Andric   // to create an extract to that same element. The extract/insert can be
4725ffd83dbSDimitry Andric   // reduced to a "select shuffle".
4735ffd83dbSDimitry Andric   // TODO: If we add a larger pattern match that starts from an insert, this
4745ffd83dbSDimitry Andric   //       probably becomes unnecessary.
4755ffd83dbSDimitry Andric   auto *Ext0 = cast<ExtractElementInst>(I0);
4765ffd83dbSDimitry Andric   auto *Ext1 = cast<ExtractElementInst>(I1);
4775ffd83dbSDimitry Andric   uint64_t InsertIndex = InvalidIndex;
4785ffd83dbSDimitry Andric   if (I.hasOneUse())
4795ffd83dbSDimitry Andric     match(I.user_back(),
4805ffd83dbSDimitry Andric           m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
4815ffd83dbSDimitry Andric 
4825ffd83dbSDimitry Andric   ExtractElementInst *ExtractToChange;
4835ffd83dbSDimitry Andric   if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
4845ffd83dbSDimitry Andric                             InsertIndex))
4855ffd83dbSDimitry Andric     return false;
4865ffd83dbSDimitry Andric 
4875ffd83dbSDimitry Andric   if (ExtractToChange) {
4885ffd83dbSDimitry Andric     unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
4895ffd83dbSDimitry Andric     ExtractElementInst *NewExtract =
4905ffd83dbSDimitry Andric         translateExtract(ExtractToChange, CheapExtractIdx, Builder);
4915ffd83dbSDimitry Andric     if (!NewExtract)
4925ffd83dbSDimitry Andric       return false;
4935ffd83dbSDimitry Andric     if (ExtractToChange == Ext0)
4945ffd83dbSDimitry Andric       Ext0 = NewExtract;
4955ffd83dbSDimitry Andric     else
4965ffd83dbSDimitry Andric       Ext1 = NewExtract;
4975ffd83dbSDimitry Andric   }
4985ffd83dbSDimitry Andric 
4995ffd83dbSDimitry Andric   if (Pred != CmpInst::BAD_ICMP_PREDICATE)
5005ffd83dbSDimitry Andric     foldExtExtCmp(Ext0, Ext1, I);
5015ffd83dbSDimitry Andric   else
5025ffd83dbSDimitry Andric     foldExtExtBinop(Ext0, Ext1, I);
5035ffd83dbSDimitry Andric 
5045ffd83dbSDimitry Andric   return true;
5055ffd83dbSDimitry Andric }
5065ffd83dbSDimitry Andric 
5075ffd83dbSDimitry Andric /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
5085ffd83dbSDimitry Andric /// destination type followed by shuffle. This can enable further transforms by
5095ffd83dbSDimitry Andric /// moving bitcasts or shuffles together.
foldBitcastShuf(Instruction & I)5105ffd83dbSDimitry Andric bool VectorCombine::foldBitcastShuf(Instruction &I) {
5115ffd83dbSDimitry Andric   Value *V;
5125ffd83dbSDimitry Andric   ArrayRef<int> Mask;
5135ffd83dbSDimitry Andric   if (!match(&I, m_BitCast(
5145ffd83dbSDimitry Andric                      m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
5155ffd83dbSDimitry Andric     return false;
5165ffd83dbSDimitry Andric 
517af732203SDimitry Andric   // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
518af732203SDimitry Andric   // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
519af732203SDimitry Andric   // mask for scalable type is a splat or not.
520af732203SDimitry Andric   // 2) Disallow non-vector casts and length-changing shuffles.
5215ffd83dbSDimitry Andric   // TODO: We could allow any shuffle.
522af732203SDimitry Andric   auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
523af732203SDimitry Andric   auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
524af732203SDimitry Andric   if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
5255ffd83dbSDimitry Andric     return false;
5265ffd83dbSDimitry Andric 
5275ffd83dbSDimitry Andric   unsigned DestNumElts = DestTy->getNumElements();
5285ffd83dbSDimitry Andric   unsigned SrcNumElts = SrcTy->getNumElements();
5295ffd83dbSDimitry Andric   SmallVector<int, 16> NewMask;
5305ffd83dbSDimitry Andric   if (SrcNumElts <= DestNumElts) {
5315ffd83dbSDimitry Andric     // The bitcast is from wide to narrow/equal elements. The shuffle mask can
5325ffd83dbSDimitry Andric     // always be expanded to the equivalent form choosing narrower elements.
5335ffd83dbSDimitry Andric     assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
5345ffd83dbSDimitry Andric     unsigned ScaleFactor = DestNumElts / SrcNumElts;
5355ffd83dbSDimitry Andric     narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
5365ffd83dbSDimitry Andric   } else {
5375ffd83dbSDimitry Andric     // The bitcast is from narrow elements to wide elements. The shuffle mask
5385ffd83dbSDimitry Andric     // must choose consecutive elements to allow casting first.
5395ffd83dbSDimitry Andric     assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
5405ffd83dbSDimitry Andric     unsigned ScaleFactor = SrcNumElts / DestNumElts;
5415ffd83dbSDimitry Andric     if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
5425ffd83dbSDimitry Andric       return false;
5435ffd83dbSDimitry Andric   }
544*5f7ddb14SDimitry Andric 
545*5f7ddb14SDimitry Andric   // The new shuffle must not cost more than the old shuffle. The bitcast is
546*5f7ddb14SDimitry Andric   // moved ahead of the shuffle, so assume that it has the same cost as before.
547*5f7ddb14SDimitry Andric   InstructionCost DestCost = TTI.getShuffleCost(
548*5f7ddb14SDimitry Andric       TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);
549*5f7ddb14SDimitry Andric   InstructionCost SrcCost =
550*5f7ddb14SDimitry Andric       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
551*5f7ddb14SDimitry Andric   if (DestCost > SrcCost || !DestCost.isValid())
552*5f7ddb14SDimitry Andric     return false;
553*5f7ddb14SDimitry Andric 
5545ffd83dbSDimitry Andric   // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
5555ffd83dbSDimitry Andric   ++NumShufOfBitcast;
5565ffd83dbSDimitry Andric   Value *CastV = Builder.CreateBitCast(V, DestTy);
557af732203SDimitry Andric   Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
5585ffd83dbSDimitry Andric   replaceValue(I, *Shuf);
5595ffd83dbSDimitry Andric   return true;
5605ffd83dbSDimitry Andric }
5615ffd83dbSDimitry Andric 
5625ffd83dbSDimitry Andric /// Match a vector binop or compare instruction with at least one inserted
5635ffd83dbSDimitry Andric /// scalar operand and convert to scalar binop/cmp followed by insertelement.
scalarizeBinopOrCmp(Instruction & I)5645ffd83dbSDimitry Andric bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
5655ffd83dbSDimitry Andric   CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
5665ffd83dbSDimitry Andric   Value *Ins0, *Ins1;
5675ffd83dbSDimitry Andric   if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
5685ffd83dbSDimitry Andric       !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
5695ffd83dbSDimitry Andric     return false;
5705ffd83dbSDimitry Andric 
5715ffd83dbSDimitry Andric   // Do not convert the vector condition of a vector select into a scalar
5725ffd83dbSDimitry Andric   // condition. That may cause problems for codegen because of differences in
5735ffd83dbSDimitry Andric   // boolean formats and register-file transfers.
5745ffd83dbSDimitry Andric   // TODO: Can we account for that in the cost model?
5755ffd83dbSDimitry Andric   bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
5765ffd83dbSDimitry Andric   if (IsCmp)
5775ffd83dbSDimitry Andric     for (User *U : I.users())
5785ffd83dbSDimitry Andric       if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
5795ffd83dbSDimitry Andric         return false;
5805ffd83dbSDimitry Andric 
5815ffd83dbSDimitry Andric   // Match against one or both scalar values being inserted into constant
5825ffd83dbSDimitry Andric   // vectors:
5835ffd83dbSDimitry Andric   // vec_op VecC0, (inselt VecC1, V1, Index)
5845ffd83dbSDimitry Andric   // vec_op (inselt VecC0, V0, Index), VecC1
5855ffd83dbSDimitry Andric   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
5865ffd83dbSDimitry Andric   // TODO: Deal with mismatched index constants and variable indexes?
5875ffd83dbSDimitry Andric   Constant *VecC0 = nullptr, *VecC1 = nullptr;
5885ffd83dbSDimitry Andric   Value *V0 = nullptr, *V1 = nullptr;
5895ffd83dbSDimitry Andric   uint64_t Index0 = 0, Index1 = 0;
5905ffd83dbSDimitry Andric   if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
5915ffd83dbSDimitry Andric                                m_ConstantInt(Index0))) &&
5925ffd83dbSDimitry Andric       !match(Ins0, m_Constant(VecC0)))
5935ffd83dbSDimitry Andric     return false;
5945ffd83dbSDimitry Andric   if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
5955ffd83dbSDimitry Andric                                m_ConstantInt(Index1))) &&
5965ffd83dbSDimitry Andric       !match(Ins1, m_Constant(VecC1)))
5975ffd83dbSDimitry Andric     return false;
5985ffd83dbSDimitry Andric 
5995ffd83dbSDimitry Andric   bool IsConst0 = !V0;
6005ffd83dbSDimitry Andric   bool IsConst1 = !V1;
6015ffd83dbSDimitry Andric   if (IsConst0 && IsConst1)
6025ffd83dbSDimitry Andric     return false;
6035ffd83dbSDimitry Andric   if (!IsConst0 && !IsConst1 && Index0 != Index1)
6045ffd83dbSDimitry Andric     return false;
6055ffd83dbSDimitry Andric 
6065ffd83dbSDimitry Andric   // Bail for single insertion if it is a load.
6075ffd83dbSDimitry Andric   // TODO: Handle this once getVectorInstrCost can cost for load/stores.
6085ffd83dbSDimitry Andric   auto *I0 = dyn_cast_or_null<Instruction>(V0);
6095ffd83dbSDimitry Andric   auto *I1 = dyn_cast_or_null<Instruction>(V1);
6105ffd83dbSDimitry Andric   if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
6115ffd83dbSDimitry Andric       (IsConst1 && I0 && I0->mayReadFromMemory()))
6125ffd83dbSDimitry Andric     return false;
6135ffd83dbSDimitry Andric 
6145ffd83dbSDimitry Andric   uint64_t Index = IsConst0 ? Index1 : Index0;
6155ffd83dbSDimitry Andric   Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
6165ffd83dbSDimitry Andric   Type *VecTy = I.getType();
6175ffd83dbSDimitry Andric   assert(VecTy->isVectorTy() &&
6185ffd83dbSDimitry Andric          (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
6195ffd83dbSDimitry Andric          (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
6205ffd83dbSDimitry Andric           ScalarTy->isPointerTy()) &&
6215ffd83dbSDimitry Andric          "Unexpected types for insert element into binop or cmp");
6225ffd83dbSDimitry Andric 
6235ffd83dbSDimitry Andric   unsigned Opcode = I.getOpcode();
624af732203SDimitry Andric   InstructionCost ScalarOpCost, VectorOpCost;
6255ffd83dbSDimitry Andric   if (IsCmp) {
6265ffd83dbSDimitry Andric     ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
6275ffd83dbSDimitry Andric     VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
6285ffd83dbSDimitry Andric   } else {
6295ffd83dbSDimitry Andric     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
6305ffd83dbSDimitry Andric     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
6315ffd83dbSDimitry Andric   }
6325ffd83dbSDimitry Andric 
6335ffd83dbSDimitry Andric   // Get cost estimate for the insert element. This cost will factor into
6345ffd83dbSDimitry Andric   // both sequences.
635af732203SDimitry Andric   InstructionCost InsertCost =
6365ffd83dbSDimitry Andric       TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
637af732203SDimitry Andric   InstructionCost OldCost =
638af732203SDimitry Andric       (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
639af732203SDimitry Andric   InstructionCost NewCost = ScalarOpCost + InsertCost +
6405ffd83dbSDimitry Andric                             (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
6415ffd83dbSDimitry Andric                             (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
6425ffd83dbSDimitry Andric 
6435ffd83dbSDimitry Andric   // We want to scalarize unless the vector variant actually has lower cost.
644af732203SDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
6455ffd83dbSDimitry Andric     return false;
6465ffd83dbSDimitry Andric 
6475ffd83dbSDimitry Andric   // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
6485ffd83dbSDimitry Andric   // inselt NewVecC, (scalar_op V0, V1), Index
6495ffd83dbSDimitry Andric   if (IsCmp)
6505ffd83dbSDimitry Andric     ++NumScalarCmp;
6515ffd83dbSDimitry Andric   else
6525ffd83dbSDimitry Andric     ++NumScalarBO;
6535ffd83dbSDimitry Andric 
6545ffd83dbSDimitry Andric   // For constant cases, extract the scalar element, this should constant fold.
6555ffd83dbSDimitry Andric   if (IsConst0)
6565ffd83dbSDimitry Andric     V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
6575ffd83dbSDimitry Andric   if (IsConst1)
6585ffd83dbSDimitry Andric     V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
6595ffd83dbSDimitry Andric 
6605ffd83dbSDimitry Andric   Value *Scalar =
6615ffd83dbSDimitry Andric       IsCmp ? Builder.CreateCmp(Pred, V0, V1)
6625ffd83dbSDimitry Andric             : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
6635ffd83dbSDimitry Andric 
6645ffd83dbSDimitry Andric   Scalar->setName(I.getName() + ".scalar");
6655ffd83dbSDimitry Andric 
6665ffd83dbSDimitry Andric   // All IR flags are safe to back-propagate. There is no potential for extra
6675ffd83dbSDimitry Andric   // poison to be created by the scalar instruction.
6685ffd83dbSDimitry Andric   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
6695ffd83dbSDimitry Andric     ScalarInst->copyIRFlags(&I);
6705ffd83dbSDimitry Andric 
6715ffd83dbSDimitry Andric   // Fold the vector constants in the original vectors into a new base vector.
6725ffd83dbSDimitry Andric   Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
6735ffd83dbSDimitry Andric                             : ConstantExpr::get(Opcode, VecC0, VecC1);
6745ffd83dbSDimitry Andric   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
6755ffd83dbSDimitry Andric   replaceValue(I, *Insert);
6765ffd83dbSDimitry Andric   return true;
6775ffd83dbSDimitry Andric }
6785ffd83dbSDimitry Andric 
6795ffd83dbSDimitry Andric /// Try to combine a scalar binop + 2 scalar compares of extracted elements of
6805ffd83dbSDimitry Andric /// a vector into vector operations followed by extract. Note: The SLP pass
6815ffd83dbSDimitry Andric /// may miss this pattern because of implementation problems.
foldExtractedCmps(Instruction & I)6825ffd83dbSDimitry Andric bool VectorCombine::foldExtractedCmps(Instruction &I) {
6835ffd83dbSDimitry Andric   // We are looking for a scalar binop of booleans.
6845ffd83dbSDimitry Andric   // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
6855ffd83dbSDimitry Andric   if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
6865ffd83dbSDimitry Andric     return false;
6875ffd83dbSDimitry Andric 
6885ffd83dbSDimitry Andric   // The compare predicates should match, and each compare should have a
6895ffd83dbSDimitry Andric   // constant operand.
6905ffd83dbSDimitry Andric   // TODO: Relax the one-use constraints.
6915ffd83dbSDimitry Andric   Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
6925ffd83dbSDimitry Andric   Instruction *I0, *I1;
6935ffd83dbSDimitry Andric   Constant *C0, *C1;
6945ffd83dbSDimitry Andric   CmpInst::Predicate P0, P1;
6955ffd83dbSDimitry Andric   if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
6965ffd83dbSDimitry Andric       !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
6975ffd83dbSDimitry Andric       P0 != P1)
6985ffd83dbSDimitry Andric     return false;
6995ffd83dbSDimitry Andric 
7005ffd83dbSDimitry Andric   // The compare operands must be extracts of the same vector with constant
7015ffd83dbSDimitry Andric   // extract indexes.
7025ffd83dbSDimitry Andric   // TODO: Relax the one-use constraints.
7035ffd83dbSDimitry Andric   Value *X;
7045ffd83dbSDimitry Andric   uint64_t Index0, Index1;
7055ffd83dbSDimitry Andric   if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
7065ffd83dbSDimitry Andric       !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
7075ffd83dbSDimitry Andric     return false;
7085ffd83dbSDimitry Andric 
7095ffd83dbSDimitry Andric   auto *Ext0 = cast<ExtractElementInst>(I0);
7105ffd83dbSDimitry Andric   auto *Ext1 = cast<ExtractElementInst>(I1);
7115ffd83dbSDimitry Andric   ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
7125ffd83dbSDimitry Andric   if (!ConvertToShuf)
7135ffd83dbSDimitry Andric     return false;
7145ffd83dbSDimitry Andric 
7155ffd83dbSDimitry Andric   // The original scalar pattern is:
7165ffd83dbSDimitry Andric   // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
7175ffd83dbSDimitry Andric   CmpInst::Predicate Pred = P0;
7185ffd83dbSDimitry Andric   unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
7195ffd83dbSDimitry Andric                                                     : Instruction::ICmp;
7205ffd83dbSDimitry Andric   auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
7215ffd83dbSDimitry Andric   if (!VecTy)
7225ffd83dbSDimitry Andric     return false;
7235ffd83dbSDimitry Andric 
724af732203SDimitry Andric   InstructionCost OldCost =
725af732203SDimitry Andric       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
7265ffd83dbSDimitry Andric   OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
7275ffd83dbSDimitry Andric   OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
7285ffd83dbSDimitry Andric   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
7295ffd83dbSDimitry Andric 
7305ffd83dbSDimitry Andric   // The proposed vector pattern is:
7315ffd83dbSDimitry Andric   // vcmp = cmp Pred X, VecC
7325ffd83dbSDimitry Andric   // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
7335ffd83dbSDimitry Andric   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
7345ffd83dbSDimitry Andric   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
7355ffd83dbSDimitry Andric   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
736af732203SDimitry Andric   InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
737*5f7ddb14SDimitry Andric   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
738*5f7ddb14SDimitry Andric   ShufMask[CheapIndex] = ExpensiveIndex;
739*5f7ddb14SDimitry Andric   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,
740*5f7ddb14SDimitry Andric                                 ShufMask);
7415ffd83dbSDimitry Andric   NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
7425ffd83dbSDimitry Andric   NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
7435ffd83dbSDimitry Andric 
7445ffd83dbSDimitry Andric   // Aggressively form vector ops if the cost is equal because the transform
7455ffd83dbSDimitry Andric   // may enable further optimization.
7465ffd83dbSDimitry Andric   // Codegen can reverse this transform (scalarize) if it was not profitable.
747af732203SDimitry Andric   if (OldCost < NewCost || !NewCost.isValid())
7485ffd83dbSDimitry Andric     return false;
7495ffd83dbSDimitry Andric 
7505ffd83dbSDimitry Andric   // Create a vector constant from the 2 scalar constants.
7515ffd83dbSDimitry Andric   SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
7525ffd83dbSDimitry Andric                                    UndefValue::get(VecTy->getElementType()));
7535ffd83dbSDimitry Andric   CmpC[Index0] = C0;
7545ffd83dbSDimitry Andric   CmpC[Index1] = C1;
7555ffd83dbSDimitry Andric   Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
7565ffd83dbSDimitry Andric 
7575ffd83dbSDimitry Andric   Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
7585ffd83dbSDimitry Andric   Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
7595ffd83dbSDimitry Andric                                         VCmp, Shuf);
7605ffd83dbSDimitry Andric   Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
7615ffd83dbSDimitry Andric   replaceValue(I, *NewExt);
7625ffd83dbSDimitry Andric   ++NumVecCmpBO;
7635ffd83dbSDimitry Andric   return true;
7645ffd83dbSDimitry Andric }
7655ffd83dbSDimitry Andric 
766*5f7ddb14SDimitry Andric // Check if memory loc modified between two instrs in the same BB
isMemModifiedBetween(BasicBlock::iterator Begin,BasicBlock::iterator End,const MemoryLocation & Loc,AAResults & AA)767*5f7ddb14SDimitry Andric static bool isMemModifiedBetween(BasicBlock::iterator Begin,
768*5f7ddb14SDimitry Andric                                  BasicBlock::iterator End,
769*5f7ddb14SDimitry Andric                                  const MemoryLocation &Loc, AAResults &AA) {
770*5f7ddb14SDimitry Andric   unsigned NumScanned = 0;
771*5f7ddb14SDimitry Andric   return std::any_of(Begin, End, [&](const Instruction &Instr) {
772*5f7ddb14SDimitry Andric     return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
773*5f7ddb14SDimitry Andric            ++NumScanned > MaxInstrsToScan;
774*5f7ddb14SDimitry Andric   });
775*5f7ddb14SDimitry Andric }
776*5f7ddb14SDimitry Andric 
777*5f7ddb14SDimitry Andric /// Check if it is legal to scalarize a memory access to \p VecTy at index \p
778*5f7ddb14SDimitry Andric /// Idx. \p Idx must access a valid vector element.
canScalarizeAccess(FixedVectorType * VecTy,Value * Idx,Instruction * CtxI,AssumptionCache & AC)779*5f7ddb14SDimitry Andric static bool canScalarizeAccess(FixedVectorType *VecTy, Value *Idx,
780*5f7ddb14SDimitry Andric                                Instruction *CtxI, AssumptionCache &AC) {
781*5f7ddb14SDimitry Andric   if (auto *C = dyn_cast<ConstantInt>(Idx))
782*5f7ddb14SDimitry Andric     return C->getValue().ult(VecTy->getNumElements());
783*5f7ddb14SDimitry Andric 
784*5f7ddb14SDimitry Andric   APInt Zero(Idx->getType()->getScalarSizeInBits(), 0);
785*5f7ddb14SDimitry Andric   APInt MaxElts(Idx->getType()->getScalarSizeInBits(), VecTy->getNumElements());
786*5f7ddb14SDimitry Andric   ConstantRange ValidIndices(Zero, MaxElts);
787*5f7ddb14SDimitry Andric   ConstantRange IdxRange = computeConstantRange(Idx, true, &AC, CtxI, 0);
788*5f7ddb14SDimitry Andric   return ValidIndices.contains(IdxRange);
789*5f7ddb14SDimitry Andric }
790*5f7ddb14SDimitry Andric 
791*5f7ddb14SDimitry Andric /// The memory operation on a vector of \p ScalarType had alignment of
792*5f7ddb14SDimitry Andric /// \p VectorAlignment. Compute the maximal, but conservatively correct,
793*5f7ddb14SDimitry Andric /// alignment that will be valid for the memory operation on a single scalar
794*5f7ddb14SDimitry Andric /// element of the same type with index \p Idx.
computeAlignmentAfterScalarization(Align VectorAlignment,Type * ScalarType,Value * Idx,const DataLayout & DL)795*5f7ddb14SDimitry Andric static Align computeAlignmentAfterScalarization(Align VectorAlignment,
796*5f7ddb14SDimitry Andric                                                 Type *ScalarType, Value *Idx,
797*5f7ddb14SDimitry Andric                                                 const DataLayout &DL) {
798*5f7ddb14SDimitry Andric   if (auto *C = dyn_cast<ConstantInt>(Idx))
799*5f7ddb14SDimitry Andric     return commonAlignment(VectorAlignment,
800*5f7ddb14SDimitry Andric                            C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
801*5f7ddb14SDimitry Andric   return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
802*5f7ddb14SDimitry Andric }
803*5f7ddb14SDimitry Andric 
804*5f7ddb14SDimitry Andric // Combine patterns like:
805*5f7ddb14SDimitry Andric //   %0 = load <4 x i32>, <4 x i32>* %a
806*5f7ddb14SDimitry Andric //   %1 = insertelement <4 x i32> %0, i32 %b, i32 1
807*5f7ddb14SDimitry Andric //   store <4 x i32> %1, <4 x i32>* %a
808*5f7ddb14SDimitry Andric // to:
809*5f7ddb14SDimitry Andric //   %0 = bitcast <4 x i32>* %a to i32*
810*5f7ddb14SDimitry Andric //   %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
811*5f7ddb14SDimitry Andric //   store i32 %b, i32* %1
foldSingleElementStore(Instruction & I)812*5f7ddb14SDimitry Andric bool VectorCombine::foldSingleElementStore(Instruction &I) {
813*5f7ddb14SDimitry Andric   StoreInst *SI = dyn_cast<StoreInst>(&I);
814*5f7ddb14SDimitry Andric   if (!SI || !SI->isSimple() ||
815*5f7ddb14SDimitry Andric       !isa<FixedVectorType>(SI->getValueOperand()->getType()))
816*5f7ddb14SDimitry Andric     return false;
817*5f7ddb14SDimitry Andric 
818*5f7ddb14SDimitry Andric   // TODO: Combine more complicated patterns (multiple insert) by referencing
819*5f7ddb14SDimitry Andric   // TargetTransformInfo.
820*5f7ddb14SDimitry Andric   Instruction *Source;
821*5f7ddb14SDimitry Andric   Value *NewElement;
822*5f7ddb14SDimitry Andric   Value *Idx;
823*5f7ddb14SDimitry Andric   if (!match(SI->getValueOperand(),
824*5f7ddb14SDimitry Andric              m_InsertElt(m_Instruction(Source), m_Value(NewElement),
825*5f7ddb14SDimitry Andric                          m_Value(Idx))))
826*5f7ddb14SDimitry Andric     return false;
827*5f7ddb14SDimitry Andric 
828*5f7ddb14SDimitry Andric   if (auto *Load = dyn_cast<LoadInst>(Source)) {
829*5f7ddb14SDimitry Andric     auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
830*5f7ddb14SDimitry Andric     const DataLayout &DL = I.getModule()->getDataLayout();
831*5f7ddb14SDimitry Andric     Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
832*5f7ddb14SDimitry Andric     // Don't optimize for atomic/volatile load or store. Ensure memory is not
833*5f7ddb14SDimitry Andric     // modified between, vector type matches store size, and index is inbounds.
834*5f7ddb14SDimitry Andric     if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
835*5f7ddb14SDimitry Andric         !DL.typeSizeEqualsStoreSize(Load->getType()) ||
836*5f7ddb14SDimitry Andric         !canScalarizeAccess(VecTy, Idx, Load, AC) ||
837*5f7ddb14SDimitry Andric         SrcAddr != SI->getPointerOperand()->stripPointerCasts() ||
838*5f7ddb14SDimitry Andric         isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
839*5f7ddb14SDimitry Andric                              MemoryLocation::get(SI), AA))
840*5f7ddb14SDimitry Andric       return false;
841*5f7ddb14SDimitry Andric 
842*5f7ddb14SDimitry Andric     Value *GEP = Builder.CreateInBoundsGEP(
843*5f7ddb14SDimitry Andric         SI->getValueOperand()->getType(), SI->getPointerOperand(),
844*5f7ddb14SDimitry Andric         {ConstantInt::get(Idx->getType(), 0), Idx});
845*5f7ddb14SDimitry Andric     StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
846*5f7ddb14SDimitry Andric     NSI->copyMetadata(*SI);
847*5f7ddb14SDimitry Andric     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
848*5f7ddb14SDimitry Andric         std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
849*5f7ddb14SDimitry Andric         DL);
850*5f7ddb14SDimitry Andric     NSI->setAlignment(ScalarOpAlignment);
851*5f7ddb14SDimitry Andric     replaceValue(I, *NSI);
852*5f7ddb14SDimitry Andric     // Need erasing the store manually.
853*5f7ddb14SDimitry Andric     I.eraseFromParent();
854*5f7ddb14SDimitry Andric     return true;
855*5f7ddb14SDimitry Andric   }
856*5f7ddb14SDimitry Andric 
857*5f7ddb14SDimitry Andric   return false;
858*5f7ddb14SDimitry Andric }
859*5f7ddb14SDimitry Andric 
860*5f7ddb14SDimitry Andric /// Try to scalarize vector loads feeding extractelement instructions.
scalarizeLoadExtract(Instruction & I)861*5f7ddb14SDimitry Andric bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
862*5f7ddb14SDimitry Andric   Value *Ptr;
863*5f7ddb14SDimitry Andric   Value *Idx;
864*5f7ddb14SDimitry Andric   if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_Value(Idx))))
865*5f7ddb14SDimitry Andric     return false;
866*5f7ddb14SDimitry Andric 
867*5f7ddb14SDimitry Andric   auto *LI = cast<LoadInst>(I.getOperand(0));
868*5f7ddb14SDimitry Andric   const DataLayout &DL = I.getModule()->getDataLayout();
869*5f7ddb14SDimitry Andric   if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(LI->getType()))
870*5f7ddb14SDimitry Andric     return false;
871*5f7ddb14SDimitry Andric 
872*5f7ddb14SDimitry Andric   auto *FixedVT = dyn_cast<FixedVectorType>(LI->getType());
873*5f7ddb14SDimitry Andric   if (!FixedVT)
874*5f7ddb14SDimitry Andric     return false;
875*5f7ddb14SDimitry Andric 
876*5f7ddb14SDimitry Andric   InstructionCost OriginalCost = TTI.getMemoryOpCost(
877*5f7ddb14SDimitry Andric       Instruction::Load, LI->getType(), Align(LI->getAlignment()),
878*5f7ddb14SDimitry Andric       LI->getPointerAddressSpace());
879*5f7ddb14SDimitry Andric   InstructionCost ScalarizedCost = 0;
880*5f7ddb14SDimitry Andric 
881*5f7ddb14SDimitry Andric   Instruction *LastCheckedInst = LI;
882*5f7ddb14SDimitry Andric   unsigned NumInstChecked = 0;
883*5f7ddb14SDimitry Andric   // Check if all users of the load are extracts with no memory modifications
884*5f7ddb14SDimitry Andric   // between the load and the extract. Compute the cost of both the original
885*5f7ddb14SDimitry Andric   // code and the scalarized version.
886*5f7ddb14SDimitry Andric   for (User *U : LI->users()) {
887*5f7ddb14SDimitry Andric     auto *UI = dyn_cast<ExtractElementInst>(U);
888*5f7ddb14SDimitry Andric     if (!UI || UI->getParent() != LI->getParent())
889*5f7ddb14SDimitry Andric       return false;
890*5f7ddb14SDimitry Andric 
891*5f7ddb14SDimitry Andric     if (!isGuaranteedNotToBePoison(UI->getOperand(1), &AC, LI, &DT))
892*5f7ddb14SDimitry Andric       return false;
893*5f7ddb14SDimitry Andric 
894*5f7ddb14SDimitry Andric     // Check if any instruction between the load and the extract may modify
895*5f7ddb14SDimitry Andric     // memory.
896*5f7ddb14SDimitry Andric     if (LastCheckedInst->comesBefore(UI)) {
897*5f7ddb14SDimitry Andric       for (Instruction &I :
898*5f7ddb14SDimitry Andric            make_range(std::next(LI->getIterator()), UI->getIterator())) {
899*5f7ddb14SDimitry Andric         // Bail out if we reached the check limit or the instruction may write
900*5f7ddb14SDimitry Andric         // to memory.
901*5f7ddb14SDimitry Andric         if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
902*5f7ddb14SDimitry Andric           return false;
903*5f7ddb14SDimitry Andric         NumInstChecked++;
904*5f7ddb14SDimitry Andric       }
905*5f7ddb14SDimitry Andric     }
906*5f7ddb14SDimitry Andric 
907*5f7ddb14SDimitry Andric     if (!LastCheckedInst)
908*5f7ddb14SDimitry Andric       LastCheckedInst = UI;
909*5f7ddb14SDimitry Andric     else if (LastCheckedInst->comesBefore(UI))
910*5f7ddb14SDimitry Andric       LastCheckedInst = UI;
911*5f7ddb14SDimitry Andric 
912*5f7ddb14SDimitry Andric     if (!canScalarizeAccess(FixedVT, UI->getOperand(1), &I, AC))
913*5f7ddb14SDimitry Andric       return false;
914*5f7ddb14SDimitry Andric 
915*5f7ddb14SDimitry Andric     auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
916*5f7ddb14SDimitry Andric     OriginalCost +=
917*5f7ddb14SDimitry Andric         TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),
918*5f7ddb14SDimitry Andric                                Index ? Index->getZExtValue() : -1);
919*5f7ddb14SDimitry Andric     ScalarizedCost +=
920*5f7ddb14SDimitry Andric         TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
921*5f7ddb14SDimitry Andric                             Align(1), LI->getPointerAddressSpace());
922*5f7ddb14SDimitry Andric     ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType());
923*5f7ddb14SDimitry Andric   }
924*5f7ddb14SDimitry Andric 
925*5f7ddb14SDimitry Andric   if (ScalarizedCost >= OriginalCost)
926*5f7ddb14SDimitry Andric     return false;
927*5f7ddb14SDimitry Andric 
928*5f7ddb14SDimitry Andric   // Replace extracts with narrow scalar loads.
929*5f7ddb14SDimitry Andric   for (User *U : LI->users()) {
930*5f7ddb14SDimitry Andric     auto *EI = cast<ExtractElementInst>(U);
931*5f7ddb14SDimitry Andric     Builder.SetInsertPoint(EI);
932*5f7ddb14SDimitry Andric 
933*5f7ddb14SDimitry Andric     Value *Idx = EI->getOperand(1);
934*5f7ddb14SDimitry Andric     Value *GEP =
935*5f7ddb14SDimitry Andric         Builder.CreateInBoundsGEP(FixedVT, Ptr, {Builder.getInt32(0), Idx});
936*5f7ddb14SDimitry Andric     auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
937*5f7ddb14SDimitry Andric         FixedVT->getElementType(), GEP, EI->getName() + ".scalar"));
938*5f7ddb14SDimitry Andric 
939*5f7ddb14SDimitry Andric     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
940*5f7ddb14SDimitry Andric         LI->getAlign(), FixedVT->getElementType(), Idx, DL);
941*5f7ddb14SDimitry Andric     NewLoad->setAlignment(ScalarOpAlignment);
942*5f7ddb14SDimitry Andric 
943*5f7ddb14SDimitry Andric     replaceValue(*EI, *NewLoad);
944*5f7ddb14SDimitry Andric   }
945*5f7ddb14SDimitry Andric 
946*5f7ddb14SDimitry Andric   return true;
947*5f7ddb14SDimitry Andric }
948*5f7ddb14SDimitry Andric 
9495ffd83dbSDimitry Andric /// This is the entry point for all transforms. Pass manager differences are
9505ffd83dbSDimitry Andric /// handled in the callers of this function.
run()9515ffd83dbSDimitry Andric bool VectorCombine::run() {
9525ffd83dbSDimitry Andric   if (DisableVectorCombine)
9535ffd83dbSDimitry Andric     return false;
9545ffd83dbSDimitry Andric 
955af732203SDimitry Andric   // Don't attempt vectorization if the target does not support vectors.
956af732203SDimitry Andric   if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
957af732203SDimitry Andric     return false;
958af732203SDimitry Andric 
9595ffd83dbSDimitry Andric   bool MadeChange = false;
9605ffd83dbSDimitry Andric   for (BasicBlock &BB : F) {
9615ffd83dbSDimitry Andric     // Ignore unreachable basic blocks.
9625ffd83dbSDimitry Andric     if (!DT.isReachableFromEntry(&BB))
9635ffd83dbSDimitry Andric       continue;
964*5f7ddb14SDimitry Andric     // Use early increment range so that we can erase instructions in loop.
965*5f7ddb14SDimitry Andric     for (Instruction &I : make_early_inc_range(BB)) {
9665ffd83dbSDimitry Andric       if (isa<DbgInfoIntrinsic>(I))
9675ffd83dbSDimitry Andric         continue;
9685ffd83dbSDimitry Andric       Builder.SetInsertPoint(&I);
969af732203SDimitry Andric       MadeChange |= vectorizeLoadInsert(I);
9705ffd83dbSDimitry Andric       MadeChange |= foldExtractExtract(I);
9715ffd83dbSDimitry Andric       MadeChange |= foldBitcastShuf(I);
9725ffd83dbSDimitry Andric       MadeChange |= scalarizeBinopOrCmp(I);
9735ffd83dbSDimitry Andric       MadeChange |= foldExtractedCmps(I);
974*5f7ddb14SDimitry Andric       MadeChange |= scalarizeLoadExtract(I);
975*5f7ddb14SDimitry Andric       MadeChange |= foldSingleElementStore(I);
9765ffd83dbSDimitry Andric     }
9775ffd83dbSDimitry Andric   }
9785ffd83dbSDimitry Andric 
9795ffd83dbSDimitry Andric   // We're done with transforms, so remove dead instructions.
9805ffd83dbSDimitry Andric   if (MadeChange)
9815ffd83dbSDimitry Andric     for (BasicBlock &BB : F)
9825ffd83dbSDimitry Andric       SimplifyInstructionsInBlock(&BB);
9835ffd83dbSDimitry Andric 
9845ffd83dbSDimitry Andric   return MadeChange;
9855ffd83dbSDimitry Andric }
9865ffd83dbSDimitry Andric 
9875ffd83dbSDimitry Andric // Pass manager boilerplate below here.
9885ffd83dbSDimitry Andric 
9895ffd83dbSDimitry Andric namespace {
9905ffd83dbSDimitry Andric class VectorCombineLegacyPass : public FunctionPass {
9915ffd83dbSDimitry Andric public:
9925ffd83dbSDimitry Andric   static char ID;
VectorCombineLegacyPass()9935ffd83dbSDimitry Andric   VectorCombineLegacyPass() : FunctionPass(ID) {
9945ffd83dbSDimitry Andric     initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
9955ffd83dbSDimitry Andric   }
9965ffd83dbSDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const9975ffd83dbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
998*5f7ddb14SDimitry Andric     AU.addRequired<AssumptionCacheTracker>();
9995ffd83dbSDimitry Andric     AU.addRequired<DominatorTreeWrapperPass>();
10005ffd83dbSDimitry Andric     AU.addRequired<TargetTransformInfoWrapperPass>();
1001*5f7ddb14SDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
10025ffd83dbSDimitry Andric     AU.setPreservesCFG();
10035ffd83dbSDimitry Andric     AU.addPreserved<DominatorTreeWrapperPass>();
10045ffd83dbSDimitry Andric     AU.addPreserved<GlobalsAAWrapperPass>();
10055ffd83dbSDimitry Andric     AU.addPreserved<AAResultsWrapperPass>();
10065ffd83dbSDimitry Andric     AU.addPreserved<BasicAAWrapperPass>();
10075ffd83dbSDimitry Andric     FunctionPass::getAnalysisUsage(AU);
10085ffd83dbSDimitry Andric   }
10095ffd83dbSDimitry Andric 
runOnFunction(Function & F)10105ffd83dbSDimitry Andric   bool runOnFunction(Function &F) override {
10115ffd83dbSDimitry Andric     if (skipFunction(F))
10125ffd83dbSDimitry Andric       return false;
1013*5f7ddb14SDimitry Andric     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
10145ffd83dbSDimitry Andric     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
10155ffd83dbSDimitry Andric     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1016*5f7ddb14SDimitry Andric     auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
1017*5f7ddb14SDimitry Andric     VectorCombine Combiner(F, TTI, DT, AA, AC);
10185ffd83dbSDimitry Andric     return Combiner.run();
10195ffd83dbSDimitry Andric   }
10205ffd83dbSDimitry Andric };
10215ffd83dbSDimitry Andric } // namespace
10225ffd83dbSDimitry Andric 
10235ffd83dbSDimitry Andric char VectorCombineLegacyPass::ID = 0;
10245ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
10255ffd83dbSDimitry Andric                       "Optimize scalar/vector ops", false,
10265ffd83dbSDimitry Andric                       false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1027*5f7ddb14SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
10285ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
10295ffd83dbSDimitry Andric INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
10305ffd83dbSDimitry Andric                     "Optimize scalar/vector ops", false, false)
10315ffd83dbSDimitry Andric Pass *llvm::createVectorCombinePass() {
10325ffd83dbSDimitry Andric   return new VectorCombineLegacyPass();
10335ffd83dbSDimitry Andric }
10345ffd83dbSDimitry Andric 
run(Function & F,FunctionAnalysisManager & FAM)10355ffd83dbSDimitry Andric PreservedAnalyses VectorCombinePass::run(Function &F,
10365ffd83dbSDimitry Andric                                          FunctionAnalysisManager &FAM) {
1037*5f7ddb14SDimitry Andric   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
10385ffd83dbSDimitry Andric   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
10395ffd83dbSDimitry Andric   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1040*5f7ddb14SDimitry Andric   AAResults &AA = FAM.getResult<AAManager>(F);
1041*5f7ddb14SDimitry Andric   VectorCombine Combiner(F, TTI, DT, AA, AC);
10425ffd83dbSDimitry Andric   if (!Combiner.run())
10435ffd83dbSDimitry Andric     return PreservedAnalyses::all();
10445ffd83dbSDimitry Andric   PreservedAnalyses PA;
10455ffd83dbSDimitry Andric   PA.preserveSet<CFGAnalyses>();
10465ffd83dbSDimitry Andric   return PA;
10475ffd83dbSDimitry Andric }
1048