1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function for converting Scalar types to vector types.
299 /// If the incoming type is void, we return void. If the VF is 1, we return
300 /// the scalar type.
301 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
302   if (Scalar->isVoidTy() || VF == 1)
303     return Scalar;
304   return VectorType::get(Scalar, VF);
305 }
306 
307 /// A helper function that returns the type of loaded or stored value.
308 static Type *getMemInstValueType(Value *I) {
309   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
310          "Expected Load or Store instruction");
311   if (auto *LI = dyn_cast<LoadInst>(I))
312     return LI->getType();
313   return cast<StoreInst>(I)->getValueOperand()->getType();
314 }
315 
316 /// A helper function that returns true if the given type is irregular. The
317 /// type is irregular if its allocated size doesn't equal the store size of an
318 /// element of the corresponding vector type at the given vectorization factor.
319 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
320   // Determine if an array of VF elements of type Ty is "bitcast compatible"
321   // with a <VF x Ty> vector.
322   if (VF > 1) {
323     auto *VectorTy = VectorType::get(Ty, VF);
324     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
325   }
326 
327   // If the vectorization factor is one, we just check if an array of type Ty
328   // requires padding between elements.
329   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
330 }
331 
332 /// A helper function that returns the reciprocal of the block probability of
333 /// predicated blocks. If we return X, we are assuming the predicated block
334 /// will execute once for every X iterations of the loop header.
335 ///
336 /// TODO: We should use actual block probability here, if available. Currently,
337 ///       we always assume predicated blocks have a 50% chance of executing.
338 static unsigned getReciprocalPredBlockProb() { return 2; }
339 
340 /// A helper function that adds a 'fast' flag to floating-point operations.
341 static Value *addFastMathFlag(Value *V) {
342   if (isa<FPMathOperator>(V))
343     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
344   return V;
345 }
346 
347 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
348   if (isa<FPMathOperator>(V))
349     cast<Instruction>(V)->setFastMathFlags(FMF);
350   return V;
351 }
352 
353 /// A helper function that returns an integer or floating-point constant with
354 /// value C.
355 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
356   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
357                            : ConstantFP::get(Ty, C);
358 }
359 
360 /// Returns "best known" trip count for the specified loop \p L as defined by
361 /// the following procedure:
362 ///   1) Returns exact trip count if it is known.
363 ///   2) Returns expected trip count according to profile data if any.
364 ///   3) Returns upper bound estimate if it is known.
365 ///   4) Returns None if all of the above failed.
366 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
367   // Check if exact trip count is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
369     return ExpectedTC;
370 
371   // Check if there is an expected trip count available from profile data.
372   if (LoopVectorizeWithBlockFrequency)
373     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
374       return EstimatedTC;
375 
376   // Check if upper bound estimate is known.
377   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
378     return ExpectedTC;
379 
380   return None;
381 }
382 
383 namespace llvm {
384 
385 /// InnerLoopVectorizer vectorizes loops which contain only one basic
386 /// block to a specified vectorization factor (VF).
387 /// This class performs the widening of scalars into vectors, or multiple
388 /// scalars. This class also implements the following features:
389 /// * It inserts an epilogue loop for handling loops that don't have iteration
390 ///   counts that are known to be a multiple of the vectorization factor.
391 /// * It handles the code generation for reduction variables.
392 /// * Scalarization (implementation using scalars) of un-vectorizable
393 ///   instructions.
394 /// InnerLoopVectorizer does not perform any vectorization-legality
395 /// checks, and relies on the caller to check for the different legality
396 /// aspects. The InnerLoopVectorizer relies on the
397 /// LoopVectorizationLegality class to provide information about the induction
398 /// and reduction variables that were found to a given vectorization factor.
399 class InnerLoopVectorizer {
400 public:
401   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
402                       LoopInfo *LI, DominatorTree *DT,
403                       const TargetLibraryInfo *TLI,
404                       const TargetTransformInfo *TTI, AssumptionCache *AC,
405                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
406                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
407                       LoopVectorizationCostModel *CM)
408       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
409         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
410         Builder(PSE.getSE()->getContext()),
411         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
412   virtual ~InnerLoopVectorizer() = default;
413 
414   /// Create a new empty loop. Unlink the old loop and connect the new one.
415   /// Return the pre-header block of the new loop.
416   BasicBlock *createVectorizedLoopSkeleton();
417 
418   /// Widen a single instruction within the innermost loop.
419   void widenInstruction(Instruction &I);
420 
421   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
422   void fixVectorizedLoop();
423 
424   // Return true if any runtime check is added.
425   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
426 
427   /// A type for vectorized values in the new loop. Each value from the
428   /// original loop, when vectorized, is represented by UF vector values in the
429   /// new unrolled loop, where UF is the unroll factor.
430   using VectorParts = SmallVector<Value *, 2>;
431 
432   /// Vectorize a single GetElementPtrInst based on information gathered and
433   /// decisions taken during planning.
434   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
435                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
436 
437   /// Vectorize a single PHINode in a block. This method handles the induction
438   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
439   /// arbitrary length vectors.
440   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
441 
442   /// A helper function to scalarize a single Instruction in the innermost loop.
443   /// Generates a sequence of scalar instances for each lane between \p MinLane
444   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
445   /// inclusive..
446   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
447                             bool IfPredicateInstr);
448 
449   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
450   /// is provided, the integer induction variable will first be truncated to
451   /// the corresponding type.
452   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
453 
454   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
455   /// vector or scalar value on-demand if one is not yet available. When
456   /// vectorizing a loop, we visit the definition of an instruction before its
457   /// uses. When visiting the definition, we either vectorize or scalarize the
458   /// instruction, creating an entry for it in the corresponding map. (In some
459   /// cases, such as induction variables, we will create both vector and scalar
460   /// entries.) Then, as we encounter uses of the definition, we derive values
461   /// for each scalar or vector use unless such a value is already available.
462   /// For example, if we scalarize a definition and one of its uses is vector,
463   /// we build the required vector on-demand with an insertelement sequence
464   /// when visiting the use. Otherwise, if the use is scalar, we can use the
465   /// existing scalar definition.
466   ///
467   /// Return a value in the new loop corresponding to \p V from the original
468   /// loop at unroll index \p Part. If the value has already been vectorized,
469   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
470   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
471   /// a new vector value on-demand by inserting the scalar values into a vector
472   /// with an insertelement sequence. If the value has been neither vectorized
473   /// nor scalarized, it must be loop invariant, so we simply broadcast the
474   /// value into a vector.
475   Value *getOrCreateVectorValue(Value *V, unsigned Part);
476 
477   /// Return a value in the new loop corresponding to \p V from the original
478   /// loop at unroll and vector indices \p Instance. If the value has been
479   /// vectorized but not scalarized, the necessary extractelement instruction
480   /// will be generated.
481   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
482 
483   /// Construct the vector value of a scalarized value \p V one lane at a time.
484   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
485 
486   /// Try to vectorize the interleaved access group that \p Instr belongs to,
487   /// optionally masking the vector operations if \p BlockInMask is non-null.
488   void vectorizeInterleaveGroup(Instruction *Instr,
489                                 VectorParts *BlockInMask = nullptr);
490 
491   /// Vectorize Load and Store instructions, optionally masking the vector
492   /// operations if \p BlockInMask is non-null.
493   void vectorizeMemoryInstruction(Instruction *Instr,
494                                   VectorParts *BlockInMask = nullptr);
495 
496   /// Set the debug location in the builder using the debug location in
497   /// the instruction.
498   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
499 
500   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
501   void fixNonInductionPHIs(void);
502 
503 protected:
504   friend class LoopVectorizationPlanner;
505 
506   /// A small list of PHINodes.
507   using PhiVector = SmallVector<PHINode *, 4>;
508 
509   /// A type for scalarized values in the new loop. Each value from the
510   /// original loop, when scalarized, is represented by UF x VF scalar values
511   /// in the new unrolled loop, where UF is the unroll factor and VF is the
512   /// vectorization factor.
513   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
514 
515   /// Set up the values of the IVs correctly when exiting the vector loop.
516   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
517                     Value *CountRoundDown, Value *EndValue,
518                     BasicBlock *MiddleBlock);
519 
520   /// Create a new induction variable inside L.
521   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
522                                    Value *Step, Instruction *DL);
523 
524   /// Handle all cross-iteration phis in the header.
525   void fixCrossIterationPHIs();
526 
527   /// Fix a first-order recurrence. This is the second phase of vectorizing
528   /// this phi node.
529   void fixFirstOrderRecurrence(PHINode *Phi);
530 
531   /// Fix a reduction cross-iteration phi. This is the second phase of
532   /// vectorizing this phi node.
533   void fixReduction(PHINode *Phi);
534 
535   /// The Loop exit block may have single value PHI nodes with some
536   /// incoming value. While vectorizing we only handled real values
537   /// that were defined inside the loop and we should have one value for
538   /// each predecessor of its parent basic block. See PR14725.
539   void fixLCSSAPHIs();
540 
541   /// Iteratively sink the scalarized operands of a predicated instruction into
542   /// the block that was created for it.
543   void sinkScalarOperands(Instruction *PredInst);
544 
545   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
546   /// represented as.
547   void truncateToMinimalBitwidths();
548 
549   /// Insert the new loop to the loop hierarchy and pass manager
550   /// and update the analysis passes.
551   void updateAnalysis();
552 
553   /// Create a broadcast instruction. This method generates a broadcast
554   /// instruction (shuffle) for loop invariant values and for the induction
555   /// value. If this is the induction variable then we extend it to N, N+1, ...
556   /// this is needed because each iteration in the loop corresponds to a SIMD
557   /// element.
558   virtual Value *getBroadcastInstrs(Value *V);
559 
560   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
561   /// to each vector element of Val. The sequence starts at StartIndex.
562   /// \p Opcode is relevant for FP induction variable.
563   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
564                                Instruction::BinaryOps Opcode =
565                                Instruction::BinaryOpsEnd);
566 
567   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
568   /// variable on which to base the steps, \p Step is the size of the step, and
569   /// \p EntryVal is the value from the original loop that maps to the steps.
570   /// Note that \p EntryVal doesn't have to be an induction variable - it
571   /// can also be a truncate instruction.
572   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
573                         const InductionDescriptor &ID);
574 
575   /// Create a vector induction phi node based on an existing scalar one. \p
576   /// EntryVal is the value from the original loop that maps to the vector phi
577   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
578   /// truncate instruction, instead of widening the original IV, we widen a
579   /// version of the IV truncated to \p EntryVal's type.
580   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
581                                        Value *Step, Instruction *EntryVal);
582 
583   /// Returns true if an instruction \p I should be scalarized instead of
584   /// vectorized for the chosen vectorization factor.
585   bool shouldScalarizeInstruction(Instruction *I) const;
586 
587   /// Returns true if we should generate a scalar version of \p IV.
588   bool needsScalarInduction(Instruction *IV) const;
589 
590   /// If there is a cast involved in the induction variable \p ID, which should
591   /// be ignored in the vectorized loop body, this function records the
592   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
593   /// cast. We had already proved that the casted Phi is equal to the uncasted
594   /// Phi in the vectorized loop (under a runtime guard), and therefore
595   /// there is no need to vectorize the cast - the same value can be used in the
596   /// vector loop for both the Phi and the cast.
597   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
598   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
599   ///
600   /// \p EntryVal is the value from the original loop that maps to the vector
601   /// phi node and is used to distinguish what is the IV currently being
602   /// processed - original one (if \p EntryVal is a phi corresponding to the
603   /// original IV) or the "newly-created" one based on the proof mentioned above
604   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
605   /// latter case \p EntryVal is a TruncInst and we must not record anything for
606   /// that IV, but it's error-prone to expect callers of this routine to care
607   /// about that, hence this explicit parameter.
608   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
609                                              const Instruction *EntryVal,
610                                              Value *VectorLoopValue,
611                                              unsigned Part,
612                                              unsigned Lane = UINT_MAX);
613 
614   /// Generate a shuffle sequence that will reverse the vector Vec.
615   virtual Value *reverseVector(Value *Vec);
616 
617   /// Returns (and creates if needed) the original loop trip count.
618   Value *getOrCreateTripCount(Loop *NewLoop);
619 
620   /// Returns (and creates if needed) the trip count of the widened loop.
621   Value *getOrCreateVectorTripCount(Loop *NewLoop);
622 
623   /// Returns a bitcasted value to the requested vector type.
624   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
625   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
626                                 const DataLayout &DL);
627 
628   /// Emit a bypass check to see if the vector trip count is zero, including if
629   /// it overflows.
630   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
631 
632   /// Emit a bypass check to see if all of the SCEV assumptions we've
633   /// had to make are correct.
634   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
635 
636   /// Emit bypass checks to check any memory assumptions we may have made.
637   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
638 
639   /// Compute the transformed value of Index at offset StartValue using step
640   /// StepValue.
641   /// For integer induction, returns StartValue + Index * StepValue.
642   /// For pointer induction, returns StartValue[Index * StepValue].
643   /// FIXME: The newly created binary instructions should contain nsw/nuw
644   /// flags, which can be found from the original scalar operations.
645   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
646                               const DataLayout &DL,
647                               const InductionDescriptor &ID) const;
648 
649   /// Add additional metadata to \p To that was not present on \p Orig.
650   ///
651   /// Currently this is used to add the noalias annotations based on the
652   /// inserted memchecks.  Use this for instructions that are *cloned* into the
653   /// vector loop.
654   void addNewMetadata(Instruction *To, const Instruction *Orig);
655 
656   /// Add metadata from one instruction to another.
657   ///
658   /// This includes both the original MDs from \p From and additional ones (\see
659   /// addNewMetadata).  Use this for *newly created* instructions in the vector
660   /// loop.
661   void addMetadata(Instruction *To, Instruction *From);
662 
663   /// Similar to the previous function but it adds the metadata to a
664   /// vector of instructions.
665   void addMetadata(ArrayRef<Value *> To, Instruction *From);
666 
667   /// The original loop.
668   Loop *OrigLoop;
669 
670   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
671   /// dynamic knowledge to simplify SCEV expressions and converts them to a
672   /// more usable form.
673   PredicatedScalarEvolution &PSE;
674 
675   /// Loop Info.
676   LoopInfo *LI;
677 
678   /// Dominator Tree.
679   DominatorTree *DT;
680 
681   /// Alias Analysis.
682   AliasAnalysis *AA;
683 
684   /// Target Library Info.
685   const TargetLibraryInfo *TLI;
686 
687   /// Target Transform Info.
688   const TargetTransformInfo *TTI;
689 
690   /// Assumption Cache.
691   AssumptionCache *AC;
692 
693   /// Interface to emit optimization remarks.
694   OptimizationRemarkEmitter *ORE;
695 
696   /// LoopVersioning.  It's only set up (non-null) if memchecks were
697   /// used.
698   ///
699   /// This is currently only used to add no-alias metadata based on the
700   /// memchecks.  The actually versioning is performed manually.
701   std::unique_ptr<LoopVersioning> LVer;
702 
703   /// The vectorization SIMD factor to use. Each vector will have this many
704   /// vector elements.
705   unsigned VF;
706 
707   /// The vectorization unroll factor to use. Each scalar is vectorized to this
708   /// many different vector instructions.
709   unsigned UF;
710 
711   /// The builder that we use
712   IRBuilder<> Builder;
713 
714   // --- Vectorization state ---
715 
716   /// The vector-loop preheader.
717   BasicBlock *LoopVectorPreHeader;
718 
719   /// The scalar-loop preheader.
720   BasicBlock *LoopScalarPreHeader;
721 
722   /// Middle Block between the vector and the scalar.
723   BasicBlock *LoopMiddleBlock;
724 
725   /// The ExitBlock of the scalar loop.
726   BasicBlock *LoopExitBlock;
727 
728   /// The vector loop body.
729   BasicBlock *LoopVectorBody;
730 
731   /// The scalar loop body.
732   BasicBlock *LoopScalarBody;
733 
734   /// A list of all bypass blocks. The first block is the entry of the loop.
735   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
736 
737   /// The new Induction variable which was added to the new block.
738   PHINode *Induction = nullptr;
739 
740   /// The induction variable of the old basic block.
741   PHINode *OldInduction = nullptr;
742 
743   /// Maps values from the original loop to their corresponding values in the
744   /// vectorized loop. A key value can map to either vector values, scalar
745   /// values or both kinds of values, depending on whether the key was
746   /// vectorized and scalarized.
747   VectorizerValueMap VectorLoopValueMap;
748 
749   /// Store instructions that were predicated.
750   SmallVector<Instruction *, 4> PredicatedInstructions;
751 
752   /// Trip count of the original loop.
753   Value *TripCount = nullptr;
754 
755   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
756   Value *VectorTripCount = nullptr;
757 
758   /// The legality analysis.
759   LoopVectorizationLegality *Legal;
760 
761   /// The profitablity analysis.
762   LoopVectorizationCostModel *Cost;
763 
764   // Record whether runtime checks are added.
765   bool AddedSafetyChecks = false;
766 
767   // Holds the end values for each induction variable. We save the end values
768   // so we can later fix-up the external users of the induction variables.
769   DenseMap<PHINode *, Value *> IVEndValues;
770 
771   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
772   // fixed up at the end of vector code generation.
773   SmallVector<PHINode *, 8> OrigPHIsToFix;
774 };
775 
776 class InnerLoopUnroller : public InnerLoopVectorizer {
777 public:
778   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
779                     LoopInfo *LI, DominatorTree *DT,
780                     const TargetLibraryInfo *TLI,
781                     const TargetTransformInfo *TTI, AssumptionCache *AC,
782                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
783                     LoopVectorizationLegality *LVL,
784                     LoopVectorizationCostModel *CM)
785       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
786                             UnrollFactor, LVL, CM) {}
787 
788 private:
789   Value *getBroadcastInstrs(Value *V) override;
790   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
791                        Instruction::BinaryOps Opcode =
792                        Instruction::BinaryOpsEnd) override;
793   Value *reverseVector(Value *Vec) override;
794 };
795 
796 } // end namespace llvm
797 
798 /// Look for a meaningful debug location on the instruction or it's
799 /// operands.
800 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
801   if (!I)
802     return I;
803 
804   DebugLoc Empty;
805   if (I->getDebugLoc() != Empty)
806     return I;
807 
808   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
809     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
810       if (OpInst->getDebugLoc() != Empty)
811         return OpInst;
812   }
813 
814   return I;
815 }
816 
817 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
818   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
819     const DILocation *DIL = Inst->getDebugLoc();
820     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
821         !isa<DbgInfoIntrinsic>(Inst)) {
822       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
823       if (NewDIL)
824         B.SetCurrentDebugLocation(NewDIL.getValue());
825       else
826         LLVM_DEBUG(dbgs()
827                    << "Failed to create new discriminator: "
828                    << DIL->getFilename() << " Line: " << DIL->getLine());
829     }
830     else
831       B.SetCurrentDebugLocation(DIL);
832   } else
833     B.SetCurrentDebugLocation(DebugLoc());
834 }
835 
836 /// Write a record \p DebugMsg about vectorization failure to the debug
837 /// output stream. If \p I is passed, it is an instruction that prevents
838 /// vectorization.
839 #ifndef NDEBUG
840 static void debugVectorizationFailure(const StringRef DebugMsg,
841     Instruction *I) {
842   dbgs() << "LV: Not vectorizing: " << DebugMsg;
843   if (I != nullptr)
844     dbgs() << " " << *I;
845   else
846     dbgs() << '.';
847   dbgs() << '\n';
848 }
849 #endif
850 
851 /// Create an analysis remark that explains why vectorization failed
852 ///
853 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
854 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
855 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
856 /// the location of the remark.  \return the remark object that can be
857 /// streamed to.
858 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
859     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
860   Value *CodeRegion = TheLoop->getHeader();
861   DebugLoc DL = TheLoop->getStartLoc();
862 
863   if (I) {
864     CodeRegion = I->getParent();
865     // If there is no debug location attached to the instruction, revert back to
866     // using the loop's.
867     if (I->getDebugLoc())
868       DL = I->getDebugLoc();
869   }
870 
871   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
872   R << "loop not vectorized: ";
873   return R;
874 }
875 
876 namespace llvm {
877 
878 void reportVectorizationFailure(const StringRef DebugMsg,
879     const StringRef OREMsg, const StringRef ORETag,
880     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
881   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
882   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
883   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
884                 ORETag, TheLoop, I) << OREMsg);
885 }
886 
887 } // end namespace llvm
888 
889 #ifndef NDEBUG
890 /// \return string containing a file name and a line # for the given loop.
891 static std::string getDebugLocString(const Loop *L) {
892   std::string Result;
893   if (L) {
894     raw_string_ostream OS(Result);
895     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
896       LoopDbgLoc.print(OS);
897     else
898       // Just print the module name.
899       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
900     OS.flush();
901   }
902   return Result;
903 }
904 #endif
905 
906 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
907                                          const Instruction *Orig) {
908   // If the loop was versioned with memchecks, add the corresponding no-alias
909   // metadata.
910   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
911     LVer->annotateInstWithNoAlias(To, Orig);
912 }
913 
914 void InnerLoopVectorizer::addMetadata(Instruction *To,
915                                       Instruction *From) {
916   propagateMetadata(To, From);
917   addNewMetadata(To, From);
918 }
919 
920 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
921                                       Instruction *From) {
922   for (Value *V : To) {
923     if (Instruction *I = dyn_cast<Instruction>(V))
924       addMetadata(I, From);
925   }
926 }
927 
928 namespace llvm {
929 
930 // Loop vectorization cost-model hints how the scalar epilogue loop should be
931 // lowered.
932 enum ScalarEpilogueLowering {
933 
934   // The default: allowing scalar epilogues.
935   CM_ScalarEpilogueAllowed,
936 
937   // Vectorization with OptForSize: don't allow epilogues.
938   CM_ScalarEpilogueNotAllowedOptSize,
939 
940   // A special case of vectorisation with OptForSize: loops with a very small
941   // trip count are considered for vectorization under OptForSize, thereby
942   // making sure the cost of their loop body is dominant, free of runtime
943   // guards and scalar iteration overheads.
944   CM_ScalarEpilogueNotAllowedLowTripLoop,
945 
946   // Loop hint predicate indicating an epilogue is undesired.
947   CM_ScalarEpilogueNotNeededUsePredicate
948 };
949 
950 /// LoopVectorizationCostModel - estimates the expected speedups due to
951 /// vectorization.
952 /// In many cases vectorization is not profitable. This can happen because of
953 /// a number of reasons. In this class we mainly attempt to predict the
954 /// expected speedup/slowdowns due to the supported instruction set. We use the
955 /// TargetTransformInfo to query the different backends for the cost of
956 /// different operations.
957 class LoopVectorizationCostModel {
958 public:
959   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
960                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
961                              LoopVectorizationLegality *Legal,
962                              const TargetTransformInfo &TTI,
963                              const TargetLibraryInfo *TLI, DemandedBits *DB,
964                              AssumptionCache *AC,
965                              OptimizationRemarkEmitter *ORE, const Function *F,
966                              const LoopVectorizeHints *Hints,
967                              InterleavedAccessInfo &IAI)
968       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
969         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
970         Hints(Hints), InterleaveInfo(IAI) {}
971 
972   /// \return An upper bound for the vectorization factor, or None if
973   /// vectorization and interleaving should be avoided up front.
974   Optional<unsigned> computeMaxVF();
975 
976   /// \return True if runtime checks are required for vectorization, and false
977   /// otherwise.
978   bool runtimeChecksRequired();
979 
980   /// \return The most profitable vectorization factor and the cost of that VF.
981   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
982   /// then this vectorization factor will be selected if vectorization is
983   /// possible.
984   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
985 
986   /// Setup cost-based decisions for user vectorization factor.
987   void selectUserVectorizationFactor(unsigned UserVF) {
988     collectUniformsAndScalars(UserVF);
989     collectInstsToScalarize(UserVF);
990   }
991 
992   /// \return The size (in bits) of the smallest and widest types in the code
993   /// that needs to be vectorized. We ignore values that remain scalar such as
994   /// 64 bit loop indices.
995   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
996 
997   /// \return The desired interleave count.
998   /// If interleave count has been specified by metadata it will be returned.
999   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1000   /// are the selected vectorization factor and the cost of the selected VF.
1001   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1002 
1003   /// Memory access instruction may be vectorized in more than one way.
1004   /// Form of instruction after vectorization depends on cost.
1005   /// This function takes cost-based decisions for Load/Store instructions
1006   /// and collects them in a map. This decisions map is used for building
1007   /// the lists of loop-uniform and loop-scalar instructions.
1008   /// The calculated cost is saved with widening decision in order to
1009   /// avoid redundant calculations.
1010   void setCostBasedWideningDecision(unsigned VF);
1011 
1012   /// A struct that represents some properties of the register usage
1013   /// of a loop.
1014   struct RegisterUsage {
1015     /// Holds the number of loop invariant values that are used in the loop.
1016     /// The key is ClassID of target-provided register class.
1017     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1018     /// Holds the maximum number of concurrent live intervals in the loop.
1019     /// The key is ClassID of target-provided register class.
1020     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1021   };
1022 
1023   /// \return Returns information about the register usages of the loop for the
1024   /// given vectorization factors.
1025   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1026 
1027   /// Collect values we want to ignore in the cost model.
1028   void collectValuesToIgnore();
1029 
1030   /// \returns The smallest bitwidth each instruction can be represented with.
1031   /// The vector equivalents of these instructions should be truncated to this
1032   /// type.
1033   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1034     return MinBWs;
1035   }
1036 
1037   /// \returns True if it is more profitable to scalarize instruction \p I for
1038   /// vectorization factor \p VF.
1039   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1040     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1041 
1042     // Cost model is not run in the VPlan-native path - return conservative
1043     // result until this changes.
1044     if (EnableVPlanNativePath)
1045       return false;
1046 
1047     auto Scalars = InstsToScalarize.find(VF);
1048     assert(Scalars != InstsToScalarize.end() &&
1049            "VF not yet analyzed for scalarization profitability");
1050     return Scalars->second.find(I) != Scalars->second.end();
1051   }
1052 
1053   /// Returns true if \p I is known to be uniform after vectorization.
1054   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1055     if (VF == 1)
1056       return true;
1057 
1058     // Cost model is not run in the VPlan-native path - return conservative
1059     // result until this changes.
1060     if (EnableVPlanNativePath)
1061       return false;
1062 
1063     auto UniformsPerVF = Uniforms.find(VF);
1064     assert(UniformsPerVF != Uniforms.end() &&
1065            "VF not yet analyzed for uniformity");
1066     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1067   }
1068 
1069   /// Returns true if \p I is known to be scalar after vectorization.
1070   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1071     if (VF == 1)
1072       return true;
1073 
1074     // Cost model is not run in the VPlan-native path - return conservative
1075     // result until this changes.
1076     if (EnableVPlanNativePath)
1077       return false;
1078 
1079     auto ScalarsPerVF = Scalars.find(VF);
1080     assert(ScalarsPerVF != Scalars.end() &&
1081            "Scalar values are not calculated for VF");
1082     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1083   }
1084 
1085   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1086   /// for vectorization factor \p VF.
1087   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1088     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1089            !isProfitableToScalarize(I, VF) &&
1090            !isScalarAfterVectorization(I, VF);
1091   }
1092 
1093   /// Decision that was taken during cost calculation for memory instruction.
1094   enum InstWidening {
1095     CM_Unknown,
1096     CM_Widen,         // For consecutive accesses with stride +1.
1097     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1098     CM_Interleave,
1099     CM_GatherScatter,
1100     CM_Scalarize
1101   };
1102 
1103   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1104   /// instruction \p I and vector width \p VF.
1105   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1106                            unsigned Cost) {
1107     assert(VF >= 2 && "Expected VF >=2");
1108     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1109   }
1110 
1111   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1112   /// interleaving group \p Grp and vector width \p VF.
1113   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1114                            InstWidening W, unsigned Cost) {
1115     assert(VF >= 2 && "Expected VF >=2");
1116     /// Broadcast this decicion to all instructions inside the group.
1117     /// But the cost will be assigned to one instruction only.
1118     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1119       if (auto *I = Grp->getMember(i)) {
1120         if (Grp->getInsertPos() == I)
1121           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1122         else
1123           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1124       }
1125     }
1126   }
1127 
1128   /// Return the cost model decision for the given instruction \p I and vector
1129   /// width \p VF. Return CM_Unknown if this instruction did not pass
1130   /// through the cost modeling.
1131   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1132     assert(VF >= 2 && "Expected VF >=2");
1133 
1134     // Cost model is not run in the VPlan-native path - return conservative
1135     // result until this changes.
1136     if (EnableVPlanNativePath)
1137       return CM_GatherScatter;
1138 
1139     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1140     auto Itr = WideningDecisions.find(InstOnVF);
1141     if (Itr == WideningDecisions.end())
1142       return CM_Unknown;
1143     return Itr->second.first;
1144   }
1145 
1146   /// Return the vectorization cost for the given instruction \p I and vector
1147   /// width \p VF.
1148   unsigned getWideningCost(Instruction *I, unsigned VF) {
1149     assert(VF >= 2 && "Expected VF >=2");
1150     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1151     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1152            "The cost is not calculated");
1153     return WideningDecisions[InstOnVF].second;
1154   }
1155 
1156   /// Return True if instruction \p I is an optimizable truncate whose operand
1157   /// is an induction variable. Such a truncate will be removed by adding a new
1158   /// induction variable with the destination type.
1159   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1160     // If the instruction is not a truncate, return false.
1161     auto *Trunc = dyn_cast<TruncInst>(I);
1162     if (!Trunc)
1163       return false;
1164 
1165     // Get the source and destination types of the truncate.
1166     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1167     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1168 
1169     // If the truncate is free for the given types, return false. Replacing a
1170     // free truncate with an induction variable would add an induction variable
1171     // update instruction to each iteration of the loop. We exclude from this
1172     // check the primary induction variable since it will need an update
1173     // instruction regardless.
1174     Value *Op = Trunc->getOperand(0);
1175     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1176       return false;
1177 
1178     // If the truncated value is not an induction variable, return false.
1179     return Legal->isInductionPhi(Op);
1180   }
1181 
1182   /// Collects the instructions to scalarize for each predicated instruction in
1183   /// the loop.
1184   void collectInstsToScalarize(unsigned VF);
1185 
1186   /// Collect Uniform and Scalar values for the given \p VF.
1187   /// The sets depend on CM decision for Load/Store instructions
1188   /// that may be vectorized as interleave, gather-scatter or scalarized.
1189   void collectUniformsAndScalars(unsigned VF) {
1190     // Do the analysis once.
1191     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1192       return;
1193     setCostBasedWideningDecision(VF);
1194     collectLoopUniforms(VF);
1195     collectLoopScalars(VF);
1196   }
1197 
1198   /// Returns true if the target machine supports masked store operation
1199   /// for the given \p DataType and kind of access to \p Ptr.
1200   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1201     return Legal->isConsecutivePtr(Ptr) &&
1202            TTI.isLegalMaskedStore(DataType, Alignment);
1203   }
1204 
1205   /// Returns true if the target machine supports masked load operation
1206   /// for the given \p DataType and kind of access to \p Ptr.
1207   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1208     return Legal->isConsecutivePtr(Ptr) &&
1209            TTI.isLegalMaskedLoad(DataType, Alignment);
1210   }
1211 
1212   /// Returns true if the target machine supports masked scatter operation
1213   /// for the given \p DataType.
1214   bool isLegalMaskedScatter(Type *DataType) {
1215     return TTI.isLegalMaskedScatter(DataType);
1216   }
1217 
1218   /// Returns true if the target machine supports masked gather operation
1219   /// for the given \p DataType.
1220   bool isLegalMaskedGather(Type *DataType) {
1221     return TTI.isLegalMaskedGather(DataType);
1222   }
1223 
1224   /// Returns true if the target machine can represent \p V as a masked gather
1225   /// or scatter operation.
1226   bool isLegalGatherOrScatter(Value *V) {
1227     bool LI = isa<LoadInst>(V);
1228     bool SI = isa<StoreInst>(V);
1229     if (!LI && !SI)
1230       return false;
1231     auto *Ty = getMemInstValueType(V);
1232     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1233   }
1234 
1235   /// Returns true if \p I is an instruction that will be scalarized with
1236   /// predication. Such instructions include conditional stores and
1237   /// instructions that may divide by zero.
1238   /// If a non-zero VF has been calculated, we check if I will be scalarized
1239   /// predication for that VF.
1240   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1241 
1242   // Returns true if \p I is an instruction that will be predicated either
1243   // through scalar predication or masked load/store or masked gather/scatter.
1244   // Superset of instructions that return true for isScalarWithPredication.
1245   bool isPredicatedInst(Instruction *I) {
1246     if (!blockNeedsPredication(I->getParent()))
1247       return false;
1248     // Loads and stores that need some form of masked operation are predicated
1249     // instructions.
1250     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1251       return Legal->isMaskRequired(I);
1252     return isScalarWithPredication(I);
1253   }
1254 
1255   /// Returns true if \p I is a memory instruction with consecutive memory
1256   /// access that can be widened.
1257   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1258 
1259   /// Returns true if \p I is a memory instruction in an interleaved-group
1260   /// of memory accesses that can be vectorized with wide vector loads/stores
1261   /// and shuffles.
1262   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1263 
1264   /// Check if \p Instr belongs to any interleaved access group.
1265   bool isAccessInterleaved(Instruction *Instr) {
1266     return InterleaveInfo.isInterleaved(Instr);
1267   }
1268 
1269   /// Get the interleaved access group that \p Instr belongs to.
1270   const InterleaveGroup<Instruction> *
1271   getInterleavedAccessGroup(Instruction *Instr) {
1272     return InterleaveInfo.getInterleaveGroup(Instr);
1273   }
1274 
1275   /// Returns true if an interleaved group requires a scalar iteration
1276   /// to handle accesses with gaps, and there is nothing preventing us from
1277   /// creating a scalar epilogue.
1278   bool requiresScalarEpilogue() const {
1279     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1280   }
1281 
1282   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1283   /// loop hint annotation.
1284   bool isScalarEpilogueAllowed() const {
1285     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1286   }
1287 
1288   /// Returns true if all loop blocks should be masked to fold tail loop.
1289   bool foldTailByMasking() const { return FoldTailByMasking; }
1290 
1291   bool blockNeedsPredication(BasicBlock *BB) {
1292     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1293   }
1294 
1295   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1296   /// with factor VF.  Return the cost of the instruction, including
1297   /// scalarization overhead if it's needed.
1298   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1299 
1300   /// Estimate cost of a call instruction CI if it were vectorized with factor
1301   /// VF. Return the cost of the instruction, including scalarization overhead
1302   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1303   /// scalarized -
1304   /// i.e. either vector version isn't available, or is too expensive.
1305   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1306 
1307 private:
1308   unsigned NumPredStores = 0;
1309 
1310   /// \return An upper bound for the vectorization factor, larger than zero.
1311   /// One is returned if vectorization should best be avoided due to cost.
1312   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1313 
1314   /// The vectorization cost is a combination of the cost itself and a boolean
1315   /// indicating whether any of the contributing operations will actually
1316   /// operate on
1317   /// vector values after type legalization in the backend. If this latter value
1318   /// is
1319   /// false, then all operations will be scalarized (i.e. no vectorization has
1320   /// actually taken place).
1321   using VectorizationCostTy = std::pair<unsigned, bool>;
1322 
1323   /// Returns the expected execution cost. The unit of the cost does
1324   /// not matter because we use the 'cost' units to compare different
1325   /// vector widths. The cost that is returned is *not* normalized by
1326   /// the factor width.
1327   VectorizationCostTy expectedCost(unsigned VF);
1328 
1329   /// Returns the execution time cost of an instruction for a given vector
1330   /// width. Vector width of one means scalar.
1331   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1332 
1333   /// The cost-computation logic from getInstructionCost which provides
1334   /// the vector type as an output parameter.
1335   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1336 
1337   /// Calculate vectorization cost of memory instruction \p I.
1338   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for scalarized memory instruction.
1341   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for interleaving group of memory instructions.
1344   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for Gather/Scatter instruction.
1347   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1348 
1349   /// The cost computation for widening instruction \p I with consecutive
1350   /// memory access.
1351   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1354   /// Load: scalar load + broadcast.
1355   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1356   /// element)
1357   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1358 
1359   /// Estimate the overhead of scalarizing an instruction. This is a
1360   /// convenience wrapper for the type-based getScalarizationOverhead API.
1361   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1362 
1363   /// Returns whether the instruction is a load or store and will be a emitted
1364   /// as a vector operation.
1365   bool isConsecutiveLoadOrStore(Instruction *I);
1366 
1367   /// Returns true if an artificially high cost for emulated masked memrefs
1368   /// should be used.
1369   bool useEmulatedMaskMemRefHack(Instruction *I);
1370 
1371   /// Map of scalar integer values to the smallest bitwidth they can be legally
1372   /// represented as. The vector equivalents of these values should be truncated
1373   /// to this type.
1374   MapVector<Instruction *, uint64_t> MinBWs;
1375 
1376   /// A type representing the costs for instructions if they were to be
1377   /// scalarized rather than vectorized. The entries are Instruction-Cost
1378   /// pairs.
1379   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1380 
1381   /// A set containing all BasicBlocks that are known to present after
1382   /// vectorization as a predicated block.
1383   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1384 
1385   /// Records whether it is allowed to have the original scalar loop execute at
1386   /// least once. This may be needed as a fallback loop in case runtime
1387   /// aliasing/dependence checks fail, or to handle the tail/remainder
1388   /// iterations when the trip count is unknown or doesn't divide by the VF,
1389   /// or as a peel-loop to handle gaps in interleave-groups.
1390   /// Under optsize and when the trip count is very small we don't allow any
1391   /// iterations to execute in the scalar loop.
1392   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1393 
1394   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1395   bool FoldTailByMasking = false;
1396 
1397   /// A map holding scalar costs for different vectorization factors. The
1398   /// presence of a cost for an instruction in the mapping indicates that the
1399   /// instruction will be scalarized when vectorizing with the associated
1400   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1401   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1402 
1403   /// Holds the instructions known to be uniform after vectorization.
1404   /// The data is collected per VF.
1405   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1406 
1407   /// Holds the instructions known to be scalar after vectorization.
1408   /// The data is collected per VF.
1409   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1410 
1411   /// Holds the instructions (address computations) that are forced to be
1412   /// scalarized.
1413   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1414 
1415   /// Returns the expected difference in cost from scalarizing the expression
1416   /// feeding a predicated instruction \p PredInst. The instructions to
1417   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1418   /// non-negative return value implies the expression will be scalarized.
1419   /// Currently, only single-use chains are considered for scalarization.
1420   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1421                               unsigned VF);
1422 
1423   /// Collect the instructions that are uniform after vectorization. An
1424   /// instruction is uniform if we represent it with a single scalar value in
1425   /// the vectorized loop corresponding to each vector iteration. Examples of
1426   /// uniform instructions include pointer operands of consecutive or
1427   /// interleaved memory accesses. Note that although uniformity implies an
1428   /// instruction will be scalar, the reverse is not true. In general, a
1429   /// scalarized instruction will be represented by VF scalar values in the
1430   /// vectorized loop, each corresponding to an iteration of the original
1431   /// scalar loop.
1432   void collectLoopUniforms(unsigned VF);
1433 
1434   /// Collect the instructions that are scalar after vectorization. An
1435   /// instruction is scalar if it is known to be uniform or will be scalarized
1436   /// during vectorization. Non-uniform scalarized instructions will be
1437   /// represented by VF values in the vectorized loop, each corresponding to an
1438   /// iteration of the original scalar loop.
1439   void collectLoopScalars(unsigned VF);
1440 
1441   /// Keeps cost model vectorization decision and cost for instructions.
1442   /// Right now it is used for memory instructions only.
1443   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1444                                 std::pair<InstWidening, unsigned>>;
1445 
1446   DecisionList WideningDecisions;
1447 
1448   /// Returns true if \p V is expected to be vectorized and it needs to be
1449   /// extracted.
1450   bool needsExtract(Value *V, unsigned VF) const {
1451     Instruction *I = dyn_cast<Instruction>(V);
1452     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1453       return false;
1454 
1455     // Assume we can vectorize V (and hence we need extraction) if the
1456     // scalars are not computed yet. This can happen, because it is called
1457     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1458     // the scalars are collected. That should be a safe assumption in most
1459     // cases, because we check if the operands have vectorizable types
1460     // beforehand in LoopVectorizationLegality.
1461     return Scalars.find(VF) == Scalars.end() ||
1462            !isScalarAfterVectorization(I, VF);
1463   };
1464 
1465   /// Returns a range containing only operands needing to be extracted.
1466   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1467                                                    unsigned VF) {
1468     return SmallVector<Value *, 4>(make_filter_range(
1469         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1470   }
1471 
1472 public:
1473   /// The loop that we evaluate.
1474   Loop *TheLoop;
1475 
1476   /// Predicated scalar evolution analysis.
1477   PredicatedScalarEvolution &PSE;
1478 
1479   /// Loop Info analysis.
1480   LoopInfo *LI;
1481 
1482   /// Vectorization legality.
1483   LoopVectorizationLegality *Legal;
1484 
1485   /// Vector target information.
1486   const TargetTransformInfo &TTI;
1487 
1488   /// Target Library Info.
1489   const TargetLibraryInfo *TLI;
1490 
1491   /// Demanded bits analysis.
1492   DemandedBits *DB;
1493 
1494   /// Assumption cache.
1495   AssumptionCache *AC;
1496 
1497   /// Interface to emit optimization remarks.
1498   OptimizationRemarkEmitter *ORE;
1499 
1500   const Function *TheFunction;
1501 
1502   /// Loop Vectorize Hint.
1503   const LoopVectorizeHints *Hints;
1504 
1505   /// The interleave access information contains groups of interleaved accesses
1506   /// with the same stride and close to each other.
1507   InterleavedAccessInfo &InterleaveInfo;
1508 
1509   /// Values to ignore in the cost model.
1510   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1511 
1512   /// Values to ignore in the cost model when VF > 1.
1513   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1514 };
1515 
1516 } // end namespace llvm
1517 
1518 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1519 // vectorization. The loop needs to be annotated with #pragma omp simd
1520 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1521 // vector length information is not provided, vectorization is not considered
1522 // explicit. Interleave hints are not allowed either. These limitations will be
1523 // relaxed in the future.
1524 // Please, note that we are currently forced to abuse the pragma 'clang
1525 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1526 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1527 // provides *explicit vectorization hints* (LV can bypass legal checks and
1528 // assume that vectorization is legal). However, both hints are implemented
1529 // using the same metadata (llvm.loop.vectorize, processed by
1530 // LoopVectorizeHints). This will be fixed in the future when the native IR
1531 // representation for pragma 'omp simd' is introduced.
1532 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1533                                    OptimizationRemarkEmitter *ORE) {
1534   assert(!OuterLp->empty() && "This is not an outer loop");
1535   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1536 
1537   // Only outer loops with an explicit vectorization hint are supported.
1538   // Unannotated outer loops are ignored.
1539   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1540     return false;
1541 
1542   Function *Fn = OuterLp->getHeader()->getParent();
1543   if (!Hints.allowVectorization(Fn, OuterLp,
1544                                 true /*VectorizeOnlyWhenForced*/)) {
1545     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1546     return false;
1547   }
1548 
1549   if (Hints.getInterleave() > 1) {
1550     // TODO: Interleave support is future work.
1551     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1552                          "outer loops.\n");
1553     Hints.emitRemarkWithHints();
1554     return false;
1555   }
1556 
1557   return true;
1558 }
1559 
1560 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1561                                   OptimizationRemarkEmitter *ORE,
1562                                   SmallVectorImpl<Loop *> &V) {
1563   // Collect inner loops and outer loops without irreducible control flow. For
1564   // now, only collect outer loops that have explicit vectorization hints. If we
1565   // are stress testing the VPlan H-CFG construction, we collect the outermost
1566   // loop of every loop nest.
1567   if (L.empty() || VPlanBuildStressTest ||
1568       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1569     LoopBlocksRPO RPOT(&L);
1570     RPOT.perform(LI);
1571     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1572       V.push_back(&L);
1573       // TODO: Collect inner loops inside marked outer loops in case
1574       // vectorization fails for the outer loop. Do not invoke
1575       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1576       // already known to be reducible. We can use an inherited attribute for
1577       // that.
1578       return;
1579     }
1580   }
1581   for (Loop *InnerL : L)
1582     collectSupportedLoops(*InnerL, LI, ORE, V);
1583 }
1584 
1585 namespace {
1586 
1587 /// The LoopVectorize Pass.
1588 struct LoopVectorize : public FunctionPass {
1589   /// Pass identification, replacement for typeid
1590   static char ID;
1591 
1592   LoopVectorizePass Impl;
1593 
1594   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1595                          bool VectorizeOnlyWhenForced = false)
1596       : FunctionPass(ID) {
1597     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1598     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1599     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1600   }
1601 
1602   bool runOnFunction(Function &F) override {
1603     if (skipFunction(F))
1604       return false;
1605 
1606     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1607     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1608     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1609     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1610     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1611     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1612     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1613     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1614     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1615     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1616     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1617     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1618     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1619 
1620     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1621         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1622 
1623     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1624                         GetLAA, *ORE, PSI);
1625   }
1626 
1627   void getAnalysisUsage(AnalysisUsage &AU) const override {
1628     AU.addRequired<AssumptionCacheTracker>();
1629     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1630     AU.addRequired<DominatorTreeWrapperPass>();
1631     AU.addRequired<LoopInfoWrapperPass>();
1632     AU.addRequired<ScalarEvolutionWrapperPass>();
1633     AU.addRequired<TargetTransformInfoWrapperPass>();
1634     AU.addRequired<AAResultsWrapperPass>();
1635     AU.addRequired<LoopAccessLegacyAnalysis>();
1636     AU.addRequired<DemandedBitsWrapperPass>();
1637     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1638     AU.addRequired<InjectTLIMappingsLegacy>();
1639 
1640     // We currently do not preserve loopinfo/dominator analyses with outer loop
1641     // vectorization. Until this is addressed, mark these analyses as preserved
1642     // only for non-VPlan-native path.
1643     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1644     if (!EnableVPlanNativePath) {
1645       AU.addPreserved<LoopInfoWrapperPass>();
1646       AU.addPreserved<DominatorTreeWrapperPass>();
1647     }
1648 
1649     AU.addPreserved<BasicAAWrapperPass>();
1650     AU.addPreserved<GlobalsAAWrapperPass>();
1651     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1652   }
1653 };
1654 
1655 } // end anonymous namespace
1656 
1657 //===----------------------------------------------------------------------===//
1658 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1659 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1660 //===----------------------------------------------------------------------===//
1661 
1662 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1663   // We need to place the broadcast of invariant variables outside the loop,
1664   // but only if it's proven safe to do so. Else, broadcast will be inside
1665   // vector loop body.
1666   Instruction *Instr = dyn_cast<Instruction>(V);
1667   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1668                      (!Instr ||
1669                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1670   // Place the code for broadcasting invariant variables in the new preheader.
1671   IRBuilder<>::InsertPointGuard Guard(Builder);
1672   if (SafeToHoist)
1673     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1674 
1675   // Broadcast the scalar into all locations in the vector.
1676   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1677 
1678   return Shuf;
1679 }
1680 
1681 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1682     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1683   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1684          "Expected either an induction phi-node or a truncate of it!");
1685   Value *Start = II.getStartValue();
1686 
1687   // Construct the initial value of the vector IV in the vector loop preheader
1688   auto CurrIP = Builder.saveIP();
1689   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1690   if (isa<TruncInst>(EntryVal)) {
1691     assert(Start->getType()->isIntegerTy() &&
1692            "Truncation requires an integer type");
1693     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1694     Step = Builder.CreateTrunc(Step, TruncType);
1695     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1696   }
1697   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1698   Value *SteppedStart =
1699       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1700 
1701   // We create vector phi nodes for both integer and floating-point induction
1702   // variables. Here, we determine the kind of arithmetic we will perform.
1703   Instruction::BinaryOps AddOp;
1704   Instruction::BinaryOps MulOp;
1705   if (Step->getType()->isIntegerTy()) {
1706     AddOp = Instruction::Add;
1707     MulOp = Instruction::Mul;
1708   } else {
1709     AddOp = II.getInductionOpcode();
1710     MulOp = Instruction::FMul;
1711   }
1712 
1713   // Multiply the vectorization factor by the step using integer or
1714   // floating-point arithmetic as appropriate.
1715   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1716   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1717 
1718   // Create a vector splat to use in the induction update.
1719   //
1720   // FIXME: If the step is non-constant, we create the vector splat with
1721   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1722   //        handle a constant vector splat.
1723   Value *SplatVF = isa<Constant>(Mul)
1724                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1725                        : Builder.CreateVectorSplat(VF, Mul);
1726   Builder.restoreIP(CurrIP);
1727 
1728   // We may need to add the step a number of times, depending on the unroll
1729   // factor. The last of those goes into the PHI.
1730   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1731                                     &*LoopVectorBody->getFirstInsertionPt());
1732   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1733   Instruction *LastInduction = VecInd;
1734   for (unsigned Part = 0; Part < UF; ++Part) {
1735     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1736 
1737     if (isa<TruncInst>(EntryVal))
1738       addMetadata(LastInduction, EntryVal);
1739     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1740 
1741     LastInduction = cast<Instruction>(addFastMathFlag(
1742         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1743     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1744   }
1745 
1746   // Move the last step to the end of the latch block. This ensures consistent
1747   // placement of all induction updates.
1748   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1749   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1750   auto *ICmp = cast<Instruction>(Br->getCondition());
1751   LastInduction->moveBefore(ICmp);
1752   LastInduction->setName("vec.ind.next");
1753 
1754   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1755   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1756 }
1757 
1758 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1759   return Cost->isScalarAfterVectorization(I, VF) ||
1760          Cost->isProfitableToScalarize(I, VF);
1761 }
1762 
1763 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1764   if (shouldScalarizeInstruction(IV))
1765     return true;
1766   auto isScalarInst = [&](User *U) -> bool {
1767     auto *I = cast<Instruction>(U);
1768     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1769   };
1770   return llvm::any_of(IV->users(), isScalarInst);
1771 }
1772 
1773 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1774     const InductionDescriptor &ID, const Instruction *EntryVal,
1775     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1776   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1777          "Expected either an induction phi-node or a truncate of it!");
1778 
1779   // This induction variable is not the phi from the original loop but the
1780   // newly-created IV based on the proof that casted Phi is equal to the
1781   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1782   // re-uses the same InductionDescriptor that original IV uses but we don't
1783   // have to do any recording in this case - that is done when original IV is
1784   // processed.
1785   if (isa<TruncInst>(EntryVal))
1786     return;
1787 
1788   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1789   if (Casts.empty())
1790     return;
1791   // Only the first Cast instruction in the Casts vector is of interest.
1792   // The rest of the Casts (if exist) have no uses outside the
1793   // induction update chain itself.
1794   Instruction *CastInst = *Casts.begin();
1795   if (Lane < UINT_MAX)
1796     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1797   else
1798     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1799 }
1800 
1801 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1802   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1803          "Primary induction variable must have an integer type");
1804 
1805   auto II = Legal->getInductionVars()->find(IV);
1806   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1807 
1808   auto ID = II->second;
1809   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1810 
1811   // The scalar value to broadcast. This will be derived from the canonical
1812   // induction variable.
1813   Value *ScalarIV = nullptr;
1814 
1815   // The value from the original loop to which we are mapping the new induction
1816   // variable.
1817   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1818 
1819   // True if we have vectorized the induction variable.
1820   auto VectorizedIV = false;
1821 
1822   // Determine if we want a scalar version of the induction variable. This is
1823   // true if the induction variable itself is not widened, or if it has at
1824   // least one user in the loop that is not widened.
1825   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1826 
1827   // Generate code for the induction step. Note that induction steps are
1828   // required to be loop-invariant
1829   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1830          "Induction step should be loop invariant");
1831   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1832   Value *Step = nullptr;
1833   if (PSE.getSE()->isSCEVable(IV->getType())) {
1834     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1835     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1836                              LoopVectorPreHeader->getTerminator());
1837   } else {
1838     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1839   }
1840 
1841   // Try to create a new independent vector induction variable. If we can't
1842   // create the phi node, we will splat the scalar induction variable in each
1843   // loop iteration.
1844   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1845     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1846     VectorizedIV = true;
1847   }
1848 
1849   // If we haven't yet vectorized the induction variable, or if we will create
1850   // a scalar one, we need to define the scalar induction variable and step
1851   // values. If we were given a truncation type, truncate the canonical
1852   // induction variable and step. Otherwise, derive these values from the
1853   // induction descriptor.
1854   if (!VectorizedIV || NeedsScalarIV) {
1855     ScalarIV = Induction;
1856     if (IV != OldInduction) {
1857       ScalarIV = IV->getType()->isIntegerTy()
1858                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1859                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1860                                           IV->getType());
1861       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1862       ScalarIV->setName("offset.idx");
1863     }
1864     if (Trunc) {
1865       auto *TruncType = cast<IntegerType>(Trunc->getType());
1866       assert(Step->getType()->isIntegerTy() &&
1867              "Truncation requires an integer step");
1868       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1869       Step = Builder.CreateTrunc(Step, TruncType);
1870     }
1871   }
1872 
1873   // If we haven't yet vectorized the induction variable, splat the scalar
1874   // induction variable, and build the necessary step vectors.
1875   // TODO: Don't do it unless the vectorized IV is really required.
1876   if (!VectorizedIV) {
1877     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1878     for (unsigned Part = 0; Part < UF; ++Part) {
1879       Value *EntryPart =
1880           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1881       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1882       if (Trunc)
1883         addMetadata(EntryPart, Trunc);
1884       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1885     }
1886   }
1887 
1888   // If an induction variable is only used for counting loop iterations or
1889   // calculating addresses, it doesn't need to be widened. Create scalar steps
1890   // that can be used by instructions we will later scalarize. Note that the
1891   // addition of the scalar steps will not increase the number of instructions
1892   // in the loop in the common case prior to InstCombine. We will be trading
1893   // one vector extract for each scalar step.
1894   if (NeedsScalarIV)
1895     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1896 }
1897 
1898 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1899                                           Instruction::BinaryOps BinOp) {
1900   // Create and check the types.
1901   assert(Val->getType()->isVectorTy() && "Must be a vector");
1902   int VLen = Val->getType()->getVectorNumElements();
1903 
1904   Type *STy = Val->getType()->getScalarType();
1905   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1906          "Induction Step must be an integer or FP");
1907   assert(Step->getType() == STy && "Step has wrong type");
1908 
1909   SmallVector<Constant *, 8> Indices;
1910 
1911   if (STy->isIntegerTy()) {
1912     // Create a vector of consecutive numbers from zero to VF.
1913     for (int i = 0; i < VLen; ++i)
1914       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1915 
1916     // Add the consecutive indices to the vector value.
1917     Constant *Cv = ConstantVector::get(Indices);
1918     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1919     Step = Builder.CreateVectorSplat(VLen, Step);
1920     assert(Step->getType() == Val->getType() && "Invalid step vec");
1921     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1922     // which can be found from the original scalar operations.
1923     Step = Builder.CreateMul(Cv, Step);
1924     return Builder.CreateAdd(Val, Step, "induction");
1925   }
1926 
1927   // Floating point induction.
1928   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1929          "Binary Opcode should be specified for FP induction");
1930   // Create a vector of consecutive numbers from zero to VF.
1931   for (int i = 0; i < VLen; ++i)
1932     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1933 
1934   // Add the consecutive indices to the vector value.
1935   Constant *Cv = ConstantVector::get(Indices);
1936 
1937   Step = Builder.CreateVectorSplat(VLen, Step);
1938 
1939   // Floating point operations had to be 'fast' to enable the induction.
1940   FastMathFlags Flags;
1941   Flags.setFast();
1942 
1943   Value *MulOp = Builder.CreateFMul(Cv, Step);
1944   if (isa<Instruction>(MulOp))
1945     // Have to check, MulOp may be a constant
1946     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1947 
1948   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1949   if (isa<Instruction>(BOp))
1950     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1951   return BOp;
1952 }
1953 
1954 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1955                                            Instruction *EntryVal,
1956                                            const InductionDescriptor &ID) {
1957   // We shouldn't have to build scalar steps if we aren't vectorizing.
1958   assert(VF > 1 && "VF should be greater than one");
1959 
1960   // Get the value type and ensure it and the step have the same integer type.
1961   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1962   assert(ScalarIVTy == Step->getType() &&
1963          "Val and Step should have the same type");
1964 
1965   // We build scalar steps for both integer and floating-point induction
1966   // variables. Here, we determine the kind of arithmetic we will perform.
1967   Instruction::BinaryOps AddOp;
1968   Instruction::BinaryOps MulOp;
1969   if (ScalarIVTy->isIntegerTy()) {
1970     AddOp = Instruction::Add;
1971     MulOp = Instruction::Mul;
1972   } else {
1973     AddOp = ID.getInductionOpcode();
1974     MulOp = Instruction::FMul;
1975   }
1976 
1977   // Determine the number of scalars we need to generate for each unroll
1978   // iteration. If EntryVal is uniform, we only need to generate the first
1979   // lane. Otherwise, we generate all VF values.
1980   unsigned Lanes =
1981       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1982                                                                          : VF;
1983   // Compute the scalar steps and save the results in VectorLoopValueMap.
1984   for (unsigned Part = 0; Part < UF; ++Part) {
1985     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1986       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1987       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1988       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1989       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1990       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1991     }
1992   }
1993 }
1994 
1995 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1996   assert(V != Induction && "The new induction variable should not be used.");
1997   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1998   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1999 
2000   // If we have a stride that is replaced by one, do it here. Defer this for
2001   // the VPlan-native path until we start running Legal checks in that path.
2002   if (!EnableVPlanNativePath && Legal->hasStride(V))
2003     V = ConstantInt::get(V->getType(), 1);
2004 
2005   // If we have a vector mapped to this value, return it.
2006   if (VectorLoopValueMap.hasVectorValue(V, Part))
2007     return VectorLoopValueMap.getVectorValue(V, Part);
2008 
2009   // If the value has not been vectorized, check if it has been scalarized
2010   // instead. If it has been scalarized, and we actually need the value in
2011   // vector form, we will construct the vector values on demand.
2012   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2013     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2014 
2015     // If we've scalarized a value, that value should be an instruction.
2016     auto *I = cast<Instruction>(V);
2017 
2018     // If we aren't vectorizing, we can just copy the scalar map values over to
2019     // the vector map.
2020     if (VF == 1) {
2021       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2022       return ScalarValue;
2023     }
2024 
2025     // Get the last scalar instruction we generated for V and Part. If the value
2026     // is known to be uniform after vectorization, this corresponds to lane zero
2027     // of the Part unroll iteration. Otherwise, the last instruction is the one
2028     // we created for the last vector lane of the Part unroll iteration.
2029     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2030     auto *LastInst = cast<Instruction>(
2031         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2032 
2033     // Set the insert point after the last scalarized instruction. This ensures
2034     // the insertelement sequence will directly follow the scalar definitions.
2035     auto OldIP = Builder.saveIP();
2036     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2037     Builder.SetInsertPoint(&*NewIP);
2038 
2039     // However, if we are vectorizing, we need to construct the vector values.
2040     // If the value is known to be uniform after vectorization, we can just
2041     // broadcast the scalar value corresponding to lane zero for each unroll
2042     // iteration. Otherwise, we construct the vector values using insertelement
2043     // instructions. Since the resulting vectors are stored in
2044     // VectorLoopValueMap, we will only generate the insertelements once.
2045     Value *VectorValue = nullptr;
2046     if (Cost->isUniformAfterVectorization(I, VF)) {
2047       VectorValue = getBroadcastInstrs(ScalarValue);
2048       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2049     } else {
2050       // Initialize packing with insertelements to start from undef.
2051       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2052       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2053       for (unsigned Lane = 0; Lane < VF; ++Lane)
2054         packScalarIntoVectorValue(V, {Part, Lane});
2055       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2056     }
2057     Builder.restoreIP(OldIP);
2058     return VectorValue;
2059   }
2060 
2061   // If this scalar is unknown, assume that it is a constant or that it is
2062   // loop invariant. Broadcast V and save the value for future uses.
2063   Value *B = getBroadcastInstrs(V);
2064   VectorLoopValueMap.setVectorValue(V, Part, B);
2065   return B;
2066 }
2067 
2068 Value *
2069 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2070                                             const VPIteration &Instance) {
2071   // If the value is not an instruction contained in the loop, it should
2072   // already be scalar.
2073   if (OrigLoop->isLoopInvariant(V))
2074     return V;
2075 
2076   assert(Instance.Lane > 0
2077              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2078              : true && "Uniform values only have lane zero");
2079 
2080   // If the value from the original loop has not been vectorized, it is
2081   // represented by UF x VF scalar values in the new loop. Return the requested
2082   // scalar value.
2083   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2084     return VectorLoopValueMap.getScalarValue(V, Instance);
2085 
2086   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2087   // for the given unroll part. If this entry is not a vector type (i.e., the
2088   // vectorization factor is one), there is no need to generate an
2089   // extractelement instruction.
2090   auto *U = getOrCreateVectorValue(V, Instance.Part);
2091   if (!U->getType()->isVectorTy()) {
2092     assert(VF == 1 && "Value not scalarized has non-vector type");
2093     return U;
2094   }
2095 
2096   // Otherwise, the value from the original loop has been vectorized and is
2097   // represented by UF vector values. Extract and return the requested scalar
2098   // value from the appropriate vector lane.
2099   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2100 }
2101 
2102 void InnerLoopVectorizer::packScalarIntoVectorValue(
2103     Value *V, const VPIteration &Instance) {
2104   assert(V != Induction && "The new induction variable should not be used.");
2105   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2106   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2107 
2108   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2109   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2110   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2111                                             Builder.getInt32(Instance.Lane));
2112   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2113 }
2114 
2115 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2116   assert(Vec->getType()->isVectorTy() && "Invalid type");
2117   SmallVector<Constant *, 8> ShuffleMask;
2118   for (unsigned i = 0; i < VF; ++i)
2119     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2120 
2121   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2122                                      ConstantVector::get(ShuffleMask),
2123                                      "reverse");
2124 }
2125 
2126 // Return whether we allow using masked interleave-groups (for dealing with
2127 // strided loads/stores that reside in predicated blocks, or for dealing
2128 // with gaps).
2129 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2130   // If an override option has been passed in for interleaved accesses, use it.
2131   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2132     return EnableMaskedInterleavedMemAccesses;
2133 
2134   return TTI.enableMaskedInterleavedAccessVectorization();
2135 }
2136 
2137 // Try to vectorize the interleave group that \p Instr belongs to.
2138 //
2139 // E.g. Translate following interleaved load group (factor = 3):
2140 //   for (i = 0; i < N; i+=3) {
2141 //     R = Pic[i];             // Member of index 0
2142 //     G = Pic[i+1];           // Member of index 1
2143 //     B = Pic[i+2];           // Member of index 2
2144 //     ... // do something to R, G, B
2145 //   }
2146 // To:
2147 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2148 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2149 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2150 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2151 //
2152 // Or translate following interleaved store group (factor = 3):
2153 //   for (i = 0; i < N; i+=3) {
2154 //     ... do something to R, G, B
2155 //     Pic[i]   = R;           // Member of index 0
2156 //     Pic[i+1] = G;           // Member of index 1
2157 //     Pic[i+2] = B;           // Member of index 2
2158 //   }
2159 // To:
2160 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2161 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2162 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2163 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2164 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2165 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2166                                                    VectorParts *BlockInMask) {
2167   const InterleaveGroup<Instruction> *Group =
2168       Cost->getInterleavedAccessGroup(Instr);
2169   assert(Group && "Fail to get an interleaved access group.");
2170 
2171   // Skip if current instruction is not the insert position.
2172   if (Instr != Group->getInsertPos())
2173     return;
2174 
2175   const DataLayout &DL = Instr->getModule()->getDataLayout();
2176   Value *Ptr = getLoadStorePointerOperand(Instr);
2177 
2178   // Prepare for the vector type of the interleaved load/store.
2179   Type *ScalarTy = getMemInstValueType(Instr);
2180   unsigned InterleaveFactor = Group->getFactor();
2181   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2182   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2183 
2184   // Prepare for the new pointers.
2185   setDebugLocFromInst(Builder, Ptr);
2186   SmallVector<Value *, 2> NewPtrs;
2187   unsigned Index = Group->getIndex(Instr);
2188 
2189   VectorParts Mask;
2190   bool IsMaskForCondRequired = BlockInMask;
2191   if (IsMaskForCondRequired) {
2192     Mask = *BlockInMask;
2193     // TODO: extend the masked interleaved-group support to reversed access.
2194     assert(!Group->isReverse() && "Reversed masked interleave-group "
2195                                   "not supported.");
2196   }
2197 
2198   // If the group is reverse, adjust the index to refer to the last vector lane
2199   // instead of the first. We adjust the index from the first vector lane,
2200   // rather than directly getting the pointer for lane VF - 1, because the
2201   // pointer operand of the interleaved access is supposed to be uniform. For
2202   // uniform instructions, we're only required to generate a value for the
2203   // first vector lane in each unroll iteration.
2204   if (Group->isReverse())
2205     Index += (VF - 1) * Group->getFactor();
2206 
2207   bool InBounds = false;
2208   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2209     InBounds = gep->isInBounds();
2210 
2211   for (unsigned Part = 0; Part < UF; Part++) {
2212     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2213 
2214     // Notice current instruction could be any index. Need to adjust the address
2215     // to the member of index 0.
2216     //
2217     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2218     //       b = A[i];       // Member of index 0
2219     // Current pointer is pointed to A[i+1], adjust it to A[i].
2220     //
2221     // E.g.  A[i+1] = a;     // Member of index 1
2222     //       A[i]   = b;     // Member of index 0
2223     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2224     // Current pointer is pointed to A[i+2], adjust it to A[i].
2225     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2226     if (InBounds)
2227       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2228 
2229     // Cast to the vector pointer type.
2230     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2231   }
2232 
2233   setDebugLocFromInst(Builder, Instr);
2234   Value *UndefVec = UndefValue::get(VecTy);
2235 
2236   Value *MaskForGaps = nullptr;
2237   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2238     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2239     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2240   }
2241 
2242   // Vectorize the interleaved load group.
2243   if (isa<LoadInst>(Instr)) {
2244     // For each unroll part, create a wide load for the group.
2245     SmallVector<Value *, 2> NewLoads;
2246     for (unsigned Part = 0; Part < UF; Part++) {
2247       Instruction *NewLoad;
2248       if (IsMaskForCondRequired || MaskForGaps) {
2249         assert(useMaskedInterleavedAccesses(*TTI) &&
2250                "masked interleaved groups are not allowed.");
2251         Value *GroupMask = MaskForGaps;
2252         if (IsMaskForCondRequired) {
2253           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2254           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2255           Value *ShuffledMask = Builder.CreateShuffleVector(
2256               Mask[Part], Undefs, RepMask, "interleaved.mask");
2257           GroupMask = MaskForGaps
2258                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2259                                                 MaskForGaps)
2260                           : ShuffledMask;
2261         }
2262         NewLoad =
2263             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2264                                      GroupMask, UndefVec, "wide.masked.vec");
2265       }
2266       else
2267         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2268                                             Group->getAlignment(), "wide.vec");
2269       Group->addMetadata(NewLoad);
2270       NewLoads.push_back(NewLoad);
2271     }
2272 
2273     // For each member in the group, shuffle out the appropriate data from the
2274     // wide loads.
2275     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2276       Instruction *Member = Group->getMember(I);
2277 
2278       // Skip the gaps in the group.
2279       if (!Member)
2280         continue;
2281 
2282       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2283       for (unsigned Part = 0; Part < UF; Part++) {
2284         Value *StridedVec = Builder.CreateShuffleVector(
2285             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2286 
2287         // If this member has different type, cast the result type.
2288         if (Member->getType() != ScalarTy) {
2289           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2290           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2291         }
2292 
2293         if (Group->isReverse())
2294           StridedVec = reverseVector(StridedVec);
2295 
2296         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2297       }
2298     }
2299     return;
2300   }
2301 
2302   // The sub vector type for current instruction.
2303   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2304 
2305   // Vectorize the interleaved store group.
2306   for (unsigned Part = 0; Part < UF; Part++) {
2307     // Collect the stored vector from each member.
2308     SmallVector<Value *, 4> StoredVecs;
2309     for (unsigned i = 0; i < InterleaveFactor; i++) {
2310       // Interleaved store group doesn't allow a gap, so each index has a member
2311       Instruction *Member = Group->getMember(i);
2312       assert(Member && "Fail to get a member from an interleaved store group");
2313 
2314       Value *StoredVec = getOrCreateVectorValue(
2315           cast<StoreInst>(Member)->getValueOperand(), Part);
2316       if (Group->isReverse())
2317         StoredVec = reverseVector(StoredVec);
2318 
2319       // If this member has different type, cast it to a unified type.
2320 
2321       if (StoredVec->getType() != SubVT)
2322         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2323 
2324       StoredVecs.push_back(StoredVec);
2325     }
2326 
2327     // Concatenate all vectors into a wide vector.
2328     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2329 
2330     // Interleave the elements in the wide vector.
2331     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2332     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2333                                               "interleaved.vec");
2334 
2335     Instruction *NewStoreInstr;
2336     if (IsMaskForCondRequired) {
2337       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2338       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2339       Value *ShuffledMask = Builder.CreateShuffleVector(
2340           Mask[Part], Undefs, RepMask, "interleaved.mask");
2341       NewStoreInstr = Builder.CreateMaskedStore(
2342           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2343     }
2344     else
2345       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2346         Group->getAlignment());
2347 
2348     Group->addMetadata(NewStoreInstr);
2349   }
2350 }
2351 
2352 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2353                                                      VectorParts *BlockInMask) {
2354   // Attempt to issue a wide load.
2355   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2356   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2357 
2358   assert((LI || SI) && "Invalid Load/Store instruction");
2359 
2360   LoopVectorizationCostModel::InstWidening Decision =
2361       Cost->getWideningDecision(Instr, VF);
2362   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2363          "CM decision should be taken at this point");
2364   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2365     return vectorizeInterleaveGroup(Instr);
2366 
2367   Type *ScalarDataTy = getMemInstValueType(Instr);
2368   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2369   Value *Ptr = getLoadStorePointerOperand(Instr);
2370   // An alignment of 0 means target abi alignment. We need to use the scalar's
2371   // target abi alignment in such a case.
2372   const DataLayout &DL = Instr->getModule()->getDataLayout();
2373   const Align Alignment =
2374       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2375   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2376 
2377   // Determine if the pointer operand of the access is either consecutive or
2378   // reverse consecutive.
2379   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2380   bool ConsecutiveStride =
2381       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2382   bool CreateGatherScatter =
2383       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2384 
2385   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2386   // gather/scatter. Otherwise Decision should have been to Scalarize.
2387   assert((ConsecutiveStride || CreateGatherScatter) &&
2388          "The instruction should be scalarized");
2389 
2390   // Handle consecutive loads/stores.
2391   if (ConsecutiveStride)
2392     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2393 
2394   VectorParts Mask;
2395   bool isMaskRequired = BlockInMask;
2396   if (isMaskRequired)
2397     Mask = *BlockInMask;
2398 
2399   bool InBounds = false;
2400   if (auto *gep = dyn_cast<GetElementPtrInst>(
2401           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2402     InBounds = gep->isInBounds();
2403 
2404   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2405     // Calculate the pointer for the specific unroll-part.
2406     GetElementPtrInst *PartPtr = nullptr;
2407 
2408     if (Reverse) {
2409       // If the address is consecutive but reversed, then the
2410       // wide store needs to start at the last vector element.
2411       PartPtr = cast<GetElementPtrInst>(
2412           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2413       PartPtr->setIsInBounds(InBounds);
2414       PartPtr = cast<GetElementPtrInst>(
2415           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2416       PartPtr->setIsInBounds(InBounds);
2417       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2418         Mask[Part] = reverseVector(Mask[Part]);
2419     } else {
2420       PartPtr = cast<GetElementPtrInst>(
2421           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2422       PartPtr->setIsInBounds(InBounds);
2423     }
2424 
2425     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2426   };
2427 
2428   // Handle Stores:
2429   if (SI) {
2430     setDebugLocFromInst(Builder, SI);
2431 
2432     for (unsigned Part = 0; Part < UF; ++Part) {
2433       Instruction *NewSI = nullptr;
2434       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2435       if (CreateGatherScatter) {
2436         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2437         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2438         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2439                                             Alignment.value(), MaskPart);
2440       } else {
2441         if (Reverse) {
2442           // If we store to reverse consecutive memory locations, then we need
2443           // to reverse the order of elements in the stored value.
2444           StoredVal = reverseVector(StoredVal);
2445           // We don't want to update the value in the map as it might be used in
2446           // another expression. So don't call resetVectorValue(StoredVal).
2447         }
2448         auto *VecPtr = CreateVecPtr(Part, Ptr);
2449         if (isMaskRequired)
2450           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2451                                             Alignment.value(), Mask[Part]);
2452         else
2453           NewSI =
2454               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2455       }
2456       addMetadata(NewSI, SI);
2457     }
2458     return;
2459   }
2460 
2461   // Handle loads.
2462   assert(LI && "Must have a load instruction");
2463   setDebugLocFromInst(Builder, LI);
2464   for (unsigned Part = 0; Part < UF; ++Part) {
2465     Value *NewLI;
2466     if (CreateGatherScatter) {
2467       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2468       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2469       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2470                                          nullptr, "wide.masked.gather");
2471       addMetadata(NewLI, LI);
2472     } else {
2473       auto *VecPtr = CreateVecPtr(Part, Ptr);
2474       if (isMaskRequired)
2475         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2476                                          UndefValue::get(DataTy),
2477                                          "wide.masked.load");
2478       else
2479         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2480                                           "wide.load");
2481 
2482       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2483       addMetadata(NewLI, LI);
2484       if (Reverse)
2485         NewLI = reverseVector(NewLI);
2486     }
2487     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2488   }
2489 }
2490 
2491 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2492                                                const VPIteration &Instance,
2493                                                bool IfPredicateInstr) {
2494   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2495 
2496   setDebugLocFromInst(Builder, Instr);
2497 
2498   // Does this instruction return a value ?
2499   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2500 
2501   Instruction *Cloned = Instr->clone();
2502   if (!IsVoidRetTy)
2503     Cloned->setName(Instr->getName() + ".cloned");
2504 
2505   // Replace the operands of the cloned instructions with their scalar
2506   // equivalents in the new loop.
2507   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2508     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2509     Cloned->setOperand(op, NewOp);
2510   }
2511   addNewMetadata(Cloned, Instr);
2512 
2513   // Place the cloned scalar in the new loop.
2514   Builder.Insert(Cloned);
2515 
2516   // Add the cloned scalar to the scalar map entry.
2517   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2518 
2519   // If we just cloned a new assumption, add it the assumption cache.
2520   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2521     if (II->getIntrinsicID() == Intrinsic::assume)
2522       AC->registerAssumption(II);
2523 
2524   // End if-block.
2525   if (IfPredicateInstr)
2526     PredicatedInstructions.push_back(Cloned);
2527 }
2528 
2529 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2530                                                       Value *End, Value *Step,
2531                                                       Instruction *DL) {
2532   BasicBlock *Header = L->getHeader();
2533   BasicBlock *Latch = L->getLoopLatch();
2534   // As we're just creating this loop, it's possible no latch exists
2535   // yet. If so, use the header as this will be a single block loop.
2536   if (!Latch)
2537     Latch = Header;
2538 
2539   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2540   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2541   setDebugLocFromInst(Builder, OldInst);
2542   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2543 
2544   Builder.SetInsertPoint(Latch->getTerminator());
2545   setDebugLocFromInst(Builder, OldInst);
2546 
2547   // Create i+1 and fill the PHINode.
2548   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2549   Induction->addIncoming(Start, L->getLoopPreheader());
2550   Induction->addIncoming(Next, Latch);
2551   // Create the compare.
2552   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2553   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2554 
2555   // Now we have two terminators. Remove the old one from the block.
2556   Latch->getTerminator()->eraseFromParent();
2557 
2558   return Induction;
2559 }
2560 
2561 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2562   if (TripCount)
2563     return TripCount;
2564 
2565   assert(L && "Create Trip Count for null loop.");
2566   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2567   // Find the loop boundaries.
2568   ScalarEvolution *SE = PSE.getSE();
2569   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2570   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2571          "Invalid loop count");
2572 
2573   Type *IdxTy = Legal->getWidestInductionType();
2574   assert(IdxTy && "No type for induction");
2575 
2576   // The exit count might have the type of i64 while the phi is i32. This can
2577   // happen if we have an induction variable that is sign extended before the
2578   // compare. The only way that we get a backedge taken count is that the
2579   // induction variable was signed and as such will not overflow. In such a case
2580   // truncation is legal.
2581   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2582       IdxTy->getPrimitiveSizeInBits())
2583     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2584   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2585 
2586   // Get the total trip count from the count by adding 1.
2587   const SCEV *ExitCount = SE->getAddExpr(
2588       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2589 
2590   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2591 
2592   // Expand the trip count and place the new instructions in the preheader.
2593   // Notice that the pre-header does not change, only the loop body.
2594   SCEVExpander Exp(*SE, DL, "induction");
2595 
2596   // Count holds the overall loop count (N).
2597   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2598                                 L->getLoopPreheader()->getTerminator());
2599 
2600   if (TripCount->getType()->isPointerTy())
2601     TripCount =
2602         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2603                                     L->getLoopPreheader()->getTerminator());
2604 
2605   return TripCount;
2606 }
2607 
2608 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2609   if (VectorTripCount)
2610     return VectorTripCount;
2611 
2612   Value *TC = getOrCreateTripCount(L);
2613   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2614 
2615   Type *Ty = TC->getType();
2616   Constant *Step = ConstantInt::get(Ty, VF * UF);
2617 
2618   // If the tail is to be folded by masking, round the number of iterations N
2619   // up to a multiple of Step instead of rounding down. This is done by first
2620   // adding Step-1 and then rounding down. Note that it's ok if this addition
2621   // overflows: the vector induction variable will eventually wrap to zero given
2622   // that it starts at zero and its Step is a power of two; the loop will then
2623   // exit, with the last early-exit vector comparison also producing all-true.
2624   if (Cost->foldTailByMasking()) {
2625     assert(isPowerOf2_32(VF * UF) &&
2626            "VF*UF must be a power of 2 when folding tail by masking");
2627     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2628   }
2629 
2630   // Now we need to generate the expression for the part of the loop that the
2631   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2632   // iterations are not required for correctness, or N - Step, otherwise. Step
2633   // is equal to the vectorization factor (number of SIMD elements) times the
2634   // unroll factor (number of SIMD instructions).
2635   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2636 
2637   // If there is a non-reversed interleaved group that may speculatively access
2638   // memory out-of-bounds, we need to ensure that there will be at least one
2639   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2640   // the trip count, we set the remainder to be equal to the step. If the step
2641   // does not evenly divide the trip count, no adjustment is necessary since
2642   // there will already be scalar iterations. Note that the minimum iterations
2643   // check ensures that N >= Step.
2644   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2645     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2646     R = Builder.CreateSelect(IsZero, Step, R);
2647   }
2648 
2649   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2650 
2651   return VectorTripCount;
2652 }
2653 
2654 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2655                                                    const DataLayout &DL) {
2656   // Verify that V is a vector type with same number of elements as DstVTy.
2657   unsigned VF = DstVTy->getNumElements();
2658   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2659   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2660   Type *SrcElemTy = SrcVecTy->getElementType();
2661   Type *DstElemTy = DstVTy->getElementType();
2662   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2663          "Vector elements must have same size");
2664 
2665   // Do a direct cast if element types are castable.
2666   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2667     return Builder.CreateBitOrPointerCast(V, DstVTy);
2668   }
2669   // V cannot be directly casted to desired vector type.
2670   // May happen when V is a floating point vector but DstVTy is a vector of
2671   // pointers or vice-versa. Handle this using a two-step bitcast using an
2672   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2673   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2674          "Only one type should be a pointer type");
2675   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2676          "Only one type should be a floating point type");
2677   Type *IntTy =
2678       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2679   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2680   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2681   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2682 }
2683 
2684 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2685                                                          BasicBlock *Bypass) {
2686   Value *Count = getOrCreateTripCount(L);
2687   BasicBlock *BB = L->getLoopPreheader();
2688   IRBuilder<> Builder(BB->getTerminator());
2689 
2690   // Generate code to check if the loop's trip count is less than VF * UF, or
2691   // equal to it in case a scalar epilogue is required; this implies that the
2692   // vector trip count is zero. This check also covers the case where adding one
2693   // to the backedge-taken count overflowed leading to an incorrect trip count
2694   // of zero. In this case we will also jump to the scalar loop.
2695   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2696                                           : ICmpInst::ICMP_ULT;
2697 
2698   // If tail is to be folded, vector loop takes care of all iterations.
2699   Value *CheckMinIters = Builder.getFalse();
2700   if (!Cost->foldTailByMasking())
2701     CheckMinIters = Builder.CreateICmp(
2702         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2703         "min.iters.check");
2704 
2705   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2706   // Update dominator tree immediately if the generated block is a
2707   // LoopBypassBlock because SCEV expansions to generate loop bypass
2708   // checks may query it before the current function is finished.
2709   DT->addNewBlock(NewBB, BB);
2710   if (L->getParentLoop())
2711     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2712   ReplaceInstWithInst(BB->getTerminator(),
2713                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2714   LoopBypassBlocks.push_back(BB);
2715 }
2716 
2717 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2718   BasicBlock *BB = L->getLoopPreheader();
2719 
2720   // Generate the code to check that the SCEV assumptions that we made.
2721   // We want the new basic block to start at the first instruction in a
2722   // sequence of instructions that form a check.
2723   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2724                    "scev.check");
2725   Value *SCEVCheck =
2726       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2727 
2728   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2729     if (C->isZero())
2730       return;
2731 
2732   assert(!BB->getParent()->hasOptSize() &&
2733          "Cannot SCEV check stride or overflow when optimizing for size");
2734 
2735   // Create a new block containing the stride check.
2736   BB->setName("vector.scevcheck");
2737   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2738   // Update dominator tree immediately if the generated block is a
2739   // LoopBypassBlock because SCEV expansions to generate loop bypass
2740   // checks may query it before the current function is finished.
2741   DT->addNewBlock(NewBB, BB);
2742   if (L->getParentLoop())
2743     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2744   ReplaceInstWithInst(BB->getTerminator(),
2745                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2746   LoopBypassBlocks.push_back(BB);
2747   AddedSafetyChecks = true;
2748 }
2749 
2750 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2751   // VPlan-native path does not do any analysis for runtime checks currently.
2752   if (EnableVPlanNativePath)
2753     return;
2754 
2755   BasicBlock *BB = L->getLoopPreheader();
2756 
2757   // Generate the code that checks in runtime if arrays overlap. We put the
2758   // checks into a separate block to make the more common case of few elements
2759   // faster.
2760   Instruction *FirstCheckInst;
2761   Instruction *MemRuntimeCheck;
2762   std::tie(FirstCheckInst, MemRuntimeCheck) =
2763       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2764   if (!MemRuntimeCheck)
2765     return;
2766 
2767   if (BB->getParent()->hasOptSize()) {
2768     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2769            "Cannot emit memory checks when optimizing for size, unless forced "
2770            "to vectorize.");
2771     ORE->emit([&]() {
2772       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2773                                         L->getStartLoc(), L->getHeader())
2774              << "Code-size may be reduced by not forcing "
2775                 "vectorization, or by source-code modifications "
2776                 "eliminating the need for runtime checks "
2777                 "(e.g., adding 'restrict').";
2778     });
2779   }
2780 
2781   // Create a new block containing the memory check.
2782   BB->setName("vector.memcheck");
2783   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2784   // Update dominator tree immediately if the generated block is a
2785   // LoopBypassBlock because SCEV expansions to generate loop bypass
2786   // checks may query it before the current function is finished.
2787   DT->addNewBlock(NewBB, BB);
2788   if (L->getParentLoop())
2789     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2790   ReplaceInstWithInst(BB->getTerminator(),
2791                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2792   LoopBypassBlocks.push_back(BB);
2793   AddedSafetyChecks = true;
2794 
2795   // We currently don't use LoopVersioning for the actual loop cloning but we
2796   // still use it to add the noalias metadata.
2797   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2798                                            PSE.getSE());
2799   LVer->prepareNoAliasMetadata();
2800 }
2801 
2802 Value *InnerLoopVectorizer::emitTransformedIndex(
2803     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2804     const InductionDescriptor &ID) const {
2805 
2806   SCEVExpander Exp(*SE, DL, "induction");
2807   auto Step = ID.getStep();
2808   auto StartValue = ID.getStartValue();
2809   assert(Index->getType() == Step->getType() &&
2810          "Index type does not match StepValue type");
2811 
2812   // Note: the IR at this point is broken. We cannot use SE to create any new
2813   // SCEV and then expand it, hoping that SCEV's simplification will give us
2814   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2815   // lead to various SCEV crashes. So all we can do is to use builder and rely
2816   // on InstCombine for future simplifications. Here we handle some trivial
2817   // cases only.
2818   auto CreateAdd = [&B](Value *X, Value *Y) {
2819     assert(X->getType() == Y->getType() && "Types don't match!");
2820     if (auto *CX = dyn_cast<ConstantInt>(X))
2821       if (CX->isZero())
2822         return Y;
2823     if (auto *CY = dyn_cast<ConstantInt>(Y))
2824       if (CY->isZero())
2825         return X;
2826     return B.CreateAdd(X, Y);
2827   };
2828 
2829   auto CreateMul = [&B](Value *X, Value *Y) {
2830     assert(X->getType() == Y->getType() && "Types don't match!");
2831     if (auto *CX = dyn_cast<ConstantInt>(X))
2832       if (CX->isOne())
2833         return Y;
2834     if (auto *CY = dyn_cast<ConstantInt>(Y))
2835       if (CY->isOne())
2836         return X;
2837     return B.CreateMul(X, Y);
2838   };
2839 
2840   switch (ID.getKind()) {
2841   case InductionDescriptor::IK_IntInduction: {
2842     assert(Index->getType() == StartValue->getType() &&
2843            "Index type does not match StartValue type");
2844     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2845       return B.CreateSub(StartValue, Index);
2846     auto *Offset = CreateMul(
2847         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2848     return CreateAdd(StartValue, Offset);
2849   }
2850   case InductionDescriptor::IK_PtrInduction: {
2851     assert(isa<SCEVConstant>(Step) &&
2852            "Expected constant step for pointer induction");
2853     return B.CreateGEP(
2854         StartValue->getType()->getPointerElementType(), StartValue,
2855         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2856                                            &*B.GetInsertPoint())));
2857   }
2858   case InductionDescriptor::IK_FpInduction: {
2859     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2860     auto InductionBinOp = ID.getInductionBinOp();
2861     assert(InductionBinOp &&
2862            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2863             InductionBinOp->getOpcode() == Instruction::FSub) &&
2864            "Original bin op should be defined for FP induction");
2865 
2866     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2867 
2868     // Floating point operations had to be 'fast' to enable the induction.
2869     FastMathFlags Flags;
2870     Flags.setFast();
2871 
2872     Value *MulExp = B.CreateFMul(StepValue, Index);
2873     if (isa<Instruction>(MulExp))
2874       // We have to check, the MulExp may be a constant.
2875       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2876 
2877     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2878                                "induction");
2879     if (isa<Instruction>(BOp))
2880       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2881 
2882     return BOp;
2883   }
2884   case InductionDescriptor::IK_NoInduction:
2885     return nullptr;
2886   }
2887   llvm_unreachable("invalid enum");
2888 }
2889 
2890 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2891   /*
2892    In this function we generate a new loop. The new loop will contain
2893    the vectorized instructions while the old loop will continue to run the
2894    scalar remainder.
2895 
2896        [ ] <-- loop iteration number check.
2897     /   |
2898    /    v
2899   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2900   |  /  |
2901   | /   v
2902   ||   [ ]     <-- vector pre header.
2903   |/    |
2904   |     v
2905   |    [  ] \
2906   |    [  ]_|   <-- vector loop.
2907   |     |
2908   |     v
2909   |   -[ ]   <--- middle-block.
2910   |  /  |
2911   | /   v
2912   -|- >[ ]     <--- new preheader.
2913    |    |
2914    |    v
2915    |   [ ] \
2916    |   [ ]_|   <-- old scalar loop to handle remainder.
2917     \   |
2918      \  v
2919       >[ ]     <-- exit block.
2920    ...
2921    */
2922 
2923   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2924   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2925   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2926   MDNode *OrigLoopID = OrigLoop->getLoopID();
2927   assert(VectorPH && "Invalid loop structure");
2928   assert(ExitBlock && "Must have an exit block");
2929 
2930   // Some loops have a single integer induction variable, while other loops
2931   // don't. One example is c++ iterators that often have multiple pointer
2932   // induction variables. In the code below we also support a case where we
2933   // don't have a single induction variable.
2934   //
2935   // We try to obtain an induction variable from the original loop as hard
2936   // as possible. However if we don't find one that:
2937   //   - is an integer
2938   //   - counts from zero, stepping by one
2939   //   - is the size of the widest induction variable type
2940   // then we create a new one.
2941   OldInduction = Legal->getPrimaryInduction();
2942   Type *IdxTy = Legal->getWidestInductionType();
2943 
2944   // Split the single block loop into the two loop structure described above.
2945   BasicBlock *VecBody =
2946       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2947   BasicBlock *MiddleBlock =
2948       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2949   BasicBlock *ScalarPH =
2950       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2951 
2952   // Create and register the new vector loop.
2953   Loop *Lp = LI->AllocateLoop();
2954   Loop *ParentLoop = OrigLoop->getParentLoop();
2955 
2956   // Insert the new loop into the loop nest and register the new basic blocks
2957   // before calling any utilities such as SCEV that require valid LoopInfo.
2958   if (ParentLoop) {
2959     ParentLoop->addChildLoop(Lp);
2960     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2961     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2962   } else {
2963     LI->addTopLevelLoop(Lp);
2964   }
2965   Lp->addBasicBlockToLoop(VecBody, *LI);
2966 
2967   // Find the loop boundaries.
2968   Value *Count = getOrCreateTripCount(Lp);
2969 
2970   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2971 
2972   // Now, compare the new count to zero. If it is zero skip the vector loop and
2973   // jump to the scalar loop. This check also covers the case where the
2974   // backedge-taken count is uint##_max: adding one to it will overflow leading
2975   // to an incorrect trip count of zero. In this (rare) case we will also jump
2976   // to the scalar loop.
2977   emitMinimumIterationCountCheck(Lp, ScalarPH);
2978 
2979   // Generate the code to check any assumptions that we've made for SCEV
2980   // expressions.
2981   emitSCEVChecks(Lp, ScalarPH);
2982 
2983   // Generate the code that checks in runtime if arrays overlap. We put the
2984   // checks into a separate block to make the more common case of few elements
2985   // faster.
2986   emitMemRuntimeChecks(Lp, ScalarPH);
2987 
2988   // Generate the induction variable.
2989   // The loop step is equal to the vectorization factor (num of SIMD elements)
2990   // times the unroll factor (num of SIMD instructions).
2991   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2992   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2993   Induction =
2994       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2995                               getDebugLocFromInstOrOperands(OldInduction));
2996 
2997   // We are going to resume the execution of the scalar loop.
2998   // Go over all of the induction variables that we found and fix the
2999   // PHIs that are left in the scalar version of the loop.
3000   // The starting values of PHI nodes depend on the counter of the last
3001   // iteration in the vectorized loop.
3002   // If we come from a bypass edge then we need to start from the original
3003   // start value.
3004 
3005   // This variable saves the new starting index for the scalar loop. It is used
3006   // to test if there are any tail iterations left once the vector loop has
3007   // completed.
3008   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3009   for (auto &InductionEntry : *List) {
3010     PHINode *OrigPhi = InductionEntry.first;
3011     InductionDescriptor II = InductionEntry.second;
3012 
3013     // Create phi nodes to merge from the  backedge-taken check block.
3014     PHINode *BCResumeVal = PHINode::Create(
3015         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3016     // Copy original phi DL over to the new one.
3017     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3018     Value *&EndValue = IVEndValues[OrigPhi];
3019     if (OrigPhi == OldInduction) {
3020       // We know what the end value is.
3021       EndValue = CountRoundDown;
3022     } else {
3023       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3024       Type *StepType = II.getStep()->getType();
3025       Instruction::CastOps CastOp =
3026         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3027       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3028       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3029       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3030       EndValue->setName("ind.end");
3031     }
3032 
3033     // The new PHI merges the original incoming value, in case of a bypass,
3034     // or the value at the end of the vectorized loop.
3035     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3036 
3037     // Fix the scalar body counter (PHI node).
3038     // The old induction's phi node in the scalar body needs the truncated
3039     // value.
3040     for (BasicBlock *BB : LoopBypassBlocks)
3041       BCResumeVal->addIncoming(II.getStartValue(), BB);
3042     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3043   }
3044 
3045   // We need the OrigLoop (scalar loop part) latch terminator to help
3046   // produce correct debug info for the middle block BB instructions.
3047   // The legality check stage guarantees that the loop will have a single
3048   // latch.
3049   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3050          "Scalar loop latch terminator isn't a branch");
3051   BranchInst *ScalarLatchBr =
3052       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3053 
3054   // Add a check in the middle block to see if we have completed
3055   // all of the iterations in the first vector loop.
3056   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3057   // If tail is to be folded, we know we don't need to run the remainder.
3058   Value *CmpN = Builder.getTrue();
3059   if (!Cost->foldTailByMasking()) {
3060     CmpN =
3061         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3062                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3063 
3064     // Here we use the same DebugLoc as the scalar loop latch branch instead
3065     // of the corresponding compare because they may have ended up with
3066     // different line numbers and we want to avoid awkward line stepping while
3067     // debugging. Eg. if the compare has got a line number inside the loop.
3068     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3069   }
3070 
3071   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3072   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3073   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3074 
3075   // Get ready to start creating new instructions into the vectorized body.
3076   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3077 
3078   // Save the state.
3079   LoopVectorPreHeader = Lp->getLoopPreheader();
3080   LoopScalarPreHeader = ScalarPH;
3081   LoopMiddleBlock = MiddleBlock;
3082   LoopExitBlock = ExitBlock;
3083   LoopVectorBody = VecBody;
3084   LoopScalarBody = OldBasicBlock;
3085 
3086   Optional<MDNode *> VectorizedLoopID =
3087       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3088                                       LLVMLoopVectorizeFollowupVectorized});
3089   if (VectorizedLoopID.hasValue()) {
3090     Lp->setLoopID(VectorizedLoopID.getValue());
3091 
3092     // Do not setAlreadyVectorized if loop attributes have been defined
3093     // explicitly.
3094     return LoopVectorPreHeader;
3095   }
3096 
3097   // Keep all loop hints from the original loop on the vector loop (we'll
3098   // replace the vectorizer-specific hints below).
3099   if (MDNode *LID = OrigLoop->getLoopID())
3100     Lp->setLoopID(LID);
3101 
3102   LoopVectorizeHints Hints(Lp, true, *ORE);
3103   Hints.setAlreadyVectorized();
3104 
3105   return LoopVectorPreHeader;
3106 }
3107 
3108 // Fix up external users of the induction variable. At this point, we are
3109 // in LCSSA form, with all external PHIs that use the IV having one input value,
3110 // coming from the remainder loop. We need those PHIs to also have a correct
3111 // value for the IV when arriving directly from the middle block.
3112 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3113                                        const InductionDescriptor &II,
3114                                        Value *CountRoundDown, Value *EndValue,
3115                                        BasicBlock *MiddleBlock) {
3116   // There are two kinds of external IV usages - those that use the value
3117   // computed in the last iteration (the PHI) and those that use the penultimate
3118   // value (the value that feeds into the phi from the loop latch).
3119   // We allow both, but they, obviously, have different values.
3120 
3121   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3122 
3123   DenseMap<Value *, Value *> MissingVals;
3124 
3125   // An external user of the last iteration's value should see the value that
3126   // the remainder loop uses to initialize its own IV.
3127   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3128   for (User *U : PostInc->users()) {
3129     Instruction *UI = cast<Instruction>(U);
3130     if (!OrigLoop->contains(UI)) {
3131       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3132       MissingVals[UI] = EndValue;
3133     }
3134   }
3135 
3136   // An external user of the penultimate value need to see EndValue - Step.
3137   // The simplest way to get this is to recompute it from the constituent SCEVs,
3138   // that is Start + (Step * (CRD - 1)).
3139   for (User *U : OrigPhi->users()) {
3140     auto *UI = cast<Instruction>(U);
3141     if (!OrigLoop->contains(UI)) {
3142       const DataLayout &DL =
3143           OrigLoop->getHeader()->getModule()->getDataLayout();
3144       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3145 
3146       IRBuilder<> B(MiddleBlock->getTerminator());
3147       Value *CountMinusOne = B.CreateSub(
3148           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3149       Value *CMO =
3150           !II.getStep()->getType()->isIntegerTy()
3151               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3152                              II.getStep()->getType())
3153               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3154       CMO->setName("cast.cmo");
3155       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3156       Escape->setName("ind.escape");
3157       MissingVals[UI] = Escape;
3158     }
3159   }
3160 
3161   for (auto &I : MissingVals) {
3162     PHINode *PHI = cast<PHINode>(I.first);
3163     // One corner case we have to handle is two IVs "chasing" each-other,
3164     // that is %IV2 = phi [...], [ %IV1, %latch ]
3165     // In this case, if IV1 has an external use, we need to avoid adding both
3166     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3167     // don't already have an incoming value for the middle block.
3168     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3169       PHI->addIncoming(I.second, MiddleBlock);
3170   }
3171 }
3172 
3173 namespace {
3174 
3175 struct CSEDenseMapInfo {
3176   static bool canHandle(const Instruction *I) {
3177     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3178            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3179   }
3180 
3181   static inline Instruction *getEmptyKey() {
3182     return DenseMapInfo<Instruction *>::getEmptyKey();
3183   }
3184 
3185   static inline Instruction *getTombstoneKey() {
3186     return DenseMapInfo<Instruction *>::getTombstoneKey();
3187   }
3188 
3189   static unsigned getHashValue(const Instruction *I) {
3190     assert(canHandle(I) && "Unknown instruction!");
3191     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3192                                                            I->value_op_end()));
3193   }
3194 
3195   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3196     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3197         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3198       return LHS == RHS;
3199     return LHS->isIdenticalTo(RHS);
3200   }
3201 };
3202 
3203 } // end anonymous namespace
3204 
3205 ///Perform cse of induction variable instructions.
3206 static void cse(BasicBlock *BB) {
3207   // Perform simple cse.
3208   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3209   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3210     Instruction *In = &*I++;
3211 
3212     if (!CSEDenseMapInfo::canHandle(In))
3213       continue;
3214 
3215     // Check if we can replace this instruction with any of the
3216     // visited instructions.
3217     if (Instruction *V = CSEMap.lookup(In)) {
3218       In->replaceAllUsesWith(V);
3219       In->eraseFromParent();
3220       continue;
3221     }
3222 
3223     CSEMap[In] = In;
3224   }
3225 }
3226 
3227 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3228                                                        unsigned VF,
3229                                                        bool &NeedToScalarize) {
3230   Function *F = CI->getCalledFunction();
3231   Type *ScalarRetTy = CI->getType();
3232   SmallVector<Type *, 4> Tys, ScalarTys;
3233   for (auto &ArgOp : CI->arg_operands())
3234     ScalarTys.push_back(ArgOp->getType());
3235 
3236   // Estimate cost of scalarized vector call. The source operands are assumed
3237   // to be vectors, so we need to extract individual elements from there,
3238   // execute VF scalar calls, and then gather the result into the vector return
3239   // value.
3240   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3241   if (VF == 1)
3242     return ScalarCallCost;
3243 
3244   // Compute corresponding vector type for return value and arguments.
3245   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3246   for (Type *ScalarTy : ScalarTys)
3247     Tys.push_back(ToVectorTy(ScalarTy, VF));
3248 
3249   // Compute costs of unpacking argument values for the scalar calls and
3250   // packing the return values to a vector.
3251   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3252 
3253   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3254 
3255   // If we can't emit a vector call for this function, then the currently found
3256   // cost is the cost we need to return.
3257   NeedToScalarize = true;
3258   if (!TLI || CI->isNoBuiltin() ||
3259       !VFDatabase(*CI).isFunctionVectorizable(
3260           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/)))
3261     return Cost;
3262 
3263   // If the corresponding vector cost is cheaper, return its cost.
3264   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3265   if (VectorCallCost < Cost) {
3266     NeedToScalarize = false;
3267     return VectorCallCost;
3268   }
3269   return Cost;
3270 }
3271 
3272 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3273                                                             unsigned VF) {
3274   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3275   assert(ID && "Expected intrinsic call!");
3276 
3277   FastMathFlags FMF;
3278   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3279     FMF = FPMO->getFastMathFlags();
3280 
3281   SmallVector<Value *, 4> Operands(CI->arg_operands());
3282   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3283 }
3284 
3285 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3286   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3287   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3288   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3289 }
3290 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3291   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3292   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3293   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3294 }
3295 
3296 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3297   // For every instruction `I` in MinBWs, truncate the operands, create a
3298   // truncated version of `I` and reextend its result. InstCombine runs
3299   // later and will remove any ext/trunc pairs.
3300   SmallPtrSet<Value *, 4> Erased;
3301   for (const auto &KV : Cost->getMinimalBitwidths()) {
3302     // If the value wasn't vectorized, we must maintain the original scalar
3303     // type. The absence of the value from VectorLoopValueMap indicates that it
3304     // wasn't vectorized.
3305     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3306       continue;
3307     for (unsigned Part = 0; Part < UF; ++Part) {
3308       Value *I = getOrCreateVectorValue(KV.first, Part);
3309       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3310           !isa<Instruction>(I))
3311         continue;
3312       Type *OriginalTy = I->getType();
3313       Type *ScalarTruncatedTy =
3314           IntegerType::get(OriginalTy->getContext(), KV.second);
3315       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3316                                           OriginalTy->getVectorNumElements());
3317       if (TruncatedTy == OriginalTy)
3318         continue;
3319 
3320       IRBuilder<> B(cast<Instruction>(I));
3321       auto ShrinkOperand = [&](Value *V) -> Value * {
3322         if (auto *ZI = dyn_cast<ZExtInst>(V))
3323           if (ZI->getSrcTy() == TruncatedTy)
3324             return ZI->getOperand(0);
3325         return B.CreateZExtOrTrunc(V, TruncatedTy);
3326       };
3327 
3328       // The actual instruction modification depends on the instruction type,
3329       // unfortunately.
3330       Value *NewI = nullptr;
3331       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3332         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3333                              ShrinkOperand(BO->getOperand(1)));
3334 
3335         // Any wrapping introduced by shrinking this operation shouldn't be
3336         // considered undefined behavior. So, we can't unconditionally copy
3337         // arithmetic wrapping flags to NewI.
3338         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3339       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3340         NewI =
3341             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3342                          ShrinkOperand(CI->getOperand(1)));
3343       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3344         NewI = B.CreateSelect(SI->getCondition(),
3345                               ShrinkOperand(SI->getTrueValue()),
3346                               ShrinkOperand(SI->getFalseValue()));
3347       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3348         switch (CI->getOpcode()) {
3349         default:
3350           llvm_unreachable("Unhandled cast!");
3351         case Instruction::Trunc:
3352           NewI = ShrinkOperand(CI->getOperand(0));
3353           break;
3354         case Instruction::SExt:
3355           NewI = B.CreateSExtOrTrunc(
3356               CI->getOperand(0),
3357               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3358           break;
3359         case Instruction::ZExt:
3360           NewI = B.CreateZExtOrTrunc(
3361               CI->getOperand(0),
3362               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3363           break;
3364         }
3365       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3366         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3367         auto *O0 = B.CreateZExtOrTrunc(
3368             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3369         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3370         auto *O1 = B.CreateZExtOrTrunc(
3371             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3372 
3373         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3374       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3375         // Don't do anything with the operands, just extend the result.
3376         continue;
3377       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3378         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3379         auto *O0 = B.CreateZExtOrTrunc(
3380             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3381         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3382         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3383       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3384         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3385         auto *O0 = B.CreateZExtOrTrunc(
3386             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3387         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3388       } else {
3389         // If we don't know what to do, be conservative and don't do anything.
3390         continue;
3391       }
3392 
3393       // Lastly, extend the result.
3394       NewI->takeName(cast<Instruction>(I));
3395       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3396       I->replaceAllUsesWith(Res);
3397       cast<Instruction>(I)->eraseFromParent();
3398       Erased.insert(I);
3399       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3400     }
3401   }
3402 
3403   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3404   for (const auto &KV : Cost->getMinimalBitwidths()) {
3405     // If the value wasn't vectorized, we must maintain the original scalar
3406     // type. The absence of the value from VectorLoopValueMap indicates that it
3407     // wasn't vectorized.
3408     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3409       continue;
3410     for (unsigned Part = 0; Part < UF; ++Part) {
3411       Value *I = getOrCreateVectorValue(KV.first, Part);
3412       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3413       if (Inst && Inst->use_empty()) {
3414         Value *NewI = Inst->getOperand(0);
3415         Inst->eraseFromParent();
3416         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3417       }
3418     }
3419   }
3420 }
3421 
3422 void InnerLoopVectorizer::fixVectorizedLoop() {
3423   // Insert truncates and extends for any truncated instructions as hints to
3424   // InstCombine.
3425   if (VF > 1)
3426     truncateToMinimalBitwidths();
3427 
3428   // Fix widened non-induction PHIs by setting up the PHI operands.
3429   if (OrigPHIsToFix.size()) {
3430     assert(EnableVPlanNativePath &&
3431            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3432     fixNonInductionPHIs();
3433   }
3434 
3435   // At this point every instruction in the original loop is widened to a
3436   // vector form. Now we need to fix the recurrences in the loop. These PHI
3437   // nodes are currently empty because we did not want to introduce cycles.
3438   // This is the second stage of vectorizing recurrences.
3439   fixCrossIterationPHIs();
3440 
3441   // Update the dominator tree.
3442   //
3443   // FIXME: After creating the structure of the new loop, the dominator tree is
3444   //        no longer up-to-date, and it remains that way until we update it
3445   //        here. An out-of-date dominator tree is problematic for SCEV,
3446   //        because SCEVExpander uses it to guide code generation. The
3447   //        vectorizer use SCEVExpanders in several places. Instead, we should
3448   //        keep the dominator tree up-to-date as we go.
3449   updateAnalysis();
3450 
3451   // Fix-up external users of the induction variables.
3452   for (auto &Entry : *Legal->getInductionVars())
3453     fixupIVUsers(Entry.first, Entry.second,
3454                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3455                  IVEndValues[Entry.first], LoopMiddleBlock);
3456 
3457   fixLCSSAPHIs();
3458   for (Instruction *PI : PredicatedInstructions)
3459     sinkScalarOperands(&*PI);
3460 
3461   // Remove redundant induction instructions.
3462   cse(LoopVectorBody);
3463 }
3464 
3465 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3466   // In order to support recurrences we need to be able to vectorize Phi nodes.
3467   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3468   // stage #2: We now need to fix the recurrences by adding incoming edges to
3469   // the currently empty PHI nodes. At this point every instruction in the
3470   // original loop is widened to a vector form so we can use them to construct
3471   // the incoming edges.
3472   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3473     // Handle first-order recurrences and reductions that need to be fixed.
3474     if (Legal->isFirstOrderRecurrence(&Phi))
3475       fixFirstOrderRecurrence(&Phi);
3476     else if (Legal->isReductionVariable(&Phi))
3477       fixReduction(&Phi);
3478   }
3479 }
3480 
3481 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3482   // This is the second phase of vectorizing first-order recurrences. An
3483   // overview of the transformation is described below. Suppose we have the
3484   // following loop.
3485   //
3486   //   for (int i = 0; i < n; ++i)
3487   //     b[i] = a[i] - a[i - 1];
3488   //
3489   // There is a first-order recurrence on "a". For this loop, the shorthand
3490   // scalar IR looks like:
3491   //
3492   //   scalar.ph:
3493   //     s_init = a[-1]
3494   //     br scalar.body
3495   //
3496   //   scalar.body:
3497   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3498   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3499   //     s2 = a[i]
3500   //     b[i] = s2 - s1
3501   //     br cond, scalar.body, ...
3502   //
3503   // In this example, s1 is a recurrence because it's value depends on the
3504   // previous iteration. In the first phase of vectorization, we created a
3505   // temporary value for s1. We now complete the vectorization and produce the
3506   // shorthand vector IR shown below (for VF = 4, UF = 1).
3507   //
3508   //   vector.ph:
3509   //     v_init = vector(..., ..., ..., a[-1])
3510   //     br vector.body
3511   //
3512   //   vector.body
3513   //     i = phi [0, vector.ph], [i+4, vector.body]
3514   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3515   //     v2 = a[i, i+1, i+2, i+3];
3516   //     v3 = vector(v1(3), v2(0, 1, 2))
3517   //     b[i, i+1, i+2, i+3] = v2 - v3
3518   //     br cond, vector.body, middle.block
3519   //
3520   //   middle.block:
3521   //     x = v2(3)
3522   //     br scalar.ph
3523   //
3524   //   scalar.ph:
3525   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3526   //     br scalar.body
3527   //
3528   // After execution completes the vector loop, we extract the next value of
3529   // the recurrence (x) to use as the initial value in the scalar loop.
3530 
3531   // Get the original loop preheader and single loop latch.
3532   auto *Preheader = OrigLoop->getLoopPreheader();
3533   auto *Latch = OrigLoop->getLoopLatch();
3534 
3535   // Get the initial and previous values of the scalar recurrence.
3536   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3537   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3538 
3539   // Create a vector from the initial value.
3540   auto *VectorInit = ScalarInit;
3541   if (VF > 1) {
3542     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3543     VectorInit = Builder.CreateInsertElement(
3544         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3545         Builder.getInt32(VF - 1), "vector.recur.init");
3546   }
3547 
3548   // We constructed a temporary phi node in the first phase of vectorization.
3549   // This phi node will eventually be deleted.
3550   Builder.SetInsertPoint(
3551       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3552 
3553   // Create a phi node for the new recurrence. The current value will either be
3554   // the initial value inserted into a vector or loop-varying vector value.
3555   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3556   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3557 
3558   // Get the vectorized previous value of the last part UF - 1. It appears last
3559   // among all unrolled iterations, due to the order of their construction.
3560   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3561 
3562   // Find and set the insertion point after the previous value if it is an
3563   // instruction.
3564   BasicBlock::iterator InsertPt;
3565   // Note that the previous value may have been constant-folded so it is not
3566   // guaranteed to be an instruction in the vector loop.
3567   // FIXME: Loop invariant values do not form recurrences. We should deal with
3568   //        them earlier.
3569   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3570     InsertPt = LoopVectorBody->getFirstInsertionPt();
3571   else {
3572     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3573     if (isa<PHINode>(PreviousLastPart))
3574       // If the previous value is a phi node, we should insert after all the phi
3575       // nodes in the block containing the PHI to avoid breaking basic block
3576       // verification. Note that the basic block may be different to
3577       // LoopVectorBody, in case we predicate the loop.
3578       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3579     else
3580       InsertPt = ++PreviousInst->getIterator();
3581   }
3582   Builder.SetInsertPoint(&*InsertPt);
3583 
3584   // We will construct a vector for the recurrence by combining the values for
3585   // the current and previous iterations. This is the required shuffle mask.
3586   SmallVector<Constant *, 8> ShuffleMask(VF);
3587   ShuffleMask[0] = Builder.getInt32(VF - 1);
3588   for (unsigned I = 1; I < VF; ++I)
3589     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3590 
3591   // The vector from which to take the initial value for the current iteration
3592   // (actual or unrolled). Initially, this is the vector phi node.
3593   Value *Incoming = VecPhi;
3594 
3595   // Shuffle the current and previous vector and update the vector parts.
3596   for (unsigned Part = 0; Part < UF; ++Part) {
3597     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3598     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3599     auto *Shuffle =
3600         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3601                                              ConstantVector::get(ShuffleMask))
3602                : Incoming;
3603     PhiPart->replaceAllUsesWith(Shuffle);
3604     cast<Instruction>(PhiPart)->eraseFromParent();
3605     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3606     Incoming = PreviousPart;
3607   }
3608 
3609   // Fix the latch value of the new recurrence in the vector loop.
3610   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3611 
3612   // Extract the last vector element in the middle block. This will be the
3613   // initial value for the recurrence when jumping to the scalar loop.
3614   auto *ExtractForScalar = Incoming;
3615   if (VF > 1) {
3616     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3617     ExtractForScalar = Builder.CreateExtractElement(
3618         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3619   }
3620   // Extract the second last element in the middle block if the
3621   // Phi is used outside the loop. We need to extract the phi itself
3622   // and not the last element (the phi update in the current iteration). This
3623   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3624   // when the scalar loop is not run at all.
3625   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3626   if (VF > 1)
3627     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3628         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3629   // When loop is unrolled without vectorizing, initialize
3630   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3631   // `Incoming`. This is analogous to the vectorized case above: extracting the
3632   // second last element when VF > 1.
3633   else if (UF > 1)
3634     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3635 
3636   // Fix the initial value of the original recurrence in the scalar loop.
3637   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3638   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3639   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3640     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3641     Start->addIncoming(Incoming, BB);
3642   }
3643 
3644   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3645   Phi->setName("scalar.recur");
3646 
3647   // Finally, fix users of the recurrence outside the loop. The users will need
3648   // either the last value of the scalar recurrence or the last value of the
3649   // vector recurrence we extracted in the middle block. Since the loop is in
3650   // LCSSA form, we just need to find all the phi nodes for the original scalar
3651   // recurrence in the exit block, and then add an edge for the middle block.
3652   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3653     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3654       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3655     }
3656   }
3657 }
3658 
3659 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3660   Constant *Zero = Builder.getInt32(0);
3661 
3662   // Get it's reduction variable descriptor.
3663   assert(Legal->isReductionVariable(Phi) &&
3664          "Unable to find the reduction variable");
3665   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3666 
3667   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3668   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3669   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3670   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3671     RdxDesc.getMinMaxRecurrenceKind();
3672   setDebugLocFromInst(Builder, ReductionStartValue);
3673 
3674   // We need to generate a reduction vector from the incoming scalar.
3675   // To do so, we need to generate the 'identity' vector and override
3676   // one of the elements with the incoming scalar reduction. We need
3677   // to do it in the vector-loop preheader.
3678   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3679 
3680   // This is the vector-clone of the value that leaves the loop.
3681   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3682 
3683   // Find the reduction identity variable. Zero for addition, or, xor,
3684   // one for multiplication, -1 for And.
3685   Value *Identity;
3686   Value *VectorStart;
3687   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3688       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3689     // MinMax reduction have the start value as their identify.
3690     if (VF == 1) {
3691       VectorStart = Identity = ReductionStartValue;
3692     } else {
3693       VectorStart = Identity =
3694         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3695     }
3696   } else {
3697     // Handle other reduction kinds:
3698     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3699         RK, VecTy->getScalarType());
3700     if (VF == 1) {
3701       Identity = Iden;
3702       // This vector is the Identity vector where the first element is the
3703       // incoming scalar reduction.
3704       VectorStart = ReductionStartValue;
3705     } else {
3706       Identity = ConstantVector::getSplat(VF, Iden);
3707 
3708       // This vector is the Identity vector where the first element is the
3709       // incoming scalar reduction.
3710       VectorStart =
3711         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3712     }
3713   }
3714 
3715   // Fix the vector-loop phi.
3716 
3717   // Reductions do not have to start at zero. They can start with
3718   // any loop invariant values.
3719   BasicBlock *Latch = OrigLoop->getLoopLatch();
3720   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3721   for (unsigned Part = 0; Part < UF; ++Part) {
3722     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3723     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3724     // Make sure to add the reduction stat value only to the
3725     // first unroll part.
3726     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3727     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3728     cast<PHINode>(VecRdxPhi)
3729       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3730   }
3731 
3732   // Before each round, move the insertion point right between
3733   // the PHIs and the values we are going to write.
3734   // This allows us to write both PHINodes and the extractelement
3735   // instructions.
3736   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3737 
3738   setDebugLocFromInst(Builder, LoopExitInst);
3739 
3740   // If tail is folded by masking, the vector value to leave the loop should be
3741   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3742   // instead of the former.
3743   if (Cost->foldTailByMasking()) {
3744     for (unsigned Part = 0; Part < UF; ++Part) {
3745       Value *VecLoopExitInst =
3746           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3747       Value *Sel = nullptr;
3748       for (User *U : VecLoopExitInst->users()) {
3749         if (isa<SelectInst>(U)) {
3750           assert(!Sel && "Reduction exit feeding two selects");
3751           Sel = U;
3752         } else
3753           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3754       }
3755       assert(Sel && "Reduction exit feeds no select");
3756       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3757     }
3758   }
3759 
3760   // If the vector reduction can be performed in a smaller type, we truncate
3761   // then extend the loop exit value to enable InstCombine to evaluate the
3762   // entire expression in the smaller type.
3763   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3764     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3765     Builder.SetInsertPoint(
3766         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3767     VectorParts RdxParts(UF);
3768     for (unsigned Part = 0; Part < UF; ++Part) {
3769       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3770       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3771       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3772                                         : Builder.CreateZExt(Trunc, VecTy);
3773       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3774            UI != RdxParts[Part]->user_end();)
3775         if (*UI != Trunc) {
3776           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3777           RdxParts[Part] = Extnd;
3778         } else {
3779           ++UI;
3780         }
3781     }
3782     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3783     for (unsigned Part = 0; Part < UF; ++Part) {
3784       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3785       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3786     }
3787   }
3788 
3789   // Reduce all of the unrolled parts into a single vector.
3790   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3791   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3792 
3793   // The middle block terminator has already been assigned a DebugLoc here (the
3794   // OrigLoop's single latch terminator). We want the whole middle block to
3795   // appear to execute on this line because: (a) it is all compiler generated,
3796   // (b) these instructions are always executed after evaluating the latch
3797   // conditional branch, and (c) other passes may add new predecessors which
3798   // terminate on this line. This is the easiest way to ensure we don't
3799   // accidentally cause an extra step back into the loop while debugging.
3800   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3801   for (unsigned Part = 1; Part < UF; ++Part) {
3802     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3803     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3804       // Floating point operations had to be 'fast' to enable the reduction.
3805       ReducedPartRdx = addFastMathFlag(
3806           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3807                               ReducedPartRdx, "bin.rdx"),
3808           RdxDesc.getFastMathFlags());
3809     else
3810       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3811                                       RdxPart);
3812   }
3813 
3814   if (VF > 1) {
3815     bool NoNaN = Legal->hasFunNoNaNAttr();
3816     ReducedPartRdx =
3817         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3818     // If the reduction can be performed in a smaller type, we need to extend
3819     // the reduction to the wider type before we branch to the original loop.
3820     if (Phi->getType() != RdxDesc.getRecurrenceType())
3821       ReducedPartRdx =
3822         RdxDesc.isSigned()
3823         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3824         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3825   }
3826 
3827   // Create a phi node that merges control-flow from the backedge-taken check
3828   // block and the middle block.
3829   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3830                                         LoopScalarPreHeader->getTerminator());
3831   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3832     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3833   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3834 
3835   // Now, we need to fix the users of the reduction variable
3836   // inside and outside of the scalar remainder loop.
3837   // We know that the loop is in LCSSA form. We need to update the
3838   // PHI nodes in the exit blocks.
3839   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3840     // All PHINodes need to have a single entry edge, or two if
3841     // we already fixed them.
3842     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3843 
3844     // We found a reduction value exit-PHI. Update it with the
3845     // incoming bypass edge.
3846     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3847       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3848   } // end of the LCSSA phi scan.
3849 
3850     // Fix the scalar loop reduction variable with the incoming reduction sum
3851     // from the vector body and from the backedge value.
3852   int IncomingEdgeBlockIdx =
3853     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3854   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3855   // Pick the other block.
3856   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3857   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3858   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3859 }
3860 
3861 void InnerLoopVectorizer::fixLCSSAPHIs() {
3862   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3863     if (LCSSAPhi.getNumIncomingValues() == 1) {
3864       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3865       // Non-instruction incoming values will have only one value.
3866       unsigned LastLane = 0;
3867       if (isa<Instruction>(IncomingValue))
3868           LastLane = Cost->isUniformAfterVectorization(
3869                          cast<Instruction>(IncomingValue), VF)
3870                          ? 0
3871                          : VF - 1;
3872       // Can be a loop invariant incoming value or the last scalar value to be
3873       // extracted from the vectorized loop.
3874       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3875       Value *lastIncomingValue =
3876           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3877       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3878     }
3879   }
3880 }
3881 
3882 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3883   // The basic block and loop containing the predicated instruction.
3884   auto *PredBB = PredInst->getParent();
3885   auto *VectorLoop = LI->getLoopFor(PredBB);
3886 
3887   // Initialize a worklist with the operands of the predicated instruction.
3888   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3889 
3890   // Holds instructions that we need to analyze again. An instruction may be
3891   // reanalyzed if we don't yet know if we can sink it or not.
3892   SmallVector<Instruction *, 8> InstsToReanalyze;
3893 
3894   // Returns true if a given use occurs in the predicated block. Phi nodes use
3895   // their operands in their corresponding predecessor blocks.
3896   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3897     auto *I = cast<Instruction>(U.getUser());
3898     BasicBlock *BB = I->getParent();
3899     if (auto *Phi = dyn_cast<PHINode>(I))
3900       BB = Phi->getIncomingBlock(
3901           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3902     return BB == PredBB;
3903   };
3904 
3905   // Iteratively sink the scalarized operands of the predicated instruction
3906   // into the block we created for it. When an instruction is sunk, it's
3907   // operands are then added to the worklist. The algorithm ends after one pass
3908   // through the worklist doesn't sink a single instruction.
3909   bool Changed;
3910   do {
3911     // Add the instructions that need to be reanalyzed to the worklist, and
3912     // reset the changed indicator.
3913     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3914     InstsToReanalyze.clear();
3915     Changed = false;
3916 
3917     while (!Worklist.empty()) {
3918       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3919 
3920       // We can't sink an instruction if it is a phi node, is already in the
3921       // predicated block, is not in the loop, or may have side effects.
3922       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3923           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3924         continue;
3925 
3926       // It's legal to sink the instruction if all its uses occur in the
3927       // predicated block. Otherwise, there's nothing to do yet, and we may
3928       // need to reanalyze the instruction.
3929       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3930         InstsToReanalyze.push_back(I);
3931         continue;
3932       }
3933 
3934       // Move the instruction to the beginning of the predicated block, and add
3935       // it's operands to the worklist.
3936       I->moveBefore(&*PredBB->getFirstInsertionPt());
3937       Worklist.insert(I->op_begin(), I->op_end());
3938 
3939       // The sinking may have enabled other instructions to be sunk, so we will
3940       // need to iterate.
3941       Changed = true;
3942     }
3943   } while (Changed);
3944 }
3945 
3946 void InnerLoopVectorizer::fixNonInductionPHIs() {
3947   for (PHINode *OrigPhi : OrigPHIsToFix) {
3948     PHINode *NewPhi =
3949         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3950     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3951 
3952     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3953         predecessors(OrigPhi->getParent()));
3954     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3955         predecessors(NewPhi->getParent()));
3956     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3957            "Scalar and Vector BB should have the same number of predecessors");
3958 
3959     // The insertion point in Builder may be invalidated by the time we get
3960     // here. Force the Builder insertion point to something valid so that we do
3961     // not run into issues during insertion point restore in
3962     // getOrCreateVectorValue calls below.
3963     Builder.SetInsertPoint(NewPhi);
3964 
3965     // The predecessor order is preserved and we can rely on mapping between
3966     // scalar and vector block predecessors.
3967     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3968       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3969 
3970       // When looking up the new scalar/vector values to fix up, use incoming
3971       // values from original phi.
3972       Value *ScIncV =
3973           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3974 
3975       // Scalar incoming value may need a broadcast
3976       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3977       NewPhi->addIncoming(NewIncV, NewPredBB);
3978     }
3979   }
3980 }
3981 
3982 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
3983                                    unsigned VF, bool IsPtrLoopInvariant,
3984                                    SmallBitVector &IsIndexLoopInvariant) {
3985   // Construct a vector GEP by widening the operands of the scalar GEP as
3986   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3987   // results in a vector of pointers when at least one operand of the GEP
3988   // is vector-typed. Thus, to keep the representation compact, we only use
3989   // vector-typed operands for loop-varying values.
3990 
3991   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
3992     // If we are vectorizing, but the GEP has only loop-invariant operands,
3993     // the GEP we build (by only using vector-typed operands for
3994     // loop-varying values) would be a scalar pointer. Thus, to ensure we
3995     // produce a vector of pointers, we need to either arbitrarily pick an
3996     // operand to broadcast, or broadcast a clone of the original GEP.
3997     // Here, we broadcast a clone of the original.
3998     //
3999     // TODO: If at some point we decide to scalarize instructions having
4000     //       loop-invariant operands, this special case will no longer be
4001     //       required. We would add the scalarization decision to
4002     //       collectLoopScalars() and teach getVectorValue() to broadcast
4003     //       the lane-zero scalar value.
4004     auto *Clone = Builder.Insert(GEP->clone());
4005     for (unsigned Part = 0; Part < UF; ++Part) {
4006       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4007       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4008       addMetadata(EntryPart, GEP);
4009     }
4010   } else {
4011     // If the GEP has at least one loop-varying operand, we are sure to
4012     // produce a vector of pointers. But if we are only unrolling, we want
4013     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4014     // produce with the code below will be scalar (if VF == 1) or vector
4015     // (otherwise). Note that for the unroll-only case, we still maintain
4016     // values in the vector mapping with initVector, as we do for other
4017     // instructions.
4018     for (unsigned Part = 0; Part < UF; ++Part) {
4019       // The pointer operand of the new GEP. If it's loop-invariant, we
4020       // won't broadcast it.
4021       auto *Ptr = IsPtrLoopInvariant
4022                       ? GEP->getPointerOperand()
4023                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4024 
4025       // Collect all the indices for the new GEP. If any index is
4026       // loop-invariant, we won't broadcast it.
4027       SmallVector<Value *, 4> Indices;
4028       for (auto Index : enumerate(GEP->indices())) {
4029         Value *User = Index.value().get();
4030         if (IsIndexLoopInvariant[Index.index()])
4031           Indices.push_back(User);
4032         else
4033           Indices.push_back(getOrCreateVectorValue(User, Part));
4034       }
4035 
4036       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4037       // but it should be a vector, otherwise.
4038       auto *NewGEP =
4039           GEP->isInBounds()
4040               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4041                                           Indices)
4042               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4043       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4044              "NewGEP is not a pointer vector");
4045       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4046       addMetadata(NewGEP, GEP);
4047     }
4048   }
4049 }
4050 
4051 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4052                                               unsigned VF) {
4053   PHINode *P = cast<PHINode>(PN);
4054   if (EnableVPlanNativePath) {
4055     // Currently we enter here in the VPlan-native path for non-induction
4056     // PHIs where all control flow is uniform. We simply widen these PHIs.
4057     // Create a vector phi with no operands - the vector phi operands will be
4058     // set at the end of vector code generation.
4059     Type *VecTy =
4060         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4061     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4062     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4063     OrigPHIsToFix.push_back(P);
4064 
4065     return;
4066   }
4067 
4068   assert(PN->getParent() == OrigLoop->getHeader() &&
4069          "Non-header phis should have been handled elsewhere");
4070 
4071   // In order to support recurrences we need to be able to vectorize Phi nodes.
4072   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4073   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4074   // this value when we vectorize all of the instructions that use the PHI.
4075   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4076     for (unsigned Part = 0; Part < UF; ++Part) {
4077       // This is phase one of vectorizing PHIs.
4078       Type *VecTy =
4079           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4080       Value *EntryPart = PHINode::Create(
4081           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4082       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4083     }
4084     return;
4085   }
4086 
4087   setDebugLocFromInst(Builder, P);
4088 
4089   // This PHINode must be an induction variable.
4090   // Make sure that we know about it.
4091   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4092 
4093   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4094   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4095 
4096   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4097   // which can be found from the original scalar operations.
4098   switch (II.getKind()) {
4099   case InductionDescriptor::IK_NoInduction:
4100     llvm_unreachable("Unknown induction");
4101   case InductionDescriptor::IK_IntInduction:
4102   case InductionDescriptor::IK_FpInduction:
4103     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4104   case InductionDescriptor::IK_PtrInduction: {
4105     // Handle the pointer induction variable case.
4106     assert(P->getType()->isPointerTy() && "Unexpected type.");
4107     // This is the normalized GEP that starts counting at zero.
4108     Value *PtrInd = Induction;
4109     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4110     // Determine the number of scalars we need to generate for each unroll
4111     // iteration. If the instruction is uniform, we only need to generate the
4112     // first lane. Otherwise, we generate all VF values.
4113     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4114     // These are the scalar results. Notice that we don't generate vector GEPs
4115     // because scalar GEPs result in better code.
4116     for (unsigned Part = 0; Part < UF; ++Part) {
4117       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4118         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4119         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4120         Value *SclrGep =
4121             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4122         SclrGep->setName("next.gep");
4123         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4124       }
4125     }
4126     return;
4127   }
4128   }
4129 }
4130 
4131 /// A helper function for checking whether an integer division-related
4132 /// instruction may divide by zero (in which case it must be predicated if
4133 /// executed conditionally in the scalar code).
4134 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4135 /// Non-zero divisors that are non compile-time constants will not be
4136 /// converted into multiplication, so we will still end up scalarizing
4137 /// the division, but can do so w/o predication.
4138 static bool mayDivideByZero(Instruction &I) {
4139   assert((I.getOpcode() == Instruction::UDiv ||
4140           I.getOpcode() == Instruction::SDiv ||
4141           I.getOpcode() == Instruction::URem ||
4142           I.getOpcode() == Instruction::SRem) &&
4143          "Unexpected instruction");
4144   Value *Divisor = I.getOperand(1);
4145   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4146   return !CInt || CInt->isZero();
4147 }
4148 
4149 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4150   switch (I.getOpcode()) {
4151   case Instruction::Br:
4152   case Instruction::PHI:
4153   case Instruction::GetElementPtr:
4154     llvm_unreachable("This instruction is handled by a different recipe.");
4155   case Instruction::UDiv:
4156   case Instruction::SDiv:
4157   case Instruction::SRem:
4158   case Instruction::URem:
4159   case Instruction::Add:
4160   case Instruction::FAdd:
4161   case Instruction::Sub:
4162   case Instruction::FSub:
4163   case Instruction::FNeg:
4164   case Instruction::Mul:
4165   case Instruction::FMul:
4166   case Instruction::FDiv:
4167   case Instruction::FRem:
4168   case Instruction::Shl:
4169   case Instruction::LShr:
4170   case Instruction::AShr:
4171   case Instruction::And:
4172   case Instruction::Or:
4173   case Instruction::Xor: {
4174     // Just widen unops and binops.
4175     setDebugLocFromInst(Builder, &I);
4176 
4177     for (unsigned Part = 0; Part < UF; ++Part) {
4178       SmallVector<Value *, 2> Ops;
4179       for (Value *Op : I.operands())
4180         Ops.push_back(getOrCreateVectorValue(Op, Part));
4181 
4182       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4183 
4184       if (auto *VecOp = dyn_cast<Instruction>(V))
4185         VecOp->copyIRFlags(&I);
4186 
4187       // Use this vector value for all users of the original instruction.
4188       VectorLoopValueMap.setVectorValue(&I, Part, V);
4189       addMetadata(V, &I);
4190     }
4191 
4192     break;
4193   }
4194   case Instruction::Select: {
4195     // Widen selects.
4196     // If the selector is loop invariant we can create a select
4197     // instruction with a scalar condition. Otherwise, use vector-select.
4198     auto *SE = PSE.getSE();
4199     bool InvariantCond =
4200         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4201     setDebugLocFromInst(Builder, &I);
4202 
4203     // The condition can be loop invariant  but still defined inside the
4204     // loop. This means that we can't just use the original 'cond' value.
4205     // We have to take the 'vectorized' value and pick the first lane.
4206     // Instcombine will make this a no-op.
4207 
4208     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4209 
4210     for (unsigned Part = 0; Part < UF; ++Part) {
4211       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4212       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4213       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4214       Value *Sel =
4215           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4216       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4217       addMetadata(Sel, &I);
4218     }
4219 
4220     break;
4221   }
4222 
4223   case Instruction::ICmp:
4224   case Instruction::FCmp: {
4225     // Widen compares. Generate vector compares.
4226     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4227     auto *Cmp = cast<CmpInst>(&I);
4228     setDebugLocFromInst(Builder, Cmp);
4229     for (unsigned Part = 0; Part < UF; ++Part) {
4230       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4231       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4232       Value *C = nullptr;
4233       if (FCmp) {
4234         // Propagate fast math flags.
4235         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4236         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4237         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4238       } else {
4239         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4240       }
4241       VectorLoopValueMap.setVectorValue(&I, Part, C);
4242       addMetadata(C, &I);
4243     }
4244 
4245     break;
4246   }
4247 
4248   case Instruction::ZExt:
4249   case Instruction::SExt:
4250   case Instruction::FPToUI:
4251   case Instruction::FPToSI:
4252   case Instruction::FPExt:
4253   case Instruction::PtrToInt:
4254   case Instruction::IntToPtr:
4255   case Instruction::SIToFP:
4256   case Instruction::UIToFP:
4257   case Instruction::Trunc:
4258   case Instruction::FPTrunc:
4259   case Instruction::BitCast: {
4260     auto *CI = cast<CastInst>(&I);
4261     setDebugLocFromInst(Builder, CI);
4262 
4263     /// Vectorize casts.
4264     Type *DestTy =
4265         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4266 
4267     for (unsigned Part = 0; Part < UF; ++Part) {
4268       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4269       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4270       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4271       addMetadata(Cast, &I);
4272     }
4273     break;
4274   }
4275 
4276   case Instruction::Call: {
4277     // Ignore dbg intrinsics.
4278     if (isa<DbgInfoIntrinsic>(I))
4279       break;
4280     setDebugLocFromInst(Builder, &I);
4281 
4282     Module *M = I.getParent()->getParent()->getParent();
4283     auto *CI = cast<CallInst>(&I);
4284 
4285     SmallVector<Type *, 4> Tys;
4286     for (Value *ArgOperand : CI->arg_operands())
4287       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4288 
4289     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4290 
4291     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4292     // version of the instruction.
4293     // Is it beneficial to perform intrinsic call compared to lib call?
4294     bool NeedToScalarize;
4295     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4296     bool UseVectorIntrinsic =
4297         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4298     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4299            "Instruction should be scalarized elsewhere.");
4300 
4301     for (unsigned Part = 0; Part < UF; ++Part) {
4302       SmallVector<Value *, 4> Args;
4303       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4304         Value *Arg = CI->getArgOperand(i);
4305         // Some intrinsics have a scalar argument - don't replace it with a
4306         // vector.
4307         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4308           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4309         Args.push_back(Arg);
4310       }
4311 
4312       Function *VectorF;
4313       if (UseVectorIntrinsic) {
4314         // Use vector version of the intrinsic.
4315         Type *TysForDecl[] = {CI->getType()};
4316         if (VF > 1)
4317           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4318         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4319       } else {
4320         // Use vector version of the function call.
4321         const VFShape Shape =
4322             VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4323 #ifndef NDEBUG
4324         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4325         assert(std::find_if(Infos.begin(), Infos.end(),
4326                             [&Shape](const VFInfo &Info) {
4327                               return Info.Shape == Shape;
4328                             }) != Infos.end() &&
4329                "Vector function shape is missing from the database.");
4330 #endif
4331         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4332       }
4333       assert(VectorF && "Can't create vector function.");
4334 
4335       SmallVector<OperandBundleDef, 1> OpBundles;
4336       CI->getOperandBundlesAsDefs(OpBundles);
4337       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4338 
4339       if (isa<FPMathOperator>(V))
4340         V->copyFastMathFlags(CI);
4341 
4342       VectorLoopValueMap.setVectorValue(&I, Part, V);
4343       addMetadata(V, &I);
4344     }
4345 
4346     break;
4347   }
4348 
4349   default:
4350     // This instruction is not vectorized by simple widening.
4351     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4352     llvm_unreachable("Unhandled instruction!");
4353   } // end of switch.
4354 }
4355 
4356 void InnerLoopVectorizer::updateAnalysis() {
4357   // Forget the original basic block.
4358   PSE.getSE()->forgetLoop(OrigLoop);
4359 
4360   // DT is not kept up-to-date for outer loop vectorization
4361   if (EnableVPlanNativePath)
4362     return;
4363 
4364   // Update the dominator tree information.
4365   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4366          "Entry does not dominate exit.");
4367 
4368   DT->addNewBlock(LoopMiddleBlock,
4369                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4370   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4371   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4372   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4373   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4374 }
4375 
4376 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4377   // We should not collect Scalars more than once per VF. Right now, this
4378   // function is called from collectUniformsAndScalars(), which already does
4379   // this check. Collecting Scalars for VF=1 does not make any sense.
4380   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4381          "This function should not be visited twice for the same VF");
4382 
4383   SmallSetVector<Instruction *, 8> Worklist;
4384 
4385   // These sets are used to seed the analysis with pointers used by memory
4386   // accesses that will remain scalar.
4387   SmallSetVector<Instruction *, 8> ScalarPtrs;
4388   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4389 
4390   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4391   // The pointer operands of loads and stores will be scalar as long as the
4392   // memory access is not a gather or scatter operation. The value operand of a
4393   // store will remain scalar if the store is scalarized.
4394   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4395     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4396     assert(WideningDecision != CM_Unknown &&
4397            "Widening decision should be ready at this moment");
4398     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4399       if (Ptr == Store->getValueOperand())
4400         return WideningDecision == CM_Scalarize;
4401     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4402            "Ptr is neither a value or pointer operand");
4403     return WideningDecision != CM_GatherScatter;
4404   };
4405 
4406   // A helper that returns true if the given value is a bitcast or
4407   // getelementptr instruction contained in the loop.
4408   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4409     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4410             isa<GetElementPtrInst>(V)) &&
4411            !TheLoop->isLoopInvariant(V);
4412   };
4413 
4414   // A helper that evaluates a memory access's use of a pointer. If the use
4415   // will be a scalar use, and the pointer is only used by memory accesses, we
4416   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4417   // PossibleNonScalarPtrs.
4418   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4419     // We only care about bitcast and getelementptr instructions contained in
4420     // the loop.
4421     if (!isLoopVaryingBitCastOrGEP(Ptr))
4422       return;
4423 
4424     // If the pointer has already been identified as scalar (e.g., if it was
4425     // also identified as uniform), there's nothing to do.
4426     auto *I = cast<Instruction>(Ptr);
4427     if (Worklist.count(I))
4428       return;
4429 
4430     // If the use of the pointer will be a scalar use, and all users of the
4431     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4432     // place the pointer in PossibleNonScalarPtrs.
4433     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4434           return isa<LoadInst>(U) || isa<StoreInst>(U);
4435         }))
4436       ScalarPtrs.insert(I);
4437     else
4438       PossibleNonScalarPtrs.insert(I);
4439   };
4440 
4441   // We seed the scalars analysis with three classes of instructions: (1)
4442   // instructions marked uniform-after-vectorization, (2) bitcast and
4443   // getelementptr instructions used by memory accesses requiring a scalar use,
4444   // and (3) pointer induction variables and their update instructions (we
4445   // currently only scalarize these).
4446   //
4447   // (1) Add to the worklist all instructions that have been identified as
4448   // uniform-after-vectorization.
4449   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4450 
4451   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4452   // memory accesses requiring a scalar use. The pointer operands of loads and
4453   // stores will be scalar as long as the memory accesses is not a gather or
4454   // scatter operation. The value operand of a store will remain scalar if the
4455   // store is scalarized.
4456   for (auto *BB : TheLoop->blocks())
4457     for (auto &I : *BB) {
4458       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4459         evaluatePtrUse(Load, Load->getPointerOperand());
4460       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4461         evaluatePtrUse(Store, Store->getPointerOperand());
4462         evaluatePtrUse(Store, Store->getValueOperand());
4463       }
4464     }
4465   for (auto *I : ScalarPtrs)
4466     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4467       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4468       Worklist.insert(I);
4469     }
4470 
4471   // (3) Add to the worklist all pointer induction variables and their update
4472   // instructions.
4473   //
4474   // TODO: Once we are able to vectorize pointer induction variables we should
4475   //       no longer insert them into the worklist here.
4476   auto *Latch = TheLoop->getLoopLatch();
4477   for (auto &Induction : *Legal->getInductionVars()) {
4478     auto *Ind = Induction.first;
4479     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4480     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4481       continue;
4482     Worklist.insert(Ind);
4483     Worklist.insert(IndUpdate);
4484     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4485     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4486                       << "\n");
4487   }
4488 
4489   // Insert the forced scalars.
4490   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4491   // induction variable when the PHI user is scalarized.
4492   auto ForcedScalar = ForcedScalars.find(VF);
4493   if (ForcedScalar != ForcedScalars.end())
4494     for (auto *I : ForcedScalar->second)
4495       Worklist.insert(I);
4496 
4497   // Expand the worklist by looking through any bitcasts and getelementptr
4498   // instructions we've already identified as scalar. This is similar to the
4499   // expansion step in collectLoopUniforms(); however, here we're only
4500   // expanding to include additional bitcasts and getelementptr instructions.
4501   unsigned Idx = 0;
4502   while (Idx != Worklist.size()) {
4503     Instruction *Dst = Worklist[Idx++];
4504     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4505       continue;
4506     auto *Src = cast<Instruction>(Dst->getOperand(0));
4507     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4508           auto *J = cast<Instruction>(U);
4509           return !TheLoop->contains(J) || Worklist.count(J) ||
4510                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4511                   isScalarUse(J, Src));
4512         })) {
4513       Worklist.insert(Src);
4514       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4515     }
4516   }
4517 
4518   // An induction variable will remain scalar if all users of the induction
4519   // variable and induction variable update remain scalar.
4520   for (auto &Induction : *Legal->getInductionVars()) {
4521     auto *Ind = Induction.first;
4522     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4523 
4524     // We already considered pointer induction variables, so there's no reason
4525     // to look at their users again.
4526     //
4527     // TODO: Once we are able to vectorize pointer induction variables we
4528     //       should no longer skip over them here.
4529     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4530       continue;
4531 
4532     // Determine if all users of the induction variable are scalar after
4533     // vectorization.
4534     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4535       auto *I = cast<Instruction>(U);
4536       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4537     });
4538     if (!ScalarInd)
4539       continue;
4540 
4541     // Determine if all users of the induction variable update instruction are
4542     // scalar after vectorization.
4543     auto ScalarIndUpdate =
4544         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4545           auto *I = cast<Instruction>(U);
4546           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4547         });
4548     if (!ScalarIndUpdate)
4549       continue;
4550 
4551     // The induction variable and its update instruction will remain scalar.
4552     Worklist.insert(Ind);
4553     Worklist.insert(IndUpdate);
4554     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4555     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4556                       << "\n");
4557   }
4558 
4559   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4560 }
4561 
4562 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4563   if (!blockNeedsPredication(I->getParent()))
4564     return false;
4565   switch(I->getOpcode()) {
4566   default:
4567     break;
4568   case Instruction::Load:
4569   case Instruction::Store: {
4570     if (!Legal->isMaskRequired(I))
4571       return false;
4572     auto *Ptr = getLoadStorePointerOperand(I);
4573     auto *Ty = getMemInstValueType(I);
4574     // We have already decided how to vectorize this instruction, get that
4575     // result.
4576     if (VF > 1) {
4577       InstWidening WideningDecision = getWideningDecision(I, VF);
4578       assert(WideningDecision != CM_Unknown &&
4579              "Widening decision should be ready at this moment");
4580       return WideningDecision == CM_Scalarize;
4581     }
4582     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4583     return isa<LoadInst>(I) ?
4584         !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
4585       : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
4586   }
4587   case Instruction::UDiv:
4588   case Instruction::SDiv:
4589   case Instruction::SRem:
4590   case Instruction::URem:
4591     return mayDivideByZero(*I);
4592   }
4593   return false;
4594 }
4595 
4596 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4597                                                                unsigned VF) {
4598   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4599   assert(getWideningDecision(I, VF) == CM_Unknown &&
4600          "Decision should not be set yet.");
4601   auto *Group = getInterleavedAccessGroup(I);
4602   assert(Group && "Must have a group.");
4603 
4604   // If the instruction's allocated size doesn't equal it's type size, it
4605   // requires padding and will be scalarized.
4606   auto &DL = I->getModule()->getDataLayout();
4607   auto *ScalarTy = getMemInstValueType(I);
4608   if (hasIrregularType(ScalarTy, DL, VF))
4609     return false;
4610 
4611   // Check if masking is required.
4612   // A Group may need masking for one of two reasons: it resides in a block that
4613   // needs predication, or it was decided to use masking to deal with gaps.
4614   bool PredicatedAccessRequiresMasking =
4615       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4616   bool AccessWithGapsRequiresMasking =
4617       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4618   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4619     return true;
4620 
4621   // If masked interleaving is required, we expect that the user/target had
4622   // enabled it, because otherwise it either wouldn't have been created or
4623   // it should have been invalidated by the CostModel.
4624   assert(useMaskedInterleavedAccesses(TTI) &&
4625          "Masked interleave-groups for predicated accesses are not enabled.");
4626 
4627   auto *Ty = getMemInstValueType(I);
4628   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4629   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4630                           : TTI.isLegalMaskedStore(Ty, Alignment);
4631 }
4632 
4633 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4634                                                                unsigned VF) {
4635   // Get and ensure we have a valid memory instruction.
4636   LoadInst *LI = dyn_cast<LoadInst>(I);
4637   StoreInst *SI = dyn_cast<StoreInst>(I);
4638   assert((LI || SI) && "Invalid memory instruction");
4639 
4640   auto *Ptr = getLoadStorePointerOperand(I);
4641 
4642   // In order to be widened, the pointer should be consecutive, first of all.
4643   if (!Legal->isConsecutivePtr(Ptr))
4644     return false;
4645 
4646   // If the instruction is a store located in a predicated block, it will be
4647   // scalarized.
4648   if (isScalarWithPredication(I))
4649     return false;
4650 
4651   // If the instruction's allocated size doesn't equal it's type size, it
4652   // requires padding and will be scalarized.
4653   auto &DL = I->getModule()->getDataLayout();
4654   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4655   if (hasIrregularType(ScalarTy, DL, VF))
4656     return false;
4657 
4658   return true;
4659 }
4660 
4661 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4662   // We should not collect Uniforms more than once per VF. Right now,
4663   // this function is called from collectUniformsAndScalars(), which
4664   // already does this check. Collecting Uniforms for VF=1 does not make any
4665   // sense.
4666 
4667   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4668          "This function should not be visited twice for the same VF");
4669 
4670   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4671   // not analyze again.  Uniforms.count(VF) will return 1.
4672   Uniforms[VF].clear();
4673 
4674   // We now know that the loop is vectorizable!
4675   // Collect instructions inside the loop that will remain uniform after
4676   // vectorization.
4677 
4678   // Global values, params and instructions outside of current loop are out of
4679   // scope.
4680   auto isOutOfScope = [&](Value *V) -> bool {
4681     Instruction *I = dyn_cast<Instruction>(V);
4682     return (!I || !TheLoop->contains(I));
4683   };
4684 
4685   SetVector<Instruction *> Worklist;
4686   BasicBlock *Latch = TheLoop->getLoopLatch();
4687 
4688   // Instructions that are scalar with predication must not be considered
4689   // uniform after vectorization, because that would create an erroneous
4690   // replicating region where only a single instance out of VF should be formed.
4691   // TODO: optimize such seldom cases if found important, see PR40816.
4692   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4693     if (isScalarWithPredication(I, VF)) {
4694       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4695                         << *I << "\n");
4696       return;
4697     }
4698     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4699     Worklist.insert(I);
4700   };
4701 
4702   // Start with the conditional branch. If the branch condition is an
4703   // instruction contained in the loop that is only used by the branch, it is
4704   // uniform.
4705   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4706   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4707     addToWorklistIfAllowed(Cmp);
4708 
4709   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4710   // are pointers that are treated like consecutive pointers during
4711   // vectorization. The pointer operands of interleaved accesses are an
4712   // example.
4713   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4714 
4715   // Holds pointer operands of instructions that are possibly non-uniform.
4716   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4717 
4718   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4719     InstWidening WideningDecision = getWideningDecision(I, VF);
4720     assert(WideningDecision != CM_Unknown &&
4721            "Widening decision should be ready at this moment");
4722 
4723     return (WideningDecision == CM_Widen ||
4724             WideningDecision == CM_Widen_Reverse ||
4725             WideningDecision == CM_Interleave);
4726   };
4727   // Iterate over the instructions in the loop, and collect all
4728   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4729   // that a consecutive-like pointer operand will be scalarized, we collect it
4730   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4731   // getelementptr instruction can be used by both vectorized and scalarized
4732   // memory instructions. For example, if a loop loads and stores from the same
4733   // location, but the store is conditional, the store will be scalarized, and
4734   // the getelementptr won't remain uniform.
4735   for (auto *BB : TheLoop->blocks())
4736     for (auto &I : *BB) {
4737       // If there's no pointer operand, there's nothing to do.
4738       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4739       if (!Ptr)
4740         continue;
4741 
4742       // True if all users of Ptr are memory accesses that have Ptr as their
4743       // pointer operand.
4744       auto UsersAreMemAccesses =
4745           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4746             return getLoadStorePointerOperand(U) == Ptr;
4747           });
4748 
4749       // Ensure the memory instruction will not be scalarized or used by
4750       // gather/scatter, making its pointer operand non-uniform. If the pointer
4751       // operand is used by any instruction other than a memory access, we
4752       // conservatively assume the pointer operand may be non-uniform.
4753       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4754         PossibleNonUniformPtrs.insert(Ptr);
4755 
4756       // If the memory instruction will be vectorized and its pointer operand
4757       // is consecutive-like, or interleaving - the pointer operand should
4758       // remain uniform.
4759       else
4760         ConsecutiveLikePtrs.insert(Ptr);
4761     }
4762 
4763   // Add to the Worklist all consecutive and consecutive-like pointers that
4764   // aren't also identified as possibly non-uniform.
4765   for (auto *V : ConsecutiveLikePtrs)
4766     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4767       addToWorklistIfAllowed(V);
4768 
4769   // Expand Worklist in topological order: whenever a new instruction
4770   // is added , its users should be already inside Worklist.  It ensures
4771   // a uniform instruction will only be used by uniform instructions.
4772   unsigned idx = 0;
4773   while (idx != Worklist.size()) {
4774     Instruction *I = Worklist[idx++];
4775 
4776     for (auto OV : I->operand_values()) {
4777       // isOutOfScope operands cannot be uniform instructions.
4778       if (isOutOfScope(OV))
4779         continue;
4780       // First order recurrence Phi's should typically be considered
4781       // non-uniform.
4782       auto *OP = dyn_cast<PHINode>(OV);
4783       if (OP && Legal->isFirstOrderRecurrence(OP))
4784         continue;
4785       // If all the users of the operand are uniform, then add the
4786       // operand into the uniform worklist.
4787       auto *OI = cast<Instruction>(OV);
4788       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4789             auto *J = cast<Instruction>(U);
4790             return Worklist.count(J) ||
4791                    (OI == getLoadStorePointerOperand(J) &&
4792                     isUniformDecision(J, VF));
4793           }))
4794         addToWorklistIfAllowed(OI);
4795     }
4796   }
4797 
4798   // Returns true if Ptr is the pointer operand of a memory access instruction
4799   // I, and I is known to not require scalarization.
4800   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4801     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4802   };
4803 
4804   // For an instruction to be added into Worklist above, all its users inside
4805   // the loop should also be in Worklist. However, this condition cannot be
4806   // true for phi nodes that form a cyclic dependence. We must process phi
4807   // nodes separately. An induction variable will remain uniform if all users
4808   // of the induction variable and induction variable update remain uniform.
4809   // The code below handles both pointer and non-pointer induction variables.
4810   for (auto &Induction : *Legal->getInductionVars()) {
4811     auto *Ind = Induction.first;
4812     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4813 
4814     // Determine if all users of the induction variable are uniform after
4815     // vectorization.
4816     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4817       auto *I = cast<Instruction>(U);
4818       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4819              isVectorizedMemAccessUse(I, Ind);
4820     });
4821     if (!UniformInd)
4822       continue;
4823 
4824     // Determine if all users of the induction variable update instruction are
4825     // uniform after vectorization.
4826     auto UniformIndUpdate =
4827         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4828           auto *I = cast<Instruction>(U);
4829           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4830                  isVectorizedMemAccessUse(I, IndUpdate);
4831         });
4832     if (!UniformIndUpdate)
4833       continue;
4834 
4835     // The induction variable and its update instruction will remain uniform.
4836     addToWorklistIfAllowed(Ind);
4837     addToWorklistIfAllowed(IndUpdate);
4838   }
4839 
4840   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4841 }
4842 
4843 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4844   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4845 
4846   if (Legal->getRuntimePointerChecking()->Need) {
4847     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4848         "runtime pointer checks needed. Enable vectorization of this "
4849         "loop with '#pragma clang loop vectorize(enable)' when "
4850         "compiling with -Os/-Oz",
4851         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4852     return true;
4853   }
4854 
4855   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4856     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4857         "runtime SCEV checks needed. Enable vectorization of this "
4858         "loop with '#pragma clang loop vectorize(enable)' when "
4859         "compiling with -Os/-Oz",
4860         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4861     return true;
4862   }
4863 
4864   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4865   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4866     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4867         "runtime stride == 1 checks needed. Enable vectorization of "
4868         "this loop with '#pragma clang loop vectorize(enable)' when "
4869         "compiling with -Os/-Oz",
4870         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4871     return true;
4872   }
4873 
4874   return false;
4875 }
4876 
4877 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4878   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4879     // TODO: It may by useful to do since it's still likely to be dynamically
4880     // uniform if the target can skip.
4881     reportVectorizationFailure(
4882         "Not inserting runtime ptr check for divergent target",
4883         "runtime pointer checks needed. Not enabled for divergent target",
4884         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4885     return None;
4886   }
4887 
4888   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4889   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4890   if (TC == 1) {
4891     reportVectorizationFailure("Single iteration (non) loop",
4892         "loop trip count is one, irrelevant for vectorization",
4893         "SingleIterationLoop", ORE, TheLoop);
4894     return None;
4895   }
4896 
4897   switch (ScalarEpilogueStatus) {
4898   case CM_ScalarEpilogueAllowed:
4899     return computeFeasibleMaxVF(TC);
4900   case CM_ScalarEpilogueNotNeededUsePredicate:
4901     LLVM_DEBUG(
4902         dbgs() << "LV: vector predicate hint/switch found.\n"
4903                << "LV: Not allowing scalar epilogue, creating predicated "
4904                << "vector loop.\n");
4905     break;
4906   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4907     // fallthrough as a special case of OptForSize
4908   case CM_ScalarEpilogueNotAllowedOptSize:
4909     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4910       LLVM_DEBUG(
4911           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4912     else
4913       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4914                         << "count.\n");
4915 
4916     // Bail if runtime checks are required, which are not good when optimising
4917     // for size.
4918     if (runtimeChecksRequired())
4919       return None;
4920     break;
4921   }
4922 
4923   // Now try the tail folding
4924 
4925   // Invalidate interleave groups that require an epilogue if we can't mask
4926   // the interleave-group.
4927   if (!useMaskedInterleavedAccesses(TTI))
4928     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4929 
4930   unsigned MaxVF = computeFeasibleMaxVF(TC);
4931   if (TC > 0 && TC % MaxVF == 0) {
4932     // Accept MaxVF if we do not have a tail.
4933     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4934     return MaxVF;
4935   }
4936 
4937   // If we don't know the precise trip count, or if the trip count that we
4938   // found modulo the vectorization factor is not zero, try to fold the tail
4939   // by masking.
4940   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4941   if (Legal->prepareToFoldTailByMasking()) {
4942     FoldTailByMasking = true;
4943     return MaxVF;
4944   }
4945 
4946   if (TC == 0) {
4947     reportVectorizationFailure(
4948         "Unable to calculate the loop count due to complex control flow",
4949         "unable to calculate the loop count due to complex control flow",
4950         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4951     return None;
4952   }
4953 
4954   reportVectorizationFailure(
4955       "Cannot optimize for size and vectorize at the same time.",
4956       "cannot optimize for size and vectorize at the same time. "
4957       "Enable vectorization of this loop with '#pragma clang loop "
4958       "vectorize(enable)' when compiling with -Os/-Oz",
4959       "NoTailLoopWithOptForSize", ORE, TheLoop);
4960   return None;
4961 }
4962 
4963 unsigned
4964 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4965   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4966   unsigned SmallestType, WidestType;
4967   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4968   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4969 
4970   // Get the maximum safe dependence distance in bits computed by LAA.
4971   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4972   // the memory accesses that is most restrictive (involved in the smallest
4973   // dependence distance).
4974   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4975 
4976   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4977 
4978   unsigned MaxVectorSize = WidestRegister / WidestType;
4979 
4980   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4981                     << " / " << WidestType << " bits.\n");
4982   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4983                     << WidestRegister << " bits.\n");
4984 
4985   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4986                                  " into one vector!");
4987   if (MaxVectorSize == 0) {
4988     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4989     MaxVectorSize = 1;
4990     return MaxVectorSize;
4991   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4992              isPowerOf2_32(ConstTripCount)) {
4993     // We need to clamp the VF to be the ConstTripCount. There is no point in
4994     // choosing a higher viable VF as done in the loop below.
4995     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4996                       << ConstTripCount << "\n");
4997     MaxVectorSize = ConstTripCount;
4998     return MaxVectorSize;
4999   }
5000 
5001   unsigned MaxVF = MaxVectorSize;
5002   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5003       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5004     // Collect all viable vectorization factors larger than the default MaxVF
5005     // (i.e. MaxVectorSize).
5006     SmallVector<unsigned, 8> VFs;
5007     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5008     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5009       VFs.push_back(VS);
5010 
5011     // For each VF calculate its register usage.
5012     auto RUs = calculateRegisterUsage(VFs);
5013 
5014     // Select the largest VF which doesn't require more registers than existing
5015     // ones.
5016     for (int i = RUs.size() - 1; i >= 0; --i) {
5017       bool Selected = true;
5018       for (auto& pair : RUs[i].MaxLocalUsers) {
5019         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5020         if (pair.second > TargetNumRegisters)
5021           Selected = false;
5022       }
5023       if (Selected) {
5024         MaxVF = VFs[i];
5025         break;
5026       }
5027     }
5028     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5029       if (MaxVF < MinVF) {
5030         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5031                           << ") with target's minimum: " << MinVF << '\n');
5032         MaxVF = MinVF;
5033       }
5034     }
5035   }
5036   return MaxVF;
5037 }
5038 
5039 VectorizationFactor
5040 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5041   float Cost = expectedCost(1).first;
5042   const float ScalarCost = Cost;
5043   unsigned Width = 1;
5044   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5045 
5046   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5047   if (ForceVectorization && MaxVF > 1) {
5048     // Ignore scalar width, because the user explicitly wants vectorization.
5049     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5050     // evaluation.
5051     Cost = std::numeric_limits<float>::max();
5052   }
5053 
5054   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5055     // Notice that the vector loop needs to be executed less times, so
5056     // we need to divide the cost of the vector loops by the width of
5057     // the vector elements.
5058     VectorizationCostTy C = expectedCost(i);
5059     float VectorCost = C.first / (float)i;
5060     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5061                       << " costs: " << (int)VectorCost << ".\n");
5062     if (!C.second && !ForceVectorization) {
5063       LLVM_DEBUG(
5064           dbgs() << "LV: Not considering vector loop of width " << i
5065                  << " because it will not generate any vector instructions.\n");
5066       continue;
5067     }
5068     if (VectorCost < Cost) {
5069       Cost = VectorCost;
5070       Width = i;
5071     }
5072   }
5073 
5074   if (!EnableCondStoresVectorization && NumPredStores) {
5075     reportVectorizationFailure("There are conditional stores.",
5076         "store that is conditionally executed prevents vectorization",
5077         "ConditionalStore", ORE, TheLoop);
5078     Width = 1;
5079     Cost = ScalarCost;
5080   }
5081 
5082   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5083              << "LV: Vectorization seems to be not beneficial, "
5084              << "but was forced by a user.\n");
5085   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5086   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5087   return Factor;
5088 }
5089 
5090 std::pair<unsigned, unsigned>
5091 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5092   unsigned MinWidth = -1U;
5093   unsigned MaxWidth = 8;
5094   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5095 
5096   // For each block.
5097   for (BasicBlock *BB : TheLoop->blocks()) {
5098     // For each instruction in the loop.
5099     for (Instruction &I : BB->instructionsWithoutDebug()) {
5100       Type *T = I.getType();
5101 
5102       // Skip ignored values.
5103       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5104         continue;
5105 
5106       // Only examine Loads, Stores and PHINodes.
5107       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5108         continue;
5109 
5110       // Examine PHI nodes that are reduction variables. Update the type to
5111       // account for the recurrence type.
5112       if (auto *PN = dyn_cast<PHINode>(&I)) {
5113         if (!Legal->isReductionVariable(PN))
5114           continue;
5115         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5116         T = RdxDesc.getRecurrenceType();
5117       }
5118 
5119       // Examine the stored values.
5120       if (auto *ST = dyn_cast<StoreInst>(&I))
5121         T = ST->getValueOperand()->getType();
5122 
5123       // Ignore loaded pointer types and stored pointer types that are not
5124       // vectorizable.
5125       //
5126       // FIXME: The check here attempts to predict whether a load or store will
5127       //        be vectorized. We only know this for certain after a VF has
5128       //        been selected. Here, we assume that if an access can be
5129       //        vectorized, it will be. We should also look at extending this
5130       //        optimization to non-pointer types.
5131       //
5132       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5133           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5134         continue;
5135 
5136       MinWidth = std::min(MinWidth,
5137                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5138       MaxWidth = std::max(MaxWidth,
5139                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5140     }
5141   }
5142 
5143   return {MinWidth, MaxWidth};
5144 }
5145 
5146 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5147                                                            unsigned LoopCost) {
5148   // -- The interleave heuristics --
5149   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5150   // There are many micro-architectural considerations that we can't predict
5151   // at this level. For example, frontend pressure (on decode or fetch) due to
5152   // code size, or the number and capabilities of the execution ports.
5153   //
5154   // We use the following heuristics to select the interleave count:
5155   // 1. If the code has reductions, then we interleave to break the cross
5156   // iteration dependency.
5157   // 2. If the loop is really small, then we interleave to reduce the loop
5158   // overhead.
5159   // 3. We don't interleave if we think that we will spill registers to memory
5160   // due to the increased register pressure.
5161 
5162   if (!isScalarEpilogueAllowed())
5163     return 1;
5164 
5165   // We used the distance for the interleave count.
5166   if (Legal->getMaxSafeDepDistBytes() != -1U)
5167     return 1;
5168 
5169   // Do not interleave loops with a relatively small known or estimated trip
5170   // count.
5171   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5172   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5173     return 1;
5174 
5175   RegisterUsage R = calculateRegisterUsage({VF})[0];
5176   // We divide by these constants so assume that we have at least one
5177   // instruction that uses at least one register.
5178   for (auto& pair : R.MaxLocalUsers) {
5179     pair.second = std::max(pair.second, 1U);
5180   }
5181 
5182   // We calculate the interleave count using the following formula.
5183   // Subtract the number of loop invariants from the number of available
5184   // registers. These registers are used by all of the interleaved instances.
5185   // Next, divide the remaining registers by the number of registers that is
5186   // required by the loop, in order to estimate how many parallel instances
5187   // fit without causing spills. All of this is rounded down if necessary to be
5188   // a power of two. We want power of two interleave count to simplify any
5189   // addressing operations or alignment considerations.
5190   // We also want power of two interleave counts to ensure that the induction
5191   // variable of the vector loop wraps to zero, when tail is folded by masking;
5192   // this currently happens when OptForSize, in which case IC is set to 1 above.
5193   unsigned IC = UINT_MAX;
5194 
5195   for (auto& pair : R.MaxLocalUsers) {
5196     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5197     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5198                       << " registers of "
5199                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5200     if (VF == 1) {
5201       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5202         TargetNumRegisters = ForceTargetNumScalarRegs;
5203     } else {
5204       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5205         TargetNumRegisters = ForceTargetNumVectorRegs;
5206     }
5207     unsigned MaxLocalUsers = pair.second;
5208     unsigned LoopInvariantRegs = 0;
5209     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5210       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5211 
5212     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5213     // Don't count the induction variable as interleaved.
5214     if (EnableIndVarRegisterHeur) {
5215       TmpIC =
5216           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5217                         std::max(1U, (MaxLocalUsers - 1)));
5218     }
5219 
5220     IC = std::min(IC, TmpIC);
5221   }
5222 
5223   // Clamp the interleave ranges to reasonable counts.
5224   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5225 
5226   // Check if the user has overridden the max.
5227   if (VF == 1) {
5228     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5229       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5230   } else {
5231     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5232       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5233   }
5234 
5235   // If trip count is known or estimated compile time constant, limit the
5236   // interleave count to be less than the trip count divided by VF.
5237   if (BestKnownTC) {
5238     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5239   }
5240 
5241   // If we did not calculate the cost for VF (because the user selected the VF)
5242   // then we calculate the cost of VF here.
5243   if (LoopCost == 0)
5244     LoopCost = expectedCost(VF).first;
5245 
5246   assert(LoopCost && "Non-zero loop cost expected");
5247 
5248   // Clamp the calculated IC to be between the 1 and the max interleave count
5249   // that the target and trip count allows.
5250   if (IC > MaxInterleaveCount)
5251     IC = MaxInterleaveCount;
5252   else if (IC < 1)
5253     IC = 1;
5254 
5255   // Interleave if we vectorized this loop and there is a reduction that could
5256   // benefit from interleaving.
5257   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5258     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5259     return IC;
5260   }
5261 
5262   // Note that if we've already vectorized the loop we will have done the
5263   // runtime check and so interleaving won't require further checks.
5264   bool InterleavingRequiresRuntimePointerCheck =
5265       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5266 
5267   // We want to interleave small loops in order to reduce the loop overhead and
5268   // potentially expose ILP opportunities.
5269   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5270   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5271     // We assume that the cost overhead is 1 and we use the cost model
5272     // to estimate the cost of the loop and interleave until the cost of the
5273     // loop overhead is about 5% of the cost of the loop.
5274     unsigned SmallIC =
5275         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5276 
5277     // Interleave until store/load ports (estimated by max interleave count) are
5278     // saturated.
5279     unsigned NumStores = Legal->getNumStores();
5280     unsigned NumLoads = Legal->getNumLoads();
5281     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5282     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5283 
5284     // If we have a scalar reduction (vector reductions are already dealt with
5285     // by this point), we can increase the critical path length if the loop
5286     // we're interleaving is inside another loop. Limit, by default to 2, so the
5287     // critical path only gets increased by one reduction operation.
5288     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5289       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5290       SmallIC = std::min(SmallIC, F);
5291       StoresIC = std::min(StoresIC, F);
5292       LoadsIC = std::min(LoadsIC, F);
5293     }
5294 
5295     if (EnableLoadStoreRuntimeInterleave &&
5296         std::max(StoresIC, LoadsIC) > SmallIC) {
5297       LLVM_DEBUG(
5298           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5299       return std::max(StoresIC, LoadsIC);
5300     }
5301 
5302     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5303     return SmallIC;
5304   }
5305 
5306   // Interleave if this is a large loop (small loops are already dealt with by
5307   // this point) that could benefit from interleaving.
5308   bool HasReductions = !Legal->getReductionVars()->empty();
5309   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5310     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5311     return IC;
5312   }
5313 
5314   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5315   return 1;
5316 }
5317 
5318 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5319 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5320   // This function calculates the register usage by measuring the highest number
5321   // of values that are alive at a single location. Obviously, this is a very
5322   // rough estimation. We scan the loop in a topological order in order and
5323   // assign a number to each instruction. We use RPO to ensure that defs are
5324   // met before their users. We assume that each instruction that has in-loop
5325   // users starts an interval. We record every time that an in-loop value is
5326   // used, so we have a list of the first and last occurrences of each
5327   // instruction. Next, we transpose this data structure into a multi map that
5328   // holds the list of intervals that *end* at a specific location. This multi
5329   // map allows us to perform a linear search. We scan the instructions linearly
5330   // and record each time that a new interval starts, by placing it in a set.
5331   // If we find this value in the multi-map then we remove it from the set.
5332   // The max register usage is the maximum size of the set.
5333   // We also search for instructions that are defined outside the loop, but are
5334   // used inside the loop. We need this number separately from the max-interval
5335   // usage number because when we unroll, loop-invariant values do not take
5336   // more register.
5337   LoopBlocksDFS DFS(TheLoop);
5338   DFS.perform(LI);
5339 
5340   RegisterUsage RU;
5341 
5342   // Each 'key' in the map opens a new interval. The values
5343   // of the map are the index of the 'last seen' usage of the
5344   // instruction that is the key.
5345   using IntervalMap = DenseMap<Instruction *, unsigned>;
5346 
5347   // Maps instruction to its index.
5348   SmallVector<Instruction *, 64> IdxToInstr;
5349   // Marks the end of each interval.
5350   IntervalMap EndPoint;
5351   // Saves the list of instruction indices that are used in the loop.
5352   SmallPtrSet<Instruction *, 8> Ends;
5353   // Saves the list of values that are used in the loop but are
5354   // defined outside the loop, such as arguments and constants.
5355   SmallPtrSet<Value *, 8> LoopInvariants;
5356 
5357   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5358     for (Instruction &I : BB->instructionsWithoutDebug()) {
5359       IdxToInstr.push_back(&I);
5360 
5361       // Save the end location of each USE.
5362       for (Value *U : I.operands()) {
5363         auto *Instr = dyn_cast<Instruction>(U);
5364 
5365         // Ignore non-instruction values such as arguments, constants, etc.
5366         if (!Instr)
5367           continue;
5368 
5369         // If this instruction is outside the loop then record it and continue.
5370         if (!TheLoop->contains(Instr)) {
5371           LoopInvariants.insert(Instr);
5372           continue;
5373         }
5374 
5375         // Overwrite previous end points.
5376         EndPoint[Instr] = IdxToInstr.size();
5377         Ends.insert(Instr);
5378       }
5379     }
5380   }
5381 
5382   // Saves the list of intervals that end with the index in 'key'.
5383   using InstrList = SmallVector<Instruction *, 2>;
5384   DenseMap<unsigned, InstrList> TransposeEnds;
5385 
5386   // Transpose the EndPoints to a list of values that end at each index.
5387   for (auto &Interval : EndPoint)
5388     TransposeEnds[Interval.second].push_back(Interval.first);
5389 
5390   SmallPtrSet<Instruction *, 8> OpenIntervals;
5391 
5392   // Get the size of the widest register.
5393   unsigned MaxSafeDepDist = -1U;
5394   if (Legal->getMaxSafeDepDistBytes() != -1U)
5395     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5396   unsigned WidestRegister =
5397       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5398   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5399 
5400   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5401   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5402 
5403   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5404 
5405   // A lambda that gets the register usage for the given type and VF.
5406   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5407     if (Ty->isTokenTy())
5408       return 0U;
5409     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5410     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5411   };
5412 
5413   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5414     Instruction *I = IdxToInstr[i];
5415 
5416     // Remove all of the instructions that end at this location.
5417     InstrList &List = TransposeEnds[i];
5418     for (Instruction *ToRemove : List)
5419       OpenIntervals.erase(ToRemove);
5420 
5421     // Ignore instructions that are never used within the loop.
5422     if (Ends.find(I) == Ends.end())
5423       continue;
5424 
5425     // Skip ignored values.
5426     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5427       continue;
5428 
5429     // For each VF find the maximum usage of registers.
5430     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5431       // Count the number of live intervals.
5432       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5433 
5434       if (VFs[j] == 1) {
5435         for (auto Inst : OpenIntervals) {
5436           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5437           if (RegUsage.find(ClassID) == RegUsage.end())
5438             RegUsage[ClassID] = 1;
5439           else
5440             RegUsage[ClassID] += 1;
5441         }
5442       } else {
5443         collectUniformsAndScalars(VFs[j]);
5444         for (auto Inst : OpenIntervals) {
5445           // Skip ignored values for VF > 1.
5446           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5447             continue;
5448           if (isScalarAfterVectorization(Inst, VFs[j])) {
5449             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5450             if (RegUsage.find(ClassID) == RegUsage.end())
5451               RegUsage[ClassID] = 1;
5452             else
5453               RegUsage[ClassID] += 1;
5454           } else {
5455             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5456             if (RegUsage.find(ClassID) == RegUsage.end())
5457               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5458             else
5459               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5460           }
5461         }
5462       }
5463 
5464       for (auto& pair : RegUsage) {
5465         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5466           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5467         else
5468           MaxUsages[j][pair.first] = pair.second;
5469       }
5470     }
5471 
5472     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5473                       << OpenIntervals.size() << '\n');
5474 
5475     // Add the current instruction to the list of open intervals.
5476     OpenIntervals.insert(I);
5477   }
5478 
5479   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5480     SmallMapVector<unsigned, unsigned, 4> Invariant;
5481 
5482     for (auto Inst : LoopInvariants) {
5483       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5484       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5485       if (Invariant.find(ClassID) == Invariant.end())
5486         Invariant[ClassID] = Usage;
5487       else
5488         Invariant[ClassID] += Usage;
5489     }
5490 
5491     LLVM_DEBUG({
5492       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5493       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5494              << " item\n";
5495       for (const auto &pair : MaxUsages[i]) {
5496         dbgs() << "LV(REG): RegisterClass: "
5497                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5498                << " registers\n";
5499       }
5500       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5501              << " item\n";
5502       for (const auto &pair : Invariant) {
5503         dbgs() << "LV(REG): RegisterClass: "
5504                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5505                << " registers\n";
5506       }
5507     });
5508 
5509     RU.LoopInvariantRegs = Invariant;
5510     RU.MaxLocalUsers = MaxUsages[i];
5511     RUs[i] = RU;
5512   }
5513 
5514   return RUs;
5515 }
5516 
5517 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5518   // TODO: Cost model for emulated masked load/store is completely
5519   // broken. This hack guides the cost model to use an artificially
5520   // high enough value to practically disable vectorization with such
5521   // operations, except where previously deployed legality hack allowed
5522   // using very low cost values. This is to avoid regressions coming simply
5523   // from moving "masked load/store" check from legality to cost model.
5524   // Masked Load/Gather emulation was previously never allowed.
5525   // Limited number of Masked Store/Scatter emulation was allowed.
5526   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5527   return isa<LoadInst>(I) ||
5528          (isa<StoreInst>(I) &&
5529           NumPredStores > NumberOfStoresToPredicate);
5530 }
5531 
5532 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5533   // If we aren't vectorizing the loop, or if we've already collected the
5534   // instructions to scalarize, there's nothing to do. Collection may already
5535   // have occurred if we have a user-selected VF and are now computing the
5536   // expected cost for interleaving.
5537   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5538     return;
5539 
5540   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5541   // not profitable to scalarize any instructions, the presence of VF in the
5542   // map will indicate that we've analyzed it already.
5543   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5544 
5545   // Find all the instructions that are scalar with predication in the loop and
5546   // determine if it would be better to not if-convert the blocks they are in.
5547   // If so, we also record the instructions to scalarize.
5548   for (BasicBlock *BB : TheLoop->blocks()) {
5549     if (!blockNeedsPredication(BB))
5550       continue;
5551     for (Instruction &I : *BB)
5552       if (isScalarWithPredication(&I)) {
5553         ScalarCostsTy ScalarCosts;
5554         // Do not apply discount logic if hacked cost is needed
5555         // for emulated masked memrefs.
5556         if (!useEmulatedMaskMemRefHack(&I) &&
5557             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5558           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5559         // Remember that BB will remain after vectorization.
5560         PredicatedBBsAfterVectorization.insert(BB);
5561       }
5562   }
5563 }
5564 
5565 int LoopVectorizationCostModel::computePredInstDiscount(
5566     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5567     unsigned VF) {
5568   assert(!isUniformAfterVectorization(PredInst, VF) &&
5569          "Instruction marked uniform-after-vectorization will be predicated");
5570 
5571   // Initialize the discount to zero, meaning that the scalar version and the
5572   // vector version cost the same.
5573   int Discount = 0;
5574 
5575   // Holds instructions to analyze. The instructions we visit are mapped in
5576   // ScalarCosts. Those instructions are the ones that would be scalarized if
5577   // we find that the scalar version costs less.
5578   SmallVector<Instruction *, 8> Worklist;
5579 
5580   // Returns true if the given instruction can be scalarized.
5581   auto canBeScalarized = [&](Instruction *I) -> bool {
5582     // We only attempt to scalarize instructions forming a single-use chain
5583     // from the original predicated block that would otherwise be vectorized.
5584     // Although not strictly necessary, we give up on instructions we know will
5585     // already be scalar to avoid traversing chains that are unlikely to be
5586     // beneficial.
5587     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5588         isScalarAfterVectorization(I, VF))
5589       return false;
5590 
5591     // If the instruction is scalar with predication, it will be analyzed
5592     // separately. We ignore it within the context of PredInst.
5593     if (isScalarWithPredication(I))
5594       return false;
5595 
5596     // If any of the instruction's operands are uniform after vectorization,
5597     // the instruction cannot be scalarized. This prevents, for example, a
5598     // masked load from being scalarized.
5599     //
5600     // We assume we will only emit a value for lane zero of an instruction
5601     // marked uniform after vectorization, rather than VF identical values.
5602     // Thus, if we scalarize an instruction that uses a uniform, we would
5603     // create uses of values corresponding to the lanes we aren't emitting code
5604     // for. This behavior can be changed by allowing getScalarValue to clone
5605     // the lane zero values for uniforms rather than asserting.
5606     for (Use &U : I->operands())
5607       if (auto *J = dyn_cast<Instruction>(U.get()))
5608         if (isUniformAfterVectorization(J, VF))
5609           return false;
5610 
5611     // Otherwise, we can scalarize the instruction.
5612     return true;
5613   };
5614 
5615   // Compute the expected cost discount from scalarizing the entire expression
5616   // feeding the predicated instruction. We currently only consider expressions
5617   // that are single-use instruction chains.
5618   Worklist.push_back(PredInst);
5619   while (!Worklist.empty()) {
5620     Instruction *I = Worklist.pop_back_val();
5621 
5622     // If we've already analyzed the instruction, there's nothing to do.
5623     if (ScalarCosts.find(I) != ScalarCosts.end())
5624       continue;
5625 
5626     // Compute the cost of the vector instruction. Note that this cost already
5627     // includes the scalarization overhead of the predicated instruction.
5628     unsigned VectorCost = getInstructionCost(I, VF).first;
5629 
5630     // Compute the cost of the scalarized instruction. This cost is the cost of
5631     // the instruction as if it wasn't if-converted and instead remained in the
5632     // predicated block. We will scale this cost by block probability after
5633     // computing the scalarization overhead.
5634     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5635 
5636     // Compute the scalarization overhead of needed insertelement instructions
5637     // and phi nodes.
5638     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5639       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5640                                                  true, false);
5641       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5642     }
5643 
5644     // Compute the scalarization overhead of needed extractelement
5645     // instructions. For each of the instruction's operands, if the operand can
5646     // be scalarized, add it to the worklist; otherwise, account for the
5647     // overhead.
5648     for (Use &U : I->operands())
5649       if (auto *J = dyn_cast<Instruction>(U.get())) {
5650         assert(VectorType::isValidElementType(J->getType()) &&
5651                "Instruction has non-scalar type");
5652         if (canBeScalarized(J))
5653           Worklist.push_back(J);
5654         else if (needsExtract(J, VF))
5655           ScalarCost += TTI.getScalarizationOverhead(
5656                               ToVectorTy(J->getType(),VF), false, true);
5657       }
5658 
5659     // Scale the total scalar cost by block probability.
5660     ScalarCost /= getReciprocalPredBlockProb();
5661 
5662     // Compute the discount. A non-negative discount means the vector version
5663     // of the instruction costs more, and scalarizing would be beneficial.
5664     Discount += VectorCost - ScalarCost;
5665     ScalarCosts[I] = ScalarCost;
5666   }
5667 
5668   return Discount;
5669 }
5670 
5671 LoopVectorizationCostModel::VectorizationCostTy
5672 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5673   VectorizationCostTy Cost;
5674 
5675   // For each block.
5676   for (BasicBlock *BB : TheLoop->blocks()) {
5677     VectorizationCostTy BlockCost;
5678 
5679     // For each instruction in the old loop.
5680     for (Instruction &I : BB->instructionsWithoutDebug()) {
5681       // Skip ignored values.
5682       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5683           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5684         continue;
5685 
5686       VectorizationCostTy C = getInstructionCost(&I, VF);
5687 
5688       // Check if we should override the cost.
5689       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5690         C.first = ForceTargetInstructionCost;
5691 
5692       BlockCost.first += C.first;
5693       BlockCost.second |= C.second;
5694       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5695                         << " for VF " << VF << " For instruction: " << I
5696                         << '\n');
5697     }
5698 
5699     // If we are vectorizing a predicated block, it will have been
5700     // if-converted. This means that the block's instructions (aside from
5701     // stores and instructions that may divide by zero) will now be
5702     // unconditionally executed. For the scalar case, we may not always execute
5703     // the predicated block. Thus, scale the block's cost by the probability of
5704     // executing it.
5705     if (VF == 1 && blockNeedsPredication(BB))
5706       BlockCost.first /= getReciprocalPredBlockProb();
5707 
5708     Cost.first += BlockCost.first;
5709     Cost.second |= BlockCost.second;
5710   }
5711 
5712   return Cost;
5713 }
5714 
5715 /// Gets Address Access SCEV after verifying that the access pattern
5716 /// is loop invariant except the induction variable dependence.
5717 ///
5718 /// This SCEV can be sent to the Target in order to estimate the address
5719 /// calculation cost.
5720 static const SCEV *getAddressAccessSCEV(
5721               Value *Ptr,
5722               LoopVectorizationLegality *Legal,
5723               PredicatedScalarEvolution &PSE,
5724               const Loop *TheLoop) {
5725 
5726   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5727   if (!Gep)
5728     return nullptr;
5729 
5730   // We are looking for a gep with all loop invariant indices except for one
5731   // which should be an induction variable.
5732   auto SE = PSE.getSE();
5733   unsigned NumOperands = Gep->getNumOperands();
5734   for (unsigned i = 1; i < NumOperands; ++i) {
5735     Value *Opd = Gep->getOperand(i);
5736     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5737         !Legal->isInductionVariable(Opd))
5738       return nullptr;
5739   }
5740 
5741   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5742   return PSE.getSCEV(Ptr);
5743 }
5744 
5745 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5746   return Legal->hasStride(I->getOperand(0)) ||
5747          Legal->hasStride(I->getOperand(1));
5748 }
5749 
5750 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5751                                                                  unsigned VF) {
5752   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5753   Type *ValTy = getMemInstValueType(I);
5754   auto SE = PSE.getSE();
5755 
5756   unsigned AS = getLoadStoreAddressSpace(I);
5757   Value *Ptr = getLoadStorePointerOperand(I);
5758   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5759 
5760   // Figure out whether the access is strided and get the stride value
5761   // if it's known in compile time
5762   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5763 
5764   // Get the cost of the scalar memory instruction and address computation.
5765   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5766 
5767   // Don't pass *I here, since it is scalar but will actually be part of a
5768   // vectorized loop where the user of it is a vectorized instruction.
5769   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5770   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5771                                    Alignment, AS);
5772 
5773   // Get the overhead of the extractelement and insertelement instructions
5774   // we might create due to scalarization.
5775   Cost += getScalarizationOverhead(I, VF);
5776 
5777   // If we have a predicated store, it may not be executed for each vector
5778   // lane. Scale the cost by the probability of executing the predicated
5779   // block.
5780   if (isPredicatedInst(I)) {
5781     Cost /= getReciprocalPredBlockProb();
5782 
5783     if (useEmulatedMaskMemRefHack(I))
5784       // Artificially setting to a high enough value to practically disable
5785       // vectorization with such operations.
5786       Cost = 3000000;
5787   }
5788 
5789   return Cost;
5790 }
5791 
5792 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5793                                                              unsigned VF) {
5794   Type *ValTy = getMemInstValueType(I);
5795   Type *VectorTy = ToVectorTy(ValTy, VF);
5796   Value *Ptr = getLoadStorePointerOperand(I);
5797   unsigned AS = getLoadStoreAddressSpace(I);
5798   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5799 
5800   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5801          "Stride should be 1 or -1 for consecutive memory access");
5802   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5803   unsigned Cost = 0;
5804   if (Legal->isMaskRequired(I))
5805     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5806                                       Alignment ? Alignment->value() : 0, AS);
5807   else
5808     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5809 
5810   bool Reverse = ConsecutiveStride < 0;
5811   if (Reverse)
5812     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5813   return Cost;
5814 }
5815 
5816 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5817                                                          unsigned VF) {
5818   Type *ValTy = getMemInstValueType(I);
5819   Type *VectorTy = ToVectorTy(ValTy, VF);
5820   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5821   unsigned AS = getLoadStoreAddressSpace(I);
5822   if (isa<LoadInst>(I)) {
5823     return TTI.getAddressComputationCost(ValTy) +
5824            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5825            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5826   }
5827   StoreInst *SI = cast<StoreInst>(I);
5828 
5829   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5830   return TTI.getAddressComputationCost(ValTy) +
5831          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5832          (isLoopInvariantStoreValue
5833               ? 0
5834               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5835                                        VF - 1));
5836 }
5837 
5838 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5839                                                           unsigned VF) {
5840   Type *ValTy = getMemInstValueType(I);
5841   Type *VectorTy = ToVectorTy(ValTy, VF);
5842   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5843   Value *Ptr = getLoadStorePointerOperand(I);
5844 
5845   return TTI.getAddressComputationCost(VectorTy) +
5846          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5847                                     Legal->isMaskRequired(I),
5848                                     Alignment ? Alignment->value() : 0);
5849 }
5850 
5851 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5852                                                             unsigned VF) {
5853   Type *ValTy = getMemInstValueType(I);
5854   Type *VectorTy = ToVectorTy(ValTy, VF);
5855   unsigned AS = getLoadStoreAddressSpace(I);
5856 
5857   auto Group = getInterleavedAccessGroup(I);
5858   assert(Group && "Fail to get an interleaved access group.");
5859 
5860   unsigned InterleaveFactor = Group->getFactor();
5861   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5862 
5863   // Holds the indices of existing members in an interleaved load group.
5864   // An interleaved store group doesn't need this as it doesn't allow gaps.
5865   SmallVector<unsigned, 4> Indices;
5866   if (isa<LoadInst>(I)) {
5867     for (unsigned i = 0; i < InterleaveFactor; i++)
5868       if (Group->getMember(i))
5869         Indices.push_back(i);
5870   }
5871 
5872   // Calculate the cost of the whole interleaved group.
5873   bool UseMaskForGaps =
5874       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5875   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5876       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5877       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5878 
5879   if (Group->isReverse()) {
5880     // TODO: Add support for reversed masked interleaved access.
5881     assert(!Legal->isMaskRequired(I) &&
5882            "Reverse masked interleaved access not supported.");
5883     Cost += Group->getNumMembers() *
5884             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5885   }
5886   return Cost;
5887 }
5888 
5889 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5890                                                               unsigned VF) {
5891   // Calculate scalar cost only. Vectorization cost should be ready at this
5892   // moment.
5893   if (VF == 1) {
5894     Type *ValTy = getMemInstValueType(I);
5895     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5896     unsigned AS = getLoadStoreAddressSpace(I);
5897 
5898     return TTI.getAddressComputationCost(ValTy) +
5899            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5900   }
5901   return getWideningCost(I, VF);
5902 }
5903 
5904 LoopVectorizationCostModel::VectorizationCostTy
5905 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5906   // If we know that this instruction will remain uniform, check the cost of
5907   // the scalar version.
5908   if (isUniformAfterVectorization(I, VF))
5909     VF = 1;
5910 
5911   if (VF > 1 && isProfitableToScalarize(I, VF))
5912     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5913 
5914   // Forced scalars do not have any scalarization overhead.
5915   auto ForcedScalar = ForcedScalars.find(VF);
5916   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5917     auto InstSet = ForcedScalar->second;
5918     if (InstSet.find(I) != InstSet.end())
5919       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5920   }
5921 
5922   Type *VectorTy;
5923   unsigned C = getInstructionCost(I, VF, VectorTy);
5924 
5925   bool TypeNotScalarized =
5926       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5927   return VectorizationCostTy(C, TypeNotScalarized);
5928 }
5929 
5930 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5931                                                               unsigned VF) {
5932 
5933   if (VF == 1)
5934     return 0;
5935 
5936   unsigned Cost = 0;
5937   Type *RetTy = ToVectorTy(I->getType(), VF);
5938   if (!RetTy->isVoidTy() &&
5939       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5940     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5941 
5942   // Some targets keep addresses scalar.
5943   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5944     return Cost;
5945 
5946   // Some targets support efficient element stores.
5947   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5948     return Cost;
5949 
5950   // Collect operands to consider.
5951   CallInst *CI = dyn_cast<CallInst>(I);
5952   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5953 
5954   // Skip operands that do not require extraction/scalarization and do not incur
5955   // any overhead.
5956   return Cost + TTI.getOperandsScalarizationOverhead(
5957                     filterExtractingOperands(Ops, VF), VF);
5958 }
5959 
5960 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5961   if (VF == 1)
5962     return;
5963   NumPredStores = 0;
5964   for (BasicBlock *BB : TheLoop->blocks()) {
5965     // For each instruction in the old loop.
5966     for (Instruction &I : *BB) {
5967       Value *Ptr =  getLoadStorePointerOperand(&I);
5968       if (!Ptr)
5969         continue;
5970 
5971       // TODO: We should generate better code and update the cost model for
5972       // predicated uniform stores. Today they are treated as any other
5973       // predicated store (see added test cases in
5974       // invariant-store-vectorization.ll).
5975       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5976         NumPredStores++;
5977 
5978       if (Legal->isUniform(Ptr) &&
5979           // Conditional loads and stores should be scalarized and predicated.
5980           // isScalarWithPredication cannot be used here since masked
5981           // gather/scatters are not considered scalar with predication.
5982           !Legal->blockNeedsPredication(I.getParent())) {
5983         // TODO: Avoid replicating loads and stores instead of
5984         // relying on instcombine to remove them.
5985         // Load: Scalar load + broadcast
5986         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5987         unsigned Cost = getUniformMemOpCost(&I, VF);
5988         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5989         continue;
5990       }
5991 
5992       // We assume that widening is the best solution when possible.
5993       if (memoryInstructionCanBeWidened(&I, VF)) {
5994         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5995         int ConsecutiveStride =
5996                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5997         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5998                "Expected consecutive stride.");
5999         InstWidening Decision =
6000             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6001         setWideningDecision(&I, VF, Decision, Cost);
6002         continue;
6003       }
6004 
6005       // Choose between Interleaving, Gather/Scatter or Scalarization.
6006       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6007       unsigned NumAccesses = 1;
6008       if (isAccessInterleaved(&I)) {
6009         auto Group = getInterleavedAccessGroup(&I);
6010         assert(Group && "Fail to get an interleaved access group.");
6011 
6012         // Make one decision for the whole group.
6013         if (getWideningDecision(&I, VF) != CM_Unknown)
6014           continue;
6015 
6016         NumAccesses = Group->getNumMembers();
6017         if (interleavedAccessCanBeWidened(&I, VF))
6018           InterleaveCost = getInterleaveGroupCost(&I, VF);
6019       }
6020 
6021       unsigned GatherScatterCost =
6022           isLegalGatherOrScatter(&I)
6023               ? getGatherScatterCost(&I, VF) * NumAccesses
6024               : std::numeric_limits<unsigned>::max();
6025 
6026       unsigned ScalarizationCost =
6027           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6028 
6029       // Choose better solution for the current VF,
6030       // write down this decision and use it during vectorization.
6031       unsigned Cost;
6032       InstWidening Decision;
6033       if (InterleaveCost <= GatherScatterCost &&
6034           InterleaveCost < ScalarizationCost) {
6035         Decision = CM_Interleave;
6036         Cost = InterleaveCost;
6037       } else if (GatherScatterCost < ScalarizationCost) {
6038         Decision = CM_GatherScatter;
6039         Cost = GatherScatterCost;
6040       } else {
6041         Decision = CM_Scalarize;
6042         Cost = ScalarizationCost;
6043       }
6044       // If the instructions belongs to an interleave group, the whole group
6045       // receives the same decision. The whole group receives the cost, but
6046       // the cost will actually be assigned to one instruction.
6047       if (auto Group = getInterleavedAccessGroup(&I))
6048         setWideningDecision(Group, VF, Decision, Cost);
6049       else
6050         setWideningDecision(&I, VF, Decision, Cost);
6051     }
6052   }
6053 
6054   // Make sure that any load of address and any other address computation
6055   // remains scalar unless there is gather/scatter support. This avoids
6056   // inevitable extracts into address registers, and also has the benefit of
6057   // activating LSR more, since that pass can't optimize vectorized
6058   // addresses.
6059   if (TTI.prefersVectorizedAddressing())
6060     return;
6061 
6062   // Start with all scalar pointer uses.
6063   SmallPtrSet<Instruction *, 8> AddrDefs;
6064   for (BasicBlock *BB : TheLoop->blocks())
6065     for (Instruction &I : *BB) {
6066       Instruction *PtrDef =
6067         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6068       if (PtrDef && TheLoop->contains(PtrDef) &&
6069           getWideningDecision(&I, VF) != CM_GatherScatter)
6070         AddrDefs.insert(PtrDef);
6071     }
6072 
6073   // Add all instructions used to generate the addresses.
6074   SmallVector<Instruction *, 4> Worklist;
6075   for (auto *I : AddrDefs)
6076     Worklist.push_back(I);
6077   while (!Worklist.empty()) {
6078     Instruction *I = Worklist.pop_back_val();
6079     for (auto &Op : I->operands())
6080       if (auto *InstOp = dyn_cast<Instruction>(Op))
6081         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6082             AddrDefs.insert(InstOp).second)
6083           Worklist.push_back(InstOp);
6084   }
6085 
6086   for (auto *I : AddrDefs) {
6087     if (isa<LoadInst>(I)) {
6088       // Setting the desired widening decision should ideally be handled in
6089       // by cost functions, but since this involves the task of finding out
6090       // if the loaded register is involved in an address computation, it is
6091       // instead changed here when we know this is the case.
6092       InstWidening Decision = getWideningDecision(I, VF);
6093       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6094         // Scalarize a widened load of address.
6095         setWideningDecision(I, VF, CM_Scalarize,
6096                             (VF * getMemoryInstructionCost(I, 1)));
6097       else if (auto Group = getInterleavedAccessGroup(I)) {
6098         // Scalarize an interleave group of address loads.
6099         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6100           if (Instruction *Member = Group->getMember(I))
6101             setWideningDecision(Member, VF, CM_Scalarize,
6102                                 (VF * getMemoryInstructionCost(Member, 1)));
6103         }
6104       }
6105     } else
6106       // Make sure I gets scalarized and a cost estimate without
6107       // scalarization overhead.
6108       ForcedScalars[VF].insert(I);
6109   }
6110 }
6111 
6112 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6113                                                         unsigned VF,
6114                                                         Type *&VectorTy) {
6115   Type *RetTy = I->getType();
6116   if (canTruncateToMinimalBitwidth(I, VF))
6117     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6118   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6119   auto SE = PSE.getSE();
6120 
6121   // TODO: We need to estimate the cost of intrinsic calls.
6122   switch (I->getOpcode()) {
6123   case Instruction::GetElementPtr:
6124     // We mark this instruction as zero-cost because the cost of GEPs in
6125     // vectorized code depends on whether the corresponding memory instruction
6126     // is scalarized or not. Therefore, we handle GEPs with the memory
6127     // instruction cost.
6128     return 0;
6129   case Instruction::Br: {
6130     // In cases of scalarized and predicated instructions, there will be VF
6131     // predicated blocks in the vectorized loop. Each branch around these
6132     // blocks requires also an extract of its vector compare i1 element.
6133     bool ScalarPredicatedBB = false;
6134     BranchInst *BI = cast<BranchInst>(I);
6135     if (VF > 1 && BI->isConditional() &&
6136         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6137              PredicatedBBsAfterVectorization.end() ||
6138          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6139              PredicatedBBsAfterVectorization.end()))
6140       ScalarPredicatedBB = true;
6141 
6142     if (ScalarPredicatedBB) {
6143       // Return cost for branches around scalarized and predicated blocks.
6144       Type *Vec_i1Ty =
6145           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6146       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6147               (TTI.getCFInstrCost(Instruction::Br) * VF));
6148     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6149       // The back-edge branch will remain, as will all scalar branches.
6150       return TTI.getCFInstrCost(Instruction::Br);
6151     else
6152       // This branch will be eliminated by if-conversion.
6153       return 0;
6154     // Note: We currently assume zero cost for an unconditional branch inside
6155     // a predicated block since it will become a fall-through, although we
6156     // may decide in the future to call TTI for all branches.
6157   }
6158   case Instruction::PHI: {
6159     auto *Phi = cast<PHINode>(I);
6160 
6161     // First-order recurrences are replaced by vector shuffles inside the loop.
6162     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6163     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6164       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6165                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6166 
6167     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6168     // converted into select instructions. We require N - 1 selects per phi
6169     // node, where N is the number of incoming values.
6170     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6171       return (Phi->getNumIncomingValues() - 1) *
6172              TTI.getCmpSelInstrCost(
6173                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6174                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6175 
6176     return TTI.getCFInstrCost(Instruction::PHI);
6177   }
6178   case Instruction::UDiv:
6179   case Instruction::SDiv:
6180   case Instruction::URem:
6181   case Instruction::SRem:
6182     // If we have a predicated instruction, it may not be executed for each
6183     // vector lane. Get the scalarization cost and scale this amount by the
6184     // probability of executing the predicated block. If the instruction is not
6185     // predicated, we fall through to the next case.
6186     if (VF > 1 && isScalarWithPredication(I)) {
6187       unsigned Cost = 0;
6188 
6189       // These instructions have a non-void type, so account for the phi nodes
6190       // that we will create. This cost is likely to be zero. The phi node
6191       // cost, if any, should be scaled by the block probability because it
6192       // models a copy at the end of each predicated block.
6193       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6194 
6195       // The cost of the non-predicated instruction.
6196       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6197 
6198       // The cost of insertelement and extractelement instructions needed for
6199       // scalarization.
6200       Cost += getScalarizationOverhead(I, VF);
6201 
6202       // Scale the cost by the probability of executing the predicated blocks.
6203       // This assumes the predicated block for each vector lane is equally
6204       // likely.
6205       return Cost / getReciprocalPredBlockProb();
6206     }
6207     LLVM_FALLTHROUGH;
6208   case Instruction::Add:
6209   case Instruction::FAdd:
6210   case Instruction::Sub:
6211   case Instruction::FSub:
6212   case Instruction::Mul:
6213   case Instruction::FMul:
6214   case Instruction::FDiv:
6215   case Instruction::FRem:
6216   case Instruction::Shl:
6217   case Instruction::LShr:
6218   case Instruction::AShr:
6219   case Instruction::And:
6220   case Instruction::Or:
6221   case Instruction::Xor: {
6222     // Since we will replace the stride by 1 the multiplication should go away.
6223     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6224       return 0;
6225     // Certain instructions can be cheaper to vectorize if they have a constant
6226     // second vector operand. One example of this are shifts on x86.
6227     Value *Op2 = I->getOperand(1);
6228     TargetTransformInfo::OperandValueProperties Op2VP;
6229     TargetTransformInfo::OperandValueKind Op2VK =
6230         TTI.getOperandInfo(Op2, Op2VP);
6231     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6232       Op2VK = TargetTransformInfo::OK_UniformValue;
6233 
6234     SmallVector<const Value *, 4> Operands(I->operand_values());
6235     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6236     return N * TTI.getArithmeticInstrCost(
6237                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6238                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6239   }
6240   case Instruction::FNeg: {
6241     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6242     return N * TTI.getArithmeticInstrCost(
6243                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6244                    TargetTransformInfo::OK_AnyValue,
6245                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6246                    I->getOperand(0), I);
6247   }
6248   case Instruction::Select: {
6249     SelectInst *SI = cast<SelectInst>(I);
6250     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6251     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6252     Type *CondTy = SI->getCondition()->getType();
6253     if (!ScalarCond)
6254       CondTy = VectorType::get(CondTy, VF);
6255 
6256     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6257   }
6258   case Instruction::ICmp:
6259   case Instruction::FCmp: {
6260     Type *ValTy = I->getOperand(0)->getType();
6261     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6262     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6263       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6264     VectorTy = ToVectorTy(ValTy, VF);
6265     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6266   }
6267   case Instruction::Store:
6268   case Instruction::Load: {
6269     unsigned Width = VF;
6270     if (Width > 1) {
6271       InstWidening Decision = getWideningDecision(I, Width);
6272       assert(Decision != CM_Unknown &&
6273              "CM decision should be taken at this point");
6274       if (Decision == CM_Scalarize)
6275         Width = 1;
6276     }
6277     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6278     return getMemoryInstructionCost(I, VF);
6279   }
6280   case Instruction::ZExt:
6281   case Instruction::SExt:
6282   case Instruction::FPToUI:
6283   case Instruction::FPToSI:
6284   case Instruction::FPExt:
6285   case Instruction::PtrToInt:
6286   case Instruction::IntToPtr:
6287   case Instruction::SIToFP:
6288   case Instruction::UIToFP:
6289   case Instruction::Trunc:
6290   case Instruction::FPTrunc:
6291   case Instruction::BitCast: {
6292     // We optimize the truncation of induction variables having constant
6293     // integer steps. The cost of these truncations is the same as the scalar
6294     // operation.
6295     if (isOptimizableIVTruncate(I, VF)) {
6296       auto *Trunc = cast<TruncInst>(I);
6297       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6298                                   Trunc->getSrcTy(), Trunc);
6299     }
6300 
6301     Type *SrcScalarTy = I->getOperand(0)->getType();
6302     Type *SrcVecTy =
6303         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6304     if (canTruncateToMinimalBitwidth(I, VF)) {
6305       // This cast is going to be shrunk. This may remove the cast or it might
6306       // turn it into slightly different cast. For example, if MinBW == 16,
6307       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6308       //
6309       // Calculate the modified src and dest types.
6310       Type *MinVecTy = VectorTy;
6311       if (I->getOpcode() == Instruction::Trunc) {
6312         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6313         VectorTy =
6314             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6315       } else if (I->getOpcode() == Instruction::ZExt ||
6316                  I->getOpcode() == Instruction::SExt) {
6317         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6318         VectorTy =
6319             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6320       }
6321     }
6322 
6323     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6324     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6325   }
6326   case Instruction::Call: {
6327     bool NeedToScalarize;
6328     CallInst *CI = cast<CallInst>(I);
6329     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6330     if (getVectorIntrinsicIDForCall(CI, TLI))
6331       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6332     return CallCost;
6333   }
6334   default:
6335     // The cost of executing VF copies of the scalar instruction. This opcode
6336     // is unknown. Assume that it is the same as 'mul'.
6337     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6338            getScalarizationOverhead(I, VF);
6339   } // end of switch.
6340 }
6341 
6342 char LoopVectorize::ID = 0;
6343 
6344 static const char lv_name[] = "Loop Vectorization";
6345 
6346 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6347 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6348 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6349 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6350 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6351 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6352 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6353 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6354 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6355 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6356 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6357 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6358 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6359 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6360 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6361 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6362 
6363 namespace llvm {
6364 
6365 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6366 
6367 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6368                               bool VectorizeOnlyWhenForced) {
6369   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6370 }
6371 
6372 } // end namespace llvm
6373 
6374 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6375   // Check if the pointer operand of a load or store instruction is
6376   // consecutive.
6377   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6378     return Legal->isConsecutivePtr(Ptr);
6379   return false;
6380 }
6381 
6382 void LoopVectorizationCostModel::collectValuesToIgnore() {
6383   // Ignore ephemeral values.
6384   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6385 
6386   // Ignore type-promoting instructions we identified during reduction
6387   // detection.
6388   for (auto &Reduction : *Legal->getReductionVars()) {
6389     RecurrenceDescriptor &RedDes = Reduction.second;
6390     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6391     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6392   }
6393   // Ignore type-casting instructions we identified during induction
6394   // detection.
6395   for (auto &Induction : *Legal->getInductionVars()) {
6396     InductionDescriptor &IndDes = Induction.second;
6397     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6398     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6399   }
6400 }
6401 
6402 // TODO: we could return a pair of values that specify the max VF and
6403 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6404 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6405 // doesn't have a cost model that can choose which plan to execute if
6406 // more than one is generated.
6407 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6408                                  LoopVectorizationCostModel &CM) {
6409   unsigned WidestType;
6410   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6411   return WidestVectorRegBits / WidestType;
6412 }
6413 
6414 VectorizationFactor
6415 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6416   unsigned VF = UserVF;
6417   // Outer loop handling: They may require CFG and instruction level
6418   // transformations before even evaluating whether vectorization is profitable.
6419   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6420   // the vectorization pipeline.
6421   if (!OrigLoop->empty()) {
6422     // If the user doesn't provide a vectorization factor, determine a
6423     // reasonable one.
6424     if (!UserVF) {
6425       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6426       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6427 
6428       // Make sure we have a VF > 1 for stress testing.
6429       if (VPlanBuildStressTest && VF < 2) {
6430         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6431                           << "overriding computed VF.\n");
6432         VF = 4;
6433       }
6434     }
6435     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6436     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6437     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6438                       << " to build VPlans.\n");
6439     buildVPlans(VF, VF);
6440 
6441     // For VPlan build stress testing, we bail out after VPlan construction.
6442     if (VPlanBuildStressTest)
6443       return VectorizationFactor::Disabled();
6444 
6445     return {VF, 0};
6446   }
6447 
6448   LLVM_DEBUG(
6449       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6450                 "VPlan-native path.\n");
6451   return VectorizationFactor::Disabled();
6452 }
6453 
6454 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6455   assert(OrigLoop->empty() && "Inner loop expected.");
6456   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6457   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6458     return None;
6459 
6460   // Invalidate interleave groups if all blocks of loop will be predicated.
6461   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6462       !useMaskedInterleavedAccesses(*TTI)) {
6463     LLVM_DEBUG(
6464         dbgs()
6465         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6466            "which requires masked-interleaved support.\n");
6467     CM.InterleaveInfo.reset();
6468   }
6469 
6470   if (UserVF) {
6471     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6472     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6473     // Collect the instructions (and their associated costs) that will be more
6474     // profitable to scalarize.
6475     CM.selectUserVectorizationFactor(UserVF);
6476     buildVPlansWithVPRecipes(UserVF, UserVF);
6477     LLVM_DEBUG(printPlans(dbgs()));
6478     return {{UserVF, 0}};
6479   }
6480 
6481   unsigned MaxVF = MaybeMaxVF.getValue();
6482   assert(MaxVF != 0 && "MaxVF is zero.");
6483 
6484   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6485     // Collect Uniform and Scalar instructions after vectorization with VF.
6486     CM.collectUniformsAndScalars(VF);
6487 
6488     // Collect the instructions (and their associated costs) that will be more
6489     // profitable to scalarize.
6490     if (VF > 1)
6491       CM.collectInstsToScalarize(VF);
6492   }
6493 
6494   buildVPlansWithVPRecipes(1, MaxVF);
6495   LLVM_DEBUG(printPlans(dbgs()));
6496   if (MaxVF == 1)
6497     return VectorizationFactor::Disabled();
6498 
6499   // Select the optimal vectorization factor.
6500   return CM.selectVectorizationFactor(MaxVF);
6501 }
6502 
6503 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6504   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6505                     << '\n');
6506   BestVF = VF;
6507   BestUF = UF;
6508 
6509   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6510     return !Plan->hasVF(VF);
6511   });
6512   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6513 }
6514 
6515 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6516                                            DominatorTree *DT) {
6517   // Perform the actual loop transformation.
6518 
6519   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6520   VPCallbackILV CallbackILV(ILV);
6521 
6522   VPTransformState State{BestVF, BestUF,      LI,
6523                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6524                          &ILV,   CallbackILV};
6525   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6526   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6527 
6528   //===------------------------------------------------===//
6529   //
6530   // Notice: any optimization or new instruction that go
6531   // into the code below should also be implemented in
6532   // the cost-model.
6533   //
6534   //===------------------------------------------------===//
6535 
6536   // 2. Copy and widen instructions from the old loop into the new loop.
6537   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6538   VPlans.front()->execute(&State);
6539 
6540   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6541   //    predication, updating analyses.
6542   ILV.fixVectorizedLoop();
6543 }
6544 
6545 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6546     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6547   BasicBlock *Latch = OrigLoop->getLoopLatch();
6548 
6549   // We create new control-flow for the vectorized loop, so the original
6550   // condition will be dead after vectorization if it's only used by the
6551   // branch.
6552   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6553   if (Cmp && Cmp->hasOneUse())
6554     DeadInstructions.insert(Cmp);
6555 
6556   // We create new "steps" for induction variable updates to which the original
6557   // induction variables map. An original update instruction will be dead if
6558   // all its users except the induction variable are dead.
6559   for (auto &Induction : *Legal->getInductionVars()) {
6560     PHINode *Ind = Induction.first;
6561     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6562     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6563           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6564                                  DeadInstructions.end();
6565         }))
6566       DeadInstructions.insert(IndUpdate);
6567 
6568     // We record as "Dead" also the type-casting instructions we had identified
6569     // during induction analysis. We don't need any handling for them in the
6570     // vectorized loop because we have proven that, under a proper runtime
6571     // test guarding the vectorized loop, the value of the phi, and the casted
6572     // value of the phi, are the same. The last instruction in this casting chain
6573     // will get its scalar/vector/widened def from the scalar/vector/widened def
6574     // of the respective phi node. Any other casts in the induction def-use chain
6575     // have no other uses outside the phi update chain, and will be ignored.
6576     InductionDescriptor &IndDes = Induction.second;
6577     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6578     DeadInstructions.insert(Casts.begin(), Casts.end());
6579   }
6580 }
6581 
6582 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6583 
6584 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6585 
6586 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6587                                         Instruction::BinaryOps BinOp) {
6588   // When unrolling and the VF is 1, we only need to add a simple scalar.
6589   Type *Ty = Val->getType();
6590   assert(!Ty->isVectorTy() && "Val must be a scalar");
6591 
6592   if (Ty->isFloatingPointTy()) {
6593     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6594 
6595     // Floating point operations had to be 'fast' to enable the unrolling.
6596     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6597     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6598   }
6599   Constant *C = ConstantInt::get(Ty, StartIdx);
6600   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6601 }
6602 
6603 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6604   SmallVector<Metadata *, 4> MDs;
6605   // Reserve first location for self reference to the LoopID metadata node.
6606   MDs.push_back(nullptr);
6607   bool IsUnrollMetadata = false;
6608   MDNode *LoopID = L->getLoopID();
6609   if (LoopID) {
6610     // First find existing loop unrolling disable metadata.
6611     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6612       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6613       if (MD) {
6614         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6615         IsUnrollMetadata =
6616             S && S->getString().startswith("llvm.loop.unroll.disable");
6617       }
6618       MDs.push_back(LoopID->getOperand(i));
6619     }
6620   }
6621 
6622   if (!IsUnrollMetadata) {
6623     // Add runtime unroll disable metadata.
6624     LLVMContext &Context = L->getHeader()->getContext();
6625     SmallVector<Metadata *, 1> DisableOperands;
6626     DisableOperands.push_back(
6627         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6628     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6629     MDs.push_back(DisableNode);
6630     MDNode *NewLoopID = MDNode::get(Context, MDs);
6631     // Set operand 0 to refer to the loop id itself.
6632     NewLoopID->replaceOperandWith(0, NewLoopID);
6633     L->setLoopID(NewLoopID);
6634   }
6635 }
6636 
6637 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6638     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6639   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6640   bool PredicateAtRangeStart = Predicate(Range.Start);
6641 
6642   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6643     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6644       Range.End = TmpVF;
6645       break;
6646     }
6647 
6648   return PredicateAtRangeStart;
6649 }
6650 
6651 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6652 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6653 /// of VF's starting at a given VF and extending it as much as possible. Each
6654 /// vectorization decision can potentially shorten this sub-range during
6655 /// buildVPlan().
6656 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6657   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6658     VFRange SubRange = {VF, MaxVF + 1};
6659     VPlans.push_back(buildVPlan(SubRange));
6660     VF = SubRange.End;
6661   }
6662 }
6663 
6664 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6665                                          VPlanPtr &Plan) {
6666   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6667 
6668   // Look for cached value.
6669   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6670   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6671   if (ECEntryIt != EdgeMaskCache.end())
6672     return ECEntryIt->second;
6673 
6674   VPValue *SrcMask = createBlockInMask(Src, Plan);
6675 
6676   // The terminator has to be a branch inst!
6677   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6678   assert(BI && "Unexpected terminator found");
6679 
6680   if (!BI->isConditional())
6681     return EdgeMaskCache[Edge] = SrcMask;
6682 
6683   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6684   assert(EdgeMask && "No Edge Mask found for condition");
6685 
6686   if (BI->getSuccessor(0) != Dst)
6687     EdgeMask = Builder.createNot(EdgeMask);
6688 
6689   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6690     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6691 
6692   return EdgeMaskCache[Edge] = EdgeMask;
6693 }
6694 
6695 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6696   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6697 
6698   // Look for cached value.
6699   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6700   if (BCEntryIt != BlockMaskCache.end())
6701     return BCEntryIt->second;
6702 
6703   // All-one mask is modelled as no-mask following the convention for masked
6704   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6705   VPValue *BlockMask = nullptr;
6706 
6707   if (OrigLoop->getHeader() == BB) {
6708     if (!CM.blockNeedsPredication(BB))
6709       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6710 
6711     // Introduce the early-exit compare IV <= BTC to form header block mask.
6712     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6713     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6714     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6715     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6716     return BlockMaskCache[BB] = BlockMask;
6717   }
6718 
6719   // This is the block mask. We OR all incoming edges.
6720   for (auto *Predecessor : predecessors(BB)) {
6721     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6722     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6723       return BlockMaskCache[BB] = EdgeMask;
6724 
6725     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6726       BlockMask = EdgeMask;
6727       continue;
6728     }
6729 
6730     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6731   }
6732 
6733   return BlockMaskCache[BB] = BlockMask;
6734 }
6735 
6736 VPWidenMemoryInstructionRecipe *
6737 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6738                                   VPlanPtr &Plan) {
6739   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6740     return nullptr;
6741 
6742   auto willWiden = [&](unsigned VF) -> bool {
6743     if (VF == 1)
6744       return false;
6745     LoopVectorizationCostModel::InstWidening Decision =
6746         CM.getWideningDecision(I, VF);
6747     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6748            "CM decision should be taken at this point.");
6749     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6750       return true;
6751     if (CM.isScalarAfterVectorization(I, VF) ||
6752         CM.isProfitableToScalarize(I, VF))
6753       return false;
6754     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6755   };
6756 
6757   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6758     return nullptr;
6759 
6760   VPValue *Mask = nullptr;
6761   if (Legal->isMaskRequired(I))
6762     Mask = createBlockInMask(I->getParent(), Plan);
6763 
6764   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6765 }
6766 
6767 VPWidenIntOrFpInductionRecipe *
6768 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6769   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6770     // Check if this is an integer or fp induction. If so, build the recipe that
6771     // produces its scalar and vector values.
6772     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6773     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6774         II.getKind() == InductionDescriptor::IK_FpInduction)
6775       return new VPWidenIntOrFpInductionRecipe(Phi);
6776 
6777     return nullptr;
6778   }
6779 
6780   // Optimize the special case where the source is a constant integer
6781   // induction variable. Notice that we can only optimize the 'trunc' case
6782   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6783   // (c) other casts depend on pointer size.
6784 
6785   // Determine whether \p K is a truncation based on an induction variable that
6786   // can be optimized.
6787   auto isOptimizableIVTruncate =
6788       [&](Instruction *K) -> std::function<bool(unsigned)> {
6789     return
6790         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6791   };
6792 
6793   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6794                                isOptimizableIVTruncate(I), Range))
6795     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6796                                              cast<TruncInst>(I));
6797   return nullptr;
6798 }
6799 
6800 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6801   PHINode *Phi = dyn_cast<PHINode>(I);
6802   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6803     return nullptr;
6804 
6805   // We know that all PHIs in non-header blocks are converted into selects, so
6806   // we don't have to worry about the insertion order and we can just use the
6807   // builder. At this point we generate the predication tree. There may be
6808   // duplications since this is a simple recursive scan, but future
6809   // optimizations will clean it up.
6810 
6811   SmallVector<VPValue *, 2> Masks;
6812   unsigned NumIncoming = Phi->getNumIncomingValues();
6813   for (unsigned In = 0; In < NumIncoming; In++) {
6814     VPValue *EdgeMask =
6815       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6816     assert((EdgeMask || NumIncoming == 1) &&
6817            "Multiple predecessors with one having a full mask");
6818     if (EdgeMask)
6819       Masks.push_back(EdgeMask);
6820   }
6821   return new VPBlendRecipe(Phi, Masks);
6822 }
6823 
6824 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6825                                  VFRange &Range) {
6826 
6827   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6828       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6829 
6830   if (IsPredicated)
6831     return false;
6832 
6833   auto IsVectorizableOpcode = [](unsigned Opcode) {
6834     switch (Opcode) {
6835     case Instruction::Add:
6836     case Instruction::And:
6837     case Instruction::AShr:
6838     case Instruction::BitCast:
6839     case Instruction::Br:
6840     case Instruction::Call:
6841     case Instruction::FAdd:
6842     case Instruction::FCmp:
6843     case Instruction::FDiv:
6844     case Instruction::FMul:
6845     case Instruction::FNeg:
6846     case Instruction::FPExt:
6847     case Instruction::FPToSI:
6848     case Instruction::FPToUI:
6849     case Instruction::FPTrunc:
6850     case Instruction::FRem:
6851     case Instruction::FSub:
6852     case Instruction::ICmp:
6853     case Instruction::IntToPtr:
6854     case Instruction::Load:
6855     case Instruction::LShr:
6856     case Instruction::Mul:
6857     case Instruction::Or:
6858     case Instruction::PHI:
6859     case Instruction::PtrToInt:
6860     case Instruction::SDiv:
6861     case Instruction::Select:
6862     case Instruction::SExt:
6863     case Instruction::Shl:
6864     case Instruction::SIToFP:
6865     case Instruction::SRem:
6866     case Instruction::Store:
6867     case Instruction::Sub:
6868     case Instruction::Trunc:
6869     case Instruction::UDiv:
6870     case Instruction::UIToFP:
6871     case Instruction::URem:
6872     case Instruction::Xor:
6873     case Instruction::ZExt:
6874       return true;
6875     }
6876     return false;
6877   };
6878 
6879   if (!IsVectorizableOpcode(I->getOpcode()))
6880     return false;
6881 
6882   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6883     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6884     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6885                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6886       return false;
6887   }
6888 
6889   auto willWiden = [&](unsigned VF) -> bool {
6890     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6891                              CM.isProfitableToScalarize(I, VF)))
6892       return false;
6893     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6894       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6895       // The following case may be scalarized depending on the VF.
6896       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6897       // version of the instruction.
6898       // Is it beneficial to perform intrinsic call compared to lib call?
6899       bool NeedToScalarize;
6900       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6901       bool UseVectorIntrinsic =
6902           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6903       return UseVectorIntrinsic || !NeedToScalarize;
6904     }
6905     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6906       assert(CM.getWideningDecision(I, VF) ==
6907                  LoopVectorizationCostModel::CM_Scalarize &&
6908              "Memory widening decisions should have been taken care by now");
6909       return false;
6910     }
6911     return true;
6912   };
6913 
6914   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6915     return false;
6916   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6917   // to avoid having to split recipes later.
6918   bool IsSingleton = Ingredient2Recipe.count(I);
6919 
6920   // Success: widen this instruction.
6921 
6922   // Use the default widening recipe. We optimize the common case where
6923   // consecutive instructions can be represented by a single recipe.
6924   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6925       LastExtensibleRecipe->appendInstruction(I))
6926     return true;
6927 
6928   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6929   if (!IsSingleton)
6930     LastExtensibleRecipe = WidenRecipe;
6931   setRecipe(I, WidenRecipe);
6932   VPBB->appendRecipe(WidenRecipe);
6933   return true;
6934 }
6935 
6936 VPBasicBlock *VPRecipeBuilder::handleReplication(
6937     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6938     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6939     VPlanPtr &Plan) {
6940   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6941       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6942       Range);
6943 
6944   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6945       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6946 
6947   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6948   setRecipe(I, Recipe);
6949 
6950   // Find if I uses a predicated instruction. If so, it will use its scalar
6951   // value. Avoid hoisting the insert-element which packs the scalar value into
6952   // a vector value, as that happens iff all users use the vector value.
6953   for (auto &Op : I->operands())
6954     if (auto *PredInst = dyn_cast<Instruction>(Op))
6955       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6956         PredInst2Recipe[PredInst]->setAlsoPack(false);
6957 
6958   // Finalize the recipe for Instr, first if it is not predicated.
6959   if (!IsPredicated) {
6960     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6961     VPBB->appendRecipe(Recipe);
6962     return VPBB;
6963   }
6964   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6965   assert(VPBB->getSuccessors().empty() &&
6966          "VPBB has successors when handling predicated replication.");
6967   // Record predicated instructions for above packing optimizations.
6968   PredInst2Recipe[I] = Recipe;
6969   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6970   VPBlockUtils::insertBlockAfter(Region, VPBB);
6971   auto *RegSucc = new VPBasicBlock();
6972   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6973   return RegSucc;
6974 }
6975 
6976 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6977                                                       VPRecipeBase *PredRecipe,
6978                                                       VPlanPtr &Plan) {
6979   // Instructions marked for predication are replicated and placed under an
6980   // if-then construct to prevent side-effects.
6981 
6982   // Generate recipes to compute the block mask for this region.
6983   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6984 
6985   // Build the triangular if-then region.
6986   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6987   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6988   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6989   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6990   auto *PHIRecipe =
6991       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6992   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6993   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6994   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6995 
6996   // Note: first set Entry as region entry and then connect successors starting
6997   // from it in order, to propagate the "parent" of each VPBasicBlock.
6998   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6999   VPBlockUtils::connectBlocks(Pred, Exit);
7000 
7001   return Region;
7002 }
7003 
7004 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7005                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7006   VPRecipeBase *Recipe = nullptr;
7007 
7008   // First, check for specific widening recipes that deal with memory
7009   // operations, inductions and Phi nodes.
7010   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7011       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7012       (Recipe = tryToBlend(Instr, Plan)) ||
7013       (isa<PHINode>(Instr) &&
7014        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7015     setRecipe(Instr, Recipe);
7016     VPBB->appendRecipe(Recipe);
7017     return true;
7018   }
7019 
7020   // Handle GEP widening.
7021   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7022     auto Scalarize = [&](unsigned VF) {
7023       return CM.isScalarWithPredication(Instr, VF) ||
7024              CM.isScalarAfterVectorization(Instr, VF) ||
7025              CM.isProfitableToScalarize(Instr, VF);
7026     };
7027     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7028       return false;
7029     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7030     setRecipe(Instr, Recipe);
7031     VPBB->appendRecipe(Recipe);
7032     return true;
7033   }
7034 
7035   // Check if Instr is to be widened by a general VPWidenRecipe, after
7036   // having first checked for specific widening recipes.
7037   if (tryToWiden(Instr, VPBB, Range))
7038     return true;
7039 
7040   return false;
7041 }
7042 
7043 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7044                                                         unsigned MaxVF) {
7045   assert(OrigLoop->empty() && "Inner loop expected.");
7046 
7047   // Collect conditions feeding internal conditional branches; they need to be
7048   // represented in VPlan for it to model masking.
7049   SmallPtrSet<Value *, 1> NeedDef;
7050 
7051   auto *Latch = OrigLoop->getLoopLatch();
7052   for (BasicBlock *BB : OrigLoop->blocks()) {
7053     if (BB == Latch)
7054       continue;
7055     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7056     if (Branch && Branch->isConditional())
7057       NeedDef.insert(Branch->getCondition());
7058   }
7059 
7060   // If the tail is to be folded by masking, the primary induction variable
7061   // needs to be represented in VPlan for it to model early-exit masking.
7062   // Also, both the Phi and the live-out instruction of each reduction are
7063   // required in order to introduce a select between them in VPlan.
7064   if (CM.foldTailByMasking()) {
7065     NeedDef.insert(Legal->getPrimaryInduction());
7066     for (auto &Reduction : *Legal->getReductionVars()) {
7067       NeedDef.insert(Reduction.first);
7068       NeedDef.insert(Reduction.second.getLoopExitInstr());
7069     }
7070   }
7071 
7072   // Collect instructions from the original loop that will become trivially dead
7073   // in the vectorized loop. We don't need to vectorize these instructions. For
7074   // example, original induction update instructions can become dead because we
7075   // separately emit induction "steps" when generating code for the new loop.
7076   // Similarly, we create a new latch condition when setting up the structure
7077   // of the new loop, so the old one can become dead.
7078   SmallPtrSet<Instruction *, 4> DeadInstructions;
7079   collectTriviallyDeadInstructions(DeadInstructions);
7080 
7081   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7082     VFRange SubRange = {VF, MaxVF + 1};
7083     VPlans.push_back(
7084         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7085     VF = SubRange.End;
7086   }
7087 }
7088 
7089 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7090     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7091     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7092 
7093   // Hold a mapping from predicated instructions to their recipes, in order to
7094   // fix their AlsoPack behavior if a user is determined to replicate and use a
7095   // scalar instead of vector value.
7096   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7097 
7098   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7099 
7100   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7101 
7102   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7103 
7104   // ---------------------------------------------------------------------------
7105   // Pre-construction: record ingredients whose recipes we'll need to further
7106   // process after constructing the initial VPlan.
7107   // ---------------------------------------------------------------------------
7108 
7109   // Mark instructions we'll need to sink later and their targets as
7110   // ingredients whose recipe we'll need to record.
7111   for (auto &Entry : SinkAfter) {
7112     RecipeBuilder.recordRecipeOf(Entry.first);
7113     RecipeBuilder.recordRecipeOf(Entry.second);
7114   }
7115 
7116   // For each interleave group which is relevant for this (possibly trimmed)
7117   // Range, add it to the set of groups to be later applied to the VPlan and add
7118   // placeholders for its members' Recipes which we'll be replacing with a
7119   // single VPInterleaveRecipe.
7120   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7121     auto applyIG = [IG, this](unsigned VF) -> bool {
7122       return (VF >= 2 && // Query is illegal for VF == 1
7123               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7124                   LoopVectorizationCostModel::CM_Interleave);
7125     };
7126     if (!getDecisionAndClampRange(applyIG, Range))
7127       continue;
7128     InterleaveGroups.insert(IG);
7129     for (unsigned i = 0; i < IG->getFactor(); i++)
7130       if (Instruction *Member = IG->getMember(i))
7131         RecipeBuilder.recordRecipeOf(Member);
7132   };
7133 
7134   // ---------------------------------------------------------------------------
7135   // Build initial VPlan: Scan the body of the loop in a topological order to
7136   // visit each basic block after having visited its predecessor basic blocks.
7137   // ---------------------------------------------------------------------------
7138 
7139   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7140   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7141   auto Plan = std::make_unique<VPlan>(VPBB);
7142 
7143   // Represent values that will have defs inside VPlan.
7144   for (Value *V : NeedDef)
7145     Plan->addVPValue(V);
7146 
7147   // Scan the body of the loop in a topological order to visit each basic block
7148   // after having visited its predecessor basic blocks.
7149   LoopBlocksDFS DFS(OrigLoop);
7150   DFS.perform(LI);
7151 
7152   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7153     // Relevant instructions from basic block BB will be grouped into VPRecipe
7154     // ingredients and fill a new VPBasicBlock.
7155     unsigned VPBBsForBB = 0;
7156     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7157     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7158     VPBB = FirstVPBBForBB;
7159     Builder.setInsertPoint(VPBB);
7160 
7161     // Introduce each ingredient into VPlan.
7162     for (Instruction &I : BB->instructionsWithoutDebug()) {
7163       Instruction *Instr = &I;
7164 
7165       // First filter out irrelevant instructions, to ensure no recipes are
7166       // built for them.
7167       if (isa<BranchInst>(Instr) ||
7168           DeadInstructions.find(Instr) != DeadInstructions.end())
7169         continue;
7170 
7171       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7172         continue;
7173 
7174       // Otherwise, if all widening options failed, Instruction is to be
7175       // replicated. This may create a successor for VPBB.
7176       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7177           Instr, Range, VPBB, PredInst2Recipe, Plan);
7178       if (NextVPBB != VPBB) {
7179         VPBB = NextVPBB;
7180         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7181                                     : "");
7182       }
7183     }
7184   }
7185 
7186   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7187   // may also be empty, such as the last one VPBB, reflecting original
7188   // basic-blocks with no recipes.
7189   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7190   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7191   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7192   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7193   delete PreEntry;
7194 
7195   // ---------------------------------------------------------------------------
7196   // Transform initial VPlan: Apply previously taken decisions, in order, to
7197   // bring the VPlan to its final state.
7198   // ---------------------------------------------------------------------------
7199 
7200   // Apply Sink-After legal constraints.
7201   for (auto &Entry : SinkAfter) {
7202     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7203     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7204     Sink->moveAfter(Target);
7205   }
7206 
7207   // Interleave memory: for each Interleave Group we marked earlier as relevant
7208   // for this VPlan, replace the Recipes widening its memory instructions with a
7209   // single VPInterleaveRecipe at its insertion point.
7210   for (auto IG : InterleaveGroups) {
7211     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7212         RecipeBuilder.getRecipe(IG->getInsertPos()));
7213     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7214 
7215     for (unsigned i = 0; i < IG->getFactor(); ++i)
7216       if (Instruction *Member = IG->getMember(i)) {
7217         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7218       }
7219   }
7220 
7221   // Finally, if tail is folded by masking, introduce selects between the phi
7222   // and the live-out instruction of each reduction, at the end of the latch.
7223   if (CM.foldTailByMasking()) {
7224     Builder.setInsertPoint(VPBB);
7225     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7226     for (auto &Reduction : *Legal->getReductionVars()) {
7227       VPValue *Phi = Plan->getVPValue(Reduction.first);
7228       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7229       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7230     }
7231   }
7232 
7233   std::string PlanName;
7234   raw_string_ostream RSO(PlanName);
7235   unsigned VF = Range.Start;
7236   Plan->addVF(VF);
7237   RSO << "Initial VPlan for VF={" << VF;
7238   for (VF *= 2; VF < Range.End; VF *= 2) {
7239     Plan->addVF(VF);
7240     RSO << "," << VF;
7241   }
7242   RSO << "},UF>=1";
7243   RSO.flush();
7244   Plan->setName(PlanName);
7245 
7246   return Plan;
7247 }
7248 
7249 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7250   // Outer loop handling: They may require CFG and instruction level
7251   // transformations before even evaluating whether vectorization is profitable.
7252   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7253   // the vectorization pipeline.
7254   assert(!OrigLoop->empty());
7255   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7256 
7257   // Create new empty VPlan
7258   auto Plan = std::make_unique<VPlan>();
7259 
7260   // Build hierarchical CFG
7261   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7262   HCFGBuilder.buildHierarchicalCFG();
7263 
7264   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7265     Plan->addVF(VF);
7266 
7267   if (EnableVPlanPredication) {
7268     VPlanPredicator VPP(*Plan);
7269     VPP.predicate();
7270 
7271     // Avoid running transformation to recipes until masked code generation in
7272     // VPlan-native path is in place.
7273     return Plan;
7274   }
7275 
7276   SmallPtrSet<Instruction *, 1> DeadInstructions;
7277   VPlanTransforms::VPInstructionsToVPRecipes(
7278       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7279   return Plan;
7280 }
7281 
7282 Value* LoopVectorizationPlanner::VPCallbackILV::
7283 getOrCreateVectorValues(Value *V, unsigned Part) {
7284       return ILV.getOrCreateVectorValue(V, Part);
7285 }
7286 
7287 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7288   O << " +\n"
7289     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7290   IG->getInsertPos()->printAsOperand(O, false);
7291   if (User) {
7292     O << ", ";
7293     User->getOperand(0)->printAsOperand(O);
7294   }
7295   O << "\\l\"";
7296   for (unsigned i = 0; i < IG->getFactor(); ++i)
7297     if (Instruction *I = IG->getMember(i))
7298       O << " +\n"
7299         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7300 }
7301 
7302 void VPWidenRecipe::execute(VPTransformState &State) {
7303   for (auto &Instr : make_range(Begin, End))
7304     State.ILV->widenInstruction(Instr);
7305 }
7306 
7307 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7308   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7309                       IsIndexLoopInvariant);
7310 }
7311 
7312 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7313   assert(!State.Instance && "Int or FP induction being replicated.");
7314   State.ILV->widenIntOrFpInduction(IV, Trunc);
7315 }
7316 
7317 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7318   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7319 }
7320 
7321 void VPBlendRecipe::execute(VPTransformState &State) {
7322   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7323   // We know that all PHIs in non-header blocks are converted into
7324   // selects, so we don't have to worry about the insertion order and we
7325   // can just use the builder.
7326   // At this point we generate the predication tree. There may be
7327   // duplications since this is a simple recursive scan, but future
7328   // optimizations will clean it up.
7329 
7330   unsigned NumIncoming = Phi->getNumIncomingValues();
7331 
7332   assert((User || NumIncoming == 1) &&
7333          "Multiple predecessors with predecessors having a full mask");
7334   // Generate a sequence of selects of the form:
7335   // SELECT(Mask3, In3,
7336   //      SELECT(Mask2, In2,
7337   //                   ( ...)))
7338   InnerLoopVectorizer::VectorParts Entry(State.UF);
7339   for (unsigned In = 0; In < NumIncoming; ++In) {
7340     for (unsigned Part = 0; Part < State.UF; ++Part) {
7341       // We might have single edge PHIs (blocks) - use an identity
7342       // 'select' for the first PHI operand.
7343       Value *In0 =
7344           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7345       if (In == 0)
7346         Entry[Part] = In0; // Initialize with the first incoming value.
7347       else {
7348         // Select between the current value and the previous incoming edge
7349         // based on the incoming mask.
7350         Value *Cond = State.get(User->getOperand(In), Part);
7351         Entry[Part] =
7352             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7353       }
7354     }
7355   }
7356   for (unsigned Part = 0; Part < State.UF; ++Part)
7357     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7358 }
7359 
7360 void VPInterleaveRecipe::execute(VPTransformState &State) {
7361   assert(!State.Instance && "Interleave group being replicated.");
7362   if (!User)
7363     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7364 
7365   // Last (and currently only) operand is a mask.
7366   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7367   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7368   for (unsigned Part = 0; Part < State.UF; ++Part)
7369     MaskValues[Part] = State.get(Mask, Part);
7370   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7371 }
7372 
7373 void VPReplicateRecipe::execute(VPTransformState &State) {
7374   if (State.Instance) { // Generate a single instance.
7375     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7376     // Insert scalar instance packing it into a vector.
7377     if (AlsoPack && State.VF > 1) {
7378       // If we're constructing lane 0, initialize to start from undef.
7379       if (State.Instance->Lane == 0) {
7380         Value *Undef =
7381             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7382         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7383       }
7384       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7385     }
7386     return;
7387   }
7388 
7389   // Generate scalar instances for all VF lanes of all UF parts, unless the
7390   // instruction is uniform inwhich case generate only the first lane for each
7391   // of the UF parts.
7392   unsigned EndLane = IsUniform ? 1 : State.VF;
7393   for (unsigned Part = 0; Part < State.UF; ++Part)
7394     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7395       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7396 }
7397 
7398 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7399   assert(State.Instance && "Branch on Mask works only on single instance.");
7400 
7401   unsigned Part = State.Instance->Part;
7402   unsigned Lane = State.Instance->Lane;
7403 
7404   Value *ConditionBit = nullptr;
7405   if (!User) // Block in mask is all-one.
7406     ConditionBit = State.Builder.getTrue();
7407   else {
7408     VPValue *BlockInMask = User->getOperand(0);
7409     ConditionBit = State.get(BlockInMask, Part);
7410     if (ConditionBit->getType()->isVectorTy())
7411       ConditionBit = State.Builder.CreateExtractElement(
7412           ConditionBit, State.Builder.getInt32(Lane));
7413   }
7414 
7415   // Replace the temporary unreachable terminator with a new conditional branch,
7416   // whose two destinations will be set later when they are created.
7417   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7418   assert(isa<UnreachableInst>(CurrentTerminator) &&
7419          "Expected to replace unreachable terminator with conditional branch.");
7420   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7421   CondBr->setSuccessor(0, nullptr);
7422   ReplaceInstWithInst(CurrentTerminator, CondBr);
7423 }
7424 
7425 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7426   assert(State.Instance && "Predicated instruction PHI works per instance.");
7427   Instruction *ScalarPredInst = cast<Instruction>(
7428       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7429   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7430   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7431   assert(PredicatingBB && "Predicated block has no single predecessor.");
7432 
7433   // By current pack/unpack logic we need to generate only a single phi node: if
7434   // a vector value for the predicated instruction exists at this point it means
7435   // the instruction has vector users only, and a phi for the vector value is
7436   // needed. In this case the recipe of the predicated instruction is marked to
7437   // also do that packing, thereby "hoisting" the insert-element sequence.
7438   // Otherwise, a phi node for the scalar value is needed.
7439   unsigned Part = State.Instance->Part;
7440   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7441     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7442     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7443     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7444     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7445     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7446     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7447   } else {
7448     Type *PredInstType = PredInst->getType();
7449     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7450     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7451     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7452     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7453   }
7454 }
7455 
7456 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7457   VPValue *Mask = getMask();
7458   if (!Mask)
7459     return State.ILV->vectorizeMemoryInstruction(&Instr);
7460 
7461   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7462   for (unsigned Part = 0; Part < State.UF; ++Part)
7463     MaskValues[Part] = State.get(Mask, Part);
7464   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7465 }
7466 
7467 static ScalarEpilogueLowering
7468 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7469                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
7470                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7471                           AssumptionCache *AC, LoopInfo *LI,
7472                           ScalarEvolution *SE, DominatorTree *DT,
7473                           const LoopAccessInfo *LAI) {
7474   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7475   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7476                               !PreferPredicateOverEpilog;
7477 
7478   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7479       (F->hasOptSize() ||
7480        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7481                                    PGSOQueryType::IRPass)))
7482     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7483   else if (PreferPredicateOverEpilog ||
7484            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7485            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
7486             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
7487             !PredicateOptDisabled))
7488     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7489 
7490   return SEL;
7491 }
7492 
7493 // Process the loop in the VPlan-native vectorization path. This path builds
7494 // VPlan upfront in the vectorization pipeline, which allows to apply
7495 // VPlan-to-VPlan transformations from the very beginning without modifying the
7496 // input LLVM IR.
7497 static bool processLoopInVPlanNativePath(
7498     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7499     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7500     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7501     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7502     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7503 
7504   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7505   Function *F = L->getHeader()->getParent();
7506   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7507 
7508   ScalarEpilogueLowering SEL =
7509     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7510                               PSE.getSE(), DT, LVL->getLAI());
7511 
7512   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7513                                 &Hints, IAI);
7514   // Use the planner for outer loop vectorization.
7515   // TODO: CM is not used at this point inside the planner. Turn CM into an
7516   // optional argument if we don't need it in the future.
7517   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7518 
7519   // Get user vectorization factor.
7520   const unsigned UserVF = Hints.getWidth();
7521 
7522   // Plan how to best vectorize, return the best VF and its cost.
7523   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7524 
7525   // If we are stress testing VPlan builds, do not attempt to generate vector
7526   // code. Masked vector code generation support will follow soon.
7527   // Also, do not attempt to vectorize if no vector code will be produced.
7528   if (VPlanBuildStressTest || EnableVPlanPredication ||
7529       VectorizationFactor::Disabled() == VF)
7530     return false;
7531 
7532   LVP.setBestPlan(VF.Width, 1);
7533 
7534   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7535                          &CM);
7536   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7537                     << L->getHeader()->getParent()->getName() << "\"\n");
7538   LVP.executePlan(LB, DT);
7539 
7540   // Mark the loop as already vectorized to avoid vectorizing again.
7541   Hints.setAlreadyVectorized();
7542 
7543   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7544   return true;
7545 }
7546 
7547 bool LoopVectorizePass::processLoop(Loop *L) {
7548   assert((EnableVPlanNativePath || L->empty()) &&
7549          "VPlan-native path is not enabled. Only process inner loops.");
7550 
7551 #ifndef NDEBUG
7552   const std::string DebugLocStr = getDebugLocString(L);
7553 #endif /* NDEBUG */
7554 
7555   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7556                     << L->getHeader()->getParent()->getName() << "\" from "
7557                     << DebugLocStr << "\n");
7558 
7559   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7560 
7561   LLVM_DEBUG(
7562       dbgs() << "LV: Loop hints:"
7563              << " force="
7564              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7565                      ? "disabled"
7566                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7567                             ? "enabled"
7568                             : "?"))
7569              << " width=" << Hints.getWidth()
7570              << " unroll=" << Hints.getInterleave() << "\n");
7571 
7572   // Function containing loop
7573   Function *F = L->getHeader()->getParent();
7574 
7575   // Looking at the diagnostic output is the only way to determine if a loop
7576   // was vectorized (other than looking at the IR or machine code), so it
7577   // is important to generate an optimization remark for each loop. Most of
7578   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7579   // generated as OptimizationRemark and OptimizationRemarkMissed are
7580   // less verbose reporting vectorized loops and unvectorized loops that may
7581   // benefit from vectorization, respectively.
7582 
7583   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7584     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7585     return false;
7586   }
7587 
7588   PredicatedScalarEvolution PSE(*SE, *L);
7589 
7590   // Check if it is legal to vectorize the loop.
7591   LoopVectorizationRequirements Requirements(*ORE);
7592   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7593                                 &Requirements, &Hints, DB, AC);
7594   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7595     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7596     Hints.emitRemarkWithHints();
7597     return false;
7598   }
7599 
7600   // Check the function attributes and profiles to find out if this function
7601   // should be optimized for size.
7602   ScalarEpilogueLowering SEL =
7603     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7604                               PSE.getSE(), DT, LVL.getLAI());
7605 
7606   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7607   // here. They may require CFG and instruction level transformations before
7608   // even evaluating whether vectorization is profitable. Since we cannot modify
7609   // the incoming IR, we need to build VPlan upfront in the vectorization
7610   // pipeline.
7611   if (!L->empty())
7612     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7613                                         ORE, BFI, PSI, Hints);
7614 
7615   assert(L->empty() && "Inner loop expected.");
7616 
7617   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7618   // count by optimizing for size, to minimize overheads.
7619   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7620   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7621     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7622                       << "This loop is worth vectorizing only if no scalar "
7623                       << "iteration overheads are incurred.");
7624     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7625       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7626     else {
7627       LLVM_DEBUG(dbgs() << "\n");
7628       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7629     }
7630   }
7631 
7632   // Check the function attributes to see if implicit floats are allowed.
7633   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7634   // an integer loop and the vector instructions selected are purely integer
7635   // vector instructions?
7636   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7637     reportVectorizationFailure(
7638         "Can't vectorize when the NoImplicitFloat attribute is used",
7639         "loop not vectorized due to NoImplicitFloat attribute",
7640         "NoImplicitFloat", ORE, L);
7641     Hints.emitRemarkWithHints();
7642     return false;
7643   }
7644 
7645   // Check if the target supports potentially unsafe FP vectorization.
7646   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7647   // for the target we're vectorizing for, to make sure none of the
7648   // additional fp-math flags can help.
7649   if (Hints.isPotentiallyUnsafe() &&
7650       TTI->isFPVectorizationPotentiallyUnsafe()) {
7651     reportVectorizationFailure(
7652         "Potentially unsafe FP op prevents vectorization",
7653         "loop not vectorized due to unsafe FP support.",
7654         "UnsafeFP", ORE, L);
7655     Hints.emitRemarkWithHints();
7656     return false;
7657   }
7658 
7659   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7660   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7661 
7662   // If an override option has been passed in for interleaved accesses, use it.
7663   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7664     UseInterleaved = EnableInterleavedMemAccesses;
7665 
7666   // Analyze interleaved memory accesses.
7667   if (UseInterleaved) {
7668     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7669   }
7670 
7671   // Use the cost model.
7672   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7673                                 F, &Hints, IAI);
7674   CM.collectValuesToIgnore();
7675 
7676   // Use the planner for vectorization.
7677   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7678 
7679   // Get user vectorization factor.
7680   unsigned UserVF = Hints.getWidth();
7681 
7682   // Plan how to best vectorize, return the best VF and its cost.
7683   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7684 
7685   VectorizationFactor VF = VectorizationFactor::Disabled();
7686   unsigned IC = 1;
7687   unsigned UserIC = Hints.getInterleave();
7688 
7689   if (MaybeVF) {
7690     VF = *MaybeVF;
7691     // Select the interleave count.
7692     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7693   }
7694 
7695   // Identify the diagnostic messages that should be produced.
7696   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7697   bool VectorizeLoop = true, InterleaveLoop = true;
7698   if (Requirements.doesNotMeet(F, L, Hints)) {
7699     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7700                          "requirements.\n");
7701     Hints.emitRemarkWithHints();
7702     return false;
7703   }
7704 
7705   if (VF.Width == 1) {
7706     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7707     VecDiagMsg = std::make_pair(
7708         "VectorizationNotBeneficial",
7709         "the cost-model indicates that vectorization is not beneficial");
7710     VectorizeLoop = false;
7711   }
7712 
7713   if (!MaybeVF && UserIC > 1) {
7714     // Tell the user interleaving was avoided up-front, despite being explicitly
7715     // requested.
7716     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7717                          "interleaving should be avoided up front\n");
7718     IntDiagMsg = std::make_pair(
7719         "InterleavingAvoided",
7720         "Ignoring UserIC, because interleaving was avoided up front");
7721     InterleaveLoop = false;
7722   } else if (IC == 1 && UserIC <= 1) {
7723     // Tell the user interleaving is not beneficial.
7724     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7725     IntDiagMsg = std::make_pair(
7726         "InterleavingNotBeneficial",
7727         "the cost-model indicates that interleaving is not beneficial");
7728     InterleaveLoop = false;
7729     if (UserIC == 1) {
7730       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7731       IntDiagMsg.second +=
7732           " and is explicitly disabled or interleave count is set to 1";
7733     }
7734   } else if (IC > 1 && UserIC == 1) {
7735     // Tell the user interleaving is beneficial, but it explicitly disabled.
7736     LLVM_DEBUG(
7737         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7738     IntDiagMsg = std::make_pair(
7739         "InterleavingBeneficialButDisabled",
7740         "the cost-model indicates that interleaving is beneficial "
7741         "but is explicitly disabled or interleave count is set to 1");
7742     InterleaveLoop = false;
7743   }
7744 
7745   // Override IC if user provided an interleave count.
7746   IC = UserIC > 0 ? UserIC : IC;
7747 
7748   // Emit diagnostic messages, if any.
7749   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7750   if (!VectorizeLoop && !InterleaveLoop) {
7751     // Do not vectorize or interleaving the loop.
7752     ORE->emit([&]() {
7753       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7754                                       L->getStartLoc(), L->getHeader())
7755              << VecDiagMsg.second;
7756     });
7757     ORE->emit([&]() {
7758       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7759                                       L->getStartLoc(), L->getHeader())
7760              << IntDiagMsg.second;
7761     });
7762     return false;
7763   } else if (!VectorizeLoop && InterleaveLoop) {
7764     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7765     ORE->emit([&]() {
7766       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7767                                         L->getStartLoc(), L->getHeader())
7768              << VecDiagMsg.second;
7769     });
7770   } else if (VectorizeLoop && !InterleaveLoop) {
7771     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7772                       << ") in " << DebugLocStr << '\n');
7773     ORE->emit([&]() {
7774       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7775                                         L->getStartLoc(), L->getHeader())
7776              << IntDiagMsg.second;
7777     });
7778   } else if (VectorizeLoop && InterleaveLoop) {
7779     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7780                       << ") in " << DebugLocStr << '\n');
7781     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7782   }
7783 
7784   LVP.setBestPlan(VF.Width, IC);
7785 
7786   using namespace ore;
7787   bool DisableRuntimeUnroll = false;
7788   MDNode *OrigLoopID = L->getLoopID();
7789 
7790   if (!VectorizeLoop) {
7791     assert(IC > 1 && "interleave count should not be 1 or 0");
7792     // If we decided that it is not legal to vectorize the loop, then
7793     // interleave it.
7794     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7795                                &CM);
7796     LVP.executePlan(Unroller, DT);
7797 
7798     ORE->emit([&]() {
7799       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7800                                 L->getHeader())
7801              << "interleaved loop (interleaved count: "
7802              << NV("InterleaveCount", IC) << ")";
7803     });
7804   } else {
7805     // If we decided that it is *legal* to vectorize the loop, then do it.
7806     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7807                            &LVL, &CM);
7808     LVP.executePlan(LB, DT);
7809     ++LoopsVectorized;
7810 
7811     // Add metadata to disable runtime unrolling a scalar loop when there are
7812     // no runtime checks about strides and memory. A scalar loop that is
7813     // rarely used is not worth unrolling.
7814     if (!LB.areSafetyChecksAdded())
7815       DisableRuntimeUnroll = true;
7816 
7817     // Report the vectorization decision.
7818     ORE->emit([&]() {
7819       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7820                                 L->getHeader())
7821              << "vectorized loop (vectorization width: "
7822              << NV("VectorizationFactor", VF.Width)
7823              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7824     });
7825   }
7826 
7827   Optional<MDNode *> RemainderLoopID =
7828       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7829                                       LLVMLoopVectorizeFollowupEpilogue});
7830   if (RemainderLoopID.hasValue()) {
7831     L->setLoopID(RemainderLoopID.getValue());
7832   } else {
7833     if (DisableRuntimeUnroll)
7834       AddRuntimeUnrollDisableMetaData(L);
7835 
7836     // Mark the loop as already vectorized to avoid vectorizing again.
7837     Hints.setAlreadyVectorized();
7838   }
7839 
7840   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7841   return true;
7842 }
7843 
7844 bool LoopVectorizePass::runImpl(
7845     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7846     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7847     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7848     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7849     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7850   SE = &SE_;
7851   LI = &LI_;
7852   TTI = &TTI_;
7853   DT = &DT_;
7854   BFI = &BFI_;
7855   TLI = TLI_;
7856   AA = &AA_;
7857   AC = &AC_;
7858   GetLAA = &GetLAA_;
7859   DB = &DB_;
7860   ORE = &ORE_;
7861   PSI = PSI_;
7862 
7863   // Don't attempt if
7864   // 1. the target claims to have no vector registers, and
7865   // 2. interleaving won't help ILP.
7866   //
7867   // The second condition is necessary because, even if the target has no
7868   // vector registers, loop vectorization may still enable scalar
7869   // interleaving.
7870   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7871       TTI->getMaxInterleaveFactor(1) < 2)
7872     return false;
7873 
7874   bool Changed = false;
7875 
7876   // The vectorizer requires loops to be in simplified form.
7877   // Since simplification may add new inner loops, it has to run before the
7878   // legality and profitability checks. This means running the loop vectorizer
7879   // will simplify all loops, regardless of whether anything end up being
7880   // vectorized.
7881   for (auto &L : *LI)
7882     Changed |=
7883         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7884 
7885   // Build up a worklist of inner-loops to vectorize. This is necessary as
7886   // the act of vectorizing or partially unrolling a loop creates new loops
7887   // and can invalidate iterators across the loops.
7888   SmallVector<Loop *, 8> Worklist;
7889 
7890   for (Loop *L : *LI)
7891     collectSupportedLoops(*L, LI, ORE, Worklist);
7892 
7893   LoopsAnalyzed += Worklist.size();
7894 
7895   // Now walk the identified inner loops.
7896   while (!Worklist.empty()) {
7897     Loop *L = Worklist.pop_back_val();
7898 
7899     // For the inner loops we actually process, form LCSSA to simplify the
7900     // transform.
7901     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7902 
7903     Changed |= processLoop(L);
7904   }
7905 
7906   // Process each loop nest in the function.
7907   return Changed;
7908 }
7909 
7910 PreservedAnalyses LoopVectorizePass::run(Function &F,
7911                                          FunctionAnalysisManager &AM) {
7912     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7913     auto &LI = AM.getResult<LoopAnalysis>(F);
7914     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7915     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7916     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7917     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7918     auto &AA = AM.getResult<AAManager>(F);
7919     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7920     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7921     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7922     MemorySSA *MSSA = EnableMSSALoopDependency
7923                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7924                           : nullptr;
7925 
7926     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7927     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7928         [&](Loop &L) -> const LoopAccessInfo & {
7929       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7930       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7931     };
7932     const ModuleAnalysisManager &MAM =
7933         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7934     ProfileSummaryInfo *PSI =
7935         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7936     bool Changed =
7937         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7938     if (!Changed)
7939       return PreservedAnalyses::all();
7940     PreservedAnalyses PA;
7941 
7942     // We currently do not preserve loopinfo/dominator analyses with outer loop
7943     // vectorization. Until this is addressed, mark these analyses as preserved
7944     // only for non-VPlan-native path.
7945     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7946     if (!EnableVPlanNativePath) {
7947       PA.preserve<LoopAnalysis>();
7948       PA.preserve<DominatorTreeAnalysis>();
7949     }
7950     PA.preserve<BasicAA>();
7951     PA.preserve<GlobalsAA>();
7952     return PA;
7953 }
7954