1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Widen a single call instruction within the innermost loop.
413   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
414                             VPTransformState &State);
415 
416   /// Widen a single select instruction within the innermost loop.
417   void widenSelectInstruction(SelectInst &I, bool InvariantCond);
418 
419   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
420   void fixVectorizedLoop();
421 
422   // Return true if any runtime check is added.
423   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
424 
425   /// A type for vectorized values in the new loop. Each value from the
426   /// original loop, when vectorized, is represented by UF vector values in the
427   /// new unrolled loop, where UF is the unroll factor.
428   using VectorParts = SmallVector<Value *, 2>;
429 
430   /// Vectorize a single GetElementPtrInst based on information gathered and
431   /// decisions taken during planning.
432   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
433                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
434 
435   /// Vectorize a single PHINode in a block. This method handles the induction
436   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
437   /// arbitrary length vectors.
438   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
439 
440   /// A helper function to scalarize a single Instruction in the innermost loop.
441   /// Generates a sequence of scalar instances for each lane between \p MinLane
442   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
443   /// inclusive..
444   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
445                             bool IfPredicateInstr);
446 
447   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
448   /// is provided, the integer induction variable will first be truncated to
449   /// the corresponding type.
450   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
451 
452   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
453   /// vector or scalar value on-demand if one is not yet available. When
454   /// vectorizing a loop, we visit the definition of an instruction before its
455   /// uses. When visiting the definition, we either vectorize or scalarize the
456   /// instruction, creating an entry for it in the corresponding map. (In some
457   /// cases, such as induction variables, we will create both vector and scalar
458   /// entries.) Then, as we encounter uses of the definition, we derive values
459   /// for each scalar or vector use unless such a value is already available.
460   /// For example, if we scalarize a definition and one of its uses is vector,
461   /// we build the required vector on-demand with an insertelement sequence
462   /// when visiting the use. Otherwise, if the use is scalar, we can use the
463   /// existing scalar definition.
464   ///
465   /// Return a value in the new loop corresponding to \p V from the original
466   /// loop at unroll index \p Part. If the value has already been vectorized,
467   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
468   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
469   /// a new vector value on-demand by inserting the scalar values into a vector
470   /// with an insertelement sequence. If the value has been neither vectorized
471   /// nor scalarized, it must be loop invariant, so we simply broadcast the
472   /// value into a vector.
473   Value *getOrCreateVectorValue(Value *V, unsigned Part);
474 
475   /// Return a value in the new loop corresponding to \p V from the original
476   /// loop at unroll and vector indices \p Instance. If the value has been
477   /// vectorized but not scalarized, the necessary extractelement instruction
478   /// will be generated.
479   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
480 
481   /// Construct the vector value of a scalarized value \p V one lane at a time.
482   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
483 
484   /// Try to vectorize interleaved access group \p Group with the base address
485   /// given in \p Addr, optionally masking the vector operations if \p
486   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
487   /// values in the vectorized loop.
488   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
489                                 VPTransformState &State, VPValue *Addr,
490                                 VPValue *BlockInMask = nullptr);
491 
492   /// Vectorize Load and Store instructions with the base address given in \p
493   /// Addr, optionally masking the vector operations if \p BlockInMask is
494   /// non-null. Use \p State to translate given VPValues to IR values in the
495   /// vectorized loop.
496   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
497                                   VPValue *Addr, VPValue *StoredValue,
498                                   VPValue *BlockInMask);
499 
500   /// Set the debug location in the builder using the debug location in
501   /// the instruction.
502   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
503 
504   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
505   void fixNonInductionPHIs(void);
506 
507 protected:
508   friend class LoopVectorizationPlanner;
509 
510   /// A small list of PHINodes.
511   using PhiVector = SmallVector<PHINode *, 4>;
512 
513   /// A type for scalarized values in the new loop. Each value from the
514   /// original loop, when scalarized, is represented by UF x VF scalar values
515   /// in the new unrolled loop, where UF is the unroll factor and VF is the
516   /// vectorization factor.
517   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
518 
519   /// Set up the values of the IVs correctly when exiting the vector loop.
520   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
521                     Value *CountRoundDown, Value *EndValue,
522                     BasicBlock *MiddleBlock);
523 
524   /// Create a new induction variable inside L.
525   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
526                                    Value *Step, Instruction *DL);
527 
528   /// Handle all cross-iteration phis in the header.
529   void fixCrossIterationPHIs();
530 
531   /// Fix a first-order recurrence. This is the second phase of vectorizing
532   /// this phi node.
533   void fixFirstOrderRecurrence(PHINode *Phi);
534 
535   /// Fix a reduction cross-iteration phi. This is the second phase of
536   /// vectorizing this phi node.
537   void fixReduction(PHINode *Phi);
538 
539   /// Clear NSW/NUW flags from reduction instructions if necessary.
540   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
541 
542   /// The Loop exit block may have single value PHI nodes with some
543   /// incoming value. While vectorizing we only handled real values
544   /// that were defined inside the loop and we should have one value for
545   /// each predecessor of its parent basic block. See PR14725.
546   void fixLCSSAPHIs();
547 
548   /// Iteratively sink the scalarized operands of a predicated instruction into
549   /// the block that was created for it.
550   void sinkScalarOperands(Instruction *PredInst);
551 
552   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
553   /// represented as.
554   void truncateToMinimalBitwidths();
555 
556   /// Create a broadcast instruction. This method generates a broadcast
557   /// instruction (shuffle) for loop invariant values and for the induction
558   /// value. If this is the induction variable then we extend it to N, N+1, ...
559   /// this is needed because each iteration in the loop corresponds to a SIMD
560   /// element.
561   virtual Value *getBroadcastInstrs(Value *V);
562 
563   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
564   /// to each vector element of Val. The sequence starts at StartIndex.
565   /// \p Opcode is relevant for FP induction variable.
566   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
567                                Instruction::BinaryOps Opcode =
568                                Instruction::BinaryOpsEnd);
569 
570   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
571   /// variable on which to base the steps, \p Step is the size of the step, and
572   /// \p EntryVal is the value from the original loop that maps to the steps.
573   /// Note that \p EntryVal doesn't have to be an induction variable - it
574   /// can also be a truncate instruction.
575   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
576                         const InductionDescriptor &ID);
577 
578   /// Create a vector induction phi node based on an existing scalar one. \p
579   /// EntryVal is the value from the original loop that maps to the vector phi
580   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
581   /// truncate instruction, instead of widening the original IV, we widen a
582   /// version of the IV truncated to \p EntryVal's type.
583   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
584                                        Value *Step, Instruction *EntryVal);
585 
586   /// Returns true if an instruction \p I should be scalarized instead of
587   /// vectorized for the chosen vectorization factor.
588   bool shouldScalarizeInstruction(Instruction *I) const;
589 
590   /// Returns true if we should generate a scalar version of \p IV.
591   bool needsScalarInduction(Instruction *IV) const;
592 
593   /// If there is a cast involved in the induction variable \p ID, which should
594   /// be ignored in the vectorized loop body, this function records the
595   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
596   /// cast. We had already proved that the casted Phi is equal to the uncasted
597   /// Phi in the vectorized loop (under a runtime guard), and therefore
598   /// there is no need to vectorize the cast - the same value can be used in the
599   /// vector loop for both the Phi and the cast.
600   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
601   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
602   ///
603   /// \p EntryVal is the value from the original loop that maps to the vector
604   /// phi node and is used to distinguish what is the IV currently being
605   /// processed - original one (if \p EntryVal is a phi corresponding to the
606   /// original IV) or the "newly-created" one based on the proof mentioned above
607   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
608   /// latter case \p EntryVal is a TruncInst and we must not record anything for
609   /// that IV, but it's error-prone to expect callers of this routine to care
610   /// about that, hence this explicit parameter.
611   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
612                                              const Instruction *EntryVal,
613                                              Value *VectorLoopValue,
614                                              unsigned Part,
615                                              unsigned Lane = UINT_MAX);
616 
617   /// Generate a shuffle sequence that will reverse the vector Vec.
618   virtual Value *reverseVector(Value *Vec);
619 
620   /// Returns (and creates if needed) the original loop trip count.
621   Value *getOrCreateTripCount(Loop *NewLoop);
622 
623   /// Returns (and creates if needed) the trip count of the widened loop.
624   Value *getOrCreateVectorTripCount(Loop *NewLoop);
625 
626   /// Returns a bitcasted value to the requested vector type.
627   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
628   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
629                                 const DataLayout &DL);
630 
631   /// Emit a bypass check to see if the vector trip count is zero, including if
632   /// it overflows.
633   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
634 
635   /// Emit a bypass check to see if all of the SCEV assumptions we've
636   /// had to make are correct.
637   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
638 
639   /// Emit bypass checks to check any memory assumptions we may have made.
640   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
641 
642   /// Compute the transformed value of Index at offset StartValue using step
643   /// StepValue.
644   /// For integer induction, returns StartValue + Index * StepValue.
645   /// For pointer induction, returns StartValue[Index * StepValue].
646   /// FIXME: The newly created binary instructions should contain nsw/nuw
647   /// flags, which can be found from the original scalar operations.
648   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
649                               const DataLayout &DL,
650                               const InductionDescriptor &ID) const;
651 
652   /// Add additional metadata to \p To that was not present on \p Orig.
653   ///
654   /// Currently this is used to add the noalias annotations based on the
655   /// inserted memchecks.  Use this for instructions that are *cloned* into the
656   /// vector loop.
657   void addNewMetadata(Instruction *To, const Instruction *Orig);
658 
659   /// Add metadata from one instruction to another.
660   ///
661   /// This includes both the original MDs from \p From and additional ones (\see
662   /// addNewMetadata).  Use this for *newly created* instructions in the vector
663   /// loop.
664   void addMetadata(Instruction *To, Instruction *From);
665 
666   /// Similar to the previous function but it adds the metadata to a
667   /// vector of instructions.
668   void addMetadata(ArrayRef<Value *> To, Instruction *From);
669 
670   /// The original loop.
671   Loop *OrigLoop;
672 
673   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
674   /// dynamic knowledge to simplify SCEV expressions and converts them to a
675   /// more usable form.
676   PredicatedScalarEvolution &PSE;
677 
678   /// Loop Info.
679   LoopInfo *LI;
680 
681   /// Dominator Tree.
682   DominatorTree *DT;
683 
684   /// Alias Analysis.
685   AliasAnalysis *AA;
686 
687   /// Target Library Info.
688   const TargetLibraryInfo *TLI;
689 
690   /// Target Transform Info.
691   const TargetTransformInfo *TTI;
692 
693   /// Assumption Cache.
694   AssumptionCache *AC;
695 
696   /// Interface to emit optimization remarks.
697   OptimizationRemarkEmitter *ORE;
698 
699   /// LoopVersioning.  It's only set up (non-null) if memchecks were
700   /// used.
701   ///
702   /// This is currently only used to add no-alias metadata based on the
703   /// memchecks.  The actually versioning is performed manually.
704   std::unique_ptr<LoopVersioning> LVer;
705 
706   /// The vectorization SIMD factor to use. Each vector will have this many
707   /// vector elements.
708   unsigned VF;
709 
710   /// The vectorization unroll factor to use. Each scalar is vectorized to this
711   /// many different vector instructions.
712   unsigned UF;
713 
714   /// The builder that we use
715   IRBuilder<> Builder;
716 
717   // --- Vectorization state ---
718 
719   /// The vector-loop preheader.
720   BasicBlock *LoopVectorPreHeader;
721 
722   /// The scalar-loop preheader.
723   BasicBlock *LoopScalarPreHeader;
724 
725   /// Middle Block between the vector and the scalar.
726   BasicBlock *LoopMiddleBlock;
727 
728   /// The ExitBlock of the scalar loop.
729   BasicBlock *LoopExitBlock;
730 
731   /// The vector loop body.
732   BasicBlock *LoopVectorBody;
733 
734   /// The scalar loop body.
735   BasicBlock *LoopScalarBody;
736 
737   /// A list of all bypass blocks. The first block is the entry of the loop.
738   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
739 
740   /// The new Induction variable which was added to the new block.
741   PHINode *Induction = nullptr;
742 
743   /// The induction variable of the old basic block.
744   PHINode *OldInduction = nullptr;
745 
746   /// Maps values from the original loop to their corresponding values in the
747   /// vectorized loop. A key value can map to either vector values, scalar
748   /// values or both kinds of values, depending on whether the key was
749   /// vectorized and scalarized.
750   VectorizerValueMap VectorLoopValueMap;
751 
752   /// Store instructions that were predicated.
753   SmallVector<Instruction *, 4> PredicatedInstructions;
754 
755   /// Trip count of the original loop.
756   Value *TripCount = nullptr;
757 
758   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
759   Value *VectorTripCount = nullptr;
760 
761   /// The legality analysis.
762   LoopVectorizationLegality *Legal;
763 
764   /// The profitablity analysis.
765   LoopVectorizationCostModel *Cost;
766 
767   // Record whether runtime checks are added.
768   bool AddedSafetyChecks = false;
769 
770   // Holds the end values for each induction variable. We save the end values
771   // so we can later fix-up the external users of the induction variables.
772   DenseMap<PHINode *, Value *> IVEndValues;
773 
774   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
775   // fixed up at the end of vector code generation.
776   SmallVector<PHINode *, 8> OrigPHIsToFix;
777 };
778 
779 class InnerLoopUnroller : public InnerLoopVectorizer {
780 public:
781   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
782                     LoopInfo *LI, DominatorTree *DT,
783                     const TargetLibraryInfo *TLI,
784                     const TargetTransformInfo *TTI, AssumptionCache *AC,
785                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
786                     LoopVectorizationLegality *LVL,
787                     LoopVectorizationCostModel *CM)
788       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
789                             UnrollFactor, LVL, CM) {}
790 
791 private:
792   Value *getBroadcastInstrs(Value *V) override;
793   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
794                        Instruction::BinaryOps Opcode =
795                        Instruction::BinaryOpsEnd) override;
796   Value *reverseVector(Value *Vec) override;
797 };
798 
799 } // end namespace llvm
800 
801 /// Look for a meaningful debug location on the instruction or it's
802 /// operands.
803 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
804   if (!I)
805     return I;
806 
807   DebugLoc Empty;
808   if (I->getDebugLoc() != Empty)
809     return I;
810 
811   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
812     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
813       if (OpInst->getDebugLoc() != Empty)
814         return OpInst;
815   }
816 
817   return I;
818 }
819 
820 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
821   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
822     const DILocation *DIL = Inst->getDebugLoc();
823     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
824         !isa<DbgInfoIntrinsic>(Inst)) {
825       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
826       if (NewDIL)
827         B.SetCurrentDebugLocation(NewDIL.getValue());
828       else
829         LLVM_DEBUG(dbgs()
830                    << "Failed to create new discriminator: "
831                    << DIL->getFilename() << " Line: " << DIL->getLine());
832     }
833     else
834       B.SetCurrentDebugLocation(DIL);
835   } else
836     B.SetCurrentDebugLocation(DebugLoc());
837 }
838 
839 /// Write a record \p DebugMsg about vectorization failure to the debug
840 /// output stream. If \p I is passed, it is an instruction that prevents
841 /// vectorization.
842 #ifndef NDEBUG
843 static void debugVectorizationFailure(const StringRef DebugMsg,
844     Instruction *I) {
845   dbgs() << "LV: Not vectorizing: " << DebugMsg;
846   if (I != nullptr)
847     dbgs() << " " << *I;
848   else
849     dbgs() << '.';
850   dbgs() << '\n';
851 }
852 #endif
853 
854 /// Create an analysis remark that explains why vectorization failed
855 ///
856 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
857 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
858 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
859 /// the location of the remark.  \return the remark object that can be
860 /// streamed to.
861 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
862     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
863   Value *CodeRegion = TheLoop->getHeader();
864   DebugLoc DL = TheLoop->getStartLoc();
865 
866   if (I) {
867     CodeRegion = I->getParent();
868     // If there is no debug location attached to the instruction, revert back to
869     // using the loop's.
870     if (I->getDebugLoc())
871       DL = I->getDebugLoc();
872   }
873 
874   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
875   R << "loop not vectorized: ";
876   return R;
877 }
878 
879 namespace llvm {
880 
881 void reportVectorizationFailure(const StringRef DebugMsg,
882     const StringRef OREMsg, const StringRef ORETag,
883     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
884   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
885   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
886   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
887                 ORETag, TheLoop, I) << OREMsg);
888 }
889 
890 } // end namespace llvm
891 
892 #ifndef NDEBUG
893 /// \return string containing a file name and a line # for the given loop.
894 static std::string getDebugLocString(const Loop *L) {
895   std::string Result;
896   if (L) {
897     raw_string_ostream OS(Result);
898     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
899       LoopDbgLoc.print(OS);
900     else
901       // Just print the module name.
902       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
903     OS.flush();
904   }
905   return Result;
906 }
907 #endif
908 
909 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
910                                          const Instruction *Orig) {
911   // If the loop was versioned with memchecks, add the corresponding no-alias
912   // metadata.
913   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
914     LVer->annotateInstWithNoAlias(To, Orig);
915 }
916 
917 void InnerLoopVectorizer::addMetadata(Instruction *To,
918                                       Instruction *From) {
919   propagateMetadata(To, From);
920   addNewMetadata(To, From);
921 }
922 
923 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
924                                       Instruction *From) {
925   for (Value *V : To) {
926     if (Instruction *I = dyn_cast<Instruction>(V))
927       addMetadata(I, From);
928   }
929 }
930 
931 namespace llvm {
932 
933 // Loop vectorization cost-model hints how the scalar epilogue loop should be
934 // lowered.
935 enum ScalarEpilogueLowering {
936 
937   // The default: allowing scalar epilogues.
938   CM_ScalarEpilogueAllowed,
939 
940   // Vectorization with OptForSize: don't allow epilogues.
941   CM_ScalarEpilogueNotAllowedOptSize,
942 
943   // A special case of vectorisation with OptForSize: loops with a very small
944   // trip count are considered for vectorization under OptForSize, thereby
945   // making sure the cost of their loop body is dominant, free of runtime
946   // guards and scalar iteration overheads.
947   CM_ScalarEpilogueNotAllowedLowTripLoop,
948 
949   // Loop hint predicate indicating an epilogue is undesired.
950   CM_ScalarEpilogueNotNeededUsePredicate
951 };
952 
953 /// LoopVectorizationCostModel - estimates the expected speedups due to
954 /// vectorization.
955 /// In many cases vectorization is not profitable. This can happen because of
956 /// a number of reasons. In this class we mainly attempt to predict the
957 /// expected speedup/slowdowns due to the supported instruction set. We use the
958 /// TargetTransformInfo to query the different backends for the cost of
959 /// different operations.
960 class LoopVectorizationCostModel {
961 public:
962   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
963                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
964                              LoopVectorizationLegality *Legal,
965                              const TargetTransformInfo &TTI,
966                              const TargetLibraryInfo *TLI, DemandedBits *DB,
967                              AssumptionCache *AC,
968                              OptimizationRemarkEmitter *ORE, const Function *F,
969                              const LoopVectorizeHints *Hints,
970                              InterleavedAccessInfo &IAI)
971       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
972         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
973         Hints(Hints), InterleaveInfo(IAI) {}
974 
975   /// \return An upper bound for the vectorization factor, or None if
976   /// vectorization and interleaving should be avoided up front.
977   Optional<unsigned> computeMaxVF();
978 
979   /// \return True if runtime checks are required for vectorization, and false
980   /// otherwise.
981   bool runtimeChecksRequired();
982 
983   /// \return The most profitable vectorization factor and the cost of that VF.
984   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
985   /// then this vectorization factor will be selected if vectorization is
986   /// possible.
987   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
988 
989   /// Setup cost-based decisions for user vectorization factor.
990   void selectUserVectorizationFactor(unsigned UserVF) {
991     collectUniformsAndScalars(UserVF);
992     collectInstsToScalarize(UserVF);
993   }
994 
995   /// \return The size (in bits) of the smallest and widest types in the code
996   /// that needs to be vectorized. We ignore values that remain scalar such as
997   /// 64 bit loop indices.
998   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
999 
1000   /// \return The desired interleave count.
1001   /// If interleave count has been specified by metadata it will be returned.
1002   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1003   /// are the selected vectorization factor and the cost of the selected VF.
1004   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1005 
1006   /// Memory access instruction may be vectorized in more than one way.
1007   /// Form of instruction after vectorization depends on cost.
1008   /// This function takes cost-based decisions for Load/Store instructions
1009   /// and collects them in a map. This decisions map is used for building
1010   /// the lists of loop-uniform and loop-scalar instructions.
1011   /// The calculated cost is saved with widening decision in order to
1012   /// avoid redundant calculations.
1013   void setCostBasedWideningDecision(unsigned VF);
1014 
1015   /// A struct that represents some properties of the register usage
1016   /// of a loop.
1017   struct RegisterUsage {
1018     /// Holds the number of loop invariant values that are used in the loop.
1019     /// The key is ClassID of target-provided register class.
1020     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1021     /// Holds the maximum number of concurrent live intervals in the loop.
1022     /// The key is ClassID of target-provided register class.
1023     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1024   };
1025 
1026   /// \return Returns information about the register usages of the loop for the
1027   /// given vectorization factors.
1028   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1029 
1030   /// Collect values we want to ignore in the cost model.
1031   void collectValuesToIgnore();
1032 
1033   /// \returns The smallest bitwidth each instruction can be represented with.
1034   /// The vector equivalents of these instructions should be truncated to this
1035   /// type.
1036   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1037     return MinBWs;
1038   }
1039 
1040   /// \returns True if it is more profitable to scalarize instruction \p I for
1041   /// vectorization factor \p VF.
1042   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1043     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1044 
1045     // Cost model is not run in the VPlan-native path - return conservative
1046     // result until this changes.
1047     if (EnableVPlanNativePath)
1048       return false;
1049 
1050     auto Scalars = InstsToScalarize.find(VF);
1051     assert(Scalars != InstsToScalarize.end() &&
1052            "VF not yet analyzed for scalarization profitability");
1053     return Scalars->second.find(I) != Scalars->second.end();
1054   }
1055 
1056   /// Returns true if \p I is known to be uniform after vectorization.
1057   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1058     if (VF == 1)
1059       return true;
1060 
1061     // Cost model is not run in the VPlan-native path - return conservative
1062     // result until this changes.
1063     if (EnableVPlanNativePath)
1064       return false;
1065 
1066     auto UniformsPerVF = Uniforms.find(VF);
1067     assert(UniformsPerVF != Uniforms.end() &&
1068            "VF not yet analyzed for uniformity");
1069     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1070   }
1071 
1072   /// Returns true if \p I is known to be scalar after vectorization.
1073   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1074     if (VF == 1)
1075       return true;
1076 
1077     // Cost model is not run in the VPlan-native path - return conservative
1078     // result until this changes.
1079     if (EnableVPlanNativePath)
1080       return false;
1081 
1082     auto ScalarsPerVF = Scalars.find(VF);
1083     assert(ScalarsPerVF != Scalars.end() &&
1084            "Scalar values are not calculated for VF");
1085     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1086   }
1087 
1088   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1089   /// for vectorization factor \p VF.
1090   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1091     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1092            !isProfitableToScalarize(I, VF) &&
1093            !isScalarAfterVectorization(I, VF);
1094   }
1095 
1096   /// Decision that was taken during cost calculation for memory instruction.
1097   enum InstWidening {
1098     CM_Unknown,
1099     CM_Widen,         // For consecutive accesses with stride +1.
1100     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1101     CM_Interleave,
1102     CM_GatherScatter,
1103     CM_Scalarize
1104   };
1105 
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// instruction \p I and vector width \p VF.
1108   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1109                            unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1112   }
1113 
1114   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1115   /// interleaving group \p Grp and vector width \p VF.
1116   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1117                            InstWidening W, unsigned Cost) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     /// Broadcast this decicion to all instructions inside the group.
1120     /// But the cost will be assigned to one instruction only.
1121     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1122       if (auto *I = Grp->getMember(i)) {
1123         if (Grp->getInsertPos() == I)
1124           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1125         else
1126           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1127       }
1128     }
1129   }
1130 
1131   /// Return the cost model decision for the given instruction \p I and vector
1132   /// width \p VF. Return CM_Unknown if this instruction did not pass
1133   /// through the cost modeling.
1134   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1135     assert(VF >= 2 && "Expected VF >=2");
1136 
1137     // Cost model is not run in the VPlan-native path - return conservative
1138     // result until this changes.
1139     if (EnableVPlanNativePath)
1140       return CM_GatherScatter;
1141 
1142     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1143     auto Itr = WideningDecisions.find(InstOnVF);
1144     if (Itr == WideningDecisions.end())
1145       return CM_Unknown;
1146     return Itr->second.first;
1147   }
1148 
1149   /// Return the vectorization cost for the given instruction \p I and vector
1150   /// width \p VF.
1151   unsigned getWideningCost(Instruction *I, unsigned VF) {
1152     assert(VF >= 2 && "Expected VF >=2");
1153     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1154     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1155            "The cost is not calculated");
1156     return WideningDecisions[InstOnVF].second;
1157   }
1158 
1159   /// Return True if instruction \p I is an optimizable truncate whose operand
1160   /// is an induction variable. Such a truncate will be removed by adding a new
1161   /// induction variable with the destination type.
1162   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1163     // If the instruction is not a truncate, return false.
1164     auto *Trunc = dyn_cast<TruncInst>(I);
1165     if (!Trunc)
1166       return false;
1167 
1168     // Get the source and destination types of the truncate.
1169     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1170     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1171 
1172     // If the truncate is free for the given types, return false. Replacing a
1173     // free truncate with an induction variable would add an induction variable
1174     // update instruction to each iteration of the loop. We exclude from this
1175     // check the primary induction variable since it will need an update
1176     // instruction regardless.
1177     Value *Op = Trunc->getOperand(0);
1178     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1179       return false;
1180 
1181     // If the truncated value is not an induction variable, return false.
1182     return Legal->isInductionPhi(Op);
1183   }
1184 
1185   /// Collects the instructions to scalarize for each predicated instruction in
1186   /// the loop.
1187   void collectInstsToScalarize(unsigned VF);
1188 
1189   /// Collect Uniform and Scalar values for the given \p VF.
1190   /// The sets depend on CM decision for Load/Store instructions
1191   /// that may be vectorized as interleave, gather-scatter or scalarized.
1192   void collectUniformsAndScalars(unsigned VF) {
1193     // Do the analysis once.
1194     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1195       return;
1196     setCostBasedWideningDecision(VF);
1197     collectLoopUniforms(VF);
1198     collectLoopScalars(VF);
1199   }
1200 
1201   /// Returns true if the target machine supports masked store operation
1202   /// for the given \p DataType and kind of access to \p Ptr.
1203   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1204     return Legal->isConsecutivePtr(Ptr) &&
1205            TTI.isLegalMaskedStore(DataType, Alignment);
1206   }
1207 
1208   /// Returns true if the target machine supports masked load operation
1209   /// for the given \p DataType and kind of access to \p Ptr.
1210   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1211     return Legal->isConsecutivePtr(Ptr) &&
1212            TTI.isLegalMaskedLoad(DataType, Alignment);
1213   }
1214 
1215   /// Returns true if the target machine supports masked scatter operation
1216   /// for the given \p DataType.
1217   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1218     return TTI.isLegalMaskedScatter(DataType, Alignment);
1219   }
1220 
1221   /// Returns true if the target machine supports masked gather operation
1222   /// for the given \p DataType.
1223   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1224     return TTI.isLegalMaskedGather(DataType, Alignment);
1225   }
1226 
1227   /// Returns true if the target machine can represent \p V as a masked gather
1228   /// or scatter operation.
1229   bool isLegalGatherOrScatter(Value *V) {
1230     bool LI = isa<LoadInst>(V);
1231     bool SI = isa<StoreInst>(V);
1232     if (!LI && !SI)
1233       return false;
1234     auto *Ty = getMemInstValueType(V);
1235     MaybeAlign Align = getLoadStoreAlignment(V);
1236     return (LI && isLegalMaskedGather(Ty, Align)) ||
1237            (SI && isLegalMaskedScatter(Ty, Align));
1238   }
1239 
1240   /// Returns true if \p I is an instruction that will be scalarized with
1241   /// predication. Such instructions include conditional stores and
1242   /// instructions that may divide by zero.
1243   /// If a non-zero VF has been calculated, we check if I will be scalarized
1244   /// predication for that VF.
1245   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1246 
1247   // Returns true if \p I is an instruction that will be predicated either
1248   // through scalar predication or masked load/store or masked gather/scatter.
1249   // Superset of instructions that return true for isScalarWithPredication.
1250   bool isPredicatedInst(Instruction *I) {
1251     if (!blockNeedsPredication(I->getParent()))
1252       return false;
1253     // Loads and stores that need some form of masked operation are predicated
1254     // instructions.
1255     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1256       return Legal->isMaskRequired(I);
1257     return isScalarWithPredication(I);
1258   }
1259 
1260   /// Returns true if \p I is a memory instruction with consecutive memory
1261   /// access that can be widened.
1262   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1263 
1264   /// Returns true if \p I is a memory instruction in an interleaved-group
1265   /// of memory accesses that can be vectorized with wide vector loads/stores
1266   /// and shuffles.
1267   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1268 
1269   /// Check if \p Instr belongs to any interleaved access group.
1270   bool isAccessInterleaved(Instruction *Instr) {
1271     return InterleaveInfo.isInterleaved(Instr);
1272   }
1273 
1274   /// Get the interleaved access group that \p Instr belongs to.
1275   const InterleaveGroup<Instruction> *
1276   getInterleavedAccessGroup(Instruction *Instr) {
1277     return InterleaveInfo.getInterleaveGroup(Instr);
1278   }
1279 
1280   /// Returns true if an interleaved group requires a scalar iteration
1281   /// to handle accesses with gaps, and there is nothing preventing us from
1282   /// creating a scalar epilogue.
1283   bool requiresScalarEpilogue() const {
1284     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1285   }
1286 
1287   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1288   /// loop hint annotation.
1289   bool isScalarEpilogueAllowed() const {
1290     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1291   }
1292 
1293   /// Returns true if all loop blocks should be masked to fold tail loop.
1294   bool foldTailByMasking() const { return FoldTailByMasking; }
1295 
1296   bool blockNeedsPredication(BasicBlock *BB) {
1297     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1298   }
1299 
1300   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1301   /// with factor VF.  Return the cost of the instruction, including
1302   /// scalarization overhead if it's needed.
1303   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1304 
1305   /// Estimate cost of a call instruction CI if it were vectorized with factor
1306   /// VF. Return the cost of the instruction, including scalarization overhead
1307   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1308   /// scalarized -
1309   /// i.e. either vector version isn't available, or is too expensive.
1310   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1311 
1312 private:
1313   unsigned NumPredStores = 0;
1314 
1315   /// \return An upper bound for the vectorization factor, larger than zero.
1316   /// One is returned if vectorization should best be avoided due to cost.
1317   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1318 
1319   /// The vectorization cost is a combination of the cost itself and a boolean
1320   /// indicating whether any of the contributing operations will actually
1321   /// operate on
1322   /// vector values after type legalization in the backend. If this latter value
1323   /// is
1324   /// false, then all operations will be scalarized (i.e. no vectorization has
1325   /// actually taken place).
1326   using VectorizationCostTy = std::pair<unsigned, bool>;
1327 
1328   /// Returns the expected execution cost. The unit of the cost does
1329   /// not matter because we use the 'cost' units to compare different
1330   /// vector widths. The cost that is returned is *not* normalized by
1331   /// the factor width.
1332   VectorizationCostTy expectedCost(unsigned VF);
1333 
1334   /// Returns the execution time cost of an instruction for a given vector
1335   /// width. Vector width of one means scalar.
1336   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1337 
1338   /// The cost-computation logic from getInstructionCost which provides
1339   /// the vector type as an output parameter.
1340   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1341 
1342   /// Calculate vectorization cost of memory instruction \p I.
1343   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1344 
1345   /// The cost computation for scalarized memory instruction.
1346   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1347 
1348   /// The cost computation for interleaving group of memory instructions.
1349   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1350 
1351   /// The cost computation for Gather/Scatter instruction.
1352   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1353 
1354   /// The cost computation for widening instruction \p I with consecutive
1355   /// memory access.
1356   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1357 
1358   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1359   /// Load: scalar load + broadcast.
1360   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1361   /// element)
1362   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1363 
1364   /// Estimate the overhead of scalarizing an instruction. This is a
1365   /// convenience wrapper for the type-based getScalarizationOverhead API.
1366   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1367 
1368   /// Returns whether the instruction is a load or store and will be a emitted
1369   /// as a vector operation.
1370   bool isConsecutiveLoadOrStore(Instruction *I);
1371 
1372   /// Returns true if an artificially high cost for emulated masked memrefs
1373   /// should be used.
1374   bool useEmulatedMaskMemRefHack(Instruction *I);
1375 
1376   /// Map of scalar integer values to the smallest bitwidth they can be legally
1377   /// represented as. The vector equivalents of these values should be truncated
1378   /// to this type.
1379   MapVector<Instruction *, uint64_t> MinBWs;
1380 
1381   /// A type representing the costs for instructions if they were to be
1382   /// scalarized rather than vectorized. The entries are Instruction-Cost
1383   /// pairs.
1384   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1385 
1386   /// A set containing all BasicBlocks that are known to present after
1387   /// vectorization as a predicated block.
1388   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1389 
1390   /// Records whether it is allowed to have the original scalar loop execute at
1391   /// least once. This may be needed as a fallback loop in case runtime
1392   /// aliasing/dependence checks fail, or to handle the tail/remainder
1393   /// iterations when the trip count is unknown or doesn't divide by the VF,
1394   /// or as a peel-loop to handle gaps in interleave-groups.
1395   /// Under optsize and when the trip count is very small we don't allow any
1396   /// iterations to execute in the scalar loop.
1397   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1398 
1399   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1400   bool FoldTailByMasking = false;
1401 
1402   /// A map holding scalar costs for different vectorization factors. The
1403   /// presence of a cost for an instruction in the mapping indicates that the
1404   /// instruction will be scalarized when vectorizing with the associated
1405   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1406   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1407 
1408   /// Holds the instructions known to be uniform after vectorization.
1409   /// The data is collected per VF.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1411 
1412   /// Holds the instructions known to be scalar after vectorization.
1413   /// The data is collected per VF.
1414   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1415 
1416   /// Holds the instructions (address computations) that are forced to be
1417   /// scalarized.
1418   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1419 
1420   /// Returns the expected difference in cost from scalarizing the expression
1421   /// feeding a predicated instruction \p PredInst. The instructions to
1422   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1423   /// non-negative return value implies the expression will be scalarized.
1424   /// Currently, only single-use chains are considered for scalarization.
1425   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1426                               unsigned VF);
1427 
1428   /// Collect the instructions that are uniform after vectorization. An
1429   /// instruction is uniform if we represent it with a single scalar value in
1430   /// the vectorized loop corresponding to each vector iteration. Examples of
1431   /// uniform instructions include pointer operands of consecutive or
1432   /// interleaved memory accesses. Note that although uniformity implies an
1433   /// instruction will be scalar, the reverse is not true. In general, a
1434   /// scalarized instruction will be represented by VF scalar values in the
1435   /// vectorized loop, each corresponding to an iteration of the original
1436   /// scalar loop.
1437   void collectLoopUniforms(unsigned VF);
1438 
1439   /// Collect the instructions that are scalar after vectorization. An
1440   /// instruction is scalar if it is known to be uniform or will be scalarized
1441   /// during vectorization. Non-uniform scalarized instructions will be
1442   /// represented by VF values in the vectorized loop, each corresponding to an
1443   /// iteration of the original scalar loop.
1444   void collectLoopScalars(unsigned VF);
1445 
1446   /// Keeps cost model vectorization decision and cost for instructions.
1447   /// Right now it is used for memory instructions only.
1448   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1449                                 std::pair<InstWidening, unsigned>>;
1450 
1451   DecisionList WideningDecisions;
1452 
1453   /// Returns true if \p V is expected to be vectorized and it needs to be
1454   /// extracted.
1455   bool needsExtract(Value *V, unsigned VF) const {
1456     Instruction *I = dyn_cast<Instruction>(V);
1457     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1458       return false;
1459 
1460     // Assume we can vectorize V (and hence we need extraction) if the
1461     // scalars are not computed yet. This can happen, because it is called
1462     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1463     // the scalars are collected. That should be a safe assumption in most
1464     // cases, because we check if the operands have vectorizable types
1465     // beforehand in LoopVectorizationLegality.
1466     return Scalars.find(VF) == Scalars.end() ||
1467            !isScalarAfterVectorization(I, VF);
1468   };
1469 
1470   /// Returns a range containing only operands needing to be extracted.
1471   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1472                                                    unsigned VF) {
1473     return SmallVector<Value *, 4>(make_filter_range(
1474         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1475   }
1476 
1477 public:
1478   /// The loop that we evaluate.
1479   Loop *TheLoop;
1480 
1481   /// Predicated scalar evolution analysis.
1482   PredicatedScalarEvolution &PSE;
1483 
1484   /// Loop Info analysis.
1485   LoopInfo *LI;
1486 
1487   /// Vectorization legality.
1488   LoopVectorizationLegality *Legal;
1489 
1490   /// Vector target information.
1491   const TargetTransformInfo &TTI;
1492 
1493   /// Target Library Info.
1494   const TargetLibraryInfo *TLI;
1495 
1496   /// Demanded bits analysis.
1497   DemandedBits *DB;
1498 
1499   /// Assumption cache.
1500   AssumptionCache *AC;
1501 
1502   /// Interface to emit optimization remarks.
1503   OptimizationRemarkEmitter *ORE;
1504 
1505   const Function *TheFunction;
1506 
1507   /// Loop Vectorize Hint.
1508   const LoopVectorizeHints *Hints;
1509 
1510   /// The interleave access information contains groups of interleaved accesses
1511   /// with the same stride and close to each other.
1512   InterleavedAccessInfo &InterleaveInfo;
1513 
1514   /// Values to ignore in the cost model.
1515   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1516 
1517   /// Values to ignore in the cost model when VF > 1.
1518   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1519 };
1520 
1521 } // end namespace llvm
1522 
1523 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1524 // vectorization. The loop needs to be annotated with #pragma omp simd
1525 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1526 // vector length information is not provided, vectorization is not considered
1527 // explicit. Interleave hints are not allowed either. These limitations will be
1528 // relaxed in the future.
1529 // Please, note that we are currently forced to abuse the pragma 'clang
1530 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1531 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1532 // provides *explicit vectorization hints* (LV can bypass legal checks and
1533 // assume that vectorization is legal). However, both hints are implemented
1534 // using the same metadata (llvm.loop.vectorize, processed by
1535 // LoopVectorizeHints). This will be fixed in the future when the native IR
1536 // representation for pragma 'omp simd' is introduced.
1537 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1538                                    OptimizationRemarkEmitter *ORE) {
1539   assert(!OuterLp->empty() && "This is not an outer loop");
1540   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1541 
1542   // Only outer loops with an explicit vectorization hint are supported.
1543   // Unannotated outer loops are ignored.
1544   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1545     return false;
1546 
1547   Function *Fn = OuterLp->getHeader()->getParent();
1548   if (!Hints.allowVectorization(Fn, OuterLp,
1549                                 true /*VectorizeOnlyWhenForced*/)) {
1550     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1551     return false;
1552   }
1553 
1554   if (Hints.getInterleave() > 1) {
1555     // TODO: Interleave support is future work.
1556     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1557                          "outer loops.\n");
1558     Hints.emitRemarkWithHints();
1559     return false;
1560   }
1561 
1562   return true;
1563 }
1564 
1565 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1566                                   OptimizationRemarkEmitter *ORE,
1567                                   SmallVectorImpl<Loop *> &V) {
1568   // Collect inner loops and outer loops without irreducible control flow. For
1569   // now, only collect outer loops that have explicit vectorization hints. If we
1570   // are stress testing the VPlan H-CFG construction, we collect the outermost
1571   // loop of every loop nest.
1572   if (L.empty() || VPlanBuildStressTest ||
1573       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1574     LoopBlocksRPO RPOT(&L);
1575     RPOT.perform(LI);
1576     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1577       V.push_back(&L);
1578       // TODO: Collect inner loops inside marked outer loops in case
1579       // vectorization fails for the outer loop. Do not invoke
1580       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1581       // already known to be reducible. We can use an inherited attribute for
1582       // that.
1583       return;
1584     }
1585   }
1586   for (Loop *InnerL : L)
1587     collectSupportedLoops(*InnerL, LI, ORE, V);
1588 }
1589 
1590 namespace {
1591 
1592 /// The LoopVectorize Pass.
1593 struct LoopVectorize : public FunctionPass {
1594   /// Pass identification, replacement for typeid
1595   static char ID;
1596 
1597   LoopVectorizePass Impl;
1598 
1599   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1600                          bool VectorizeOnlyWhenForced = false)
1601       : FunctionPass(ID),
1602         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1603     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1604   }
1605 
1606   bool runOnFunction(Function &F) override {
1607     if (skipFunction(F))
1608       return false;
1609 
1610     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1611     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1612     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1613     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1614     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1615     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1616     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1617     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1618     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1619     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1620     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1621     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1622     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1623 
1624     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1625         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1626 
1627     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1628                         GetLAA, *ORE, PSI);
1629   }
1630 
1631   void getAnalysisUsage(AnalysisUsage &AU) const override {
1632     AU.addRequired<AssumptionCacheTracker>();
1633     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1634     AU.addRequired<DominatorTreeWrapperPass>();
1635     AU.addRequired<LoopInfoWrapperPass>();
1636     AU.addRequired<ScalarEvolutionWrapperPass>();
1637     AU.addRequired<TargetTransformInfoWrapperPass>();
1638     AU.addRequired<AAResultsWrapperPass>();
1639     AU.addRequired<LoopAccessLegacyAnalysis>();
1640     AU.addRequired<DemandedBitsWrapperPass>();
1641     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1642     AU.addRequired<InjectTLIMappingsLegacy>();
1643 
1644     // We currently do not preserve loopinfo/dominator analyses with outer loop
1645     // vectorization. Until this is addressed, mark these analyses as preserved
1646     // only for non-VPlan-native path.
1647     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1648     if (!EnableVPlanNativePath) {
1649       AU.addPreserved<LoopInfoWrapperPass>();
1650       AU.addPreserved<DominatorTreeWrapperPass>();
1651     }
1652 
1653     AU.addPreserved<BasicAAWrapperPass>();
1654     AU.addPreserved<GlobalsAAWrapperPass>();
1655     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1656   }
1657 };
1658 
1659 } // end anonymous namespace
1660 
1661 //===----------------------------------------------------------------------===//
1662 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1663 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1664 //===----------------------------------------------------------------------===//
1665 
1666 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1667   // We need to place the broadcast of invariant variables outside the loop,
1668   // but only if it's proven safe to do so. Else, broadcast will be inside
1669   // vector loop body.
1670   Instruction *Instr = dyn_cast<Instruction>(V);
1671   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1672                      (!Instr ||
1673                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1674   // Place the code for broadcasting invariant variables in the new preheader.
1675   IRBuilder<>::InsertPointGuard Guard(Builder);
1676   if (SafeToHoist)
1677     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1678 
1679   // Broadcast the scalar into all locations in the vector.
1680   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1681 
1682   return Shuf;
1683 }
1684 
1685 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1686     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1687   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1688          "Expected either an induction phi-node or a truncate of it!");
1689   Value *Start = II.getStartValue();
1690 
1691   // Construct the initial value of the vector IV in the vector loop preheader
1692   auto CurrIP = Builder.saveIP();
1693   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1694   if (isa<TruncInst>(EntryVal)) {
1695     assert(Start->getType()->isIntegerTy() &&
1696            "Truncation requires an integer type");
1697     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1698     Step = Builder.CreateTrunc(Step, TruncType);
1699     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1700   }
1701   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1702   Value *SteppedStart =
1703       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1704 
1705   // We create vector phi nodes for both integer and floating-point induction
1706   // variables. Here, we determine the kind of arithmetic we will perform.
1707   Instruction::BinaryOps AddOp;
1708   Instruction::BinaryOps MulOp;
1709   if (Step->getType()->isIntegerTy()) {
1710     AddOp = Instruction::Add;
1711     MulOp = Instruction::Mul;
1712   } else {
1713     AddOp = II.getInductionOpcode();
1714     MulOp = Instruction::FMul;
1715   }
1716 
1717   // Multiply the vectorization factor by the step using integer or
1718   // floating-point arithmetic as appropriate.
1719   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1720   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1721 
1722   // Create a vector splat to use in the induction update.
1723   //
1724   // FIXME: If the step is non-constant, we create the vector splat with
1725   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1726   //        handle a constant vector splat.
1727   Value *SplatVF =
1728       isa<Constant>(Mul)
1729           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1730           : Builder.CreateVectorSplat(VF, Mul);
1731   Builder.restoreIP(CurrIP);
1732 
1733   // We may need to add the step a number of times, depending on the unroll
1734   // factor. The last of those goes into the PHI.
1735   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1736                                     &*LoopVectorBody->getFirstInsertionPt());
1737   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1738   Instruction *LastInduction = VecInd;
1739   for (unsigned Part = 0; Part < UF; ++Part) {
1740     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1741 
1742     if (isa<TruncInst>(EntryVal))
1743       addMetadata(LastInduction, EntryVal);
1744     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1745 
1746     LastInduction = cast<Instruction>(addFastMathFlag(
1747         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1748     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1749   }
1750 
1751   // Move the last step to the end of the latch block. This ensures consistent
1752   // placement of all induction updates.
1753   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1754   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1755   auto *ICmp = cast<Instruction>(Br->getCondition());
1756   LastInduction->moveBefore(ICmp);
1757   LastInduction->setName("vec.ind.next");
1758 
1759   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1760   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1761 }
1762 
1763 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1764   return Cost->isScalarAfterVectorization(I, VF) ||
1765          Cost->isProfitableToScalarize(I, VF);
1766 }
1767 
1768 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1769   if (shouldScalarizeInstruction(IV))
1770     return true;
1771   auto isScalarInst = [&](User *U) -> bool {
1772     auto *I = cast<Instruction>(U);
1773     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1774   };
1775   return llvm::any_of(IV->users(), isScalarInst);
1776 }
1777 
1778 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1779     const InductionDescriptor &ID, const Instruction *EntryVal,
1780     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1781   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1782          "Expected either an induction phi-node or a truncate of it!");
1783 
1784   // This induction variable is not the phi from the original loop but the
1785   // newly-created IV based on the proof that casted Phi is equal to the
1786   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1787   // re-uses the same InductionDescriptor that original IV uses but we don't
1788   // have to do any recording in this case - that is done when original IV is
1789   // processed.
1790   if (isa<TruncInst>(EntryVal))
1791     return;
1792 
1793   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1794   if (Casts.empty())
1795     return;
1796   // Only the first Cast instruction in the Casts vector is of interest.
1797   // The rest of the Casts (if exist) have no uses outside the
1798   // induction update chain itself.
1799   Instruction *CastInst = *Casts.begin();
1800   if (Lane < UINT_MAX)
1801     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1802   else
1803     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1804 }
1805 
1806 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1807   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1808          "Primary induction variable must have an integer type");
1809 
1810   auto II = Legal->getInductionVars().find(IV);
1811   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1812 
1813   auto ID = II->second;
1814   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1815 
1816   // The value from the original loop to which we are mapping the new induction
1817   // variable.
1818   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1819 
1820   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1821 
1822   // Generate code for the induction step. Note that induction steps are
1823   // required to be loop-invariant
1824   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1825     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1826            "Induction step should be loop invariant");
1827     if (PSE.getSE()->isSCEVable(IV->getType())) {
1828       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1829       return Exp.expandCodeFor(Step, Step->getType(),
1830                                LoopVectorPreHeader->getTerminator());
1831     }
1832     return cast<SCEVUnknown>(Step)->getValue();
1833   };
1834 
1835   // The scalar value to broadcast. This is derived from the canonical
1836   // induction variable. If a truncation type is given, truncate the canonical
1837   // induction variable and step. Otherwise, derive these values from the
1838   // induction descriptor.
1839   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1840     Value *ScalarIV = Induction;
1841     if (IV != OldInduction) {
1842       ScalarIV = IV->getType()->isIntegerTy()
1843                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1844                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1845                                           IV->getType());
1846       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1847       ScalarIV->setName("offset.idx");
1848     }
1849     if (Trunc) {
1850       auto *TruncType = cast<IntegerType>(Trunc->getType());
1851       assert(Step->getType()->isIntegerTy() &&
1852              "Truncation requires an integer step");
1853       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1854       Step = Builder.CreateTrunc(Step, TruncType);
1855     }
1856     return ScalarIV;
1857   };
1858 
1859   // Create the vector values from the scalar IV, in the absence of creating a
1860   // vector IV.
1861   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1862     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1863     for (unsigned Part = 0; Part < UF; ++Part) {
1864       Value *EntryPart =
1865           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1866       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1867       if (Trunc)
1868         addMetadata(EntryPart, Trunc);
1869       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1870     }
1871   };
1872 
1873   // Now do the actual transformations, and start with creating the step value.
1874   Value *Step = CreateStepValue(ID.getStep());
1875   if (VF <= 1) {
1876     Value *ScalarIV = CreateScalarIV(Step);
1877     CreateSplatIV(ScalarIV, Step);
1878     return;
1879   }
1880 
1881   // Determine if we want a scalar version of the induction variable. This is
1882   // true if the induction variable itself is not widened, or if it has at
1883   // least one user in the loop that is not widened.
1884   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1885   if (!NeedsScalarIV) {
1886     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1887     return;
1888   }
1889 
1890   // Try to create a new independent vector induction variable. If we can't
1891   // create the phi node, we will splat the scalar induction variable in each
1892   // loop iteration.
1893   if (!shouldScalarizeInstruction(EntryVal)) {
1894     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1895     Value *ScalarIV = CreateScalarIV(Step);
1896     // Create scalar steps that can be used by instructions we will later
1897     // scalarize. Note that the addition of the scalar steps will not increase
1898     // the number of instructions in the loop in the common case prior to
1899     // InstCombine. We will be trading one vector extract for each scalar step.
1900     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1901     return;
1902   }
1903 
1904   // If we haven't yet vectorized the induction variable, splat the scalar
1905   // induction variable, and build the necessary step vectors.
1906   // TODO: Don't do it unless the vectorized IV is really required.
1907   Value *ScalarIV = CreateScalarIV(Step);
1908   CreateSplatIV(ScalarIV, Step);
1909   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1910 }
1911 
1912 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1913                                           Instruction::BinaryOps BinOp) {
1914   // Create and check the types.
1915   auto *ValVTy = cast<VectorType>(Val->getType());
1916   int VLen = ValVTy->getNumElements();
1917 
1918   Type *STy = Val->getType()->getScalarType();
1919   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1920          "Induction Step must be an integer or FP");
1921   assert(Step->getType() == STy && "Step has wrong type");
1922 
1923   SmallVector<Constant *, 8> Indices;
1924 
1925   if (STy->isIntegerTy()) {
1926     // Create a vector of consecutive numbers from zero to VF.
1927     for (int i = 0; i < VLen; ++i)
1928       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1929 
1930     // Add the consecutive indices to the vector value.
1931     Constant *Cv = ConstantVector::get(Indices);
1932     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1933     Step = Builder.CreateVectorSplat(VLen, Step);
1934     assert(Step->getType() == Val->getType() && "Invalid step vec");
1935     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1936     // which can be found from the original scalar operations.
1937     Step = Builder.CreateMul(Cv, Step);
1938     return Builder.CreateAdd(Val, Step, "induction");
1939   }
1940 
1941   // Floating point induction.
1942   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1943          "Binary Opcode should be specified for FP induction");
1944   // Create a vector of consecutive numbers from zero to VF.
1945   for (int i = 0; i < VLen; ++i)
1946     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1947 
1948   // Add the consecutive indices to the vector value.
1949   Constant *Cv = ConstantVector::get(Indices);
1950 
1951   Step = Builder.CreateVectorSplat(VLen, Step);
1952 
1953   // Floating point operations had to be 'fast' to enable the induction.
1954   FastMathFlags Flags;
1955   Flags.setFast();
1956 
1957   Value *MulOp = Builder.CreateFMul(Cv, Step);
1958   if (isa<Instruction>(MulOp))
1959     // Have to check, MulOp may be a constant
1960     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1961 
1962   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1963   if (isa<Instruction>(BOp))
1964     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1965   return BOp;
1966 }
1967 
1968 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1969                                            Instruction *EntryVal,
1970                                            const InductionDescriptor &ID) {
1971   // We shouldn't have to build scalar steps if we aren't vectorizing.
1972   assert(VF > 1 && "VF should be greater than one");
1973 
1974   // Get the value type and ensure it and the step have the same integer type.
1975   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1976   assert(ScalarIVTy == Step->getType() &&
1977          "Val and Step should have the same type");
1978 
1979   // We build scalar steps for both integer and floating-point induction
1980   // variables. Here, we determine the kind of arithmetic we will perform.
1981   Instruction::BinaryOps AddOp;
1982   Instruction::BinaryOps MulOp;
1983   if (ScalarIVTy->isIntegerTy()) {
1984     AddOp = Instruction::Add;
1985     MulOp = Instruction::Mul;
1986   } else {
1987     AddOp = ID.getInductionOpcode();
1988     MulOp = Instruction::FMul;
1989   }
1990 
1991   // Determine the number of scalars we need to generate for each unroll
1992   // iteration. If EntryVal is uniform, we only need to generate the first
1993   // lane. Otherwise, we generate all VF values.
1994   unsigned Lanes =
1995       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1996                                                                          : VF;
1997   // Compute the scalar steps and save the results in VectorLoopValueMap.
1998   for (unsigned Part = 0; Part < UF; ++Part) {
1999     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2000       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2001       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2002       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2003       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2004       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2005     }
2006   }
2007 }
2008 
2009 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2010   assert(V != Induction && "The new induction variable should not be used.");
2011   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2012   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2013 
2014   // If we have a stride that is replaced by one, do it here. Defer this for
2015   // the VPlan-native path until we start running Legal checks in that path.
2016   if (!EnableVPlanNativePath && Legal->hasStride(V))
2017     V = ConstantInt::get(V->getType(), 1);
2018 
2019   // If we have a vector mapped to this value, return it.
2020   if (VectorLoopValueMap.hasVectorValue(V, Part))
2021     return VectorLoopValueMap.getVectorValue(V, Part);
2022 
2023   // If the value has not been vectorized, check if it has been scalarized
2024   // instead. If it has been scalarized, and we actually need the value in
2025   // vector form, we will construct the vector values on demand.
2026   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2027     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2028 
2029     // If we've scalarized a value, that value should be an instruction.
2030     auto *I = cast<Instruction>(V);
2031 
2032     // If we aren't vectorizing, we can just copy the scalar map values over to
2033     // the vector map.
2034     if (VF == 1) {
2035       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2036       return ScalarValue;
2037     }
2038 
2039     // Get the last scalar instruction we generated for V and Part. If the value
2040     // is known to be uniform after vectorization, this corresponds to lane zero
2041     // of the Part unroll iteration. Otherwise, the last instruction is the one
2042     // we created for the last vector lane of the Part unroll iteration.
2043     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2044     auto *LastInst = cast<Instruction>(
2045         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2046 
2047     // Set the insert point after the last scalarized instruction. This ensures
2048     // the insertelement sequence will directly follow the scalar definitions.
2049     auto OldIP = Builder.saveIP();
2050     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2051     Builder.SetInsertPoint(&*NewIP);
2052 
2053     // However, if we are vectorizing, we need to construct the vector values.
2054     // If the value is known to be uniform after vectorization, we can just
2055     // broadcast the scalar value corresponding to lane zero for each unroll
2056     // iteration. Otherwise, we construct the vector values using insertelement
2057     // instructions. Since the resulting vectors are stored in
2058     // VectorLoopValueMap, we will only generate the insertelements once.
2059     Value *VectorValue = nullptr;
2060     if (Cost->isUniformAfterVectorization(I, VF)) {
2061       VectorValue = getBroadcastInstrs(ScalarValue);
2062       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2063     } else {
2064       // Initialize packing with insertelements to start from undef.
2065       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2066       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2067       for (unsigned Lane = 0; Lane < VF; ++Lane)
2068         packScalarIntoVectorValue(V, {Part, Lane});
2069       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2070     }
2071     Builder.restoreIP(OldIP);
2072     return VectorValue;
2073   }
2074 
2075   // If this scalar is unknown, assume that it is a constant or that it is
2076   // loop invariant. Broadcast V and save the value for future uses.
2077   Value *B = getBroadcastInstrs(V);
2078   VectorLoopValueMap.setVectorValue(V, Part, B);
2079   return B;
2080 }
2081 
2082 Value *
2083 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2084                                             const VPIteration &Instance) {
2085   // If the value is not an instruction contained in the loop, it should
2086   // already be scalar.
2087   if (OrigLoop->isLoopInvariant(V))
2088     return V;
2089 
2090   assert(Instance.Lane > 0
2091              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2092              : true && "Uniform values only have lane zero");
2093 
2094   // If the value from the original loop has not been vectorized, it is
2095   // represented by UF x VF scalar values in the new loop. Return the requested
2096   // scalar value.
2097   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2098     return VectorLoopValueMap.getScalarValue(V, Instance);
2099 
2100   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2101   // for the given unroll part. If this entry is not a vector type (i.e., the
2102   // vectorization factor is one), there is no need to generate an
2103   // extractelement instruction.
2104   auto *U = getOrCreateVectorValue(V, Instance.Part);
2105   if (!U->getType()->isVectorTy()) {
2106     assert(VF == 1 && "Value not scalarized has non-vector type");
2107     return U;
2108   }
2109 
2110   // Otherwise, the value from the original loop has been vectorized and is
2111   // represented by UF vector values. Extract and return the requested scalar
2112   // value from the appropriate vector lane.
2113   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2114 }
2115 
2116 void InnerLoopVectorizer::packScalarIntoVectorValue(
2117     Value *V, const VPIteration &Instance) {
2118   assert(V != Induction && "The new induction variable should not be used.");
2119   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2120   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2121 
2122   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2123   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2124   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2125                                             Builder.getInt32(Instance.Lane));
2126   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2127 }
2128 
2129 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2130   assert(Vec->getType()->isVectorTy() && "Invalid type");
2131   SmallVector<int, 8> ShuffleMask;
2132   for (unsigned i = 0; i < VF; ++i)
2133     ShuffleMask.push_back(VF - i - 1);
2134 
2135   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2136                                      ShuffleMask, "reverse");
2137 }
2138 
2139 // Return whether we allow using masked interleave-groups (for dealing with
2140 // strided loads/stores that reside in predicated blocks, or for dealing
2141 // with gaps).
2142 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2143   // If an override option has been passed in for interleaved accesses, use it.
2144   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2145     return EnableMaskedInterleavedMemAccesses;
2146 
2147   return TTI.enableMaskedInterleavedAccessVectorization();
2148 }
2149 
2150 // Try to vectorize the interleave group that \p Instr belongs to.
2151 //
2152 // E.g. Translate following interleaved load group (factor = 3):
2153 //   for (i = 0; i < N; i+=3) {
2154 //     R = Pic[i];             // Member of index 0
2155 //     G = Pic[i+1];           // Member of index 1
2156 //     B = Pic[i+2];           // Member of index 2
2157 //     ... // do something to R, G, B
2158 //   }
2159 // To:
2160 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2161 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2162 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2163 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2164 //
2165 // Or translate following interleaved store group (factor = 3):
2166 //   for (i = 0; i < N; i+=3) {
2167 //     ... do something to R, G, B
2168 //     Pic[i]   = R;           // Member of index 0
2169 //     Pic[i+1] = G;           // Member of index 1
2170 //     Pic[i+2] = B;           // Member of index 2
2171 //   }
2172 // To:
2173 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2174 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2175 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2176 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2177 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2178 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2179     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2180     VPValue *Addr, VPValue *BlockInMask) {
2181   Instruction *Instr = Group->getInsertPos();
2182   const DataLayout &DL = Instr->getModule()->getDataLayout();
2183 
2184   // Prepare for the vector type of the interleaved load/store.
2185   Type *ScalarTy = getMemInstValueType(Instr);
2186   unsigned InterleaveFactor = Group->getFactor();
2187   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2188 
2189   // Prepare for the new pointers.
2190   SmallVector<Value *, 2> AddrParts;
2191   unsigned Index = Group->getIndex(Instr);
2192 
2193   // TODO: extend the masked interleaved-group support to reversed access.
2194   assert((!BlockInMask || !Group->isReverse()) &&
2195          "Reversed masked interleave-group not supported.");
2196 
2197   // If the group is reverse, adjust the index to refer to the last vector lane
2198   // instead of the first. We adjust the index from the first vector lane,
2199   // rather than directly getting the pointer for lane VF - 1, because the
2200   // pointer operand of the interleaved access is supposed to be uniform. For
2201   // uniform instructions, we're only required to generate a value for the
2202   // first vector lane in each unroll iteration.
2203   if (Group->isReverse())
2204     Index += (VF - 1) * Group->getFactor();
2205 
2206   for (unsigned Part = 0; Part < UF; Part++) {
2207     Value *AddrPart = State.get(Addr, {Part, 0});
2208     setDebugLocFromInst(Builder, AddrPart);
2209 
2210     // Notice current instruction could be any index. Need to adjust the address
2211     // to the member of index 0.
2212     //
2213     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2214     //       b = A[i];       // Member of index 0
2215     // Current pointer is pointed to A[i+1], adjust it to A[i].
2216     //
2217     // E.g.  A[i+1] = a;     // Member of index 1
2218     //       A[i]   = b;     // Member of index 0
2219     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2220     // Current pointer is pointed to A[i+2], adjust it to A[i].
2221 
2222     bool InBounds = false;
2223     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2224       InBounds = gep->isInBounds();
2225     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2226     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2227 
2228     // Cast to the vector pointer type.
2229     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2230     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2231     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2232   }
2233 
2234   setDebugLocFromInst(Builder, Instr);
2235   Value *UndefVec = UndefValue::get(VecTy);
2236 
2237   Value *MaskForGaps = nullptr;
2238   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2239     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2240     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2241   }
2242 
2243   // Vectorize the interleaved load group.
2244   if (isa<LoadInst>(Instr)) {
2245     // For each unroll part, create a wide load for the group.
2246     SmallVector<Value *, 2> NewLoads;
2247     for (unsigned Part = 0; Part < UF; Part++) {
2248       Instruction *NewLoad;
2249       if (BlockInMask || MaskForGaps) {
2250         assert(useMaskedInterleavedAccesses(*TTI) &&
2251                "masked interleaved groups are not allowed.");
2252         Value *GroupMask = MaskForGaps;
2253         if (BlockInMask) {
2254           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2255           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2256           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2257           Value *ShuffledMask = Builder.CreateShuffleVector(
2258               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2259           GroupMask = MaskForGaps
2260                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2261                                                 MaskForGaps)
2262                           : ShuffledMask;
2263         }
2264         NewLoad =
2265             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2266                                      GroupMask, UndefVec, "wide.masked.vec");
2267       }
2268       else
2269         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2270                                             Group->getAlign(), "wide.vec");
2271       Group->addMetadata(NewLoad);
2272       NewLoads.push_back(NewLoad);
2273     }
2274 
2275     // For each member in the group, shuffle out the appropriate data from the
2276     // wide loads.
2277     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2278       Instruction *Member = Group->getMember(I);
2279 
2280       // Skip the gaps in the group.
2281       if (!Member)
2282         continue;
2283 
2284       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2285       for (unsigned Part = 0; Part < UF; Part++) {
2286         Value *StridedVec = Builder.CreateShuffleVector(
2287             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2288 
2289         // If this member has different type, cast the result type.
2290         if (Member->getType() != ScalarTy) {
2291           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2292           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2293         }
2294 
2295         if (Group->isReverse())
2296           StridedVec = reverseVector(StridedVec);
2297 
2298         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2299       }
2300     }
2301     return;
2302   }
2303 
2304   // The sub vector type for current instruction.
2305   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2306 
2307   // Vectorize the interleaved store group.
2308   for (unsigned Part = 0; Part < UF; Part++) {
2309     // Collect the stored vector from each member.
2310     SmallVector<Value *, 4> StoredVecs;
2311     for (unsigned i = 0; i < InterleaveFactor; i++) {
2312       // Interleaved store group doesn't allow a gap, so each index has a member
2313       Instruction *Member = Group->getMember(i);
2314       assert(Member && "Fail to get a member from an interleaved store group");
2315 
2316       Value *StoredVec = getOrCreateVectorValue(
2317           cast<StoreInst>(Member)->getValueOperand(), Part);
2318       if (Group->isReverse())
2319         StoredVec = reverseVector(StoredVec);
2320 
2321       // If this member has different type, cast it to a unified type.
2322 
2323       if (StoredVec->getType() != SubVT)
2324         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2325 
2326       StoredVecs.push_back(StoredVec);
2327     }
2328 
2329     // Concatenate all vectors into a wide vector.
2330     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2331 
2332     // Interleave the elements in the wide vector.
2333     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2334     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2335                                               "interleaved.vec");
2336 
2337     Instruction *NewStoreInstr;
2338     if (BlockInMask) {
2339       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2340       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2341       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2342       Value *ShuffledMask = Builder.CreateShuffleVector(
2343           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2344       NewStoreInstr = Builder.CreateMaskedStore(
2345           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2346     }
2347     else
2348       NewStoreInstr =
2349           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2350 
2351     Group->addMetadata(NewStoreInstr);
2352   }
2353 }
2354 
2355 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2356                                                      VPTransformState &State,
2357                                                      VPValue *Addr,
2358                                                      VPValue *StoredValue,
2359                                                      VPValue *BlockInMask) {
2360   // Attempt to issue a wide load.
2361   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2362   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2363 
2364   assert((LI || SI) && "Invalid Load/Store instruction");
2365   assert((!SI || StoredValue) && "No stored value provided for widened store");
2366   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2367 
2368   LoopVectorizationCostModel::InstWidening Decision =
2369       Cost->getWideningDecision(Instr, VF);
2370   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2371           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2372           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2373          "CM decision is not to widen the memory instruction");
2374 
2375   Type *ScalarDataTy = getMemInstValueType(Instr);
2376   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2377   // An alignment of 0 means target abi alignment. We need to use the scalar's
2378   // target abi alignment in such a case.
2379   const DataLayout &DL = Instr->getModule()->getDataLayout();
2380   const Align Alignment =
2381       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2382 
2383   // Determine if the pointer operand of the access is either consecutive or
2384   // reverse consecutive.
2385   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2386   bool ConsecutiveStride =
2387       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2388   bool CreateGatherScatter =
2389       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2390 
2391   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2392   // gather/scatter. Otherwise Decision should have been to Scalarize.
2393   assert((ConsecutiveStride || CreateGatherScatter) &&
2394          "The instruction should be scalarized");
2395   (void)ConsecutiveStride;
2396 
2397   VectorParts BlockInMaskParts(UF);
2398   bool isMaskRequired = BlockInMask;
2399   if (isMaskRequired)
2400     for (unsigned Part = 0; Part < UF; ++Part)
2401       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2402 
2403   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2404     // Calculate the pointer for the specific unroll-part.
2405     GetElementPtrInst *PartPtr = nullptr;
2406 
2407     bool InBounds = false;
2408     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2409       InBounds = gep->isInBounds();
2410 
2411     if (Reverse) {
2412       // If the address is consecutive but reversed, then the
2413       // wide store needs to start at the last vector element.
2414       PartPtr = cast<GetElementPtrInst>(
2415           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2416       PartPtr->setIsInBounds(InBounds);
2417       PartPtr = cast<GetElementPtrInst>(
2418           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2419       PartPtr->setIsInBounds(InBounds);
2420       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2421         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2422     } else {
2423       PartPtr = cast<GetElementPtrInst>(
2424           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2425       PartPtr->setIsInBounds(InBounds);
2426     }
2427 
2428     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2429     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2430   };
2431 
2432   // Handle Stores:
2433   if (SI) {
2434     setDebugLocFromInst(Builder, SI);
2435 
2436     for (unsigned Part = 0; Part < UF; ++Part) {
2437       Instruction *NewSI = nullptr;
2438       Value *StoredVal = State.get(StoredValue, Part);
2439       if (CreateGatherScatter) {
2440         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2441         Value *VectorGep = State.get(Addr, Part);
2442         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2443                                             MaskPart);
2444       } else {
2445         if (Reverse) {
2446           // If we store to reverse consecutive memory locations, then we need
2447           // to reverse the order of elements in the stored value.
2448           StoredVal = reverseVector(StoredVal);
2449           // We don't want to update the value in the map as it might be used in
2450           // another expression. So don't call resetVectorValue(StoredVal).
2451         }
2452         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2453         if (isMaskRequired)
2454           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2455                                             BlockInMaskParts[Part]);
2456         else
2457           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2458       }
2459       addMetadata(NewSI, SI);
2460     }
2461     return;
2462   }
2463 
2464   // Handle loads.
2465   assert(LI && "Must have a load instruction");
2466   setDebugLocFromInst(Builder, LI);
2467   for (unsigned Part = 0; Part < UF; ++Part) {
2468     Value *NewLI;
2469     if (CreateGatherScatter) {
2470       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2471       Value *VectorGep = State.get(Addr, Part);
2472       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2473                                          nullptr, "wide.masked.gather");
2474       addMetadata(NewLI, LI);
2475     } else {
2476       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2477       if (isMaskRequired)
2478         NewLI = Builder.CreateMaskedLoad(
2479             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2480             "wide.masked.load");
2481       else
2482         NewLI =
2483             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2484 
2485       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2486       addMetadata(NewLI, LI);
2487       if (Reverse)
2488         NewLI = reverseVector(NewLI);
2489     }
2490     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2491   }
2492 }
2493 
2494 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2495                                                const VPIteration &Instance,
2496                                                bool IfPredicateInstr) {
2497   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2498 
2499   setDebugLocFromInst(Builder, Instr);
2500 
2501   // Does this instruction return a value ?
2502   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2503 
2504   Instruction *Cloned = Instr->clone();
2505   if (!IsVoidRetTy)
2506     Cloned->setName(Instr->getName() + ".cloned");
2507 
2508   // Replace the operands of the cloned instructions with their scalar
2509   // equivalents in the new loop.
2510   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2511     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2512     Cloned->setOperand(op, NewOp);
2513   }
2514   addNewMetadata(Cloned, Instr);
2515 
2516   // Place the cloned scalar in the new loop.
2517   Builder.Insert(Cloned);
2518 
2519   // Add the cloned scalar to the scalar map entry.
2520   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2521 
2522   // If we just cloned a new assumption, add it the assumption cache.
2523   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2524     if (II->getIntrinsicID() == Intrinsic::assume)
2525       AC->registerAssumption(II);
2526 
2527   // End if-block.
2528   if (IfPredicateInstr)
2529     PredicatedInstructions.push_back(Cloned);
2530 }
2531 
2532 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2533                                                       Value *End, Value *Step,
2534                                                       Instruction *DL) {
2535   BasicBlock *Header = L->getHeader();
2536   BasicBlock *Latch = L->getLoopLatch();
2537   // As we're just creating this loop, it's possible no latch exists
2538   // yet. If so, use the header as this will be a single block loop.
2539   if (!Latch)
2540     Latch = Header;
2541 
2542   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2543   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2544   setDebugLocFromInst(Builder, OldInst);
2545   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2546 
2547   Builder.SetInsertPoint(Latch->getTerminator());
2548   setDebugLocFromInst(Builder, OldInst);
2549 
2550   // Create i+1 and fill the PHINode.
2551   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2552   Induction->addIncoming(Start, L->getLoopPreheader());
2553   Induction->addIncoming(Next, Latch);
2554   // Create the compare.
2555   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2556   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2557 
2558   // Now we have two terminators. Remove the old one from the block.
2559   Latch->getTerminator()->eraseFromParent();
2560 
2561   return Induction;
2562 }
2563 
2564 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2565   if (TripCount)
2566     return TripCount;
2567 
2568   assert(L && "Create Trip Count for null loop.");
2569   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2570   // Find the loop boundaries.
2571   ScalarEvolution *SE = PSE.getSE();
2572   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2573   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2574          "Invalid loop count");
2575 
2576   Type *IdxTy = Legal->getWidestInductionType();
2577   assert(IdxTy && "No type for induction");
2578 
2579   // The exit count might have the type of i64 while the phi is i32. This can
2580   // happen if we have an induction variable that is sign extended before the
2581   // compare. The only way that we get a backedge taken count is that the
2582   // induction variable was signed and as such will not overflow. In such a case
2583   // truncation is legal.
2584   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2585       IdxTy->getPrimitiveSizeInBits())
2586     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2587   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2588 
2589   // Get the total trip count from the count by adding 1.
2590   const SCEV *ExitCount = SE->getAddExpr(
2591       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2592 
2593   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2594 
2595   // Expand the trip count and place the new instructions in the preheader.
2596   // Notice that the pre-header does not change, only the loop body.
2597   SCEVExpander Exp(*SE, DL, "induction");
2598 
2599   // Count holds the overall loop count (N).
2600   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2601                                 L->getLoopPreheader()->getTerminator());
2602 
2603   if (TripCount->getType()->isPointerTy())
2604     TripCount =
2605         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2606                                     L->getLoopPreheader()->getTerminator());
2607 
2608   return TripCount;
2609 }
2610 
2611 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2612   if (VectorTripCount)
2613     return VectorTripCount;
2614 
2615   Value *TC = getOrCreateTripCount(L);
2616   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2617 
2618   Type *Ty = TC->getType();
2619   Constant *Step = ConstantInt::get(Ty, VF * UF);
2620 
2621   // If the tail is to be folded by masking, round the number of iterations N
2622   // up to a multiple of Step instead of rounding down. This is done by first
2623   // adding Step-1 and then rounding down. Note that it's ok if this addition
2624   // overflows: the vector induction variable will eventually wrap to zero given
2625   // that it starts at zero and its Step is a power of two; the loop will then
2626   // exit, with the last early-exit vector comparison also producing all-true.
2627   if (Cost->foldTailByMasking()) {
2628     assert(isPowerOf2_32(VF * UF) &&
2629            "VF*UF must be a power of 2 when folding tail by masking");
2630     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2631   }
2632 
2633   // Now we need to generate the expression for the part of the loop that the
2634   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2635   // iterations are not required for correctness, or N - Step, otherwise. Step
2636   // is equal to the vectorization factor (number of SIMD elements) times the
2637   // unroll factor (number of SIMD instructions).
2638   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2639 
2640   // If there is a non-reversed interleaved group that may speculatively access
2641   // memory out-of-bounds, we need to ensure that there will be at least one
2642   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2643   // the trip count, we set the remainder to be equal to the step. If the step
2644   // does not evenly divide the trip count, no adjustment is necessary since
2645   // there will already be scalar iterations. Note that the minimum iterations
2646   // check ensures that N >= Step.
2647   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2648     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2649     R = Builder.CreateSelect(IsZero, Step, R);
2650   }
2651 
2652   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2653 
2654   return VectorTripCount;
2655 }
2656 
2657 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2658                                                    const DataLayout &DL) {
2659   // Verify that V is a vector type with same number of elements as DstVTy.
2660   unsigned VF = DstVTy->getNumElements();
2661   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2662   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2663   Type *SrcElemTy = SrcVecTy->getElementType();
2664   Type *DstElemTy = DstVTy->getElementType();
2665   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2666          "Vector elements must have same size");
2667 
2668   // Do a direct cast if element types are castable.
2669   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2670     return Builder.CreateBitOrPointerCast(V, DstVTy);
2671   }
2672   // V cannot be directly casted to desired vector type.
2673   // May happen when V is a floating point vector but DstVTy is a vector of
2674   // pointers or vice-versa. Handle this using a two-step bitcast using an
2675   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2676   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2677          "Only one type should be a pointer type");
2678   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2679          "Only one type should be a floating point type");
2680   Type *IntTy =
2681       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2682   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2683   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2684   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2685 }
2686 
2687 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2688                                                          BasicBlock *Bypass) {
2689   Value *Count = getOrCreateTripCount(L);
2690   // Reuse existing vector loop preheader for TC checks.
2691   // Note that new preheader block is generated for vector loop.
2692   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2693   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2694 
2695   // Generate code to check if the loop's trip count is less than VF * UF, or
2696   // equal to it in case a scalar epilogue is required; this implies that the
2697   // vector trip count is zero. This check also covers the case where adding one
2698   // to the backedge-taken count overflowed leading to an incorrect trip count
2699   // of zero. In this case we will also jump to the scalar loop.
2700   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2701                                           : ICmpInst::ICMP_ULT;
2702 
2703   // If tail is to be folded, vector loop takes care of all iterations.
2704   Value *CheckMinIters = Builder.getFalse();
2705   if (!Cost->foldTailByMasking())
2706     CheckMinIters = Builder.CreateICmp(
2707         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2708         "min.iters.check");
2709 
2710   // Create new preheader for vector loop.
2711   LoopVectorPreHeader =
2712       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2713                  "vector.ph");
2714 
2715   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2716                                DT->getNode(Bypass)->getIDom()) &&
2717          "TC check is expected to dominate Bypass");
2718 
2719   // Update dominator for Bypass & LoopExit.
2720   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2721   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2722 
2723   ReplaceInstWithInst(
2724       TCCheckBlock->getTerminator(),
2725       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2726   LoopBypassBlocks.push_back(TCCheckBlock);
2727 }
2728 
2729 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2730   // Reuse existing vector loop preheader for SCEV checks.
2731   // Note that new preheader block is generated for vector loop.
2732   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2733 
2734   // Generate the code to check that the SCEV assumptions that we made.
2735   // We want the new basic block to start at the first instruction in a
2736   // sequence of instructions that form a check.
2737   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2738                    "scev.check");
2739   Value *SCEVCheck = Exp.expandCodeForPredicate(
2740       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2741 
2742   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2743     if (C->isZero())
2744       return;
2745 
2746   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2747          "Cannot SCEV check stride or overflow when optimizing for size");
2748 
2749   SCEVCheckBlock->setName("vector.scevcheck");
2750   // Create new preheader for vector loop.
2751   LoopVectorPreHeader =
2752       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2753                  nullptr, "vector.ph");
2754 
2755   // Update dominator only if this is first RT check.
2756   if (LoopBypassBlocks.empty()) {
2757     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2758     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2759   }
2760 
2761   ReplaceInstWithInst(
2762       SCEVCheckBlock->getTerminator(),
2763       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2764   LoopBypassBlocks.push_back(SCEVCheckBlock);
2765   AddedSafetyChecks = true;
2766 }
2767 
2768 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2769   // VPlan-native path does not do any analysis for runtime checks currently.
2770   if (EnableVPlanNativePath)
2771     return;
2772 
2773   // Reuse existing vector loop preheader for runtime memory checks.
2774   // Note that new preheader block is generated for vector loop.
2775   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2776 
2777   // Generate the code that checks in runtime if arrays overlap. We put the
2778   // checks into a separate block to make the more common case of few elements
2779   // faster.
2780   Instruction *FirstCheckInst;
2781   Instruction *MemRuntimeCheck;
2782   std::tie(FirstCheckInst, MemRuntimeCheck) =
2783       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2784   if (!MemRuntimeCheck)
2785     return;
2786 
2787   if (MemCheckBlock->getParent()->hasOptSize()) {
2788     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2789            "Cannot emit memory checks when optimizing for size, unless forced "
2790            "to vectorize.");
2791     ORE->emit([&]() {
2792       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2793                                         L->getStartLoc(), L->getHeader())
2794              << "Code-size may be reduced by not forcing "
2795                 "vectorization, or by source-code modifications "
2796                 "eliminating the need for runtime checks "
2797                 "(e.g., adding 'restrict').";
2798     });
2799   }
2800 
2801   MemCheckBlock->setName("vector.memcheck");
2802   // Create new preheader for vector loop.
2803   LoopVectorPreHeader =
2804       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2805                  "vector.ph");
2806 
2807   // Update dominator only if this is first RT check.
2808   if (LoopBypassBlocks.empty()) {
2809     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2810     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2811   }
2812 
2813   ReplaceInstWithInst(
2814       MemCheckBlock->getTerminator(),
2815       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2816   LoopBypassBlocks.push_back(MemCheckBlock);
2817   AddedSafetyChecks = true;
2818 
2819   // We currently don't use LoopVersioning for the actual loop cloning but we
2820   // still use it to add the noalias metadata.
2821   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2822                                           PSE.getSE());
2823   LVer->prepareNoAliasMetadata();
2824 }
2825 
2826 Value *InnerLoopVectorizer::emitTransformedIndex(
2827     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2828     const InductionDescriptor &ID) const {
2829 
2830   SCEVExpander Exp(*SE, DL, "induction");
2831   auto Step = ID.getStep();
2832   auto StartValue = ID.getStartValue();
2833   assert(Index->getType() == Step->getType() &&
2834          "Index type does not match StepValue type");
2835 
2836   // Note: the IR at this point is broken. We cannot use SE to create any new
2837   // SCEV and then expand it, hoping that SCEV's simplification will give us
2838   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2839   // lead to various SCEV crashes. So all we can do is to use builder and rely
2840   // on InstCombine for future simplifications. Here we handle some trivial
2841   // cases only.
2842   auto CreateAdd = [&B](Value *X, Value *Y) {
2843     assert(X->getType() == Y->getType() && "Types don't match!");
2844     if (auto *CX = dyn_cast<ConstantInt>(X))
2845       if (CX->isZero())
2846         return Y;
2847     if (auto *CY = dyn_cast<ConstantInt>(Y))
2848       if (CY->isZero())
2849         return X;
2850     return B.CreateAdd(X, Y);
2851   };
2852 
2853   auto CreateMul = [&B](Value *X, Value *Y) {
2854     assert(X->getType() == Y->getType() && "Types don't match!");
2855     if (auto *CX = dyn_cast<ConstantInt>(X))
2856       if (CX->isOne())
2857         return Y;
2858     if (auto *CY = dyn_cast<ConstantInt>(Y))
2859       if (CY->isOne())
2860         return X;
2861     return B.CreateMul(X, Y);
2862   };
2863 
2864   switch (ID.getKind()) {
2865   case InductionDescriptor::IK_IntInduction: {
2866     assert(Index->getType() == StartValue->getType() &&
2867            "Index type does not match StartValue type");
2868     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2869       return B.CreateSub(StartValue, Index);
2870     auto *Offset = CreateMul(
2871         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2872     return CreateAdd(StartValue, Offset);
2873   }
2874   case InductionDescriptor::IK_PtrInduction: {
2875     assert(isa<SCEVConstant>(Step) &&
2876            "Expected constant step for pointer induction");
2877     return B.CreateGEP(
2878         StartValue->getType()->getPointerElementType(), StartValue,
2879         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2880                                            &*B.GetInsertPoint())));
2881   }
2882   case InductionDescriptor::IK_FpInduction: {
2883     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2884     auto InductionBinOp = ID.getInductionBinOp();
2885     assert(InductionBinOp &&
2886            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2887             InductionBinOp->getOpcode() == Instruction::FSub) &&
2888            "Original bin op should be defined for FP induction");
2889 
2890     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2891 
2892     // Floating point operations had to be 'fast' to enable the induction.
2893     FastMathFlags Flags;
2894     Flags.setFast();
2895 
2896     Value *MulExp = B.CreateFMul(StepValue, Index);
2897     if (isa<Instruction>(MulExp))
2898       // We have to check, the MulExp may be a constant.
2899       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2900 
2901     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2902                                "induction");
2903     if (isa<Instruction>(BOp))
2904       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2905 
2906     return BOp;
2907   }
2908   case InductionDescriptor::IK_NoInduction:
2909     return nullptr;
2910   }
2911   llvm_unreachable("invalid enum");
2912 }
2913 
2914 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2915   /*
2916    In this function we generate a new loop. The new loop will contain
2917    the vectorized instructions while the old loop will continue to run the
2918    scalar remainder.
2919 
2920        [ ] <-- loop iteration number check.
2921     /   |
2922    /    v
2923   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2924   |  /  |
2925   | /   v
2926   ||   [ ]     <-- vector pre header.
2927   |/    |
2928   |     v
2929   |    [  ] \
2930   |    [  ]_|   <-- vector loop.
2931   |     |
2932   |     v
2933   |   -[ ]   <--- middle-block.
2934   |  /  |
2935   | /   v
2936   -|- >[ ]     <--- new preheader.
2937    |    |
2938    |    v
2939    |   [ ] \
2940    |   [ ]_|   <-- old scalar loop to handle remainder.
2941     \   |
2942      \  v
2943       >[ ]     <-- exit block.
2944    ...
2945    */
2946 
2947   MDNode *OrigLoopID = OrigLoop->getLoopID();
2948 
2949   // Some loops have a single integer induction variable, while other loops
2950   // don't. One example is c++ iterators that often have multiple pointer
2951   // induction variables. In the code below we also support a case where we
2952   // don't have a single induction variable.
2953   //
2954   // We try to obtain an induction variable from the original loop as hard
2955   // as possible. However if we don't find one that:
2956   //   - is an integer
2957   //   - counts from zero, stepping by one
2958   //   - is the size of the widest induction variable type
2959   // then we create a new one.
2960   OldInduction = Legal->getPrimaryInduction();
2961   Type *IdxTy = Legal->getWidestInductionType();
2962 
2963   // Split the single block loop into the two loop structure described above.
2964   LoopScalarBody = OrigLoop->getHeader();
2965   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2966   LoopExitBlock = OrigLoop->getExitBlock();
2967   assert(LoopExitBlock && "Must have an exit block");
2968   assert(LoopVectorPreHeader && "Invalid loop structure");
2969 
2970   LoopMiddleBlock =
2971       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2972                  LI, nullptr, "middle.block");
2973   LoopScalarPreHeader =
2974       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2975                  nullptr, "scalar.ph");
2976   // We intentionally don't let SplitBlock to update LoopInfo since
2977   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2978   // LoopVectorBody is explicitly added to the correct place few lines later.
2979   LoopVectorBody =
2980       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2981                  nullptr, nullptr, "vector.body");
2982 
2983   // Update dominator for loop exit.
2984   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2985 
2986   // Create and register the new vector loop.
2987   Loop *Lp = LI->AllocateLoop();
2988   Loop *ParentLoop = OrigLoop->getParentLoop();
2989 
2990   // Insert the new loop into the loop nest and register the new basic blocks
2991   // before calling any utilities such as SCEV that require valid LoopInfo.
2992   if (ParentLoop) {
2993     ParentLoop->addChildLoop(Lp);
2994   } else {
2995     LI->addTopLevelLoop(Lp);
2996   }
2997   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2998 
2999   // Find the loop boundaries.
3000   Value *Count = getOrCreateTripCount(Lp);
3001 
3002   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3003 
3004   // Now, compare the new count to zero. If it is zero skip the vector loop and
3005   // jump to the scalar loop. This check also covers the case where the
3006   // backedge-taken count is uint##_max: adding one to it will overflow leading
3007   // to an incorrect trip count of zero. In this (rare) case we will also jump
3008   // to the scalar loop.
3009   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3010 
3011   // Generate the code to check any assumptions that we've made for SCEV
3012   // expressions.
3013   emitSCEVChecks(Lp, LoopScalarPreHeader);
3014 
3015   // Generate the code that checks in runtime if arrays overlap. We put the
3016   // checks into a separate block to make the more common case of few elements
3017   // faster.
3018   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3019 
3020   // Generate the induction variable.
3021   // The loop step is equal to the vectorization factor (num of SIMD elements)
3022   // times the unroll factor (num of SIMD instructions).
3023   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3024   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3025   Induction =
3026       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3027                               getDebugLocFromInstOrOperands(OldInduction));
3028 
3029   // We are going to resume the execution of the scalar loop.
3030   // Go over all of the induction variables that we found and fix the
3031   // PHIs that are left in the scalar version of the loop.
3032   // The starting values of PHI nodes depend on the counter of the last
3033   // iteration in the vectorized loop.
3034   // If we come from a bypass edge then we need to start from the original
3035   // start value.
3036 
3037   // This variable saves the new starting index for the scalar loop. It is used
3038   // to test if there are any tail iterations left once the vector loop has
3039   // completed.
3040   for (auto &InductionEntry : Legal->getInductionVars()) {
3041     PHINode *OrigPhi = InductionEntry.first;
3042     InductionDescriptor II = InductionEntry.second;
3043 
3044     // Create phi nodes to merge from the  backedge-taken check block.
3045     PHINode *BCResumeVal =
3046         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3047                         LoopScalarPreHeader->getTerminator());
3048     // Copy original phi DL over to the new one.
3049     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3050     Value *&EndValue = IVEndValues[OrigPhi];
3051     if (OrigPhi == OldInduction) {
3052       // We know what the end value is.
3053       EndValue = CountRoundDown;
3054     } else {
3055       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3056       Type *StepType = II.getStep()->getType();
3057       Instruction::CastOps CastOp =
3058           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3059       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3060       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3061       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3062       EndValue->setName("ind.end");
3063     }
3064 
3065     // The new PHI merges the original incoming value, in case of a bypass,
3066     // or the value at the end of the vectorized loop.
3067     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3068 
3069     // Fix the scalar body counter (PHI node).
3070     // The old induction's phi node in the scalar body needs the truncated
3071     // value.
3072     for (BasicBlock *BB : LoopBypassBlocks)
3073       BCResumeVal->addIncoming(II.getStartValue(), BB);
3074     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3075   }
3076 
3077   // We need the OrigLoop (scalar loop part) latch terminator to help
3078   // produce correct debug info for the middle block BB instructions.
3079   // The legality check stage guarantees that the loop will have a single
3080   // latch.
3081   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3082          "Scalar loop latch terminator isn't a branch");
3083   BranchInst *ScalarLatchBr =
3084       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3085 
3086   // Add a check in the middle block to see if we have completed
3087   // all of the iterations in the first vector loop.
3088   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3089   // If tail is to be folded, we know we don't need to run the remainder.
3090   Value *CmpN = Builder.getTrue();
3091   if (!Cost->foldTailByMasking()) {
3092     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3093                            CountRoundDown, "cmp.n",
3094                            LoopMiddleBlock->getTerminator());
3095 
3096     // Here we use the same DebugLoc as the scalar loop latch branch instead
3097     // of the corresponding compare because they may have ended up with
3098     // different line numbers and we want to avoid awkward line stepping while
3099     // debugging. Eg. if the compare has got a line number inside the loop.
3100     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3101   }
3102 
3103   BranchInst *BrInst =
3104       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3105   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3106   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3107 
3108   // Get ready to start creating new instructions into the vectorized body.
3109   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3110          "Inconsistent vector loop preheader");
3111   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3112 
3113   Optional<MDNode *> VectorizedLoopID =
3114       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3115                                       LLVMLoopVectorizeFollowupVectorized});
3116   if (VectorizedLoopID.hasValue()) {
3117     Lp->setLoopID(VectorizedLoopID.getValue());
3118 
3119     // Do not setAlreadyVectorized if loop attributes have been defined
3120     // explicitly.
3121     return LoopVectorPreHeader;
3122   }
3123 
3124   // Keep all loop hints from the original loop on the vector loop (we'll
3125   // replace the vectorizer-specific hints below).
3126   if (MDNode *LID = OrigLoop->getLoopID())
3127     Lp->setLoopID(LID);
3128 
3129   LoopVectorizeHints Hints(Lp, true, *ORE);
3130   Hints.setAlreadyVectorized();
3131 
3132 #ifdef EXPENSIVE_CHECKS
3133   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3134   LI->verify(*DT);
3135 #endif
3136 
3137   return LoopVectorPreHeader;
3138 }
3139 
3140 // Fix up external users of the induction variable. At this point, we are
3141 // in LCSSA form, with all external PHIs that use the IV having one input value,
3142 // coming from the remainder loop. We need those PHIs to also have a correct
3143 // value for the IV when arriving directly from the middle block.
3144 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3145                                        const InductionDescriptor &II,
3146                                        Value *CountRoundDown, Value *EndValue,
3147                                        BasicBlock *MiddleBlock) {
3148   // There are two kinds of external IV usages - those that use the value
3149   // computed in the last iteration (the PHI) and those that use the penultimate
3150   // value (the value that feeds into the phi from the loop latch).
3151   // We allow both, but they, obviously, have different values.
3152 
3153   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3154 
3155   DenseMap<Value *, Value *> MissingVals;
3156 
3157   // An external user of the last iteration's value should see the value that
3158   // the remainder loop uses to initialize its own IV.
3159   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3160   for (User *U : PostInc->users()) {
3161     Instruction *UI = cast<Instruction>(U);
3162     if (!OrigLoop->contains(UI)) {
3163       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3164       MissingVals[UI] = EndValue;
3165     }
3166   }
3167 
3168   // An external user of the penultimate value need to see EndValue - Step.
3169   // The simplest way to get this is to recompute it from the constituent SCEVs,
3170   // that is Start + (Step * (CRD - 1)).
3171   for (User *U : OrigPhi->users()) {
3172     auto *UI = cast<Instruction>(U);
3173     if (!OrigLoop->contains(UI)) {
3174       const DataLayout &DL =
3175           OrigLoop->getHeader()->getModule()->getDataLayout();
3176       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3177 
3178       IRBuilder<> B(MiddleBlock->getTerminator());
3179       Value *CountMinusOne = B.CreateSub(
3180           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3181       Value *CMO =
3182           !II.getStep()->getType()->isIntegerTy()
3183               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3184                              II.getStep()->getType())
3185               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3186       CMO->setName("cast.cmo");
3187       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3188       Escape->setName("ind.escape");
3189       MissingVals[UI] = Escape;
3190     }
3191   }
3192 
3193   for (auto &I : MissingVals) {
3194     PHINode *PHI = cast<PHINode>(I.first);
3195     // One corner case we have to handle is two IVs "chasing" each-other,
3196     // that is %IV2 = phi [...], [ %IV1, %latch ]
3197     // In this case, if IV1 has an external use, we need to avoid adding both
3198     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3199     // don't already have an incoming value for the middle block.
3200     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3201       PHI->addIncoming(I.second, MiddleBlock);
3202   }
3203 }
3204 
3205 namespace {
3206 
3207 struct CSEDenseMapInfo {
3208   static bool canHandle(const Instruction *I) {
3209     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3210            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3211   }
3212 
3213   static inline Instruction *getEmptyKey() {
3214     return DenseMapInfo<Instruction *>::getEmptyKey();
3215   }
3216 
3217   static inline Instruction *getTombstoneKey() {
3218     return DenseMapInfo<Instruction *>::getTombstoneKey();
3219   }
3220 
3221   static unsigned getHashValue(const Instruction *I) {
3222     assert(canHandle(I) && "Unknown instruction!");
3223     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3224                                                            I->value_op_end()));
3225   }
3226 
3227   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3228     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3229         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3230       return LHS == RHS;
3231     return LHS->isIdenticalTo(RHS);
3232   }
3233 };
3234 
3235 } // end anonymous namespace
3236 
3237 ///Perform cse of induction variable instructions.
3238 static void cse(BasicBlock *BB) {
3239   // Perform simple cse.
3240   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3241   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3242     Instruction *In = &*I++;
3243 
3244     if (!CSEDenseMapInfo::canHandle(In))
3245       continue;
3246 
3247     // Check if we can replace this instruction with any of the
3248     // visited instructions.
3249     if (Instruction *V = CSEMap.lookup(In)) {
3250       In->replaceAllUsesWith(V);
3251       In->eraseFromParent();
3252       continue;
3253     }
3254 
3255     CSEMap[In] = In;
3256   }
3257 }
3258 
3259 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3260                                                        unsigned VF,
3261                                                        bool &NeedToScalarize) {
3262   Function *F = CI->getCalledFunction();
3263   Type *ScalarRetTy = CI->getType();
3264   SmallVector<Type *, 4> Tys, ScalarTys;
3265   for (auto &ArgOp : CI->arg_operands())
3266     ScalarTys.push_back(ArgOp->getType());
3267 
3268   // Estimate cost of scalarized vector call. The source operands are assumed
3269   // to be vectors, so we need to extract individual elements from there,
3270   // execute VF scalar calls, and then gather the result into the vector return
3271   // value.
3272   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3273   if (VF == 1)
3274     return ScalarCallCost;
3275 
3276   // Compute corresponding vector type for return value and arguments.
3277   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3278   for (Type *ScalarTy : ScalarTys)
3279     Tys.push_back(ToVectorTy(ScalarTy, VF));
3280 
3281   // Compute costs of unpacking argument values for the scalar calls and
3282   // packing the return values to a vector.
3283   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3284 
3285   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3286 
3287   // If we can't emit a vector call for this function, then the currently found
3288   // cost is the cost we need to return.
3289   NeedToScalarize = true;
3290   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3291   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3292 
3293   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3294     return Cost;
3295 
3296   // If the corresponding vector cost is cheaper, return its cost.
3297   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3298   if (VectorCallCost < Cost) {
3299     NeedToScalarize = false;
3300     return VectorCallCost;
3301   }
3302   return Cost;
3303 }
3304 
3305 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3306                                                             unsigned VF) {
3307   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3308   assert(ID && "Expected intrinsic call!");
3309 
3310   FastMathFlags FMF;
3311   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3312     FMF = FPMO->getFastMathFlags();
3313 
3314   SmallVector<Value *, 4> Operands(CI->arg_operands());
3315   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
3316 }
3317 
3318 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3319   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3320   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3321   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3322 }
3323 
3324 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3325   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3326   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3327   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3328 }
3329 
3330 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3331   // For every instruction `I` in MinBWs, truncate the operands, create a
3332   // truncated version of `I` and reextend its result. InstCombine runs
3333   // later and will remove any ext/trunc pairs.
3334   SmallPtrSet<Value *, 4> Erased;
3335   for (const auto &KV : Cost->getMinimalBitwidths()) {
3336     // If the value wasn't vectorized, we must maintain the original scalar
3337     // type. The absence of the value from VectorLoopValueMap indicates that it
3338     // wasn't vectorized.
3339     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3340       continue;
3341     for (unsigned Part = 0; Part < UF; ++Part) {
3342       Value *I = getOrCreateVectorValue(KV.first, Part);
3343       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3344           !isa<Instruction>(I))
3345         continue;
3346       Type *OriginalTy = I->getType();
3347       Type *ScalarTruncatedTy =
3348           IntegerType::get(OriginalTy->getContext(), KV.second);
3349       Type *TruncatedTy = VectorType::get(
3350           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3351       if (TruncatedTy == OriginalTy)
3352         continue;
3353 
3354       IRBuilder<> B(cast<Instruction>(I));
3355       auto ShrinkOperand = [&](Value *V) -> Value * {
3356         if (auto *ZI = dyn_cast<ZExtInst>(V))
3357           if (ZI->getSrcTy() == TruncatedTy)
3358             return ZI->getOperand(0);
3359         return B.CreateZExtOrTrunc(V, TruncatedTy);
3360       };
3361 
3362       // The actual instruction modification depends on the instruction type,
3363       // unfortunately.
3364       Value *NewI = nullptr;
3365       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3366         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3367                              ShrinkOperand(BO->getOperand(1)));
3368 
3369         // Any wrapping introduced by shrinking this operation shouldn't be
3370         // considered undefined behavior. So, we can't unconditionally copy
3371         // arithmetic wrapping flags to NewI.
3372         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3373       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3374         NewI =
3375             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3376                          ShrinkOperand(CI->getOperand(1)));
3377       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3378         NewI = B.CreateSelect(SI->getCondition(),
3379                               ShrinkOperand(SI->getTrueValue()),
3380                               ShrinkOperand(SI->getFalseValue()));
3381       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3382         switch (CI->getOpcode()) {
3383         default:
3384           llvm_unreachable("Unhandled cast!");
3385         case Instruction::Trunc:
3386           NewI = ShrinkOperand(CI->getOperand(0));
3387           break;
3388         case Instruction::SExt:
3389           NewI = B.CreateSExtOrTrunc(
3390               CI->getOperand(0),
3391               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3392           break;
3393         case Instruction::ZExt:
3394           NewI = B.CreateZExtOrTrunc(
3395               CI->getOperand(0),
3396               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3397           break;
3398         }
3399       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3400         auto Elements0 =
3401             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3402         auto *O0 = B.CreateZExtOrTrunc(
3403             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3404         auto Elements1 =
3405             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3406         auto *O1 = B.CreateZExtOrTrunc(
3407             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3408 
3409         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3410       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3411         // Don't do anything with the operands, just extend the result.
3412         continue;
3413       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3414         auto Elements =
3415             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3416         auto *O0 = B.CreateZExtOrTrunc(
3417             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3418         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3419         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3420       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3421         auto Elements =
3422             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3423         auto *O0 = B.CreateZExtOrTrunc(
3424             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3425         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3426       } else {
3427         // If we don't know what to do, be conservative and don't do anything.
3428         continue;
3429       }
3430 
3431       // Lastly, extend the result.
3432       NewI->takeName(cast<Instruction>(I));
3433       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3434       I->replaceAllUsesWith(Res);
3435       cast<Instruction>(I)->eraseFromParent();
3436       Erased.insert(I);
3437       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3438     }
3439   }
3440 
3441   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3442   for (const auto &KV : Cost->getMinimalBitwidths()) {
3443     // If the value wasn't vectorized, we must maintain the original scalar
3444     // type. The absence of the value from VectorLoopValueMap indicates that it
3445     // wasn't vectorized.
3446     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3447       continue;
3448     for (unsigned Part = 0; Part < UF; ++Part) {
3449       Value *I = getOrCreateVectorValue(KV.first, Part);
3450       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3451       if (Inst && Inst->use_empty()) {
3452         Value *NewI = Inst->getOperand(0);
3453         Inst->eraseFromParent();
3454         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3455       }
3456     }
3457   }
3458 }
3459 
3460 void InnerLoopVectorizer::fixVectorizedLoop() {
3461   // Insert truncates and extends for any truncated instructions as hints to
3462   // InstCombine.
3463   if (VF > 1)
3464     truncateToMinimalBitwidths();
3465 
3466   // Fix widened non-induction PHIs by setting up the PHI operands.
3467   if (OrigPHIsToFix.size()) {
3468     assert(EnableVPlanNativePath &&
3469            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3470     fixNonInductionPHIs();
3471   }
3472 
3473   // At this point every instruction in the original loop is widened to a
3474   // vector form. Now we need to fix the recurrences in the loop. These PHI
3475   // nodes are currently empty because we did not want to introduce cycles.
3476   // This is the second stage of vectorizing recurrences.
3477   fixCrossIterationPHIs();
3478 
3479   // Forget the original basic block.
3480   PSE.getSE()->forgetLoop(OrigLoop);
3481 
3482   // Fix-up external users of the induction variables.
3483   for (auto &Entry : Legal->getInductionVars())
3484     fixupIVUsers(Entry.first, Entry.second,
3485                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3486                  IVEndValues[Entry.first], LoopMiddleBlock);
3487 
3488   fixLCSSAPHIs();
3489   for (Instruction *PI : PredicatedInstructions)
3490     sinkScalarOperands(&*PI);
3491 
3492   // Remove redundant induction instructions.
3493   cse(LoopVectorBody);
3494 
3495   // Set/update profile weights for the vector and remainder loops as original
3496   // loop iterations are now distributed among them. Note that original loop
3497   // represented by LoopScalarBody becomes remainder loop after vectorization.
3498   //
3499   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3500   // end up getting slightly roughened result but that should be OK since
3501   // profile is not inherently precise anyway. Note also possible bypass of
3502   // vector code caused by legality checks is ignored, assigning all the weight
3503   // to the vector loop, optimistically.
3504   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3505                                LI->getLoopFor(LoopVectorBody),
3506                                LI->getLoopFor(LoopScalarBody), VF * UF);
3507 }
3508 
3509 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3510   // In order to support recurrences we need to be able to vectorize Phi nodes.
3511   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3512   // stage #2: We now need to fix the recurrences by adding incoming edges to
3513   // the currently empty PHI nodes. At this point every instruction in the
3514   // original loop is widened to a vector form so we can use them to construct
3515   // the incoming edges.
3516   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3517     // Handle first-order recurrences and reductions that need to be fixed.
3518     if (Legal->isFirstOrderRecurrence(&Phi))
3519       fixFirstOrderRecurrence(&Phi);
3520     else if (Legal->isReductionVariable(&Phi))
3521       fixReduction(&Phi);
3522   }
3523 }
3524 
3525 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3526   // This is the second phase of vectorizing first-order recurrences. An
3527   // overview of the transformation is described below. Suppose we have the
3528   // following loop.
3529   //
3530   //   for (int i = 0; i < n; ++i)
3531   //     b[i] = a[i] - a[i - 1];
3532   //
3533   // There is a first-order recurrence on "a". For this loop, the shorthand
3534   // scalar IR looks like:
3535   //
3536   //   scalar.ph:
3537   //     s_init = a[-1]
3538   //     br scalar.body
3539   //
3540   //   scalar.body:
3541   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3542   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3543   //     s2 = a[i]
3544   //     b[i] = s2 - s1
3545   //     br cond, scalar.body, ...
3546   //
3547   // In this example, s1 is a recurrence because it's value depends on the
3548   // previous iteration. In the first phase of vectorization, we created a
3549   // temporary value for s1. We now complete the vectorization and produce the
3550   // shorthand vector IR shown below (for VF = 4, UF = 1).
3551   //
3552   //   vector.ph:
3553   //     v_init = vector(..., ..., ..., a[-1])
3554   //     br vector.body
3555   //
3556   //   vector.body
3557   //     i = phi [0, vector.ph], [i+4, vector.body]
3558   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3559   //     v2 = a[i, i+1, i+2, i+3];
3560   //     v3 = vector(v1(3), v2(0, 1, 2))
3561   //     b[i, i+1, i+2, i+3] = v2 - v3
3562   //     br cond, vector.body, middle.block
3563   //
3564   //   middle.block:
3565   //     x = v2(3)
3566   //     br scalar.ph
3567   //
3568   //   scalar.ph:
3569   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3570   //     br scalar.body
3571   //
3572   // After execution completes the vector loop, we extract the next value of
3573   // the recurrence (x) to use as the initial value in the scalar loop.
3574 
3575   // Get the original loop preheader and single loop latch.
3576   auto *Preheader = OrigLoop->getLoopPreheader();
3577   auto *Latch = OrigLoop->getLoopLatch();
3578 
3579   // Get the initial and previous values of the scalar recurrence.
3580   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3581   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3582 
3583   // Create a vector from the initial value.
3584   auto *VectorInit = ScalarInit;
3585   if (VF > 1) {
3586     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3587     VectorInit = Builder.CreateInsertElement(
3588         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3589         Builder.getInt32(VF - 1), "vector.recur.init");
3590   }
3591 
3592   // We constructed a temporary phi node in the first phase of vectorization.
3593   // This phi node will eventually be deleted.
3594   Builder.SetInsertPoint(
3595       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3596 
3597   // Create a phi node for the new recurrence. The current value will either be
3598   // the initial value inserted into a vector or loop-varying vector value.
3599   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3600   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3601 
3602   // Get the vectorized previous value of the last part UF - 1. It appears last
3603   // among all unrolled iterations, due to the order of their construction.
3604   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3605 
3606   // Find and set the insertion point after the previous value if it is an
3607   // instruction.
3608   BasicBlock::iterator InsertPt;
3609   // Note that the previous value may have been constant-folded so it is not
3610   // guaranteed to be an instruction in the vector loop.
3611   // FIXME: Loop invariant values do not form recurrences. We should deal with
3612   //        them earlier.
3613   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3614     InsertPt = LoopVectorBody->getFirstInsertionPt();
3615   else {
3616     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3617     if (isa<PHINode>(PreviousLastPart))
3618       // If the previous value is a phi node, we should insert after all the phi
3619       // nodes in the block containing the PHI to avoid breaking basic block
3620       // verification. Note that the basic block may be different to
3621       // LoopVectorBody, in case we predicate the loop.
3622       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3623     else
3624       InsertPt = ++PreviousInst->getIterator();
3625   }
3626   Builder.SetInsertPoint(&*InsertPt);
3627 
3628   // We will construct a vector for the recurrence by combining the values for
3629   // the current and previous iterations. This is the required shuffle mask.
3630   SmallVector<int, 8> ShuffleMask(VF);
3631   ShuffleMask[0] = VF - 1;
3632   for (unsigned I = 1; I < VF; ++I)
3633     ShuffleMask[I] = I + VF - 1;
3634 
3635   // The vector from which to take the initial value for the current iteration
3636   // (actual or unrolled). Initially, this is the vector phi node.
3637   Value *Incoming = VecPhi;
3638 
3639   // Shuffle the current and previous vector and update the vector parts.
3640   for (unsigned Part = 0; Part < UF; ++Part) {
3641     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3642     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3643     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3644                                                          ShuffleMask)
3645                            : Incoming;
3646     PhiPart->replaceAllUsesWith(Shuffle);
3647     cast<Instruction>(PhiPart)->eraseFromParent();
3648     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3649     Incoming = PreviousPart;
3650   }
3651 
3652   // Fix the latch value of the new recurrence in the vector loop.
3653   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3654 
3655   // Extract the last vector element in the middle block. This will be the
3656   // initial value for the recurrence when jumping to the scalar loop.
3657   auto *ExtractForScalar = Incoming;
3658   if (VF > 1) {
3659     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3660     ExtractForScalar = Builder.CreateExtractElement(
3661         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3662   }
3663   // Extract the second last element in the middle block if the
3664   // Phi is used outside the loop. We need to extract the phi itself
3665   // and not the last element (the phi update in the current iteration). This
3666   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3667   // when the scalar loop is not run at all.
3668   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3669   if (VF > 1)
3670     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3671         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3672   // When loop is unrolled without vectorizing, initialize
3673   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3674   // `Incoming`. This is analogous to the vectorized case above: extracting the
3675   // second last element when VF > 1.
3676   else if (UF > 1)
3677     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3678 
3679   // Fix the initial value of the original recurrence in the scalar loop.
3680   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3681   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3682   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3683     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3684     Start->addIncoming(Incoming, BB);
3685   }
3686 
3687   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3688   Phi->setName("scalar.recur");
3689 
3690   // Finally, fix users of the recurrence outside the loop. The users will need
3691   // either the last value of the scalar recurrence or the last value of the
3692   // vector recurrence we extracted in the middle block. Since the loop is in
3693   // LCSSA form, we just need to find all the phi nodes for the original scalar
3694   // recurrence in the exit block, and then add an edge for the middle block.
3695   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3696     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3697       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3698     }
3699   }
3700 }
3701 
3702 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3703   Constant *Zero = Builder.getInt32(0);
3704 
3705   // Get it's reduction variable descriptor.
3706   assert(Legal->isReductionVariable(Phi) &&
3707          "Unable to find the reduction variable");
3708   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3709 
3710   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3711   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3712   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3713   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3714     RdxDesc.getMinMaxRecurrenceKind();
3715   setDebugLocFromInst(Builder, ReductionStartValue);
3716 
3717   // We need to generate a reduction vector from the incoming scalar.
3718   // To do so, we need to generate the 'identity' vector and override
3719   // one of the elements with the incoming scalar reduction. We need
3720   // to do it in the vector-loop preheader.
3721   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3722 
3723   // This is the vector-clone of the value that leaves the loop.
3724   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3725 
3726   // Find the reduction identity variable. Zero for addition, or, xor,
3727   // one for multiplication, -1 for And.
3728   Value *Identity;
3729   Value *VectorStart;
3730   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3731       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3732     // MinMax reduction have the start value as their identify.
3733     if (VF == 1) {
3734       VectorStart = Identity = ReductionStartValue;
3735     } else {
3736       VectorStart = Identity =
3737         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3738     }
3739   } else {
3740     // Handle other reduction kinds:
3741     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3742         RK, VecTy->getScalarType());
3743     if (VF == 1) {
3744       Identity = Iden;
3745       // This vector is the Identity vector where the first element is the
3746       // incoming scalar reduction.
3747       VectorStart = ReductionStartValue;
3748     } else {
3749       Identity = ConstantVector::getSplat({VF, false}, Iden);
3750 
3751       // This vector is the Identity vector where the first element is the
3752       // incoming scalar reduction.
3753       VectorStart =
3754         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3755     }
3756   }
3757 
3758   // Wrap flags are in general invalid after vectorization, clear them.
3759   clearReductionWrapFlags(RdxDesc);
3760 
3761   // Fix the vector-loop phi.
3762 
3763   // Reductions do not have to start at zero. They can start with
3764   // any loop invariant values.
3765   BasicBlock *Latch = OrigLoop->getLoopLatch();
3766   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3767 
3768   for (unsigned Part = 0; Part < UF; ++Part) {
3769     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3770     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3771     // Make sure to add the reduction start value only to the
3772     // first unroll part.
3773     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3774     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3775     cast<PHINode>(VecRdxPhi)
3776       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3777   }
3778 
3779   // Before each round, move the insertion point right between
3780   // the PHIs and the values we are going to write.
3781   // This allows us to write both PHINodes and the extractelement
3782   // instructions.
3783   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3784 
3785   setDebugLocFromInst(Builder, LoopExitInst);
3786 
3787   // If tail is folded by masking, the vector value to leave the loop should be
3788   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3789   // instead of the former.
3790   if (Cost->foldTailByMasking()) {
3791     for (unsigned Part = 0; Part < UF; ++Part) {
3792       Value *VecLoopExitInst =
3793           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3794       Value *Sel = nullptr;
3795       for (User *U : VecLoopExitInst->users()) {
3796         if (isa<SelectInst>(U)) {
3797           assert(!Sel && "Reduction exit feeding two selects");
3798           Sel = U;
3799         } else
3800           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3801       }
3802       assert(Sel && "Reduction exit feeds no select");
3803       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3804     }
3805   }
3806 
3807   // If the vector reduction can be performed in a smaller type, we truncate
3808   // then extend the loop exit value to enable InstCombine to evaluate the
3809   // entire expression in the smaller type.
3810   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3811     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3812     Builder.SetInsertPoint(
3813         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3814     VectorParts RdxParts(UF);
3815     for (unsigned Part = 0; Part < UF; ++Part) {
3816       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3817       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3818       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3819                                         : Builder.CreateZExt(Trunc, VecTy);
3820       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3821            UI != RdxParts[Part]->user_end();)
3822         if (*UI != Trunc) {
3823           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3824           RdxParts[Part] = Extnd;
3825         } else {
3826           ++UI;
3827         }
3828     }
3829     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3830     for (unsigned Part = 0; Part < UF; ++Part) {
3831       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3832       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3833     }
3834   }
3835 
3836   // Reduce all of the unrolled parts into a single vector.
3837   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3838   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3839 
3840   // The middle block terminator has already been assigned a DebugLoc here (the
3841   // OrigLoop's single latch terminator). We want the whole middle block to
3842   // appear to execute on this line because: (a) it is all compiler generated,
3843   // (b) these instructions are always executed after evaluating the latch
3844   // conditional branch, and (c) other passes may add new predecessors which
3845   // terminate on this line. This is the easiest way to ensure we don't
3846   // accidentally cause an extra step back into the loop while debugging.
3847   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3848   for (unsigned Part = 1; Part < UF; ++Part) {
3849     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3850     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3851       // Floating point operations had to be 'fast' to enable the reduction.
3852       ReducedPartRdx = addFastMathFlag(
3853           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3854                               ReducedPartRdx, "bin.rdx"),
3855           RdxDesc.getFastMathFlags());
3856     else
3857       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3858                                       RdxPart);
3859   }
3860 
3861   if (VF > 1) {
3862     bool NoNaN = Legal->hasFunNoNaNAttr();
3863     ReducedPartRdx =
3864         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3865     // If the reduction can be performed in a smaller type, we need to extend
3866     // the reduction to the wider type before we branch to the original loop.
3867     if (Phi->getType() != RdxDesc.getRecurrenceType())
3868       ReducedPartRdx =
3869         RdxDesc.isSigned()
3870         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3871         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3872   }
3873 
3874   // Create a phi node that merges control-flow from the backedge-taken check
3875   // block and the middle block.
3876   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3877                                         LoopScalarPreHeader->getTerminator());
3878   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3879     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3880   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3881 
3882   // Now, we need to fix the users of the reduction variable
3883   // inside and outside of the scalar remainder loop.
3884   // We know that the loop is in LCSSA form. We need to update the
3885   // PHI nodes in the exit blocks.
3886   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3887     // All PHINodes need to have a single entry edge, or two if
3888     // we already fixed them.
3889     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3890 
3891     // We found a reduction value exit-PHI. Update it with the
3892     // incoming bypass edge.
3893     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3894       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3895   } // end of the LCSSA phi scan.
3896 
3897     // Fix the scalar loop reduction variable with the incoming reduction sum
3898     // from the vector body and from the backedge value.
3899   int IncomingEdgeBlockIdx =
3900     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3901   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3902   // Pick the other block.
3903   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3904   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3905   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3906 }
3907 
3908 void InnerLoopVectorizer::clearReductionWrapFlags(
3909     RecurrenceDescriptor &RdxDesc) {
3910   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3911   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3912       RK != RecurrenceDescriptor::RK_IntegerMult)
3913     return;
3914 
3915   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3916   assert(LoopExitInstr && "null loop exit instruction");
3917   SmallVector<Instruction *, 8> Worklist;
3918   SmallPtrSet<Instruction *, 8> Visited;
3919   Worklist.push_back(LoopExitInstr);
3920   Visited.insert(LoopExitInstr);
3921 
3922   while (!Worklist.empty()) {
3923     Instruction *Cur = Worklist.pop_back_val();
3924     if (isa<OverflowingBinaryOperator>(Cur))
3925       for (unsigned Part = 0; Part < UF; ++Part) {
3926         Value *V = getOrCreateVectorValue(Cur, Part);
3927         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3928       }
3929 
3930     for (User *U : Cur->users()) {
3931       Instruction *UI = cast<Instruction>(U);
3932       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3933           Visited.insert(UI).second)
3934         Worklist.push_back(UI);
3935     }
3936   }
3937 }
3938 
3939 void InnerLoopVectorizer::fixLCSSAPHIs() {
3940   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3941     if (LCSSAPhi.getNumIncomingValues() == 1) {
3942       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3943       // Non-instruction incoming values will have only one value.
3944       unsigned LastLane = 0;
3945       if (isa<Instruction>(IncomingValue))
3946           LastLane = Cost->isUniformAfterVectorization(
3947                          cast<Instruction>(IncomingValue), VF)
3948                          ? 0
3949                          : VF - 1;
3950       // Can be a loop invariant incoming value or the last scalar value to be
3951       // extracted from the vectorized loop.
3952       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3953       Value *lastIncomingValue =
3954           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3955       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3956     }
3957   }
3958 }
3959 
3960 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3961   // The basic block and loop containing the predicated instruction.
3962   auto *PredBB = PredInst->getParent();
3963   auto *VectorLoop = LI->getLoopFor(PredBB);
3964 
3965   // Initialize a worklist with the operands of the predicated instruction.
3966   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3967 
3968   // Holds instructions that we need to analyze again. An instruction may be
3969   // reanalyzed if we don't yet know if we can sink it or not.
3970   SmallVector<Instruction *, 8> InstsToReanalyze;
3971 
3972   // Returns true if a given use occurs in the predicated block. Phi nodes use
3973   // their operands in their corresponding predecessor blocks.
3974   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3975     auto *I = cast<Instruction>(U.getUser());
3976     BasicBlock *BB = I->getParent();
3977     if (auto *Phi = dyn_cast<PHINode>(I))
3978       BB = Phi->getIncomingBlock(
3979           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3980     return BB == PredBB;
3981   };
3982 
3983   // Iteratively sink the scalarized operands of the predicated instruction
3984   // into the block we created for it. When an instruction is sunk, it's
3985   // operands are then added to the worklist. The algorithm ends after one pass
3986   // through the worklist doesn't sink a single instruction.
3987   bool Changed;
3988   do {
3989     // Add the instructions that need to be reanalyzed to the worklist, and
3990     // reset the changed indicator.
3991     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3992     InstsToReanalyze.clear();
3993     Changed = false;
3994 
3995     while (!Worklist.empty()) {
3996       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3997 
3998       // We can't sink an instruction if it is a phi node, is already in the
3999       // predicated block, is not in the loop, or may have side effects.
4000       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4001           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4002         continue;
4003 
4004       // It's legal to sink the instruction if all its uses occur in the
4005       // predicated block. Otherwise, there's nothing to do yet, and we may
4006       // need to reanalyze the instruction.
4007       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4008         InstsToReanalyze.push_back(I);
4009         continue;
4010       }
4011 
4012       // Move the instruction to the beginning of the predicated block, and add
4013       // it's operands to the worklist.
4014       I->moveBefore(&*PredBB->getFirstInsertionPt());
4015       Worklist.insert(I->op_begin(), I->op_end());
4016 
4017       // The sinking may have enabled other instructions to be sunk, so we will
4018       // need to iterate.
4019       Changed = true;
4020     }
4021   } while (Changed);
4022 }
4023 
4024 void InnerLoopVectorizer::fixNonInductionPHIs() {
4025   for (PHINode *OrigPhi : OrigPHIsToFix) {
4026     PHINode *NewPhi =
4027         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4028     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4029 
4030     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4031         predecessors(OrigPhi->getParent()));
4032     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4033         predecessors(NewPhi->getParent()));
4034     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4035            "Scalar and Vector BB should have the same number of predecessors");
4036 
4037     // The insertion point in Builder may be invalidated by the time we get
4038     // here. Force the Builder insertion point to something valid so that we do
4039     // not run into issues during insertion point restore in
4040     // getOrCreateVectorValue calls below.
4041     Builder.SetInsertPoint(NewPhi);
4042 
4043     // The predecessor order is preserved and we can rely on mapping between
4044     // scalar and vector block predecessors.
4045     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4046       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4047 
4048       // When looking up the new scalar/vector values to fix up, use incoming
4049       // values from original phi.
4050       Value *ScIncV =
4051           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4052 
4053       // Scalar incoming value may need a broadcast
4054       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4055       NewPhi->addIncoming(NewIncV, NewPredBB);
4056     }
4057   }
4058 }
4059 
4060 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4061                                    unsigned VF, bool IsPtrLoopInvariant,
4062                                    SmallBitVector &IsIndexLoopInvariant) {
4063   // Construct a vector GEP by widening the operands of the scalar GEP as
4064   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4065   // results in a vector of pointers when at least one operand of the GEP
4066   // is vector-typed. Thus, to keep the representation compact, we only use
4067   // vector-typed operands for loop-varying values.
4068 
4069   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4070     // If we are vectorizing, but the GEP has only loop-invariant operands,
4071     // the GEP we build (by only using vector-typed operands for
4072     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4073     // produce a vector of pointers, we need to either arbitrarily pick an
4074     // operand to broadcast, or broadcast a clone of the original GEP.
4075     // Here, we broadcast a clone of the original.
4076     //
4077     // TODO: If at some point we decide to scalarize instructions having
4078     //       loop-invariant operands, this special case will no longer be
4079     //       required. We would add the scalarization decision to
4080     //       collectLoopScalars() and teach getVectorValue() to broadcast
4081     //       the lane-zero scalar value.
4082     auto *Clone = Builder.Insert(GEP->clone());
4083     for (unsigned Part = 0; Part < UF; ++Part) {
4084       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4085       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4086       addMetadata(EntryPart, GEP);
4087     }
4088   } else {
4089     // If the GEP has at least one loop-varying operand, we are sure to
4090     // produce a vector of pointers. But if we are only unrolling, we want
4091     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4092     // produce with the code below will be scalar (if VF == 1) or vector
4093     // (otherwise). Note that for the unroll-only case, we still maintain
4094     // values in the vector mapping with initVector, as we do for other
4095     // instructions.
4096     for (unsigned Part = 0; Part < UF; ++Part) {
4097       // The pointer operand of the new GEP. If it's loop-invariant, we
4098       // won't broadcast it.
4099       auto *Ptr = IsPtrLoopInvariant
4100                       ? GEP->getPointerOperand()
4101                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4102 
4103       // Collect all the indices for the new GEP. If any index is
4104       // loop-invariant, we won't broadcast it.
4105       SmallVector<Value *, 4> Indices;
4106       for (auto Index : enumerate(GEP->indices())) {
4107         Value *User = Index.value().get();
4108         if (IsIndexLoopInvariant[Index.index()])
4109           Indices.push_back(User);
4110         else
4111           Indices.push_back(getOrCreateVectorValue(User, Part));
4112       }
4113 
4114       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4115       // but it should be a vector, otherwise.
4116       auto *NewGEP =
4117           GEP->isInBounds()
4118               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4119                                           Indices)
4120               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4121       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4122              "NewGEP is not a pointer vector");
4123       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4124       addMetadata(NewGEP, GEP);
4125     }
4126   }
4127 }
4128 
4129 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4130                                               unsigned VF) {
4131   PHINode *P = cast<PHINode>(PN);
4132   if (EnableVPlanNativePath) {
4133     // Currently we enter here in the VPlan-native path for non-induction
4134     // PHIs where all control flow is uniform. We simply widen these PHIs.
4135     // Create a vector phi with no operands - the vector phi operands will be
4136     // set at the end of vector code generation.
4137     Type *VecTy =
4138         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4139     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4140     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4141     OrigPHIsToFix.push_back(P);
4142 
4143     return;
4144   }
4145 
4146   assert(PN->getParent() == OrigLoop->getHeader() &&
4147          "Non-header phis should have been handled elsewhere");
4148 
4149   // In order to support recurrences we need to be able to vectorize Phi nodes.
4150   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4151   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4152   // this value when we vectorize all of the instructions that use the PHI.
4153   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4154     for (unsigned Part = 0; Part < UF; ++Part) {
4155       // This is phase one of vectorizing PHIs.
4156       Type *VecTy =
4157           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4158       Value *EntryPart = PHINode::Create(
4159           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4160       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4161     }
4162     return;
4163   }
4164 
4165   setDebugLocFromInst(Builder, P);
4166 
4167   // This PHINode must be an induction variable.
4168   // Make sure that we know about it.
4169   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4170 
4171   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4172   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4173 
4174   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4175   // which can be found from the original scalar operations.
4176   switch (II.getKind()) {
4177   case InductionDescriptor::IK_NoInduction:
4178     llvm_unreachable("Unknown induction");
4179   case InductionDescriptor::IK_IntInduction:
4180   case InductionDescriptor::IK_FpInduction:
4181     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4182   case InductionDescriptor::IK_PtrInduction: {
4183     // Handle the pointer induction variable case.
4184     assert(P->getType()->isPointerTy() && "Unexpected type.");
4185     // This is the normalized GEP that starts counting at zero.
4186     Value *PtrInd = Induction;
4187     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4188     // Determine the number of scalars we need to generate for each unroll
4189     // iteration. If the instruction is uniform, we only need to generate the
4190     // first lane. Otherwise, we generate all VF values.
4191     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4192     // These are the scalar results. Notice that we don't generate vector GEPs
4193     // because scalar GEPs result in better code.
4194     for (unsigned Part = 0; Part < UF; ++Part) {
4195       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4196         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4197         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4198         Value *SclrGep =
4199             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4200         SclrGep->setName("next.gep");
4201         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4202       }
4203     }
4204     return;
4205   }
4206   }
4207 }
4208 
4209 /// A helper function for checking whether an integer division-related
4210 /// instruction may divide by zero (in which case it must be predicated if
4211 /// executed conditionally in the scalar code).
4212 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4213 /// Non-zero divisors that are non compile-time constants will not be
4214 /// converted into multiplication, so we will still end up scalarizing
4215 /// the division, but can do so w/o predication.
4216 static bool mayDivideByZero(Instruction &I) {
4217   assert((I.getOpcode() == Instruction::UDiv ||
4218           I.getOpcode() == Instruction::SDiv ||
4219           I.getOpcode() == Instruction::URem ||
4220           I.getOpcode() == Instruction::SRem) &&
4221          "Unexpected instruction");
4222   Value *Divisor = I.getOperand(1);
4223   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4224   return !CInt || CInt->isZero();
4225 }
4226 
4227 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4228   switch (I.getOpcode()) {
4229   case Instruction::Call:
4230   case Instruction::Br:
4231   case Instruction::PHI:
4232   case Instruction::GetElementPtr:
4233   case Instruction::Select:
4234     llvm_unreachable("This instruction is handled by a different recipe.");
4235   case Instruction::UDiv:
4236   case Instruction::SDiv:
4237   case Instruction::SRem:
4238   case Instruction::URem:
4239   case Instruction::Add:
4240   case Instruction::FAdd:
4241   case Instruction::Sub:
4242   case Instruction::FSub:
4243   case Instruction::FNeg:
4244   case Instruction::Mul:
4245   case Instruction::FMul:
4246   case Instruction::FDiv:
4247   case Instruction::FRem:
4248   case Instruction::Shl:
4249   case Instruction::LShr:
4250   case Instruction::AShr:
4251   case Instruction::And:
4252   case Instruction::Or:
4253   case Instruction::Xor: {
4254     // Just widen unops and binops.
4255     setDebugLocFromInst(Builder, &I);
4256 
4257     for (unsigned Part = 0; Part < UF; ++Part) {
4258       SmallVector<Value *, 2> Ops;
4259       for (Value *Op : I.operands())
4260         Ops.push_back(getOrCreateVectorValue(Op, Part));
4261 
4262       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4263 
4264       if (auto *VecOp = dyn_cast<Instruction>(V))
4265         VecOp->copyIRFlags(&I);
4266 
4267       // Use this vector value for all users of the original instruction.
4268       VectorLoopValueMap.setVectorValue(&I, Part, V);
4269       addMetadata(V, &I);
4270     }
4271 
4272     break;
4273   }
4274   case Instruction::ICmp:
4275   case Instruction::FCmp: {
4276     // Widen compares. Generate vector compares.
4277     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4278     auto *Cmp = cast<CmpInst>(&I);
4279     setDebugLocFromInst(Builder, Cmp);
4280     for (unsigned Part = 0; Part < UF; ++Part) {
4281       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4282       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4283       Value *C = nullptr;
4284       if (FCmp) {
4285         // Propagate fast math flags.
4286         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4287         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4288         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4289       } else {
4290         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4291       }
4292       VectorLoopValueMap.setVectorValue(&I, Part, C);
4293       addMetadata(C, &I);
4294     }
4295 
4296     break;
4297   }
4298 
4299   case Instruction::ZExt:
4300   case Instruction::SExt:
4301   case Instruction::FPToUI:
4302   case Instruction::FPToSI:
4303   case Instruction::FPExt:
4304   case Instruction::PtrToInt:
4305   case Instruction::IntToPtr:
4306   case Instruction::SIToFP:
4307   case Instruction::UIToFP:
4308   case Instruction::Trunc:
4309   case Instruction::FPTrunc:
4310   case Instruction::BitCast: {
4311     auto *CI = cast<CastInst>(&I);
4312     setDebugLocFromInst(Builder, CI);
4313 
4314     /// Vectorize casts.
4315     Type *DestTy =
4316         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4317 
4318     for (unsigned Part = 0; Part < UF; ++Part) {
4319       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4320       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4321       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4322       addMetadata(Cast, &I);
4323     }
4324     break;
4325   }
4326   default:
4327     // This instruction is not vectorized by simple widening.
4328     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4329     llvm_unreachable("Unhandled instruction!");
4330   } // end of switch.
4331 }
4332 
4333 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4334                                                VPTransformState &State) {
4335   assert(!isa<DbgInfoIntrinsic>(I) &&
4336          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4337   setDebugLocFromInst(Builder, &I);
4338 
4339   Module *M = I.getParent()->getParent()->getParent();
4340   auto *CI = cast<CallInst>(&I);
4341 
4342   SmallVector<Type *, 4> Tys;
4343   for (Value *ArgOperand : CI->arg_operands())
4344     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4345 
4346   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4347 
4348   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4349   // version of the instruction.
4350   // Is it beneficial to perform intrinsic call compared to lib call?
4351   bool NeedToScalarize = false;
4352   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4353   bool UseVectorIntrinsic =
4354       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4355   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4356          "Instruction should be scalarized elsewhere.");
4357 
4358   for (unsigned Part = 0; Part < UF; ++Part) {
4359     SmallVector<Value *, 4> Args;
4360     for (auto &I : enumerate(ArgOperands.operands())) {
4361       // Some intrinsics have a scalar argument - don't replace it with a
4362       // vector.
4363       Value *Arg;
4364       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4365         Arg = State.get(I.value(), Part);
4366       else
4367         Arg = State.get(I.value(), {0, 0});
4368       Args.push_back(Arg);
4369     }
4370 
4371     Function *VectorF;
4372     if (UseVectorIntrinsic) {
4373       // Use vector version of the intrinsic.
4374       Type *TysForDecl[] = {CI->getType()};
4375       if (VF > 1)
4376         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4377       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4378     } else {
4379       // Use vector version of the function call.
4380       const VFShape Shape =
4381           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4382 #ifndef NDEBUG
4383         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4384         assert(std::find_if(Infos.begin(), Infos.end(),
4385                             [&Shape](const VFInfo &Info) {
4386                               return Info.Shape == Shape;
4387                             }) != Infos.end() &&
4388                "Vector function shape is missing from the database.");
4389 #endif
4390         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4391     }
4392       assert(VectorF && "Can't create vector function.");
4393 
4394       SmallVector<OperandBundleDef, 1> OpBundles;
4395       CI->getOperandBundlesAsDefs(OpBundles);
4396       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4397 
4398       if (isa<FPMathOperator>(V))
4399         V->copyFastMathFlags(CI);
4400 
4401       VectorLoopValueMap.setVectorValue(&I, Part, V);
4402       addMetadata(V, &I);
4403   }
4404 }
4405 
4406 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4407                                                  bool InvariantCond) {
4408   setDebugLocFromInst(Builder, &I);
4409 
4410   // The condition can be loop invariant  but still defined inside the
4411   // loop. This means that we can't just use the original 'cond' value.
4412   // We have to take the 'vectorized' value and pick the first lane.
4413   // Instcombine will make this a no-op.
4414 
4415   auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4416 
4417   for (unsigned Part = 0; Part < UF; ++Part) {
4418     Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4419     Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4420     Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4421     Value *Sel =
4422         Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4423     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4424     addMetadata(Sel, &I);
4425   }
4426 }
4427 
4428 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4429   // We should not collect Scalars more than once per VF. Right now, this
4430   // function is called from collectUniformsAndScalars(), which already does
4431   // this check. Collecting Scalars for VF=1 does not make any sense.
4432   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4433          "This function should not be visited twice for the same VF");
4434 
4435   SmallSetVector<Instruction *, 8> Worklist;
4436 
4437   // These sets are used to seed the analysis with pointers used by memory
4438   // accesses that will remain scalar.
4439   SmallSetVector<Instruction *, 8> ScalarPtrs;
4440   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4441 
4442   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4443   // The pointer operands of loads and stores will be scalar as long as the
4444   // memory access is not a gather or scatter operation. The value operand of a
4445   // store will remain scalar if the store is scalarized.
4446   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4447     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4448     assert(WideningDecision != CM_Unknown &&
4449            "Widening decision should be ready at this moment");
4450     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4451       if (Ptr == Store->getValueOperand())
4452         return WideningDecision == CM_Scalarize;
4453     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4454            "Ptr is neither a value or pointer operand");
4455     return WideningDecision != CM_GatherScatter;
4456   };
4457 
4458   // A helper that returns true if the given value is a bitcast or
4459   // getelementptr instruction contained in the loop.
4460   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4461     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4462             isa<GetElementPtrInst>(V)) &&
4463            !TheLoop->isLoopInvariant(V);
4464   };
4465 
4466   // A helper that evaluates a memory access's use of a pointer. If the use
4467   // will be a scalar use, and the pointer is only used by memory accesses, we
4468   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4469   // PossibleNonScalarPtrs.
4470   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4471     // We only care about bitcast and getelementptr instructions contained in
4472     // the loop.
4473     if (!isLoopVaryingBitCastOrGEP(Ptr))
4474       return;
4475 
4476     // If the pointer has already been identified as scalar (e.g., if it was
4477     // also identified as uniform), there's nothing to do.
4478     auto *I = cast<Instruction>(Ptr);
4479     if (Worklist.count(I))
4480       return;
4481 
4482     // If the use of the pointer will be a scalar use, and all users of the
4483     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4484     // place the pointer in PossibleNonScalarPtrs.
4485     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4486           return isa<LoadInst>(U) || isa<StoreInst>(U);
4487         }))
4488       ScalarPtrs.insert(I);
4489     else
4490       PossibleNonScalarPtrs.insert(I);
4491   };
4492 
4493   // We seed the scalars analysis with three classes of instructions: (1)
4494   // instructions marked uniform-after-vectorization, (2) bitcast and
4495   // getelementptr instructions used by memory accesses requiring a scalar use,
4496   // and (3) pointer induction variables and their update instructions (we
4497   // currently only scalarize these).
4498   //
4499   // (1) Add to the worklist all instructions that have been identified as
4500   // uniform-after-vectorization.
4501   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4502 
4503   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4504   // memory accesses requiring a scalar use. The pointer operands of loads and
4505   // stores will be scalar as long as the memory accesses is not a gather or
4506   // scatter operation. The value operand of a store will remain scalar if the
4507   // store is scalarized.
4508   for (auto *BB : TheLoop->blocks())
4509     for (auto &I : *BB) {
4510       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4511         evaluatePtrUse(Load, Load->getPointerOperand());
4512       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4513         evaluatePtrUse(Store, Store->getPointerOperand());
4514         evaluatePtrUse(Store, Store->getValueOperand());
4515       }
4516     }
4517   for (auto *I : ScalarPtrs)
4518     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4519       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4520       Worklist.insert(I);
4521     }
4522 
4523   // (3) Add to the worklist all pointer induction variables and their update
4524   // instructions.
4525   //
4526   // TODO: Once we are able to vectorize pointer induction variables we should
4527   //       no longer insert them into the worklist here.
4528   auto *Latch = TheLoop->getLoopLatch();
4529   for (auto &Induction : Legal->getInductionVars()) {
4530     auto *Ind = Induction.first;
4531     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4532     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4533       continue;
4534     Worklist.insert(Ind);
4535     Worklist.insert(IndUpdate);
4536     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4537     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4538                       << "\n");
4539   }
4540 
4541   // Insert the forced scalars.
4542   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4543   // induction variable when the PHI user is scalarized.
4544   auto ForcedScalar = ForcedScalars.find(VF);
4545   if (ForcedScalar != ForcedScalars.end())
4546     for (auto *I : ForcedScalar->second)
4547       Worklist.insert(I);
4548 
4549   // Expand the worklist by looking through any bitcasts and getelementptr
4550   // instructions we've already identified as scalar. This is similar to the
4551   // expansion step in collectLoopUniforms(); however, here we're only
4552   // expanding to include additional bitcasts and getelementptr instructions.
4553   unsigned Idx = 0;
4554   while (Idx != Worklist.size()) {
4555     Instruction *Dst = Worklist[Idx++];
4556     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4557       continue;
4558     auto *Src = cast<Instruction>(Dst->getOperand(0));
4559     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4560           auto *J = cast<Instruction>(U);
4561           return !TheLoop->contains(J) || Worklist.count(J) ||
4562                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4563                   isScalarUse(J, Src));
4564         })) {
4565       Worklist.insert(Src);
4566       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4567     }
4568   }
4569 
4570   // An induction variable will remain scalar if all users of the induction
4571   // variable and induction variable update remain scalar.
4572   for (auto &Induction : Legal->getInductionVars()) {
4573     auto *Ind = Induction.first;
4574     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4575 
4576     // We already considered pointer induction variables, so there's no reason
4577     // to look at their users again.
4578     //
4579     // TODO: Once we are able to vectorize pointer induction variables we
4580     //       should no longer skip over them here.
4581     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4582       continue;
4583 
4584     // Determine if all users of the induction variable are scalar after
4585     // vectorization.
4586     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4587       auto *I = cast<Instruction>(U);
4588       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4589     });
4590     if (!ScalarInd)
4591       continue;
4592 
4593     // Determine if all users of the induction variable update instruction are
4594     // scalar after vectorization.
4595     auto ScalarIndUpdate =
4596         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4597           auto *I = cast<Instruction>(U);
4598           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4599         });
4600     if (!ScalarIndUpdate)
4601       continue;
4602 
4603     // The induction variable and its update instruction will remain scalar.
4604     Worklist.insert(Ind);
4605     Worklist.insert(IndUpdate);
4606     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4607     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4608                       << "\n");
4609   }
4610 
4611   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4612 }
4613 
4614 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4615   if (!blockNeedsPredication(I->getParent()))
4616     return false;
4617   switch(I->getOpcode()) {
4618   default:
4619     break;
4620   case Instruction::Load:
4621   case Instruction::Store: {
4622     if (!Legal->isMaskRequired(I))
4623       return false;
4624     auto *Ptr = getLoadStorePointerOperand(I);
4625     auto *Ty = getMemInstValueType(I);
4626     // We have already decided how to vectorize this instruction, get that
4627     // result.
4628     if (VF > 1) {
4629       InstWidening WideningDecision = getWideningDecision(I, VF);
4630       assert(WideningDecision != CM_Unknown &&
4631              "Widening decision should be ready at this moment");
4632       return WideningDecision == CM_Scalarize;
4633     }
4634     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4635     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4636                                 isLegalMaskedGather(Ty, Alignment))
4637                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4638                                 isLegalMaskedScatter(Ty, Alignment));
4639   }
4640   case Instruction::UDiv:
4641   case Instruction::SDiv:
4642   case Instruction::SRem:
4643   case Instruction::URem:
4644     return mayDivideByZero(*I);
4645   }
4646   return false;
4647 }
4648 
4649 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4650                                                                unsigned VF) {
4651   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4652   assert(getWideningDecision(I, VF) == CM_Unknown &&
4653          "Decision should not be set yet.");
4654   auto *Group = getInterleavedAccessGroup(I);
4655   assert(Group && "Must have a group.");
4656 
4657   // If the instruction's allocated size doesn't equal it's type size, it
4658   // requires padding and will be scalarized.
4659   auto &DL = I->getModule()->getDataLayout();
4660   auto *ScalarTy = getMemInstValueType(I);
4661   if (hasIrregularType(ScalarTy, DL, VF))
4662     return false;
4663 
4664   // Check if masking is required.
4665   // A Group may need masking for one of two reasons: it resides in a block that
4666   // needs predication, or it was decided to use masking to deal with gaps.
4667   bool PredicatedAccessRequiresMasking =
4668       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4669   bool AccessWithGapsRequiresMasking =
4670       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4671   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4672     return true;
4673 
4674   // If masked interleaving is required, we expect that the user/target had
4675   // enabled it, because otherwise it either wouldn't have been created or
4676   // it should have been invalidated by the CostModel.
4677   assert(useMaskedInterleavedAccesses(TTI) &&
4678          "Masked interleave-groups for predicated accesses are not enabled.");
4679 
4680   auto *Ty = getMemInstValueType(I);
4681   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4682   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4683                           : TTI.isLegalMaskedStore(Ty, Alignment);
4684 }
4685 
4686 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4687                                                                unsigned VF) {
4688   // Get and ensure we have a valid memory instruction.
4689   LoadInst *LI = dyn_cast<LoadInst>(I);
4690   StoreInst *SI = dyn_cast<StoreInst>(I);
4691   assert((LI || SI) && "Invalid memory instruction");
4692 
4693   auto *Ptr = getLoadStorePointerOperand(I);
4694 
4695   // In order to be widened, the pointer should be consecutive, first of all.
4696   if (!Legal->isConsecutivePtr(Ptr))
4697     return false;
4698 
4699   // If the instruction is a store located in a predicated block, it will be
4700   // scalarized.
4701   if (isScalarWithPredication(I))
4702     return false;
4703 
4704   // If the instruction's allocated size doesn't equal it's type size, it
4705   // requires padding and will be scalarized.
4706   auto &DL = I->getModule()->getDataLayout();
4707   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4708   if (hasIrregularType(ScalarTy, DL, VF))
4709     return false;
4710 
4711   return true;
4712 }
4713 
4714 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4715   // We should not collect Uniforms more than once per VF. Right now,
4716   // this function is called from collectUniformsAndScalars(), which
4717   // already does this check. Collecting Uniforms for VF=1 does not make any
4718   // sense.
4719 
4720   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4721          "This function should not be visited twice for the same VF");
4722 
4723   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4724   // not analyze again.  Uniforms.count(VF) will return 1.
4725   Uniforms[VF].clear();
4726 
4727   // We now know that the loop is vectorizable!
4728   // Collect instructions inside the loop that will remain uniform after
4729   // vectorization.
4730 
4731   // Global values, params and instructions outside of current loop are out of
4732   // scope.
4733   auto isOutOfScope = [&](Value *V) -> bool {
4734     Instruction *I = dyn_cast<Instruction>(V);
4735     return (!I || !TheLoop->contains(I));
4736   };
4737 
4738   SetVector<Instruction *> Worklist;
4739   BasicBlock *Latch = TheLoop->getLoopLatch();
4740 
4741   // Instructions that are scalar with predication must not be considered
4742   // uniform after vectorization, because that would create an erroneous
4743   // replicating region where only a single instance out of VF should be formed.
4744   // TODO: optimize such seldom cases if found important, see PR40816.
4745   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4746     if (isScalarWithPredication(I, VF)) {
4747       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4748                         << *I << "\n");
4749       return;
4750     }
4751     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4752     Worklist.insert(I);
4753   };
4754 
4755   // Start with the conditional branch. If the branch condition is an
4756   // instruction contained in the loop that is only used by the branch, it is
4757   // uniform.
4758   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4759   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4760     addToWorklistIfAllowed(Cmp);
4761 
4762   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4763   // are pointers that are treated like consecutive pointers during
4764   // vectorization. The pointer operands of interleaved accesses are an
4765   // example.
4766   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4767 
4768   // Holds pointer operands of instructions that are possibly non-uniform.
4769   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4770 
4771   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4772     InstWidening WideningDecision = getWideningDecision(I, VF);
4773     assert(WideningDecision != CM_Unknown &&
4774            "Widening decision should be ready at this moment");
4775 
4776     return (WideningDecision == CM_Widen ||
4777             WideningDecision == CM_Widen_Reverse ||
4778             WideningDecision == CM_Interleave);
4779   };
4780   // Iterate over the instructions in the loop, and collect all
4781   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4782   // that a consecutive-like pointer operand will be scalarized, we collect it
4783   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4784   // getelementptr instruction can be used by both vectorized and scalarized
4785   // memory instructions. For example, if a loop loads and stores from the same
4786   // location, but the store is conditional, the store will be scalarized, and
4787   // the getelementptr won't remain uniform.
4788   for (auto *BB : TheLoop->blocks())
4789     for (auto &I : *BB) {
4790       // If there's no pointer operand, there's nothing to do.
4791       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4792       if (!Ptr)
4793         continue;
4794 
4795       // True if all users of Ptr are memory accesses that have Ptr as their
4796       // pointer operand.
4797       auto UsersAreMemAccesses =
4798           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4799             return getLoadStorePointerOperand(U) == Ptr;
4800           });
4801 
4802       // Ensure the memory instruction will not be scalarized or used by
4803       // gather/scatter, making its pointer operand non-uniform. If the pointer
4804       // operand is used by any instruction other than a memory access, we
4805       // conservatively assume the pointer operand may be non-uniform.
4806       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4807         PossibleNonUniformPtrs.insert(Ptr);
4808 
4809       // If the memory instruction will be vectorized and its pointer operand
4810       // is consecutive-like, or interleaving - the pointer operand should
4811       // remain uniform.
4812       else
4813         ConsecutiveLikePtrs.insert(Ptr);
4814     }
4815 
4816   // Add to the Worklist all consecutive and consecutive-like pointers that
4817   // aren't also identified as possibly non-uniform.
4818   for (auto *V : ConsecutiveLikePtrs)
4819     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4820       addToWorklistIfAllowed(V);
4821 
4822   // Expand Worklist in topological order: whenever a new instruction
4823   // is added , its users should be already inside Worklist.  It ensures
4824   // a uniform instruction will only be used by uniform instructions.
4825   unsigned idx = 0;
4826   while (idx != Worklist.size()) {
4827     Instruction *I = Worklist[idx++];
4828 
4829     for (auto OV : I->operand_values()) {
4830       // isOutOfScope operands cannot be uniform instructions.
4831       if (isOutOfScope(OV))
4832         continue;
4833       // First order recurrence Phi's should typically be considered
4834       // non-uniform.
4835       auto *OP = dyn_cast<PHINode>(OV);
4836       if (OP && Legal->isFirstOrderRecurrence(OP))
4837         continue;
4838       // If all the users of the operand are uniform, then add the
4839       // operand into the uniform worklist.
4840       auto *OI = cast<Instruction>(OV);
4841       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4842             auto *J = cast<Instruction>(U);
4843             return Worklist.count(J) ||
4844                    (OI == getLoadStorePointerOperand(J) &&
4845                     isUniformDecision(J, VF));
4846           }))
4847         addToWorklistIfAllowed(OI);
4848     }
4849   }
4850 
4851   // Returns true if Ptr is the pointer operand of a memory access instruction
4852   // I, and I is known to not require scalarization.
4853   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4854     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4855   };
4856 
4857   // For an instruction to be added into Worklist above, all its users inside
4858   // the loop should also be in Worklist. However, this condition cannot be
4859   // true for phi nodes that form a cyclic dependence. We must process phi
4860   // nodes separately. An induction variable will remain uniform if all users
4861   // of the induction variable and induction variable update remain uniform.
4862   // The code below handles both pointer and non-pointer induction variables.
4863   for (auto &Induction : Legal->getInductionVars()) {
4864     auto *Ind = Induction.first;
4865     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4866 
4867     // Determine if all users of the induction variable are uniform after
4868     // vectorization.
4869     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4870       auto *I = cast<Instruction>(U);
4871       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4872              isVectorizedMemAccessUse(I, Ind);
4873     });
4874     if (!UniformInd)
4875       continue;
4876 
4877     // Determine if all users of the induction variable update instruction are
4878     // uniform after vectorization.
4879     auto UniformIndUpdate =
4880         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4881           auto *I = cast<Instruction>(U);
4882           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4883                  isVectorizedMemAccessUse(I, IndUpdate);
4884         });
4885     if (!UniformIndUpdate)
4886       continue;
4887 
4888     // The induction variable and its update instruction will remain uniform.
4889     addToWorklistIfAllowed(Ind);
4890     addToWorklistIfAllowed(IndUpdate);
4891   }
4892 
4893   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4894 }
4895 
4896 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4897   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4898 
4899   if (Legal->getRuntimePointerChecking()->Need) {
4900     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4901         "runtime pointer checks needed. Enable vectorization of this "
4902         "loop with '#pragma clang loop vectorize(enable)' when "
4903         "compiling with -Os/-Oz",
4904         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4905     return true;
4906   }
4907 
4908   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4909     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4910         "runtime SCEV checks needed. Enable vectorization of this "
4911         "loop with '#pragma clang loop vectorize(enable)' when "
4912         "compiling with -Os/-Oz",
4913         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4914     return true;
4915   }
4916 
4917   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4918   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4919     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4920         "runtime stride == 1 checks needed. Enable vectorization of "
4921         "this loop with '#pragma clang loop vectorize(enable)' when "
4922         "compiling with -Os/-Oz",
4923         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4924     return true;
4925   }
4926 
4927   return false;
4928 }
4929 
4930 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4931   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4932     // TODO: It may by useful to do since it's still likely to be dynamically
4933     // uniform if the target can skip.
4934     reportVectorizationFailure(
4935         "Not inserting runtime ptr check for divergent target",
4936         "runtime pointer checks needed. Not enabled for divergent target",
4937         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4938     return None;
4939   }
4940 
4941   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4942   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4943   if (TC == 1) {
4944     reportVectorizationFailure("Single iteration (non) loop",
4945         "loop trip count is one, irrelevant for vectorization",
4946         "SingleIterationLoop", ORE, TheLoop);
4947     return None;
4948   }
4949 
4950   switch (ScalarEpilogueStatus) {
4951   case CM_ScalarEpilogueAllowed:
4952     return computeFeasibleMaxVF(TC);
4953   case CM_ScalarEpilogueNotNeededUsePredicate:
4954     LLVM_DEBUG(
4955         dbgs() << "LV: vector predicate hint/switch found.\n"
4956                << "LV: Not allowing scalar epilogue, creating predicated "
4957                << "vector loop.\n");
4958     break;
4959   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4960     // fallthrough as a special case of OptForSize
4961   case CM_ScalarEpilogueNotAllowedOptSize:
4962     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4963       LLVM_DEBUG(
4964           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4965     else
4966       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4967                         << "count.\n");
4968 
4969     // Bail if runtime checks are required, which are not good when optimising
4970     // for size.
4971     if (runtimeChecksRequired())
4972       return None;
4973     break;
4974   }
4975 
4976   // Now try the tail folding
4977 
4978   // Invalidate interleave groups that require an epilogue if we can't mask
4979   // the interleave-group.
4980   if (!useMaskedInterleavedAccesses(TTI))
4981     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4982 
4983   unsigned MaxVF = computeFeasibleMaxVF(TC);
4984   if (TC > 0 && TC % MaxVF == 0) {
4985     // Accept MaxVF if we do not have a tail.
4986     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4987     return MaxVF;
4988   }
4989 
4990   // If we don't know the precise trip count, or if the trip count that we
4991   // found modulo the vectorization factor is not zero, try to fold the tail
4992   // by masking.
4993   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4994   if (Legal->prepareToFoldTailByMasking()) {
4995     FoldTailByMasking = true;
4996     return MaxVF;
4997   }
4998 
4999   if (TC == 0) {
5000     reportVectorizationFailure(
5001         "Unable to calculate the loop count due to complex control flow",
5002         "unable to calculate the loop count due to complex control flow",
5003         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5004     return None;
5005   }
5006 
5007   reportVectorizationFailure(
5008       "Cannot optimize for size and vectorize at the same time.",
5009       "cannot optimize for size and vectorize at the same time. "
5010       "Enable vectorization of this loop with '#pragma clang loop "
5011       "vectorize(enable)' when compiling with -Os/-Oz",
5012       "NoTailLoopWithOptForSize", ORE, TheLoop);
5013   return None;
5014 }
5015 
5016 unsigned
5017 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5018   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5019   unsigned SmallestType, WidestType;
5020   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5021   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5022 
5023   // Get the maximum safe dependence distance in bits computed by LAA.
5024   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5025   // the memory accesses that is most restrictive (involved in the smallest
5026   // dependence distance).
5027   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5028 
5029   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5030 
5031   unsigned MaxVectorSize = WidestRegister / WidestType;
5032 
5033   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5034                     << " / " << WidestType << " bits.\n");
5035   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5036                     << WidestRegister << " bits.\n");
5037 
5038   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5039                                  " into one vector!");
5040   if (MaxVectorSize == 0) {
5041     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5042     MaxVectorSize = 1;
5043     return MaxVectorSize;
5044   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5045              isPowerOf2_32(ConstTripCount)) {
5046     // We need to clamp the VF to be the ConstTripCount. There is no point in
5047     // choosing a higher viable VF as done in the loop below.
5048     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5049                       << ConstTripCount << "\n");
5050     MaxVectorSize = ConstTripCount;
5051     return MaxVectorSize;
5052   }
5053 
5054   unsigned MaxVF = MaxVectorSize;
5055   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5056       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5057     // Collect all viable vectorization factors larger than the default MaxVF
5058     // (i.e. MaxVectorSize).
5059     SmallVector<unsigned, 8> VFs;
5060     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5061     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5062       VFs.push_back(VS);
5063 
5064     // For each VF calculate its register usage.
5065     auto RUs = calculateRegisterUsage(VFs);
5066 
5067     // Select the largest VF which doesn't require more registers than existing
5068     // ones.
5069     for (int i = RUs.size() - 1; i >= 0; --i) {
5070       bool Selected = true;
5071       for (auto& pair : RUs[i].MaxLocalUsers) {
5072         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5073         if (pair.second > TargetNumRegisters)
5074           Selected = false;
5075       }
5076       if (Selected) {
5077         MaxVF = VFs[i];
5078         break;
5079       }
5080     }
5081     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5082       if (MaxVF < MinVF) {
5083         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5084                           << ") with target's minimum: " << MinVF << '\n');
5085         MaxVF = MinVF;
5086       }
5087     }
5088   }
5089   return MaxVF;
5090 }
5091 
5092 VectorizationFactor
5093 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5094   float Cost = expectedCost(1).first;
5095   const float ScalarCost = Cost;
5096   unsigned Width = 1;
5097   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5098 
5099   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5100   if (ForceVectorization && MaxVF > 1) {
5101     // Ignore scalar width, because the user explicitly wants vectorization.
5102     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5103     // evaluation.
5104     Cost = std::numeric_limits<float>::max();
5105   }
5106 
5107   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5108     // Notice that the vector loop needs to be executed less times, so
5109     // we need to divide the cost of the vector loops by the width of
5110     // the vector elements.
5111     VectorizationCostTy C = expectedCost(i);
5112     float VectorCost = C.first / (float)i;
5113     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5114                       << " costs: " << (int)VectorCost << ".\n");
5115     if (!C.second && !ForceVectorization) {
5116       LLVM_DEBUG(
5117           dbgs() << "LV: Not considering vector loop of width " << i
5118                  << " because it will not generate any vector instructions.\n");
5119       continue;
5120     }
5121     if (VectorCost < Cost) {
5122       Cost = VectorCost;
5123       Width = i;
5124     }
5125   }
5126 
5127   if (!EnableCondStoresVectorization && NumPredStores) {
5128     reportVectorizationFailure("There are conditional stores.",
5129         "store that is conditionally executed prevents vectorization",
5130         "ConditionalStore", ORE, TheLoop);
5131     Width = 1;
5132     Cost = ScalarCost;
5133   }
5134 
5135   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5136              << "LV: Vectorization seems to be not beneficial, "
5137              << "but was forced by a user.\n");
5138   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5139   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5140   return Factor;
5141 }
5142 
5143 std::pair<unsigned, unsigned>
5144 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5145   unsigned MinWidth = -1U;
5146   unsigned MaxWidth = 8;
5147   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5148 
5149   // For each block.
5150   for (BasicBlock *BB : TheLoop->blocks()) {
5151     // For each instruction in the loop.
5152     for (Instruction &I : BB->instructionsWithoutDebug()) {
5153       Type *T = I.getType();
5154 
5155       // Skip ignored values.
5156       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5157         continue;
5158 
5159       // Only examine Loads, Stores and PHINodes.
5160       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5161         continue;
5162 
5163       // Examine PHI nodes that are reduction variables. Update the type to
5164       // account for the recurrence type.
5165       if (auto *PN = dyn_cast<PHINode>(&I)) {
5166         if (!Legal->isReductionVariable(PN))
5167           continue;
5168         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5169         T = RdxDesc.getRecurrenceType();
5170       }
5171 
5172       // Examine the stored values.
5173       if (auto *ST = dyn_cast<StoreInst>(&I))
5174         T = ST->getValueOperand()->getType();
5175 
5176       // Ignore loaded pointer types and stored pointer types that are not
5177       // vectorizable.
5178       //
5179       // FIXME: The check here attempts to predict whether a load or store will
5180       //        be vectorized. We only know this for certain after a VF has
5181       //        been selected. Here, we assume that if an access can be
5182       //        vectorized, it will be. We should also look at extending this
5183       //        optimization to non-pointer types.
5184       //
5185       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5186           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5187         continue;
5188 
5189       MinWidth = std::min(MinWidth,
5190                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5191       MaxWidth = std::max(MaxWidth,
5192                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5193     }
5194   }
5195 
5196   return {MinWidth, MaxWidth};
5197 }
5198 
5199 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5200                                                            unsigned LoopCost) {
5201   // -- The interleave heuristics --
5202   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5203   // There are many micro-architectural considerations that we can't predict
5204   // at this level. For example, frontend pressure (on decode or fetch) due to
5205   // code size, or the number and capabilities of the execution ports.
5206   //
5207   // We use the following heuristics to select the interleave count:
5208   // 1. If the code has reductions, then we interleave to break the cross
5209   // iteration dependency.
5210   // 2. If the loop is really small, then we interleave to reduce the loop
5211   // overhead.
5212   // 3. We don't interleave if we think that we will spill registers to memory
5213   // due to the increased register pressure.
5214 
5215   if (!isScalarEpilogueAllowed())
5216     return 1;
5217 
5218   // We used the distance for the interleave count.
5219   if (Legal->getMaxSafeDepDistBytes() != -1U)
5220     return 1;
5221 
5222   // Do not interleave loops with a relatively small known or estimated trip
5223   // count.
5224   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5225   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5226     return 1;
5227 
5228   RegisterUsage R = calculateRegisterUsage({VF})[0];
5229   // We divide by these constants so assume that we have at least one
5230   // instruction that uses at least one register.
5231   for (auto& pair : R.MaxLocalUsers) {
5232     pair.second = std::max(pair.second, 1U);
5233   }
5234 
5235   // We calculate the interleave count using the following formula.
5236   // Subtract the number of loop invariants from the number of available
5237   // registers. These registers are used by all of the interleaved instances.
5238   // Next, divide the remaining registers by the number of registers that is
5239   // required by the loop, in order to estimate how many parallel instances
5240   // fit without causing spills. All of this is rounded down if necessary to be
5241   // a power of two. We want power of two interleave count to simplify any
5242   // addressing operations or alignment considerations.
5243   // We also want power of two interleave counts to ensure that the induction
5244   // variable of the vector loop wraps to zero, when tail is folded by masking;
5245   // this currently happens when OptForSize, in which case IC is set to 1 above.
5246   unsigned IC = UINT_MAX;
5247 
5248   for (auto& pair : R.MaxLocalUsers) {
5249     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5250     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5251                       << " registers of "
5252                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5253     if (VF == 1) {
5254       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5255         TargetNumRegisters = ForceTargetNumScalarRegs;
5256     } else {
5257       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5258         TargetNumRegisters = ForceTargetNumVectorRegs;
5259     }
5260     unsigned MaxLocalUsers = pair.second;
5261     unsigned LoopInvariantRegs = 0;
5262     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5263       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5264 
5265     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5266     // Don't count the induction variable as interleaved.
5267     if (EnableIndVarRegisterHeur) {
5268       TmpIC =
5269           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5270                         std::max(1U, (MaxLocalUsers - 1)));
5271     }
5272 
5273     IC = std::min(IC, TmpIC);
5274   }
5275 
5276   // Clamp the interleave ranges to reasonable counts.
5277   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5278 
5279   // Check if the user has overridden the max.
5280   if (VF == 1) {
5281     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5282       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5283   } else {
5284     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5285       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5286   }
5287 
5288   // If trip count is known or estimated compile time constant, limit the
5289   // interleave count to be less than the trip count divided by VF.
5290   if (BestKnownTC) {
5291     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5292   }
5293 
5294   // If we did not calculate the cost for VF (because the user selected the VF)
5295   // then we calculate the cost of VF here.
5296   if (LoopCost == 0)
5297     LoopCost = expectedCost(VF).first;
5298 
5299   assert(LoopCost && "Non-zero loop cost expected");
5300 
5301   // Clamp the calculated IC to be between the 1 and the max interleave count
5302   // that the target and trip count allows.
5303   if (IC > MaxInterleaveCount)
5304     IC = MaxInterleaveCount;
5305   else if (IC < 1)
5306     IC = 1;
5307 
5308   // Interleave if we vectorized this loop and there is a reduction that could
5309   // benefit from interleaving.
5310   if (VF > 1 && !Legal->getReductionVars().empty()) {
5311     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5312     return IC;
5313   }
5314 
5315   // Note that if we've already vectorized the loop we will have done the
5316   // runtime check and so interleaving won't require further checks.
5317   bool InterleavingRequiresRuntimePointerCheck =
5318       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5319 
5320   // We want to interleave small loops in order to reduce the loop overhead and
5321   // potentially expose ILP opportunities.
5322   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5323   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5324     // We assume that the cost overhead is 1 and we use the cost model
5325     // to estimate the cost of the loop and interleave until the cost of the
5326     // loop overhead is about 5% of the cost of the loop.
5327     unsigned SmallIC =
5328         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5329 
5330     // Interleave until store/load ports (estimated by max interleave count) are
5331     // saturated.
5332     unsigned NumStores = Legal->getNumStores();
5333     unsigned NumLoads = Legal->getNumLoads();
5334     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5335     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5336 
5337     // If we have a scalar reduction (vector reductions are already dealt with
5338     // by this point), we can increase the critical path length if the loop
5339     // we're interleaving is inside another loop. Limit, by default to 2, so the
5340     // critical path only gets increased by one reduction operation.
5341     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5342       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5343       SmallIC = std::min(SmallIC, F);
5344       StoresIC = std::min(StoresIC, F);
5345       LoadsIC = std::min(LoadsIC, F);
5346     }
5347 
5348     if (EnableLoadStoreRuntimeInterleave &&
5349         std::max(StoresIC, LoadsIC) > SmallIC) {
5350       LLVM_DEBUG(
5351           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5352       return std::max(StoresIC, LoadsIC);
5353     }
5354 
5355     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5356     return SmallIC;
5357   }
5358 
5359   // Interleave if this is a large loop (small loops are already dealt with by
5360   // this point) that could benefit from interleaving.
5361   bool HasReductions = !Legal->getReductionVars().empty();
5362   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5363     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5364     return IC;
5365   }
5366 
5367   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5368   return 1;
5369 }
5370 
5371 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5372 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5373   // This function calculates the register usage by measuring the highest number
5374   // of values that are alive at a single location. Obviously, this is a very
5375   // rough estimation. We scan the loop in a topological order in order and
5376   // assign a number to each instruction. We use RPO to ensure that defs are
5377   // met before their users. We assume that each instruction that has in-loop
5378   // users starts an interval. We record every time that an in-loop value is
5379   // used, so we have a list of the first and last occurrences of each
5380   // instruction. Next, we transpose this data structure into a multi map that
5381   // holds the list of intervals that *end* at a specific location. This multi
5382   // map allows us to perform a linear search. We scan the instructions linearly
5383   // and record each time that a new interval starts, by placing it in a set.
5384   // If we find this value in the multi-map then we remove it from the set.
5385   // The max register usage is the maximum size of the set.
5386   // We also search for instructions that are defined outside the loop, but are
5387   // used inside the loop. We need this number separately from the max-interval
5388   // usage number because when we unroll, loop-invariant values do not take
5389   // more register.
5390   LoopBlocksDFS DFS(TheLoop);
5391   DFS.perform(LI);
5392 
5393   RegisterUsage RU;
5394 
5395   // Each 'key' in the map opens a new interval. The values
5396   // of the map are the index of the 'last seen' usage of the
5397   // instruction that is the key.
5398   using IntervalMap = DenseMap<Instruction *, unsigned>;
5399 
5400   // Maps instruction to its index.
5401   SmallVector<Instruction *, 64> IdxToInstr;
5402   // Marks the end of each interval.
5403   IntervalMap EndPoint;
5404   // Saves the list of instruction indices that are used in the loop.
5405   SmallPtrSet<Instruction *, 8> Ends;
5406   // Saves the list of values that are used in the loop but are
5407   // defined outside the loop, such as arguments and constants.
5408   SmallPtrSet<Value *, 8> LoopInvariants;
5409 
5410   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5411     for (Instruction &I : BB->instructionsWithoutDebug()) {
5412       IdxToInstr.push_back(&I);
5413 
5414       // Save the end location of each USE.
5415       for (Value *U : I.operands()) {
5416         auto *Instr = dyn_cast<Instruction>(U);
5417 
5418         // Ignore non-instruction values such as arguments, constants, etc.
5419         if (!Instr)
5420           continue;
5421 
5422         // If this instruction is outside the loop then record it and continue.
5423         if (!TheLoop->contains(Instr)) {
5424           LoopInvariants.insert(Instr);
5425           continue;
5426         }
5427 
5428         // Overwrite previous end points.
5429         EndPoint[Instr] = IdxToInstr.size();
5430         Ends.insert(Instr);
5431       }
5432     }
5433   }
5434 
5435   // Saves the list of intervals that end with the index in 'key'.
5436   using InstrList = SmallVector<Instruction *, 2>;
5437   DenseMap<unsigned, InstrList> TransposeEnds;
5438 
5439   // Transpose the EndPoints to a list of values that end at each index.
5440   for (auto &Interval : EndPoint)
5441     TransposeEnds[Interval.second].push_back(Interval.first);
5442 
5443   SmallPtrSet<Instruction *, 8> OpenIntervals;
5444 
5445   // Get the size of the widest register.
5446   unsigned MaxSafeDepDist = -1U;
5447   if (Legal->getMaxSafeDepDistBytes() != -1U)
5448     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5449   unsigned WidestRegister =
5450       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5451   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5452 
5453   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5454   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5455 
5456   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5457 
5458   // A lambda that gets the register usage for the given type and VF.
5459   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5460     if (Ty->isTokenTy())
5461       return 0U;
5462     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5463     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5464   };
5465 
5466   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5467     Instruction *I = IdxToInstr[i];
5468 
5469     // Remove all of the instructions that end at this location.
5470     InstrList &List = TransposeEnds[i];
5471     for (Instruction *ToRemove : List)
5472       OpenIntervals.erase(ToRemove);
5473 
5474     // Ignore instructions that are never used within the loop.
5475     if (Ends.find(I) == Ends.end())
5476       continue;
5477 
5478     // Skip ignored values.
5479     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5480       continue;
5481 
5482     // For each VF find the maximum usage of registers.
5483     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5484       // Count the number of live intervals.
5485       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5486 
5487       if (VFs[j] == 1) {
5488         for (auto Inst : OpenIntervals) {
5489           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5490           if (RegUsage.find(ClassID) == RegUsage.end())
5491             RegUsage[ClassID] = 1;
5492           else
5493             RegUsage[ClassID] += 1;
5494         }
5495       } else {
5496         collectUniformsAndScalars(VFs[j]);
5497         for (auto Inst : OpenIntervals) {
5498           // Skip ignored values for VF > 1.
5499           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5500             continue;
5501           if (isScalarAfterVectorization(Inst, VFs[j])) {
5502             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5503             if (RegUsage.find(ClassID) == RegUsage.end())
5504               RegUsage[ClassID] = 1;
5505             else
5506               RegUsage[ClassID] += 1;
5507           } else {
5508             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5509             if (RegUsage.find(ClassID) == RegUsage.end())
5510               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5511             else
5512               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5513           }
5514         }
5515       }
5516 
5517       for (auto& pair : RegUsage) {
5518         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5519           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5520         else
5521           MaxUsages[j][pair.first] = pair.second;
5522       }
5523     }
5524 
5525     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5526                       << OpenIntervals.size() << '\n');
5527 
5528     // Add the current instruction to the list of open intervals.
5529     OpenIntervals.insert(I);
5530   }
5531 
5532   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5533     SmallMapVector<unsigned, unsigned, 4> Invariant;
5534 
5535     for (auto Inst : LoopInvariants) {
5536       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5537       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5538       if (Invariant.find(ClassID) == Invariant.end())
5539         Invariant[ClassID] = Usage;
5540       else
5541         Invariant[ClassID] += Usage;
5542     }
5543 
5544     LLVM_DEBUG({
5545       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5546       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5547              << " item\n";
5548       for (const auto &pair : MaxUsages[i]) {
5549         dbgs() << "LV(REG): RegisterClass: "
5550                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5551                << " registers\n";
5552       }
5553       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5554              << " item\n";
5555       for (const auto &pair : Invariant) {
5556         dbgs() << "LV(REG): RegisterClass: "
5557                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5558                << " registers\n";
5559       }
5560     });
5561 
5562     RU.LoopInvariantRegs = Invariant;
5563     RU.MaxLocalUsers = MaxUsages[i];
5564     RUs[i] = RU;
5565   }
5566 
5567   return RUs;
5568 }
5569 
5570 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5571   // TODO: Cost model for emulated masked load/store is completely
5572   // broken. This hack guides the cost model to use an artificially
5573   // high enough value to practically disable vectorization with such
5574   // operations, except where previously deployed legality hack allowed
5575   // using very low cost values. This is to avoid regressions coming simply
5576   // from moving "masked load/store" check from legality to cost model.
5577   // Masked Load/Gather emulation was previously never allowed.
5578   // Limited number of Masked Store/Scatter emulation was allowed.
5579   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5580   return isa<LoadInst>(I) ||
5581          (isa<StoreInst>(I) &&
5582           NumPredStores > NumberOfStoresToPredicate);
5583 }
5584 
5585 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5586   // If we aren't vectorizing the loop, or if we've already collected the
5587   // instructions to scalarize, there's nothing to do. Collection may already
5588   // have occurred if we have a user-selected VF and are now computing the
5589   // expected cost for interleaving.
5590   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5591     return;
5592 
5593   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5594   // not profitable to scalarize any instructions, the presence of VF in the
5595   // map will indicate that we've analyzed it already.
5596   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5597 
5598   // Find all the instructions that are scalar with predication in the loop and
5599   // determine if it would be better to not if-convert the blocks they are in.
5600   // If so, we also record the instructions to scalarize.
5601   for (BasicBlock *BB : TheLoop->blocks()) {
5602     if (!blockNeedsPredication(BB))
5603       continue;
5604     for (Instruction &I : *BB)
5605       if (isScalarWithPredication(&I)) {
5606         ScalarCostsTy ScalarCosts;
5607         // Do not apply discount logic if hacked cost is needed
5608         // for emulated masked memrefs.
5609         if (!useEmulatedMaskMemRefHack(&I) &&
5610             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5611           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5612         // Remember that BB will remain after vectorization.
5613         PredicatedBBsAfterVectorization.insert(BB);
5614       }
5615   }
5616 }
5617 
5618 int LoopVectorizationCostModel::computePredInstDiscount(
5619     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5620     unsigned VF) {
5621   assert(!isUniformAfterVectorization(PredInst, VF) &&
5622          "Instruction marked uniform-after-vectorization will be predicated");
5623 
5624   // Initialize the discount to zero, meaning that the scalar version and the
5625   // vector version cost the same.
5626   int Discount = 0;
5627 
5628   // Holds instructions to analyze. The instructions we visit are mapped in
5629   // ScalarCosts. Those instructions are the ones that would be scalarized if
5630   // we find that the scalar version costs less.
5631   SmallVector<Instruction *, 8> Worklist;
5632 
5633   // Returns true if the given instruction can be scalarized.
5634   auto canBeScalarized = [&](Instruction *I) -> bool {
5635     // We only attempt to scalarize instructions forming a single-use chain
5636     // from the original predicated block that would otherwise be vectorized.
5637     // Although not strictly necessary, we give up on instructions we know will
5638     // already be scalar to avoid traversing chains that are unlikely to be
5639     // beneficial.
5640     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5641         isScalarAfterVectorization(I, VF))
5642       return false;
5643 
5644     // If the instruction is scalar with predication, it will be analyzed
5645     // separately. We ignore it within the context of PredInst.
5646     if (isScalarWithPredication(I))
5647       return false;
5648 
5649     // If any of the instruction's operands are uniform after vectorization,
5650     // the instruction cannot be scalarized. This prevents, for example, a
5651     // masked load from being scalarized.
5652     //
5653     // We assume we will only emit a value for lane zero of an instruction
5654     // marked uniform after vectorization, rather than VF identical values.
5655     // Thus, if we scalarize an instruction that uses a uniform, we would
5656     // create uses of values corresponding to the lanes we aren't emitting code
5657     // for. This behavior can be changed by allowing getScalarValue to clone
5658     // the lane zero values for uniforms rather than asserting.
5659     for (Use &U : I->operands())
5660       if (auto *J = dyn_cast<Instruction>(U.get()))
5661         if (isUniformAfterVectorization(J, VF))
5662           return false;
5663 
5664     // Otherwise, we can scalarize the instruction.
5665     return true;
5666   };
5667 
5668   // Compute the expected cost discount from scalarizing the entire expression
5669   // feeding the predicated instruction. We currently only consider expressions
5670   // that are single-use instruction chains.
5671   Worklist.push_back(PredInst);
5672   while (!Worklist.empty()) {
5673     Instruction *I = Worklist.pop_back_val();
5674 
5675     // If we've already analyzed the instruction, there's nothing to do.
5676     if (ScalarCosts.find(I) != ScalarCosts.end())
5677       continue;
5678 
5679     // Compute the cost of the vector instruction. Note that this cost already
5680     // includes the scalarization overhead of the predicated instruction.
5681     unsigned VectorCost = getInstructionCost(I, VF).first;
5682 
5683     // Compute the cost of the scalarized instruction. This cost is the cost of
5684     // the instruction as if it wasn't if-converted and instead remained in the
5685     // predicated block. We will scale this cost by block probability after
5686     // computing the scalarization overhead.
5687     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5688 
5689     // Compute the scalarization overhead of needed insertelement instructions
5690     // and phi nodes.
5691     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5692       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5693                                                  true, false);
5694       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5695     }
5696 
5697     // Compute the scalarization overhead of needed extractelement
5698     // instructions. For each of the instruction's operands, if the operand can
5699     // be scalarized, add it to the worklist; otherwise, account for the
5700     // overhead.
5701     for (Use &U : I->operands())
5702       if (auto *J = dyn_cast<Instruction>(U.get())) {
5703         assert(VectorType::isValidElementType(J->getType()) &&
5704                "Instruction has non-scalar type");
5705         if (canBeScalarized(J))
5706           Worklist.push_back(J);
5707         else if (needsExtract(J, VF))
5708           ScalarCost += TTI.getScalarizationOverhead(
5709                               ToVectorTy(J->getType(),VF), false, true);
5710       }
5711 
5712     // Scale the total scalar cost by block probability.
5713     ScalarCost /= getReciprocalPredBlockProb();
5714 
5715     // Compute the discount. A non-negative discount means the vector version
5716     // of the instruction costs more, and scalarizing would be beneficial.
5717     Discount += VectorCost - ScalarCost;
5718     ScalarCosts[I] = ScalarCost;
5719   }
5720 
5721   return Discount;
5722 }
5723 
5724 LoopVectorizationCostModel::VectorizationCostTy
5725 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5726   VectorizationCostTy Cost;
5727 
5728   // For each block.
5729   for (BasicBlock *BB : TheLoop->blocks()) {
5730     VectorizationCostTy BlockCost;
5731 
5732     // For each instruction in the old loop.
5733     for (Instruction &I : BB->instructionsWithoutDebug()) {
5734       // Skip ignored values.
5735       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5736           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5737         continue;
5738 
5739       VectorizationCostTy C = getInstructionCost(&I, VF);
5740 
5741       // Check if we should override the cost.
5742       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5743         C.first = ForceTargetInstructionCost;
5744 
5745       BlockCost.first += C.first;
5746       BlockCost.second |= C.second;
5747       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5748                         << " for VF " << VF << " For instruction: " << I
5749                         << '\n');
5750     }
5751 
5752     // If we are vectorizing a predicated block, it will have been
5753     // if-converted. This means that the block's instructions (aside from
5754     // stores and instructions that may divide by zero) will now be
5755     // unconditionally executed. For the scalar case, we may not always execute
5756     // the predicated block. Thus, scale the block's cost by the probability of
5757     // executing it.
5758     if (VF == 1 && blockNeedsPredication(BB))
5759       BlockCost.first /= getReciprocalPredBlockProb();
5760 
5761     Cost.first += BlockCost.first;
5762     Cost.second |= BlockCost.second;
5763   }
5764 
5765   return Cost;
5766 }
5767 
5768 /// Gets Address Access SCEV after verifying that the access pattern
5769 /// is loop invariant except the induction variable dependence.
5770 ///
5771 /// This SCEV can be sent to the Target in order to estimate the address
5772 /// calculation cost.
5773 static const SCEV *getAddressAccessSCEV(
5774               Value *Ptr,
5775               LoopVectorizationLegality *Legal,
5776               PredicatedScalarEvolution &PSE,
5777               const Loop *TheLoop) {
5778 
5779   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5780   if (!Gep)
5781     return nullptr;
5782 
5783   // We are looking for a gep with all loop invariant indices except for one
5784   // which should be an induction variable.
5785   auto SE = PSE.getSE();
5786   unsigned NumOperands = Gep->getNumOperands();
5787   for (unsigned i = 1; i < NumOperands; ++i) {
5788     Value *Opd = Gep->getOperand(i);
5789     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5790         !Legal->isInductionVariable(Opd))
5791       return nullptr;
5792   }
5793 
5794   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5795   return PSE.getSCEV(Ptr);
5796 }
5797 
5798 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5799   return Legal->hasStride(I->getOperand(0)) ||
5800          Legal->hasStride(I->getOperand(1));
5801 }
5802 
5803 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5804                                                                  unsigned VF) {
5805   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5806   Type *ValTy = getMemInstValueType(I);
5807   auto SE = PSE.getSE();
5808 
5809   unsigned AS = getLoadStoreAddressSpace(I);
5810   Value *Ptr = getLoadStorePointerOperand(I);
5811   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5812 
5813   // Figure out whether the access is strided and get the stride value
5814   // if it's known in compile time
5815   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5816 
5817   // Get the cost of the scalar memory instruction and address computation.
5818   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5819 
5820   // Don't pass *I here, since it is scalar but will actually be part of a
5821   // vectorized loop where the user of it is a vectorized instruction.
5822   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5823   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5824                                    Alignment, AS);
5825 
5826   // Get the overhead of the extractelement and insertelement instructions
5827   // we might create due to scalarization.
5828   Cost += getScalarizationOverhead(I, VF);
5829 
5830   // If we have a predicated store, it may not be executed for each vector
5831   // lane. Scale the cost by the probability of executing the predicated
5832   // block.
5833   if (isPredicatedInst(I)) {
5834     Cost /= getReciprocalPredBlockProb();
5835 
5836     if (useEmulatedMaskMemRefHack(I))
5837       // Artificially setting to a high enough value to practically disable
5838       // vectorization with such operations.
5839       Cost = 3000000;
5840   }
5841 
5842   return Cost;
5843 }
5844 
5845 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5846                                                              unsigned VF) {
5847   Type *ValTy = getMemInstValueType(I);
5848   Type *VectorTy = ToVectorTy(ValTy, VF);
5849   Value *Ptr = getLoadStorePointerOperand(I);
5850   unsigned AS = getLoadStoreAddressSpace(I);
5851   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5852 
5853   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5854          "Stride should be 1 or -1 for consecutive memory access");
5855   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5856   unsigned Cost = 0;
5857   if (Legal->isMaskRequired(I))
5858     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5859                                       Alignment ? Alignment->value() : 0, AS);
5860   else
5861     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5862 
5863   bool Reverse = ConsecutiveStride < 0;
5864   if (Reverse)
5865     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5866   return Cost;
5867 }
5868 
5869 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5870                                                          unsigned VF) {
5871   Type *ValTy = getMemInstValueType(I);
5872   Type *VectorTy = ToVectorTy(ValTy, VF);
5873   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5874   unsigned AS = getLoadStoreAddressSpace(I);
5875   if (isa<LoadInst>(I)) {
5876     return TTI.getAddressComputationCost(ValTy) +
5877            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5878            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5879   }
5880   StoreInst *SI = cast<StoreInst>(I);
5881 
5882   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5883   return TTI.getAddressComputationCost(ValTy) +
5884          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5885          (isLoopInvariantStoreValue
5886               ? 0
5887               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5888                                        VF - 1));
5889 }
5890 
5891 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5892                                                           unsigned VF) {
5893   Type *ValTy = getMemInstValueType(I);
5894   Type *VectorTy = ToVectorTy(ValTy, VF);
5895   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5896   Value *Ptr = getLoadStorePointerOperand(I);
5897 
5898   return TTI.getAddressComputationCost(VectorTy) +
5899          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5900                                     Legal->isMaskRequired(I),
5901                                     Alignment ? Alignment->value() : 0, I);
5902 }
5903 
5904 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5905                                                             unsigned VF) {
5906   Type *ValTy = getMemInstValueType(I);
5907   Type *VectorTy = ToVectorTy(ValTy, VF);
5908   unsigned AS = getLoadStoreAddressSpace(I);
5909 
5910   auto Group = getInterleavedAccessGroup(I);
5911   assert(Group && "Fail to get an interleaved access group.");
5912 
5913   unsigned InterleaveFactor = Group->getFactor();
5914   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5915 
5916   // Holds the indices of existing members in an interleaved load group.
5917   // An interleaved store group doesn't need this as it doesn't allow gaps.
5918   SmallVector<unsigned, 4> Indices;
5919   if (isa<LoadInst>(I)) {
5920     for (unsigned i = 0; i < InterleaveFactor; i++)
5921       if (Group->getMember(i))
5922         Indices.push_back(i);
5923   }
5924 
5925   // Calculate the cost of the whole interleaved group.
5926   bool UseMaskForGaps =
5927       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5928   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5929       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5930       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5931 
5932   if (Group->isReverse()) {
5933     // TODO: Add support for reversed masked interleaved access.
5934     assert(!Legal->isMaskRequired(I) &&
5935            "Reverse masked interleaved access not supported.");
5936     Cost += Group->getNumMembers() *
5937             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5938   }
5939   return Cost;
5940 }
5941 
5942 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5943                                                               unsigned VF) {
5944   // Calculate scalar cost only. Vectorization cost should be ready at this
5945   // moment.
5946   if (VF == 1) {
5947     Type *ValTy = getMemInstValueType(I);
5948     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5949     unsigned AS = getLoadStoreAddressSpace(I);
5950 
5951     return TTI.getAddressComputationCost(ValTy) +
5952            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5953   }
5954   return getWideningCost(I, VF);
5955 }
5956 
5957 LoopVectorizationCostModel::VectorizationCostTy
5958 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5959   // If we know that this instruction will remain uniform, check the cost of
5960   // the scalar version.
5961   if (isUniformAfterVectorization(I, VF))
5962     VF = 1;
5963 
5964   if (VF > 1 && isProfitableToScalarize(I, VF))
5965     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5966 
5967   // Forced scalars do not have any scalarization overhead.
5968   auto ForcedScalar = ForcedScalars.find(VF);
5969   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5970     auto InstSet = ForcedScalar->second;
5971     if (InstSet.find(I) != InstSet.end())
5972       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5973   }
5974 
5975   Type *VectorTy;
5976   unsigned C = getInstructionCost(I, VF, VectorTy);
5977 
5978   bool TypeNotScalarized =
5979       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5980   return VectorizationCostTy(C, TypeNotScalarized);
5981 }
5982 
5983 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5984                                                               unsigned VF) {
5985 
5986   if (VF == 1)
5987     return 0;
5988 
5989   unsigned Cost = 0;
5990   Type *RetTy = ToVectorTy(I->getType(), VF);
5991   if (!RetTy->isVoidTy() &&
5992       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5993     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5994 
5995   // Some targets keep addresses scalar.
5996   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5997     return Cost;
5998 
5999   // Some targets support efficient element stores.
6000   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6001     return Cost;
6002 
6003   // Collect operands to consider.
6004   CallInst *CI = dyn_cast<CallInst>(I);
6005   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6006 
6007   // Skip operands that do not require extraction/scalarization and do not incur
6008   // any overhead.
6009   return Cost + TTI.getOperandsScalarizationOverhead(
6010                     filterExtractingOperands(Ops, VF), VF);
6011 }
6012 
6013 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6014   if (VF == 1)
6015     return;
6016   NumPredStores = 0;
6017   for (BasicBlock *BB : TheLoop->blocks()) {
6018     // For each instruction in the old loop.
6019     for (Instruction &I : *BB) {
6020       Value *Ptr =  getLoadStorePointerOperand(&I);
6021       if (!Ptr)
6022         continue;
6023 
6024       // TODO: We should generate better code and update the cost model for
6025       // predicated uniform stores. Today they are treated as any other
6026       // predicated store (see added test cases in
6027       // invariant-store-vectorization.ll).
6028       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6029         NumPredStores++;
6030 
6031       if (Legal->isUniform(Ptr) &&
6032           // Conditional loads and stores should be scalarized and predicated.
6033           // isScalarWithPredication cannot be used here since masked
6034           // gather/scatters are not considered scalar with predication.
6035           !Legal->blockNeedsPredication(I.getParent())) {
6036         // TODO: Avoid replicating loads and stores instead of
6037         // relying on instcombine to remove them.
6038         // Load: Scalar load + broadcast
6039         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6040         unsigned Cost = getUniformMemOpCost(&I, VF);
6041         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6042         continue;
6043       }
6044 
6045       // We assume that widening is the best solution when possible.
6046       if (memoryInstructionCanBeWidened(&I, VF)) {
6047         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6048         int ConsecutiveStride =
6049                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6050         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6051                "Expected consecutive stride.");
6052         InstWidening Decision =
6053             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6054         setWideningDecision(&I, VF, Decision, Cost);
6055         continue;
6056       }
6057 
6058       // Choose between Interleaving, Gather/Scatter or Scalarization.
6059       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6060       unsigned NumAccesses = 1;
6061       if (isAccessInterleaved(&I)) {
6062         auto Group = getInterleavedAccessGroup(&I);
6063         assert(Group && "Fail to get an interleaved access group.");
6064 
6065         // Make one decision for the whole group.
6066         if (getWideningDecision(&I, VF) != CM_Unknown)
6067           continue;
6068 
6069         NumAccesses = Group->getNumMembers();
6070         if (interleavedAccessCanBeWidened(&I, VF))
6071           InterleaveCost = getInterleaveGroupCost(&I, VF);
6072       }
6073 
6074       unsigned GatherScatterCost =
6075           isLegalGatherOrScatter(&I)
6076               ? getGatherScatterCost(&I, VF) * NumAccesses
6077               : std::numeric_limits<unsigned>::max();
6078 
6079       unsigned ScalarizationCost =
6080           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6081 
6082       // Choose better solution for the current VF,
6083       // write down this decision and use it during vectorization.
6084       unsigned Cost;
6085       InstWidening Decision;
6086       if (InterleaveCost <= GatherScatterCost &&
6087           InterleaveCost < ScalarizationCost) {
6088         Decision = CM_Interleave;
6089         Cost = InterleaveCost;
6090       } else if (GatherScatterCost < ScalarizationCost) {
6091         Decision = CM_GatherScatter;
6092         Cost = GatherScatterCost;
6093       } else {
6094         Decision = CM_Scalarize;
6095         Cost = ScalarizationCost;
6096       }
6097       // If the instructions belongs to an interleave group, the whole group
6098       // receives the same decision. The whole group receives the cost, but
6099       // the cost will actually be assigned to one instruction.
6100       if (auto Group = getInterleavedAccessGroup(&I))
6101         setWideningDecision(Group, VF, Decision, Cost);
6102       else
6103         setWideningDecision(&I, VF, Decision, Cost);
6104     }
6105   }
6106 
6107   // Make sure that any load of address and any other address computation
6108   // remains scalar unless there is gather/scatter support. This avoids
6109   // inevitable extracts into address registers, and also has the benefit of
6110   // activating LSR more, since that pass can't optimize vectorized
6111   // addresses.
6112   if (TTI.prefersVectorizedAddressing())
6113     return;
6114 
6115   // Start with all scalar pointer uses.
6116   SmallPtrSet<Instruction *, 8> AddrDefs;
6117   for (BasicBlock *BB : TheLoop->blocks())
6118     for (Instruction &I : *BB) {
6119       Instruction *PtrDef =
6120         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6121       if (PtrDef && TheLoop->contains(PtrDef) &&
6122           getWideningDecision(&I, VF) != CM_GatherScatter)
6123         AddrDefs.insert(PtrDef);
6124     }
6125 
6126   // Add all instructions used to generate the addresses.
6127   SmallVector<Instruction *, 4> Worklist;
6128   for (auto *I : AddrDefs)
6129     Worklist.push_back(I);
6130   while (!Worklist.empty()) {
6131     Instruction *I = Worklist.pop_back_val();
6132     for (auto &Op : I->operands())
6133       if (auto *InstOp = dyn_cast<Instruction>(Op))
6134         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6135             AddrDefs.insert(InstOp).second)
6136           Worklist.push_back(InstOp);
6137   }
6138 
6139   for (auto *I : AddrDefs) {
6140     if (isa<LoadInst>(I)) {
6141       // Setting the desired widening decision should ideally be handled in
6142       // by cost functions, but since this involves the task of finding out
6143       // if the loaded register is involved in an address computation, it is
6144       // instead changed here when we know this is the case.
6145       InstWidening Decision = getWideningDecision(I, VF);
6146       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6147         // Scalarize a widened load of address.
6148         setWideningDecision(I, VF, CM_Scalarize,
6149                             (VF * getMemoryInstructionCost(I, 1)));
6150       else if (auto Group = getInterleavedAccessGroup(I)) {
6151         // Scalarize an interleave group of address loads.
6152         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6153           if (Instruction *Member = Group->getMember(I))
6154             setWideningDecision(Member, VF, CM_Scalarize,
6155                                 (VF * getMemoryInstructionCost(Member, 1)));
6156         }
6157       }
6158     } else
6159       // Make sure I gets scalarized and a cost estimate without
6160       // scalarization overhead.
6161       ForcedScalars[VF].insert(I);
6162   }
6163 }
6164 
6165 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6166                                                         unsigned VF,
6167                                                         Type *&VectorTy) {
6168   Type *RetTy = I->getType();
6169   if (canTruncateToMinimalBitwidth(I, VF))
6170     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6171   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6172   auto SE = PSE.getSE();
6173 
6174   // TODO: We need to estimate the cost of intrinsic calls.
6175   switch (I->getOpcode()) {
6176   case Instruction::GetElementPtr:
6177     // We mark this instruction as zero-cost because the cost of GEPs in
6178     // vectorized code depends on whether the corresponding memory instruction
6179     // is scalarized or not. Therefore, we handle GEPs with the memory
6180     // instruction cost.
6181     return 0;
6182   case Instruction::Br: {
6183     // In cases of scalarized and predicated instructions, there will be VF
6184     // predicated blocks in the vectorized loop. Each branch around these
6185     // blocks requires also an extract of its vector compare i1 element.
6186     bool ScalarPredicatedBB = false;
6187     BranchInst *BI = cast<BranchInst>(I);
6188     if (VF > 1 && BI->isConditional() &&
6189         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6190              PredicatedBBsAfterVectorization.end() ||
6191          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6192              PredicatedBBsAfterVectorization.end()))
6193       ScalarPredicatedBB = true;
6194 
6195     if (ScalarPredicatedBB) {
6196       // Return cost for branches around scalarized and predicated blocks.
6197       Type *Vec_i1Ty =
6198           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6199       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6200               (TTI.getCFInstrCost(Instruction::Br) * VF));
6201     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6202       // The back-edge branch will remain, as will all scalar branches.
6203       return TTI.getCFInstrCost(Instruction::Br);
6204     else
6205       // This branch will be eliminated by if-conversion.
6206       return 0;
6207     // Note: We currently assume zero cost for an unconditional branch inside
6208     // a predicated block since it will become a fall-through, although we
6209     // may decide in the future to call TTI for all branches.
6210   }
6211   case Instruction::PHI: {
6212     auto *Phi = cast<PHINode>(I);
6213 
6214     // First-order recurrences are replaced by vector shuffles inside the loop.
6215     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6216     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6217       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6218                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6219 
6220     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6221     // converted into select instructions. We require N - 1 selects per phi
6222     // node, where N is the number of incoming values.
6223     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6224       return (Phi->getNumIncomingValues() - 1) *
6225              TTI.getCmpSelInstrCost(
6226                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6227                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6228 
6229     return TTI.getCFInstrCost(Instruction::PHI);
6230   }
6231   case Instruction::UDiv:
6232   case Instruction::SDiv:
6233   case Instruction::URem:
6234   case Instruction::SRem:
6235     // If we have a predicated instruction, it may not be executed for each
6236     // vector lane. Get the scalarization cost and scale this amount by the
6237     // probability of executing the predicated block. If the instruction is not
6238     // predicated, we fall through to the next case.
6239     if (VF > 1 && isScalarWithPredication(I)) {
6240       unsigned Cost = 0;
6241 
6242       // These instructions have a non-void type, so account for the phi nodes
6243       // that we will create. This cost is likely to be zero. The phi node
6244       // cost, if any, should be scaled by the block probability because it
6245       // models a copy at the end of each predicated block.
6246       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6247 
6248       // The cost of the non-predicated instruction.
6249       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6250 
6251       // The cost of insertelement and extractelement instructions needed for
6252       // scalarization.
6253       Cost += getScalarizationOverhead(I, VF);
6254 
6255       // Scale the cost by the probability of executing the predicated blocks.
6256       // This assumes the predicated block for each vector lane is equally
6257       // likely.
6258       return Cost / getReciprocalPredBlockProb();
6259     }
6260     LLVM_FALLTHROUGH;
6261   case Instruction::Add:
6262   case Instruction::FAdd:
6263   case Instruction::Sub:
6264   case Instruction::FSub:
6265   case Instruction::Mul:
6266   case Instruction::FMul:
6267   case Instruction::FDiv:
6268   case Instruction::FRem:
6269   case Instruction::Shl:
6270   case Instruction::LShr:
6271   case Instruction::AShr:
6272   case Instruction::And:
6273   case Instruction::Or:
6274   case Instruction::Xor: {
6275     // Since we will replace the stride by 1 the multiplication should go away.
6276     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6277       return 0;
6278     // Certain instructions can be cheaper to vectorize if they have a constant
6279     // second vector operand. One example of this are shifts on x86.
6280     Value *Op2 = I->getOperand(1);
6281     TargetTransformInfo::OperandValueProperties Op2VP;
6282     TargetTransformInfo::OperandValueKind Op2VK =
6283         TTI.getOperandInfo(Op2, Op2VP);
6284     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6285       Op2VK = TargetTransformInfo::OK_UniformValue;
6286 
6287     SmallVector<const Value *, 4> Operands(I->operand_values());
6288     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6289     return N * TTI.getArithmeticInstrCost(
6290                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6291                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6292   }
6293   case Instruction::FNeg: {
6294     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6295     return N * TTI.getArithmeticInstrCost(
6296                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6297                    TargetTransformInfo::OK_AnyValue,
6298                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6299                    I->getOperand(0), I);
6300   }
6301   case Instruction::Select: {
6302     SelectInst *SI = cast<SelectInst>(I);
6303     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6304     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6305     Type *CondTy = SI->getCondition()->getType();
6306     if (!ScalarCond)
6307       CondTy = VectorType::get(CondTy, VF);
6308 
6309     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6310   }
6311   case Instruction::ICmp:
6312   case Instruction::FCmp: {
6313     Type *ValTy = I->getOperand(0)->getType();
6314     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6315     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6316       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6317     VectorTy = ToVectorTy(ValTy, VF);
6318     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6319   }
6320   case Instruction::Store:
6321   case Instruction::Load: {
6322     unsigned Width = VF;
6323     if (Width > 1) {
6324       InstWidening Decision = getWideningDecision(I, Width);
6325       assert(Decision != CM_Unknown &&
6326              "CM decision should be taken at this point");
6327       if (Decision == CM_Scalarize)
6328         Width = 1;
6329     }
6330     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6331     return getMemoryInstructionCost(I, VF);
6332   }
6333   case Instruction::ZExt:
6334   case Instruction::SExt:
6335   case Instruction::FPToUI:
6336   case Instruction::FPToSI:
6337   case Instruction::FPExt:
6338   case Instruction::PtrToInt:
6339   case Instruction::IntToPtr:
6340   case Instruction::SIToFP:
6341   case Instruction::UIToFP:
6342   case Instruction::Trunc:
6343   case Instruction::FPTrunc:
6344   case Instruction::BitCast: {
6345     // We optimize the truncation of induction variables having constant
6346     // integer steps. The cost of these truncations is the same as the scalar
6347     // operation.
6348     if (isOptimizableIVTruncate(I, VF)) {
6349       auto *Trunc = cast<TruncInst>(I);
6350       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6351                                   Trunc->getSrcTy(), Trunc);
6352     }
6353 
6354     Type *SrcScalarTy = I->getOperand(0)->getType();
6355     Type *SrcVecTy =
6356         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6357     if (canTruncateToMinimalBitwidth(I, VF)) {
6358       // This cast is going to be shrunk. This may remove the cast or it might
6359       // turn it into slightly different cast. For example, if MinBW == 16,
6360       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6361       //
6362       // Calculate the modified src and dest types.
6363       Type *MinVecTy = VectorTy;
6364       if (I->getOpcode() == Instruction::Trunc) {
6365         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6366         VectorTy =
6367             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6368       } else if (I->getOpcode() == Instruction::ZExt ||
6369                  I->getOpcode() == Instruction::SExt) {
6370         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6371         VectorTy =
6372             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6373       }
6374     }
6375 
6376     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6377     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6378   }
6379   case Instruction::Call: {
6380     bool NeedToScalarize;
6381     CallInst *CI = cast<CallInst>(I);
6382     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6383     if (getVectorIntrinsicIDForCall(CI, TLI))
6384       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6385     return CallCost;
6386   }
6387   default:
6388     // The cost of executing VF copies of the scalar instruction. This opcode
6389     // is unknown. Assume that it is the same as 'mul'.
6390     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6391            getScalarizationOverhead(I, VF);
6392   } // end of switch.
6393 }
6394 
6395 char LoopVectorize::ID = 0;
6396 
6397 static const char lv_name[] = "Loop Vectorization";
6398 
6399 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6400 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6401 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6402 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6403 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6404 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6405 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6406 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6407 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6408 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6409 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6410 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6411 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6412 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6413 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6414 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6415 
6416 namespace llvm {
6417 
6418 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6419 
6420 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6421                               bool VectorizeOnlyWhenForced) {
6422   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6423 }
6424 
6425 } // end namespace llvm
6426 
6427 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6428   // Check if the pointer operand of a load or store instruction is
6429   // consecutive.
6430   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6431     return Legal->isConsecutivePtr(Ptr);
6432   return false;
6433 }
6434 
6435 void LoopVectorizationCostModel::collectValuesToIgnore() {
6436   // Ignore ephemeral values.
6437   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6438 
6439   // Ignore type-promoting instructions we identified during reduction
6440   // detection.
6441   for (auto &Reduction : Legal->getReductionVars()) {
6442     RecurrenceDescriptor &RedDes = Reduction.second;
6443     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6444     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6445   }
6446   // Ignore type-casting instructions we identified during induction
6447   // detection.
6448   for (auto &Induction : Legal->getInductionVars()) {
6449     InductionDescriptor &IndDes = Induction.second;
6450     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6451     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6452   }
6453 }
6454 
6455 // TODO: we could return a pair of values that specify the max VF and
6456 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6457 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6458 // doesn't have a cost model that can choose which plan to execute if
6459 // more than one is generated.
6460 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6461                                  LoopVectorizationCostModel &CM) {
6462   unsigned WidestType;
6463   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6464   return WidestVectorRegBits / WidestType;
6465 }
6466 
6467 VectorizationFactor
6468 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6469   unsigned VF = UserVF;
6470   // Outer loop handling: They may require CFG and instruction level
6471   // transformations before even evaluating whether vectorization is profitable.
6472   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6473   // the vectorization pipeline.
6474   if (!OrigLoop->empty()) {
6475     // If the user doesn't provide a vectorization factor, determine a
6476     // reasonable one.
6477     if (!UserVF) {
6478       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6479       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6480 
6481       // Make sure we have a VF > 1 for stress testing.
6482       if (VPlanBuildStressTest && VF < 2) {
6483         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6484                           << "overriding computed VF.\n");
6485         VF = 4;
6486       }
6487     }
6488     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6489     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6490     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6491                       << " to build VPlans.\n");
6492     buildVPlans(VF, VF);
6493 
6494     // For VPlan build stress testing, we bail out after VPlan construction.
6495     if (VPlanBuildStressTest)
6496       return VectorizationFactor::Disabled();
6497 
6498     return {VF, 0};
6499   }
6500 
6501   LLVM_DEBUG(
6502       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6503                 "VPlan-native path.\n");
6504   return VectorizationFactor::Disabled();
6505 }
6506 
6507 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6508   assert(OrigLoop->empty() && "Inner loop expected.");
6509   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6510   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6511     return None;
6512 
6513   // Invalidate interleave groups if all blocks of loop will be predicated.
6514   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6515       !useMaskedInterleavedAccesses(*TTI)) {
6516     LLVM_DEBUG(
6517         dbgs()
6518         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6519            "which requires masked-interleaved support.\n");
6520     CM.InterleaveInfo.reset();
6521   }
6522 
6523   if (UserVF) {
6524     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6525     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6526     // Collect the instructions (and their associated costs) that will be more
6527     // profitable to scalarize.
6528     CM.selectUserVectorizationFactor(UserVF);
6529     buildVPlansWithVPRecipes(UserVF, UserVF);
6530     LLVM_DEBUG(printPlans(dbgs()));
6531     return {{UserVF, 0}};
6532   }
6533 
6534   unsigned MaxVF = MaybeMaxVF.getValue();
6535   assert(MaxVF != 0 && "MaxVF is zero.");
6536 
6537   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6538     // Collect Uniform and Scalar instructions after vectorization with VF.
6539     CM.collectUniformsAndScalars(VF);
6540 
6541     // Collect the instructions (and their associated costs) that will be more
6542     // profitable to scalarize.
6543     if (VF > 1)
6544       CM.collectInstsToScalarize(VF);
6545   }
6546 
6547   buildVPlansWithVPRecipes(1, MaxVF);
6548   LLVM_DEBUG(printPlans(dbgs()));
6549   if (MaxVF == 1)
6550     return VectorizationFactor::Disabled();
6551 
6552   // Select the optimal vectorization factor.
6553   return CM.selectVectorizationFactor(MaxVF);
6554 }
6555 
6556 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6557   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6558                     << '\n');
6559   BestVF = VF;
6560   BestUF = UF;
6561 
6562   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6563     return !Plan->hasVF(VF);
6564   });
6565   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6566 }
6567 
6568 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6569                                            DominatorTree *DT) {
6570   // Perform the actual loop transformation.
6571 
6572   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6573   VPCallbackILV CallbackILV(ILV);
6574 
6575   VPTransformState State{BestVF, BestUF,      LI,
6576                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6577                          &ILV,   CallbackILV};
6578   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6579   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6580   State.CanonicalIV = ILV.Induction;
6581 
6582   //===------------------------------------------------===//
6583   //
6584   // Notice: any optimization or new instruction that go
6585   // into the code below should also be implemented in
6586   // the cost-model.
6587   //
6588   //===------------------------------------------------===//
6589 
6590   // 2. Copy and widen instructions from the old loop into the new loop.
6591   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6592   VPlans.front()->execute(&State);
6593 
6594   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6595   //    predication, updating analyses.
6596   ILV.fixVectorizedLoop();
6597 }
6598 
6599 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6600     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6601   BasicBlock *Latch = OrigLoop->getLoopLatch();
6602 
6603   // We create new control-flow for the vectorized loop, so the original
6604   // condition will be dead after vectorization if it's only used by the
6605   // branch.
6606   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6607   if (Cmp && Cmp->hasOneUse())
6608     DeadInstructions.insert(Cmp);
6609 
6610   // We create new "steps" for induction variable updates to which the original
6611   // induction variables map. An original update instruction will be dead if
6612   // all its users except the induction variable are dead.
6613   for (auto &Induction : Legal->getInductionVars()) {
6614     PHINode *Ind = Induction.first;
6615     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6616     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6617           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6618                                  DeadInstructions.end();
6619         }))
6620       DeadInstructions.insert(IndUpdate);
6621 
6622     // We record as "Dead" also the type-casting instructions we had identified
6623     // during induction analysis. We don't need any handling for them in the
6624     // vectorized loop because we have proven that, under a proper runtime
6625     // test guarding the vectorized loop, the value of the phi, and the casted
6626     // value of the phi, are the same. The last instruction in this casting chain
6627     // will get its scalar/vector/widened def from the scalar/vector/widened def
6628     // of the respective phi node. Any other casts in the induction def-use chain
6629     // have no other uses outside the phi update chain, and will be ignored.
6630     InductionDescriptor &IndDes = Induction.second;
6631     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6632     DeadInstructions.insert(Casts.begin(), Casts.end());
6633   }
6634 }
6635 
6636 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6637 
6638 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6639 
6640 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6641                                         Instruction::BinaryOps BinOp) {
6642   // When unrolling and the VF is 1, we only need to add a simple scalar.
6643   Type *Ty = Val->getType();
6644   assert(!Ty->isVectorTy() && "Val must be a scalar");
6645 
6646   if (Ty->isFloatingPointTy()) {
6647     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6648 
6649     // Floating point operations had to be 'fast' to enable the unrolling.
6650     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6651     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6652   }
6653   Constant *C = ConstantInt::get(Ty, StartIdx);
6654   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6655 }
6656 
6657 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6658   SmallVector<Metadata *, 4> MDs;
6659   // Reserve first location for self reference to the LoopID metadata node.
6660   MDs.push_back(nullptr);
6661   bool IsUnrollMetadata = false;
6662   MDNode *LoopID = L->getLoopID();
6663   if (LoopID) {
6664     // First find existing loop unrolling disable metadata.
6665     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6666       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6667       if (MD) {
6668         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6669         IsUnrollMetadata =
6670             S && S->getString().startswith("llvm.loop.unroll.disable");
6671       }
6672       MDs.push_back(LoopID->getOperand(i));
6673     }
6674   }
6675 
6676   if (!IsUnrollMetadata) {
6677     // Add runtime unroll disable metadata.
6678     LLVMContext &Context = L->getHeader()->getContext();
6679     SmallVector<Metadata *, 1> DisableOperands;
6680     DisableOperands.push_back(
6681         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6682     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6683     MDs.push_back(DisableNode);
6684     MDNode *NewLoopID = MDNode::get(Context, MDs);
6685     // Set operand 0 to refer to the loop id itself.
6686     NewLoopID->replaceOperandWith(0, NewLoopID);
6687     L->setLoopID(NewLoopID);
6688   }
6689 }
6690 
6691 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6692     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6693   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6694   bool PredicateAtRangeStart = Predicate(Range.Start);
6695 
6696   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6697     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6698       Range.End = TmpVF;
6699       break;
6700     }
6701 
6702   return PredicateAtRangeStart;
6703 }
6704 
6705 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6706 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6707 /// of VF's starting at a given VF and extending it as much as possible. Each
6708 /// vectorization decision can potentially shorten this sub-range during
6709 /// buildVPlan().
6710 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6711   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6712     VFRange SubRange = {VF, MaxVF + 1};
6713     VPlans.push_back(buildVPlan(SubRange));
6714     VF = SubRange.End;
6715   }
6716 }
6717 
6718 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6719                                          VPlanPtr &Plan) {
6720   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6721 
6722   // Look for cached value.
6723   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6724   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6725   if (ECEntryIt != EdgeMaskCache.end())
6726     return ECEntryIt->second;
6727 
6728   VPValue *SrcMask = createBlockInMask(Src, Plan);
6729 
6730   // The terminator has to be a branch inst!
6731   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6732   assert(BI && "Unexpected terminator found");
6733 
6734   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6735     return EdgeMaskCache[Edge] = SrcMask;
6736 
6737   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6738   assert(EdgeMask && "No Edge Mask found for condition");
6739 
6740   if (BI->getSuccessor(0) != Dst)
6741     EdgeMask = Builder.createNot(EdgeMask);
6742 
6743   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6744     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6745 
6746   return EdgeMaskCache[Edge] = EdgeMask;
6747 }
6748 
6749 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6750   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6751 
6752   // Look for cached value.
6753   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6754   if (BCEntryIt != BlockMaskCache.end())
6755     return BCEntryIt->second;
6756 
6757   // All-one mask is modelled as no-mask following the convention for masked
6758   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6759   VPValue *BlockMask = nullptr;
6760 
6761   if (OrigLoop->getHeader() == BB) {
6762     if (!CM.blockNeedsPredication(BB))
6763       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6764 
6765     // Introduce the early-exit compare IV <= BTC to form header block mask.
6766     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6767     // Start by constructing the desired canonical IV.
6768     VPValue *IV = nullptr;
6769     if (Legal->getPrimaryInduction())
6770       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6771     else {
6772       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6773       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6774       IV = IVRecipe->getVPValue();
6775     }
6776     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6777     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6778     return BlockMaskCache[BB] = BlockMask;
6779   }
6780 
6781   // This is the block mask. We OR all incoming edges.
6782   for (auto *Predecessor : predecessors(BB)) {
6783     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6784     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6785       return BlockMaskCache[BB] = EdgeMask;
6786 
6787     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6788       BlockMask = EdgeMask;
6789       continue;
6790     }
6791 
6792     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6793   }
6794 
6795   return BlockMaskCache[BB] = BlockMask;
6796 }
6797 
6798 VPWidenMemoryInstructionRecipe *
6799 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6800                                   VPlanPtr &Plan) {
6801   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6802     return nullptr;
6803 
6804   auto willWiden = [&](unsigned VF) -> bool {
6805     if (VF == 1)
6806       return false;
6807     LoopVectorizationCostModel::InstWidening Decision =
6808         CM.getWideningDecision(I, VF);
6809     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6810            "CM decision should be taken at this point.");
6811     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6812       return true;
6813     if (CM.isScalarAfterVectorization(I, VF) ||
6814         CM.isProfitableToScalarize(I, VF))
6815       return false;
6816     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6817   };
6818 
6819   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6820     return nullptr;
6821 
6822   VPValue *Mask = nullptr;
6823   if (Legal->isMaskRequired(I))
6824     Mask = createBlockInMask(I->getParent(), Plan);
6825 
6826   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6827   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6828     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6829 
6830   StoreInst *Store = cast<StoreInst>(I);
6831   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6832   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6833 }
6834 
6835 VPWidenIntOrFpInductionRecipe *
6836 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6837   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6838     // Check if this is an integer or fp induction. If so, build the recipe that
6839     // produces its scalar and vector values.
6840     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6841     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6842         II.getKind() == InductionDescriptor::IK_FpInduction)
6843       return new VPWidenIntOrFpInductionRecipe(Phi);
6844 
6845     return nullptr;
6846   }
6847 
6848   // Optimize the special case where the source is a constant integer
6849   // induction variable. Notice that we can only optimize the 'trunc' case
6850   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6851   // (c) other casts depend on pointer size.
6852 
6853   // Determine whether \p K is a truncation based on an induction variable that
6854   // can be optimized.
6855   auto isOptimizableIVTruncate =
6856       [&](Instruction *K) -> std::function<bool(unsigned)> {
6857     return
6858         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6859   };
6860 
6861   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6862                                isOptimizableIVTruncate(I), Range))
6863     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6864                                              cast<TruncInst>(I));
6865   return nullptr;
6866 }
6867 
6868 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6869   PHINode *Phi = dyn_cast<PHINode>(I);
6870   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6871     return nullptr;
6872 
6873   // We know that all PHIs in non-header blocks are converted into selects, so
6874   // we don't have to worry about the insertion order and we can just use the
6875   // builder. At this point we generate the predication tree. There may be
6876   // duplications since this is a simple recursive scan, but future
6877   // optimizations will clean it up.
6878 
6879   SmallVector<VPValue *, 2> Operands;
6880   unsigned NumIncoming = Phi->getNumIncomingValues();
6881   for (unsigned In = 0; In < NumIncoming; In++) {
6882     VPValue *EdgeMask =
6883       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6884     assert((EdgeMask || NumIncoming == 1) &&
6885            "Multiple predecessors with one having a full mask");
6886     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
6887     if (EdgeMask)
6888       Operands.push_back(EdgeMask);
6889   }
6890   return new VPBlendRecipe(Phi, Operands);
6891 }
6892 
6893 VPWidenCallRecipe *
6894 VPRecipeBuilder::tryToWidenCall(Instruction *I, VFRange &Range, VPlan &Plan) {
6895 
6896   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6897       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6898 
6899   CallInst *CI = dyn_cast<CallInst>(I);
6900   if (IsPredicated || !CI)
6901     return nullptr;
6902 
6903   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6904   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6905              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6906     return nullptr;
6907 
6908   auto willWiden = [&](unsigned VF) -> bool {
6909     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6910     // The following case may be scalarized depending on the VF.
6911     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6912     // version of the instruction.
6913     // Is it beneficial to perform intrinsic call compared to lib call?
6914     bool NeedToScalarize = false;
6915     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6916     bool UseVectorIntrinsic =
6917         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6918     return UseVectorIntrinsic || !NeedToScalarize;
6919   };
6920 
6921   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6922     return nullptr;
6923 
6924   // Success: widen this call.
6925   auto VPValues = map_range(CI->arg_operands(), [&Plan](Value *Op) {
6926     return Plan.getOrAddVPValue(Op);
6927   });
6928 
6929   return new VPWidenCallRecipe(*CI, VPValues);
6930 }
6931 
6932 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
6933   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
6934          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
6935   // Instruction should be widened, unless it is scalar after vectorization,
6936   // scalarization is profitable or it is predicated.
6937   auto WillScalarize = [this, I](unsigned VF) -> bool {
6938     return CM.isScalarAfterVectorization(I, VF) ||
6939            CM.isProfitableToScalarize(I, VF) ||
6940            CM.isScalarWithPredication(I, VF);
6941   };
6942   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
6943                                                              Range);
6944 }
6945 
6946 VPWidenSelectRecipe *VPRecipeBuilder::tryToWidenSelect(Instruction *I) {
6947   auto *SI = dyn_cast<SelectInst>(I);
6948   if (!SI)
6949     return nullptr;
6950   auto *SE = PSE.getSE();
6951   bool InvariantCond =
6952       SE->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
6953   // Success: widen this instruction.
6954   return new VPWidenSelectRecipe(*SI, InvariantCond);
6955 }
6956 
6957 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) {
6958   auto IsVectorizableOpcode = [](unsigned Opcode) {
6959     switch (Opcode) {
6960     case Instruction::Add:
6961     case Instruction::And:
6962     case Instruction::AShr:
6963     case Instruction::BitCast:
6964     case Instruction::FAdd:
6965     case Instruction::FCmp:
6966     case Instruction::FDiv:
6967     case Instruction::FMul:
6968     case Instruction::FNeg:
6969     case Instruction::FPExt:
6970     case Instruction::FPToSI:
6971     case Instruction::FPToUI:
6972     case Instruction::FPTrunc:
6973     case Instruction::FRem:
6974     case Instruction::FSub:
6975     case Instruction::ICmp:
6976     case Instruction::IntToPtr:
6977     case Instruction::LShr:
6978     case Instruction::Mul:
6979     case Instruction::Or:
6980     case Instruction::PtrToInt:
6981     case Instruction::SDiv:
6982     case Instruction::Select:
6983     case Instruction::SExt:
6984     case Instruction::Shl:
6985     case Instruction::SIToFP:
6986     case Instruction::SRem:
6987     case Instruction::Sub:
6988     case Instruction::Trunc:
6989     case Instruction::UDiv:
6990     case Instruction::UIToFP:
6991     case Instruction::URem:
6992     case Instruction::Xor:
6993     case Instruction::ZExt:
6994       return true;
6995     }
6996     return false;
6997   };
6998 
6999   if (!IsVectorizableOpcode(I->getOpcode()))
7000     return nullptr;
7001 
7002   // Success: widen this instruction.
7003   return new VPWidenRecipe(*I);
7004 }
7005 
7006 VPBasicBlock *VPRecipeBuilder::handleReplication(
7007     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7008     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7009     VPlanPtr &Plan) {
7010   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7011       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7012       Range);
7013 
7014   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7015       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7016 
7017   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7018   setRecipe(I, Recipe);
7019 
7020   // Find if I uses a predicated instruction. If so, it will use its scalar
7021   // value. Avoid hoisting the insert-element which packs the scalar value into
7022   // a vector value, as that happens iff all users use the vector value.
7023   for (auto &Op : I->operands())
7024     if (auto *PredInst = dyn_cast<Instruction>(Op))
7025       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7026         PredInst2Recipe[PredInst]->setAlsoPack(false);
7027 
7028   // Finalize the recipe for Instr, first if it is not predicated.
7029   if (!IsPredicated) {
7030     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7031     VPBB->appendRecipe(Recipe);
7032     return VPBB;
7033   }
7034   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7035   assert(VPBB->getSuccessors().empty() &&
7036          "VPBB has successors when handling predicated replication.");
7037   // Record predicated instructions for above packing optimizations.
7038   PredInst2Recipe[I] = Recipe;
7039   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7040   VPBlockUtils::insertBlockAfter(Region, VPBB);
7041   auto *RegSucc = new VPBasicBlock();
7042   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7043   return RegSucc;
7044 }
7045 
7046 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7047                                                       VPRecipeBase *PredRecipe,
7048                                                       VPlanPtr &Plan) {
7049   // Instructions marked for predication are replicated and placed under an
7050   // if-then construct to prevent side-effects.
7051 
7052   // Generate recipes to compute the block mask for this region.
7053   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7054 
7055   // Build the triangular if-then region.
7056   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7057   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7058   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7059   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7060   auto *PHIRecipe =
7061       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7062   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7063   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7064   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7065 
7066   // Note: first set Entry as region entry and then connect successors starting
7067   // from it in order, to propagate the "parent" of each VPBasicBlock.
7068   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7069   VPBlockUtils::connectBlocks(Pred, Exit);
7070 
7071   return Region;
7072 }
7073 
7074 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7075                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7076   VPRecipeBase *Recipe = nullptr;
7077 
7078   // First, check for specific widening recipes that deal with calls, memory
7079   // operations, inductions and Phi nodes.
7080   if ((Recipe = tryToWidenCall(Instr, Range, *Plan)) ||
7081       (Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7082       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7083       (Recipe = tryToBlend(Instr, Plan)) ||
7084       (isa<PHINode>(Instr) &&
7085        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7086     setRecipe(Instr, Recipe);
7087     VPBB->appendRecipe(Recipe);
7088     return true;
7089   }
7090 
7091   // Calls and memory instructions are widened by the specialized recipes above,
7092   // or scalarized.
7093   if (isa<CallInst>(Instr) || isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7094     return false;
7095 
7096   if (!shouldWiden(Instr, Range))
7097     return false;
7098 
7099   if ((Recipe = tryToWidenSelect(Instr)) ||
7100       (isa<GetElementPtrInst>(Instr) &&
7101        (Recipe =
7102             new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), OrigLoop))) ||
7103       (Recipe = tryToWiden(Instr, *Plan))) {
7104     setRecipe(Instr, Recipe);
7105     VPBB->appendRecipe(Recipe);
7106     return true;
7107   }
7108 
7109   return false;
7110 }
7111 
7112 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7113                                                         unsigned MaxVF) {
7114   assert(OrigLoop->empty() && "Inner loop expected.");
7115 
7116   // Collect conditions feeding internal conditional branches; they need to be
7117   // represented in VPlan for it to model masking.
7118   SmallPtrSet<Value *, 1> NeedDef;
7119 
7120   auto *Latch = OrigLoop->getLoopLatch();
7121   for (BasicBlock *BB : OrigLoop->blocks()) {
7122     if (BB == Latch)
7123       continue;
7124     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7125     if (Branch && Branch->isConditional())
7126       NeedDef.insert(Branch->getCondition());
7127   }
7128 
7129   // If the tail is to be folded by masking, the primary induction variable, if
7130   // exists needs to be represented in VPlan for it to model early-exit masking.
7131   // Also, both the Phi and the live-out instruction of each reduction are
7132   // required in order to introduce a select between them in VPlan.
7133   if (CM.foldTailByMasking()) {
7134     if (Legal->getPrimaryInduction())
7135       NeedDef.insert(Legal->getPrimaryInduction());
7136     for (auto &Reduction : Legal->getReductionVars()) {
7137       NeedDef.insert(Reduction.first);
7138       NeedDef.insert(Reduction.second.getLoopExitInstr());
7139     }
7140   }
7141 
7142   // Collect instructions from the original loop that will become trivially dead
7143   // in the vectorized loop. We don't need to vectorize these instructions. For
7144   // example, original induction update instructions can become dead because we
7145   // separately emit induction "steps" when generating code for the new loop.
7146   // Similarly, we create a new latch condition when setting up the structure
7147   // of the new loop, so the old one can become dead.
7148   SmallPtrSet<Instruction *, 4> DeadInstructions;
7149   collectTriviallyDeadInstructions(DeadInstructions);
7150 
7151   // Add assume instructions we need to drop to DeadInstructions, to prevent
7152   // them from being added to the VPlan.
7153   // TODO: We only need to drop assumes in blocks that get flattend. If the
7154   // control flow is preserved, we should keep them.
7155   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7156   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7157 
7158   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7159   // Dead instructions do not need sinking. Remove them from SinkAfter.
7160   for (Instruction *I : DeadInstructions)
7161     SinkAfter.erase(I);
7162 
7163   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7164     VFRange SubRange = {VF, MaxVF + 1};
7165     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7166                                              DeadInstructions, SinkAfter));
7167     VF = SubRange.End;
7168   }
7169 }
7170 
7171 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7172     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7173     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7174     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7175 
7176   // Hold a mapping from predicated instructions to their recipes, in order to
7177   // fix their AlsoPack behavior if a user is determined to replicate and use a
7178   // scalar instead of vector value.
7179   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7180 
7181   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7182 
7183   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7184 
7185   // ---------------------------------------------------------------------------
7186   // Pre-construction: record ingredients whose recipes we'll need to further
7187   // process after constructing the initial VPlan.
7188   // ---------------------------------------------------------------------------
7189 
7190   // Mark instructions we'll need to sink later and their targets as
7191   // ingredients whose recipe we'll need to record.
7192   for (auto &Entry : SinkAfter) {
7193     RecipeBuilder.recordRecipeOf(Entry.first);
7194     RecipeBuilder.recordRecipeOf(Entry.second);
7195   }
7196 
7197   // For each interleave group which is relevant for this (possibly trimmed)
7198   // Range, add it to the set of groups to be later applied to the VPlan and add
7199   // placeholders for its members' Recipes which we'll be replacing with a
7200   // single VPInterleaveRecipe.
7201   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7202     auto applyIG = [IG, this](unsigned VF) -> bool {
7203       return (VF >= 2 && // Query is illegal for VF == 1
7204               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7205                   LoopVectorizationCostModel::CM_Interleave);
7206     };
7207     if (!getDecisionAndClampRange(applyIG, Range))
7208       continue;
7209     InterleaveGroups.insert(IG);
7210     for (unsigned i = 0; i < IG->getFactor(); i++)
7211       if (Instruction *Member = IG->getMember(i))
7212         RecipeBuilder.recordRecipeOf(Member);
7213   };
7214 
7215   // ---------------------------------------------------------------------------
7216   // Build initial VPlan: Scan the body of the loop in a topological order to
7217   // visit each basic block after having visited its predecessor basic blocks.
7218   // ---------------------------------------------------------------------------
7219 
7220   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7221   auto Plan = std::make_unique<VPlan>();
7222   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7223   Plan->setEntry(VPBB);
7224 
7225   // Represent values that will have defs inside VPlan.
7226   for (Value *V : NeedDef)
7227     Plan->addVPValue(V);
7228 
7229   // Scan the body of the loop in a topological order to visit each basic block
7230   // after having visited its predecessor basic blocks.
7231   LoopBlocksDFS DFS(OrigLoop);
7232   DFS.perform(LI);
7233 
7234   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7235     // Relevant instructions from basic block BB will be grouped into VPRecipe
7236     // ingredients and fill a new VPBasicBlock.
7237     unsigned VPBBsForBB = 0;
7238     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7239     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7240     VPBB = FirstVPBBForBB;
7241     Builder.setInsertPoint(VPBB);
7242 
7243     // Introduce each ingredient into VPlan.
7244     // TODO: Model and preserve debug instrinsics in VPlan.
7245     for (Instruction &I : BB->instructionsWithoutDebug()) {
7246       Instruction *Instr = &I;
7247 
7248       // First filter out irrelevant instructions, to ensure no recipes are
7249       // built for them.
7250       if (isa<BranchInst>(Instr) ||
7251           DeadInstructions.find(Instr) != DeadInstructions.end())
7252         continue;
7253 
7254       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7255         continue;
7256 
7257       // Otherwise, if all widening options failed, Instruction is to be
7258       // replicated. This may create a successor for VPBB.
7259       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7260           Instr, Range, VPBB, PredInst2Recipe, Plan);
7261       if (NextVPBB != VPBB) {
7262         VPBB = NextVPBB;
7263         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7264                                     : "");
7265       }
7266     }
7267   }
7268 
7269   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7270   // may also be empty, such as the last one VPBB, reflecting original
7271   // basic-blocks with no recipes.
7272   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7273   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7274   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7275   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7276   delete PreEntry;
7277 
7278   // ---------------------------------------------------------------------------
7279   // Transform initial VPlan: Apply previously taken decisions, in order, to
7280   // bring the VPlan to its final state.
7281   // ---------------------------------------------------------------------------
7282 
7283   // Apply Sink-After legal constraints.
7284   for (auto &Entry : SinkAfter) {
7285     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7286     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7287     Sink->moveAfter(Target);
7288   }
7289 
7290   // Interleave memory: for each Interleave Group we marked earlier as relevant
7291   // for this VPlan, replace the Recipes widening its memory instructions with a
7292   // single VPInterleaveRecipe at its insertion point.
7293   for (auto IG : InterleaveGroups) {
7294     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7295         RecipeBuilder.getRecipe(IG->getInsertPos()));
7296     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7297         ->insertBefore(Recipe);
7298 
7299     for (unsigned i = 0; i < IG->getFactor(); ++i)
7300       if (Instruction *Member = IG->getMember(i)) {
7301         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7302       }
7303   }
7304 
7305   // Finally, if tail is folded by masking, introduce selects between the phi
7306   // and the live-out instruction of each reduction, at the end of the latch.
7307   if (CM.foldTailByMasking()) {
7308     Builder.setInsertPoint(VPBB);
7309     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7310     for (auto &Reduction : Legal->getReductionVars()) {
7311       VPValue *Phi = Plan->getVPValue(Reduction.first);
7312       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7313       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7314     }
7315   }
7316 
7317   std::string PlanName;
7318   raw_string_ostream RSO(PlanName);
7319   unsigned VF = Range.Start;
7320   Plan->addVF(VF);
7321   RSO << "Initial VPlan for VF={" << VF;
7322   for (VF *= 2; VF < Range.End; VF *= 2) {
7323     Plan->addVF(VF);
7324     RSO << "," << VF;
7325   }
7326   RSO << "},UF>=1";
7327   RSO.flush();
7328   Plan->setName(PlanName);
7329 
7330   return Plan;
7331 }
7332 
7333 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7334   // Outer loop handling: They may require CFG and instruction level
7335   // transformations before even evaluating whether vectorization is profitable.
7336   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7337   // the vectorization pipeline.
7338   assert(!OrigLoop->empty());
7339   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7340 
7341   // Create new empty VPlan
7342   auto Plan = std::make_unique<VPlan>();
7343 
7344   // Build hierarchical CFG
7345   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7346   HCFGBuilder.buildHierarchicalCFG();
7347 
7348   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7349     Plan->addVF(VF);
7350 
7351   if (EnableVPlanPredication) {
7352     VPlanPredicator VPP(*Plan);
7353     VPP.predicate();
7354 
7355     // Avoid running transformation to recipes until masked code generation in
7356     // VPlan-native path is in place.
7357     return Plan;
7358   }
7359 
7360   SmallPtrSet<Instruction *, 1> DeadInstructions;
7361   VPlanTransforms::VPInstructionsToVPRecipes(
7362       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7363   return Plan;
7364 }
7365 
7366 Value* LoopVectorizationPlanner::VPCallbackILV::
7367 getOrCreateVectorValues(Value *V, unsigned Part) {
7368       return ILV.getOrCreateVectorValue(V, Part);
7369 }
7370 
7371 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7372     Value *V, const VPIteration &Instance) {
7373   return ILV.getOrCreateScalarValue(V, Instance);
7374 }
7375 
7376 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7377                                VPSlotTracker &SlotTracker) const {
7378   O << " +\n"
7379     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7380   IG->getInsertPos()->printAsOperand(O, false);
7381   O << ", ";
7382   getAddr()->printAsOperand(O, SlotTracker);
7383   VPValue *Mask = getMask();
7384   if (Mask) {
7385     O << ", ";
7386     Mask->printAsOperand(O, SlotTracker);
7387   }
7388   O << "\\l\"";
7389   for (unsigned i = 0; i < IG->getFactor(); ++i)
7390     if (Instruction *I = IG->getMember(i))
7391       O << " +\n"
7392         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7393 }
7394 
7395 void VPWidenCallRecipe::execute(VPTransformState &State) {
7396   State.ILV->widenCallInstruction(Ingredient, User, State);
7397 }
7398 
7399 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7400   State.ILV->widenSelectInstruction(Ingredient, InvariantCond);
7401 }
7402 
7403 void VPWidenRecipe::execute(VPTransformState &State) {
7404   State.ILV->widenInstruction(Ingredient);
7405 }
7406 
7407 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7408   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7409                       IsIndexLoopInvariant);
7410 }
7411 
7412 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7413   assert(!State.Instance && "Int or FP induction being replicated.");
7414   State.ILV->widenIntOrFpInduction(IV, Trunc);
7415 }
7416 
7417 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7418   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7419 }
7420 
7421 void VPBlendRecipe::execute(VPTransformState &State) {
7422   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7423   // We know that all PHIs in non-header blocks are converted into
7424   // selects, so we don't have to worry about the insertion order and we
7425   // can just use the builder.
7426   // At this point we generate the predication tree. There may be
7427   // duplications since this is a simple recursive scan, but future
7428   // optimizations will clean it up.
7429 
7430   unsigned NumIncoming = getNumIncomingValues();
7431 
7432   // Generate a sequence of selects of the form:
7433   // SELECT(Mask3, In3,
7434   //        SELECT(Mask2, In2,
7435   //               SELECT(Mask1, In1,
7436   //                      In0)))
7437   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7438   // are essentially undef are taken from In0.
7439   InnerLoopVectorizer::VectorParts Entry(State.UF);
7440   for (unsigned In = 0; In < NumIncoming; ++In) {
7441     for (unsigned Part = 0; Part < State.UF; ++Part) {
7442       // We might have single edge PHIs (blocks) - use an identity
7443       // 'select' for the first PHI operand.
7444       Value *In0 = State.get(getIncomingValue(In), Part);
7445       if (In == 0)
7446         Entry[Part] = In0; // Initialize with the first incoming value.
7447       else {
7448         // Select between the current value and the previous incoming edge
7449         // based on the incoming mask.
7450         Value *Cond = State.get(getMask(In), Part);
7451         Entry[Part] =
7452             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7453       }
7454     }
7455   }
7456   for (unsigned Part = 0; Part < State.UF; ++Part)
7457     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7458 }
7459 
7460 void VPInterleaveRecipe::execute(VPTransformState &State) {
7461   assert(!State.Instance && "Interleave group being replicated.");
7462   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7463 }
7464 
7465 void VPReplicateRecipe::execute(VPTransformState &State) {
7466   if (State.Instance) { // Generate a single instance.
7467     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7468     // Insert scalar instance packing it into a vector.
7469     if (AlsoPack && State.VF > 1) {
7470       // If we're constructing lane 0, initialize to start from undef.
7471       if (State.Instance->Lane == 0) {
7472         Value *Undef =
7473             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7474         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7475       }
7476       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7477     }
7478     return;
7479   }
7480 
7481   // Generate scalar instances for all VF lanes of all UF parts, unless the
7482   // instruction is uniform inwhich case generate only the first lane for each
7483   // of the UF parts.
7484   unsigned EndLane = IsUniform ? 1 : State.VF;
7485   for (unsigned Part = 0; Part < State.UF; ++Part)
7486     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7487       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7488 }
7489 
7490 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7491   assert(State.Instance && "Branch on Mask works only on single instance.");
7492 
7493   unsigned Part = State.Instance->Part;
7494   unsigned Lane = State.Instance->Lane;
7495 
7496   Value *ConditionBit = nullptr;
7497   if (!User) // Block in mask is all-one.
7498     ConditionBit = State.Builder.getTrue();
7499   else {
7500     VPValue *BlockInMask = User->getOperand(0);
7501     ConditionBit = State.get(BlockInMask, Part);
7502     if (ConditionBit->getType()->isVectorTy())
7503       ConditionBit = State.Builder.CreateExtractElement(
7504           ConditionBit, State.Builder.getInt32(Lane));
7505   }
7506 
7507   // Replace the temporary unreachable terminator with a new conditional branch,
7508   // whose two destinations will be set later when they are created.
7509   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7510   assert(isa<UnreachableInst>(CurrentTerminator) &&
7511          "Expected to replace unreachable terminator with conditional branch.");
7512   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7513   CondBr->setSuccessor(0, nullptr);
7514   ReplaceInstWithInst(CurrentTerminator, CondBr);
7515 }
7516 
7517 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7518   assert(State.Instance && "Predicated instruction PHI works per instance.");
7519   Instruction *ScalarPredInst = cast<Instruction>(
7520       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7521   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7522   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7523   assert(PredicatingBB && "Predicated block has no single predecessor.");
7524 
7525   // By current pack/unpack logic we need to generate only a single phi node: if
7526   // a vector value for the predicated instruction exists at this point it means
7527   // the instruction has vector users only, and a phi for the vector value is
7528   // needed. In this case the recipe of the predicated instruction is marked to
7529   // also do that packing, thereby "hoisting" the insert-element sequence.
7530   // Otherwise, a phi node for the scalar value is needed.
7531   unsigned Part = State.Instance->Part;
7532   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7533     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7534     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7535     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7536     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7537     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7538     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7539   } else {
7540     Type *PredInstType = PredInst->getType();
7541     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7542     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7543     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7544     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7545   }
7546 }
7547 
7548 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7549   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7550   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7551                                         getMask());
7552 }
7553 
7554 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7555 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7556 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7557 // for predication.
7558 static ScalarEpilogueLowering getScalarEpilogueLowering(
7559     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7560     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7561     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7562     LoopVectorizationLegality &LVL) {
7563   bool OptSize =
7564       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7565                                                      PGSOQueryType::IRPass);
7566   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7567   // don't look at hints or options, and don't request a scalar epilogue.
7568   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7569     return CM_ScalarEpilogueNotAllowedOptSize;
7570 
7571   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7572                               !PreferPredicateOverEpilog;
7573 
7574   // 2) Next, if disabling predication is requested on the command line, honour
7575   // this and request a scalar epilogue.
7576   if (PredicateOptDisabled)
7577     return CM_ScalarEpilogueAllowed;
7578 
7579   // 3) and 4) look if enabling predication is requested on the command line,
7580   // with a loop hint, or if the TTI hook indicates this is profitable, request
7581   // predication .
7582   if (PreferPredicateOverEpilog ||
7583       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7584       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7585                                         LVL.getLAI()) &&
7586        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7587     return CM_ScalarEpilogueNotNeededUsePredicate;
7588 
7589   return CM_ScalarEpilogueAllowed;
7590 }
7591 
7592 // Process the loop in the VPlan-native vectorization path. This path builds
7593 // VPlan upfront in the vectorization pipeline, which allows to apply
7594 // VPlan-to-VPlan transformations from the very beginning without modifying the
7595 // input LLVM IR.
7596 static bool processLoopInVPlanNativePath(
7597     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7598     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7599     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7600     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7601     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7602 
7603   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7604   Function *F = L->getHeader()->getParent();
7605   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7606 
7607   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7608       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7609 
7610   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7611                                 &Hints, IAI);
7612   // Use the planner for outer loop vectorization.
7613   // TODO: CM is not used at this point inside the planner. Turn CM into an
7614   // optional argument if we don't need it in the future.
7615   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7616 
7617   // Get user vectorization factor.
7618   const unsigned UserVF = Hints.getWidth();
7619 
7620   // Plan how to best vectorize, return the best VF and its cost.
7621   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7622 
7623   // If we are stress testing VPlan builds, do not attempt to generate vector
7624   // code. Masked vector code generation support will follow soon.
7625   // Also, do not attempt to vectorize if no vector code will be produced.
7626   if (VPlanBuildStressTest || EnableVPlanPredication ||
7627       VectorizationFactor::Disabled() == VF)
7628     return false;
7629 
7630   LVP.setBestPlan(VF.Width, 1);
7631 
7632   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7633                          &CM);
7634   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7635                     << L->getHeader()->getParent()->getName() << "\"\n");
7636   LVP.executePlan(LB, DT);
7637 
7638   // Mark the loop as already vectorized to avoid vectorizing again.
7639   Hints.setAlreadyVectorized();
7640 
7641   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7642   return true;
7643 }
7644 
7645 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7646     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7647                                !EnableLoopInterleaving),
7648       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7649                               !EnableLoopVectorization) {}
7650 
7651 bool LoopVectorizePass::processLoop(Loop *L) {
7652   assert((EnableVPlanNativePath || L->empty()) &&
7653          "VPlan-native path is not enabled. Only process inner loops.");
7654 
7655 #ifndef NDEBUG
7656   const std::string DebugLocStr = getDebugLocString(L);
7657 #endif /* NDEBUG */
7658 
7659   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7660                     << L->getHeader()->getParent()->getName() << "\" from "
7661                     << DebugLocStr << "\n");
7662 
7663   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7664 
7665   LLVM_DEBUG(
7666       dbgs() << "LV: Loop hints:"
7667              << " force="
7668              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7669                      ? "disabled"
7670                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7671                             ? "enabled"
7672                             : "?"))
7673              << " width=" << Hints.getWidth()
7674              << " unroll=" << Hints.getInterleave() << "\n");
7675 
7676   // Function containing loop
7677   Function *F = L->getHeader()->getParent();
7678 
7679   // Looking at the diagnostic output is the only way to determine if a loop
7680   // was vectorized (other than looking at the IR or machine code), so it
7681   // is important to generate an optimization remark for each loop. Most of
7682   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7683   // generated as OptimizationRemark and OptimizationRemarkMissed are
7684   // less verbose reporting vectorized loops and unvectorized loops that may
7685   // benefit from vectorization, respectively.
7686 
7687   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7688     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7689     return false;
7690   }
7691 
7692   PredicatedScalarEvolution PSE(*SE, *L);
7693 
7694   // Check if it is legal to vectorize the loop.
7695   LoopVectorizationRequirements Requirements(*ORE);
7696   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7697                                 &Requirements, &Hints, DB, AC);
7698   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7699     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7700     Hints.emitRemarkWithHints();
7701     return false;
7702   }
7703 
7704   // Check the function attributes and profiles to find out if this function
7705   // should be optimized for size.
7706   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7707       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7708 
7709   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7710   // here. They may require CFG and instruction level transformations before
7711   // even evaluating whether vectorization is profitable. Since we cannot modify
7712   // the incoming IR, we need to build VPlan upfront in the vectorization
7713   // pipeline.
7714   if (!L->empty())
7715     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7716                                         ORE, BFI, PSI, Hints);
7717 
7718   assert(L->empty() && "Inner loop expected.");
7719 
7720   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7721   // count by optimizing for size, to minimize overheads.
7722   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7723   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7724     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7725                       << "This loop is worth vectorizing only if no scalar "
7726                       << "iteration overheads are incurred.");
7727     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7728       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7729     else {
7730       LLVM_DEBUG(dbgs() << "\n");
7731       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7732     }
7733   }
7734 
7735   // Check the function attributes to see if implicit floats are allowed.
7736   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7737   // an integer loop and the vector instructions selected are purely integer
7738   // vector instructions?
7739   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7740     reportVectorizationFailure(
7741         "Can't vectorize when the NoImplicitFloat attribute is used",
7742         "loop not vectorized due to NoImplicitFloat attribute",
7743         "NoImplicitFloat", ORE, L);
7744     Hints.emitRemarkWithHints();
7745     return false;
7746   }
7747 
7748   // Check if the target supports potentially unsafe FP vectorization.
7749   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7750   // for the target we're vectorizing for, to make sure none of the
7751   // additional fp-math flags can help.
7752   if (Hints.isPotentiallyUnsafe() &&
7753       TTI->isFPVectorizationPotentiallyUnsafe()) {
7754     reportVectorizationFailure(
7755         "Potentially unsafe FP op prevents vectorization",
7756         "loop not vectorized due to unsafe FP support.",
7757         "UnsafeFP", ORE, L);
7758     Hints.emitRemarkWithHints();
7759     return false;
7760   }
7761 
7762   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7763   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7764 
7765   // If an override option has been passed in for interleaved accesses, use it.
7766   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7767     UseInterleaved = EnableInterleavedMemAccesses;
7768 
7769   // Analyze interleaved memory accesses.
7770   if (UseInterleaved) {
7771     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7772   }
7773 
7774   // Use the cost model.
7775   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7776                                 F, &Hints, IAI);
7777   CM.collectValuesToIgnore();
7778 
7779   // Use the planner for vectorization.
7780   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7781 
7782   // Get user vectorization factor.
7783   unsigned UserVF = Hints.getWidth();
7784 
7785   // Plan how to best vectorize, return the best VF and its cost.
7786   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7787 
7788   VectorizationFactor VF = VectorizationFactor::Disabled();
7789   unsigned IC = 1;
7790   unsigned UserIC = Hints.getInterleave();
7791 
7792   if (MaybeVF) {
7793     VF = *MaybeVF;
7794     // Select the interleave count.
7795     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7796   }
7797 
7798   // Identify the diagnostic messages that should be produced.
7799   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7800   bool VectorizeLoop = true, InterleaveLoop = true;
7801   if (Requirements.doesNotMeet(F, L, Hints)) {
7802     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7803                          "requirements.\n");
7804     Hints.emitRemarkWithHints();
7805     return false;
7806   }
7807 
7808   if (VF.Width == 1) {
7809     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7810     VecDiagMsg = std::make_pair(
7811         "VectorizationNotBeneficial",
7812         "the cost-model indicates that vectorization is not beneficial");
7813     VectorizeLoop = false;
7814   }
7815 
7816   if (!MaybeVF && UserIC > 1) {
7817     // Tell the user interleaving was avoided up-front, despite being explicitly
7818     // requested.
7819     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7820                          "interleaving should be avoided up front\n");
7821     IntDiagMsg = std::make_pair(
7822         "InterleavingAvoided",
7823         "Ignoring UserIC, because interleaving was avoided up front");
7824     InterleaveLoop = false;
7825   } else if (IC == 1 && UserIC <= 1) {
7826     // Tell the user interleaving is not beneficial.
7827     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7828     IntDiagMsg = std::make_pair(
7829         "InterleavingNotBeneficial",
7830         "the cost-model indicates that interleaving is not beneficial");
7831     InterleaveLoop = false;
7832     if (UserIC == 1) {
7833       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7834       IntDiagMsg.second +=
7835           " and is explicitly disabled or interleave count is set to 1";
7836     }
7837   } else if (IC > 1 && UserIC == 1) {
7838     // Tell the user interleaving is beneficial, but it explicitly disabled.
7839     LLVM_DEBUG(
7840         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7841     IntDiagMsg = std::make_pair(
7842         "InterleavingBeneficialButDisabled",
7843         "the cost-model indicates that interleaving is beneficial "
7844         "but is explicitly disabled or interleave count is set to 1");
7845     InterleaveLoop = false;
7846   }
7847 
7848   // Override IC if user provided an interleave count.
7849   IC = UserIC > 0 ? UserIC : IC;
7850 
7851   // Emit diagnostic messages, if any.
7852   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7853   if (!VectorizeLoop && !InterleaveLoop) {
7854     // Do not vectorize or interleaving the loop.
7855     ORE->emit([&]() {
7856       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7857                                       L->getStartLoc(), L->getHeader())
7858              << VecDiagMsg.second;
7859     });
7860     ORE->emit([&]() {
7861       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7862                                       L->getStartLoc(), L->getHeader())
7863              << IntDiagMsg.second;
7864     });
7865     return false;
7866   } else if (!VectorizeLoop && InterleaveLoop) {
7867     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7868     ORE->emit([&]() {
7869       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7870                                         L->getStartLoc(), L->getHeader())
7871              << VecDiagMsg.second;
7872     });
7873   } else if (VectorizeLoop && !InterleaveLoop) {
7874     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7875                       << ") in " << DebugLocStr << '\n');
7876     ORE->emit([&]() {
7877       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7878                                         L->getStartLoc(), L->getHeader())
7879              << IntDiagMsg.second;
7880     });
7881   } else if (VectorizeLoop && InterleaveLoop) {
7882     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7883                       << ") in " << DebugLocStr << '\n');
7884     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7885   }
7886 
7887   LVP.setBestPlan(VF.Width, IC);
7888 
7889   using namespace ore;
7890   bool DisableRuntimeUnroll = false;
7891   MDNode *OrigLoopID = L->getLoopID();
7892 
7893   if (!VectorizeLoop) {
7894     assert(IC > 1 && "interleave count should not be 1 or 0");
7895     // If we decided that it is not legal to vectorize the loop, then
7896     // interleave it.
7897     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7898                                &CM);
7899     LVP.executePlan(Unroller, DT);
7900 
7901     ORE->emit([&]() {
7902       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7903                                 L->getHeader())
7904              << "interleaved loop (interleaved count: "
7905              << NV("InterleaveCount", IC) << ")";
7906     });
7907   } else {
7908     // If we decided that it is *legal* to vectorize the loop, then do it.
7909     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7910                            &LVL, &CM);
7911     LVP.executePlan(LB, DT);
7912     ++LoopsVectorized;
7913 
7914     // Add metadata to disable runtime unrolling a scalar loop when there are
7915     // no runtime checks about strides and memory. A scalar loop that is
7916     // rarely used is not worth unrolling.
7917     if (!LB.areSafetyChecksAdded())
7918       DisableRuntimeUnroll = true;
7919 
7920     // Report the vectorization decision.
7921     ORE->emit([&]() {
7922       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7923                                 L->getHeader())
7924              << "vectorized loop (vectorization width: "
7925              << NV("VectorizationFactor", VF.Width)
7926              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7927     });
7928   }
7929 
7930   Optional<MDNode *> RemainderLoopID =
7931       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7932                                       LLVMLoopVectorizeFollowupEpilogue});
7933   if (RemainderLoopID.hasValue()) {
7934     L->setLoopID(RemainderLoopID.getValue());
7935   } else {
7936     if (DisableRuntimeUnroll)
7937       AddRuntimeUnrollDisableMetaData(L);
7938 
7939     // Mark the loop as already vectorized to avoid vectorizing again.
7940     Hints.setAlreadyVectorized();
7941   }
7942 
7943   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7944   return true;
7945 }
7946 
7947 bool LoopVectorizePass::runImpl(
7948     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7949     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7950     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7951     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7952     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7953   SE = &SE_;
7954   LI = &LI_;
7955   TTI = &TTI_;
7956   DT = &DT_;
7957   BFI = &BFI_;
7958   TLI = TLI_;
7959   AA = &AA_;
7960   AC = &AC_;
7961   GetLAA = &GetLAA_;
7962   DB = &DB_;
7963   ORE = &ORE_;
7964   PSI = PSI_;
7965 
7966   // Don't attempt if
7967   // 1. the target claims to have no vector registers, and
7968   // 2. interleaving won't help ILP.
7969   //
7970   // The second condition is necessary because, even if the target has no
7971   // vector registers, loop vectorization may still enable scalar
7972   // interleaving.
7973   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7974       TTI->getMaxInterleaveFactor(1) < 2)
7975     return false;
7976 
7977   bool Changed = false;
7978 
7979   // The vectorizer requires loops to be in simplified form.
7980   // Since simplification may add new inner loops, it has to run before the
7981   // legality and profitability checks. This means running the loop vectorizer
7982   // will simplify all loops, regardless of whether anything end up being
7983   // vectorized.
7984   for (auto &L : *LI)
7985     Changed |=
7986         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7987 
7988   // Build up a worklist of inner-loops to vectorize. This is necessary as
7989   // the act of vectorizing or partially unrolling a loop creates new loops
7990   // and can invalidate iterators across the loops.
7991   SmallVector<Loop *, 8> Worklist;
7992 
7993   for (Loop *L : *LI)
7994     collectSupportedLoops(*L, LI, ORE, Worklist);
7995 
7996   LoopsAnalyzed += Worklist.size();
7997 
7998   // Now walk the identified inner loops.
7999   while (!Worklist.empty()) {
8000     Loop *L = Worklist.pop_back_val();
8001 
8002     // For the inner loops we actually process, form LCSSA to simplify the
8003     // transform.
8004     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8005 
8006     Changed |= processLoop(L);
8007   }
8008 
8009   // Process each loop nest in the function.
8010   return Changed;
8011 }
8012 
8013 PreservedAnalyses LoopVectorizePass::run(Function &F,
8014                                          FunctionAnalysisManager &AM) {
8015     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8016     auto &LI = AM.getResult<LoopAnalysis>(F);
8017     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8018     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8019     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8020     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8021     auto &AA = AM.getResult<AAManager>(F);
8022     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8023     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8024     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8025     MemorySSA *MSSA = EnableMSSALoopDependency
8026                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8027                           : nullptr;
8028 
8029     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8030     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8031         [&](Loop &L) -> const LoopAccessInfo & {
8032       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8033       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8034     };
8035     const ModuleAnalysisManager &MAM =
8036         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
8037     ProfileSummaryInfo *PSI =
8038         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8039     bool Changed =
8040         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8041     if (!Changed)
8042       return PreservedAnalyses::all();
8043     PreservedAnalyses PA;
8044 
8045     // We currently do not preserve loopinfo/dominator analyses with outer loop
8046     // vectorization. Until this is addressed, mark these analyses as preserved
8047     // only for non-VPlan-native path.
8048     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8049     if (!EnableVPlanNativePath) {
8050       PA.preserve<LoopAnalysis>();
8051       PA.preserve<DominatorTreeAnalysis>();
8052     }
8053     PA.preserve<BasicAA>();
8054     PA.preserve<GlobalsAA>();
8055     return PA;
8056 }
8057