1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Widen a single call instruction within the innermost loop.
413   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
414                             VPTransformState &State);
415 
416   /// Widen a single select instruction within the innermost loop.
417   void widenSelectInstruction(SelectInst &I, bool InvariantCond);
418 
419   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
420   void fixVectorizedLoop();
421 
422   // Return true if any runtime check is added.
423   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
424 
425   /// A type for vectorized values in the new loop. Each value from the
426   /// original loop, when vectorized, is represented by UF vector values in the
427   /// new unrolled loop, where UF is the unroll factor.
428   using VectorParts = SmallVector<Value *, 2>;
429 
430   /// Vectorize a single GetElementPtrInst based on information gathered and
431   /// decisions taken during planning.
432   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
433                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
434 
435   /// Vectorize a single PHINode in a block. This method handles the induction
436   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
437   /// arbitrary length vectors.
438   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
439 
440   /// A helper function to scalarize a single Instruction in the innermost loop.
441   /// Generates a sequence of scalar instances for each lane between \p MinLane
442   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
443   /// inclusive..
444   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
445                             bool IfPredicateInstr);
446 
447   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
448   /// is provided, the integer induction variable will first be truncated to
449   /// the corresponding type.
450   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
451 
452   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
453   /// vector or scalar value on-demand if one is not yet available. When
454   /// vectorizing a loop, we visit the definition of an instruction before its
455   /// uses. When visiting the definition, we either vectorize or scalarize the
456   /// instruction, creating an entry for it in the corresponding map. (In some
457   /// cases, such as induction variables, we will create both vector and scalar
458   /// entries.) Then, as we encounter uses of the definition, we derive values
459   /// for each scalar or vector use unless such a value is already available.
460   /// For example, if we scalarize a definition and one of its uses is vector,
461   /// we build the required vector on-demand with an insertelement sequence
462   /// when visiting the use. Otherwise, if the use is scalar, we can use the
463   /// existing scalar definition.
464   ///
465   /// Return a value in the new loop corresponding to \p V from the original
466   /// loop at unroll index \p Part. If the value has already been vectorized,
467   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
468   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
469   /// a new vector value on-demand by inserting the scalar values into a vector
470   /// with an insertelement sequence. If the value has been neither vectorized
471   /// nor scalarized, it must be loop invariant, so we simply broadcast the
472   /// value into a vector.
473   Value *getOrCreateVectorValue(Value *V, unsigned Part);
474 
475   /// Return a value in the new loop corresponding to \p V from the original
476   /// loop at unroll and vector indices \p Instance. If the value has been
477   /// vectorized but not scalarized, the necessary extractelement instruction
478   /// will be generated.
479   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
480 
481   /// Construct the vector value of a scalarized value \p V one lane at a time.
482   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
483 
484   /// Try to vectorize the interleaved access group that \p Instr belongs to
485   /// with the base address given in \p Addr, optionally masking the vector
486   /// operations if \p BlockInMask is non-null. Use \p State to translate given
487   /// VPValues to IR values in the vectorized loop.
488   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
489                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
490 
491   /// Vectorize Load and Store instructions with the base address given in \p
492   /// Addr, optionally masking the vector operations if \p BlockInMask is
493   /// non-null. Use \p State to translate given VPValues to IR values in the
494   /// vectorized loop.
495   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
496                                   VPValue *Addr, VPValue *StoredValue,
497                                   VPValue *BlockInMask);
498 
499   /// Set the debug location in the builder using the debug location in
500   /// the instruction.
501   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
502 
503   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
504   void fixNonInductionPHIs(void);
505 
506 protected:
507   friend class LoopVectorizationPlanner;
508 
509   /// A small list of PHINodes.
510   using PhiVector = SmallVector<PHINode *, 4>;
511 
512   /// A type for scalarized values in the new loop. Each value from the
513   /// original loop, when scalarized, is represented by UF x VF scalar values
514   /// in the new unrolled loop, where UF is the unroll factor and VF is the
515   /// vectorization factor.
516   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
517 
518   /// Set up the values of the IVs correctly when exiting the vector loop.
519   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
520                     Value *CountRoundDown, Value *EndValue,
521                     BasicBlock *MiddleBlock);
522 
523   /// Create a new induction variable inside L.
524   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
525                                    Value *Step, Instruction *DL);
526 
527   /// Handle all cross-iteration phis in the header.
528   void fixCrossIterationPHIs();
529 
530   /// Fix a first-order recurrence. This is the second phase of vectorizing
531   /// this phi node.
532   void fixFirstOrderRecurrence(PHINode *Phi);
533 
534   /// Fix a reduction cross-iteration phi. This is the second phase of
535   /// vectorizing this phi node.
536   void fixReduction(PHINode *Phi);
537 
538   /// Clear NSW/NUW flags from reduction instructions if necessary.
539   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
540 
541   /// The Loop exit block may have single value PHI nodes with some
542   /// incoming value. While vectorizing we only handled real values
543   /// that were defined inside the loop and we should have one value for
544   /// each predecessor of its parent basic block. See PR14725.
545   void fixLCSSAPHIs();
546 
547   /// Iteratively sink the scalarized operands of a predicated instruction into
548   /// the block that was created for it.
549   void sinkScalarOperands(Instruction *PredInst);
550 
551   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
552   /// represented as.
553   void truncateToMinimalBitwidths();
554 
555   /// Create a broadcast instruction. This method generates a broadcast
556   /// instruction (shuffle) for loop invariant values and for the induction
557   /// value. If this is the induction variable then we extend it to N, N+1, ...
558   /// this is needed because each iteration in the loop corresponds to a SIMD
559   /// element.
560   virtual Value *getBroadcastInstrs(Value *V);
561 
562   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
563   /// to each vector element of Val. The sequence starts at StartIndex.
564   /// \p Opcode is relevant for FP induction variable.
565   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
566                                Instruction::BinaryOps Opcode =
567                                Instruction::BinaryOpsEnd);
568 
569   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
570   /// variable on which to base the steps, \p Step is the size of the step, and
571   /// \p EntryVal is the value from the original loop that maps to the steps.
572   /// Note that \p EntryVal doesn't have to be an induction variable - it
573   /// can also be a truncate instruction.
574   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
575                         const InductionDescriptor &ID);
576 
577   /// Create a vector induction phi node based on an existing scalar one. \p
578   /// EntryVal is the value from the original loop that maps to the vector phi
579   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
580   /// truncate instruction, instead of widening the original IV, we widen a
581   /// version of the IV truncated to \p EntryVal's type.
582   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
583                                        Value *Step, Instruction *EntryVal);
584 
585   /// Returns true if an instruction \p I should be scalarized instead of
586   /// vectorized for the chosen vectorization factor.
587   bool shouldScalarizeInstruction(Instruction *I) const;
588 
589   /// Returns true if we should generate a scalar version of \p IV.
590   bool needsScalarInduction(Instruction *IV) const;
591 
592   /// If there is a cast involved in the induction variable \p ID, which should
593   /// be ignored in the vectorized loop body, this function records the
594   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
595   /// cast. We had already proved that the casted Phi is equal to the uncasted
596   /// Phi in the vectorized loop (under a runtime guard), and therefore
597   /// there is no need to vectorize the cast - the same value can be used in the
598   /// vector loop for both the Phi and the cast.
599   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
600   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
601   ///
602   /// \p EntryVal is the value from the original loop that maps to the vector
603   /// phi node and is used to distinguish what is the IV currently being
604   /// processed - original one (if \p EntryVal is a phi corresponding to the
605   /// original IV) or the "newly-created" one based on the proof mentioned above
606   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
607   /// latter case \p EntryVal is a TruncInst and we must not record anything for
608   /// that IV, but it's error-prone to expect callers of this routine to care
609   /// about that, hence this explicit parameter.
610   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
611                                              const Instruction *EntryVal,
612                                              Value *VectorLoopValue,
613                                              unsigned Part,
614                                              unsigned Lane = UINT_MAX);
615 
616   /// Generate a shuffle sequence that will reverse the vector Vec.
617   virtual Value *reverseVector(Value *Vec);
618 
619   /// Returns (and creates if needed) the original loop trip count.
620   Value *getOrCreateTripCount(Loop *NewLoop);
621 
622   /// Returns (and creates if needed) the trip count of the widened loop.
623   Value *getOrCreateVectorTripCount(Loop *NewLoop);
624 
625   /// Returns a bitcasted value to the requested vector type.
626   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
627   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
628                                 const DataLayout &DL);
629 
630   /// Emit a bypass check to see if the vector trip count is zero, including if
631   /// it overflows.
632   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
633 
634   /// Emit a bypass check to see if all of the SCEV assumptions we've
635   /// had to make are correct.
636   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
637 
638   /// Emit bypass checks to check any memory assumptions we may have made.
639   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
640 
641   /// Compute the transformed value of Index at offset StartValue using step
642   /// StepValue.
643   /// For integer induction, returns StartValue + Index * StepValue.
644   /// For pointer induction, returns StartValue[Index * StepValue].
645   /// FIXME: The newly created binary instructions should contain nsw/nuw
646   /// flags, which can be found from the original scalar operations.
647   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
648                               const DataLayout &DL,
649                               const InductionDescriptor &ID) const;
650 
651   /// Add additional metadata to \p To that was not present on \p Orig.
652   ///
653   /// Currently this is used to add the noalias annotations based on the
654   /// inserted memchecks.  Use this for instructions that are *cloned* into the
655   /// vector loop.
656   void addNewMetadata(Instruction *To, const Instruction *Orig);
657 
658   /// Add metadata from one instruction to another.
659   ///
660   /// This includes both the original MDs from \p From and additional ones (\see
661   /// addNewMetadata).  Use this for *newly created* instructions in the vector
662   /// loop.
663   void addMetadata(Instruction *To, Instruction *From);
664 
665   /// Similar to the previous function but it adds the metadata to a
666   /// vector of instructions.
667   void addMetadata(ArrayRef<Value *> To, Instruction *From);
668 
669   /// The original loop.
670   Loop *OrigLoop;
671 
672   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
673   /// dynamic knowledge to simplify SCEV expressions and converts them to a
674   /// more usable form.
675   PredicatedScalarEvolution &PSE;
676 
677   /// Loop Info.
678   LoopInfo *LI;
679 
680   /// Dominator Tree.
681   DominatorTree *DT;
682 
683   /// Alias Analysis.
684   AliasAnalysis *AA;
685 
686   /// Target Library Info.
687   const TargetLibraryInfo *TLI;
688 
689   /// Target Transform Info.
690   const TargetTransformInfo *TTI;
691 
692   /// Assumption Cache.
693   AssumptionCache *AC;
694 
695   /// Interface to emit optimization remarks.
696   OptimizationRemarkEmitter *ORE;
697 
698   /// LoopVersioning.  It's only set up (non-null) if memchecks were
699   /// used.
700   ///
701   /// This is currently only used to add no-alias metadata based on the
702   /// memchecks.  The actually versioning is performed manually.
703   std::unique_ptr<LoopVersioning> LVer;
704 
705   /// The vectorization SIMD factor to use. Each vector will have this many
706   /// vector elements.
707   unsigned VF;
708 
709   /// The vectorization unroll factor to use. Each scalar is vectorized to this
710   /// many different vector instructions.
711   unsigned UF;
712 
713   /// The builder that we use
714   IRBuilder<> Builder;
715 
716   // --- Vectorization state ---
717 
718   /// The vector-loop preheader.
719   BasicBlock *LoopVectorPreHeader;
720 
721   /// The scalar-loop preheader.
722   BasicBlock *LoopScalarPreHeader;
723 
724   /// Middle Block between the vector and the scalar.
725   BasicBlock *LoopMiddleBlock;
726 
727   /// The ExitBlock of the scalar loop.
728   BasicBlock *LoopExitBlock;
729 
730   /// The vector loop body.
731   BasicBlock *LoopVectorBody;
732 
733   /// The scalar loop body.
734   BasicBlock *LoopScalarBody;
735 
736   /// A list of all bypass blocks. The first block is the entry of the loop.
737   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
738 
739   /// The new Induction variable which was added to the new block.
740   PHINode *Induction = nullptr;
741 
742   /// The induction variable of the old basic block.
743   PHINode *OldInduction = nullptr;
744 
745   /// Maps values from the original loop to their corresponding values in the
746   /// vectorized loop. A key value can map to either vector values, scalar
747   /// values or both kinds of values, depending on whether the key was
748   /// vectorized and scalarized.
749   VectorizerValueMap VectorLoopValueMap;
750 
751   /// Store instructions that were predicated.
752   SmallVector<Instruction *, 4> PredicatedInstructions;
753 
754   /// Trip count of the original loop.
755   Value *TripCount = nullptr;
756 
757   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
758   Value *VectorTripCount = nullptr;
759 
760   /// The legality analysis.
761   LoopVectorizationLegality *Legal;
762 
763   /// The profitablity analysis.
764   LoopVectorizationCostModel *Cost;
765 
766   // Record whether runtime checks are added.
767   bool AddedSafetyChecks = false;
768 
769   // Holds the end values for each induction variable. We save the end values
770   // so we can later fix-up the external users of the induction variables.
771   DenseMap<PHINode *, Value *> IVEndValues;
772 
773   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
774   // fixed up at the end of vector code generation.
775   SmallVector<PHINode *, 8> OrigPHIsToFix;
776 };
777 
778 class InnerLoopUnroller : public InnerLoopVectorizer {
779 public:
780   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
781                     LoopInfo *LI, DominatorTree *DT,
782                     const TargetLibraryInfo *TLI,
783                     const TargetTransformInfo *TTI, AssumptionCache *AC,
784                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
785                     LoopVectorizationLegality *LVL,
786                     LoopVectorizationCostModel *CM)
787       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
788                             UnrollFactor, LVL, CM) {}
789 
790 private:
791   Value *getBroadcastInstrs(Value *V) override;
792   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
793                        Instruction::BinaryOps Opcode =
794                        Instruction::BinaryOpsEnd) override;
795   Value *reverseVector(Value *Vec) override;
796 };
797 
798 } // end namespace llvm
799 
800 /// Look for a meaningful debug location on the instruction or it's
801 /// operands.
802 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
803   if (!I)
804     return I;
805 
806   DebugLoc Empty;
807   if (I->getDebugLoc() != Empty)
808     return I;
809 
810   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
811     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
812       if (OpInst->getDebugLoc() != Empty)
813         return OpInst;
814   }
815 
816   return I;
817 }
818 
819 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
820   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
821     const DILocation *DIL = Inst->getDebugLoc();
822     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
823         !isa<DbgInfoIntrinsic>(Inst)) {
824       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
825       if (NewDIL)
826         B.SetCurrentDebugLocation(NewDIL.getValue());
827       else
828         LLVM_DEBUG(dbgs()
829                    << "Failed to create new discriminator: "
830                    << DIL->getFilename() << " Line: " << DIL->getLine());
831     }
832     else
833       B.SetCurrentDebugLocation(DIL);
834   } else
835     B.SetCurrentDebugLocation(DebugLoc());
836 }
837 
838 /// Write a record \p DebugMsg about vectorization failure to the debug
839 /// output stream. If \p I is passed, it is an instruction that prevents
840 /// vectorization.
841 #ifndef NDEBUG
842 static void debugVectorizationFailure(const StringRef DebugMsg,
843     Instruction *I) {
844   dbgs() << "LV: Not vectorizing: " << DebugMsg;
845   if (I != nullptr)
846     dbgs() << " " << *I;
847   else
848     dbgs() << '.';
849   dbgs() << '\n';
850 }
851 #endif
852 
853 /// Create an analysis remark that explains why vectorization failed
854 ///
855 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
856 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
857 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
858 /// the location of the remark.  \return the remark object that can be
859 /// streamed to.
860 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
861     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
862   Value *CodeRegion = TheLoop->getHeader();
863   DebugLoc DL = TheLoop->getStartLoc();
864 
865   if (I) {
866     CodeRegion = I->getParent();
867     // If there is no debug location attached to the instruction, revert back to
868     // using the loop's.
869     if (I->getDebugLoc())
870       DL = I->getDebugLoc();
871   }
872 
873   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
874   R << "loop not vectorized: ";
875   return R;
876 }
877 
878 namespace llvm {
879 
880 void reportVectorizationFailure(const StringRef DebugMsg,
881     const StringRef OREMsg, const StringRef ORETag,
882     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
883   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
884   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
885   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
886                 ORETag, TheLoop, I) << OREMsg);
887 }
888 
889 } // end namespace llvm
890 
891 #ifndef NDEBUG
892 /// \return string containing a file name and a line # for the given loop.
893 static std::string getDebugLocString(const Loop *L) {
894   std::string Result;
895   if (L) {
896     raw_string_ostream OS(Result);
897     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
898       LoopDbgLoc.print(OS);
899     else
900       // Just print the module name.
901       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
902     OS.flush();
903   }
904   return Result;
905 }
906 #endif
907 
908 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
909                                          const Instruction *Orig) {
910   // If the loop was versioned with memchecks, add the corresponding no-alias
911   // metadata.
912   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
913     LVer->annotateInstWithNoAlias(To, Orig);
914 }
915 
916 void InnerLoopVectorizer::addMetadata(Instruction *To,
917                                       Instruction *From) {
918   propagateMetadata(To, From);
919   addNewMetadata(To, From);
920 }
921 
922 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
923                                       Instruction *From) {
924   for (Value *V : To) {
925     if (Instruction *I = dyn_cast<Instruction>(V))
926       addMetadata(I, From);
927   }
928 }
929 
930 namespace llvm {
931 
932 // Loop vectorization cost-model hints how the scalar epilogue loop should be
933 // lowered.
934 enum ScalarEpilogueLowering {
935 
936   // The default: allowing scalar epilogues.
937   CM_ScalarEpilogueAllowed,
938 
939   // Vectorization with OptForSize: don't allow epilogues.
940   CM_ScalarEpilogueNotAllowedOptSize,
941 
942   // A special case of vectorisation with OptForSize: loops with a very small
943   // trip count are considered for vectorization under OptForSize, thereby
944   // making sure the cost of their loop body is dominant, free of runtime
945   // guards and scalar iteration overheads.
946   CM_ScalarEpilogueNotAllowedLowTripLoop,
947 
948   // Loop hint predicate indicating an epilogue is undesired.
949   CM_ScalarEpilogueNotNeededUsePredicate
950 };
951 
952 /// LoopVectorizationCostModel - estimates the expected speedups due to
953 /// vectorization.
954 /// In many cases vectorization is not profitable. This can happen because of
955 /// a number of reasons. In this class we mainly attempt to predict the
956 /// expected speedup/slowdowns due to the supported instruction set. We use the
957 /// TargetTransformInfo to query the different backends for the cost of
958 /// different operations.
959 class LoopVectorizationCostModel {
960 public:
961   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
962                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
963                              LoopVectorizationLegality *Legal,
964                              const TargetTransformInfo &TTI,
965                              const TargetLibraryInfo *TLI, DemandedBits *DB,
966                              AssumptionCache *AC,
967                              OptimizationRemarkEmitter *ORE, const Function *F,
968                              const LoopVectorizeHints *Hints,
969                              InterleavedAccessInfo &IAI)
970       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
971         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
972         Hints(Hints), InterleaveInfo(IAI) {}
973 
974   /// \return An upper bound for the vectorization factor, or None if
975   /// vectorization and interleaving should be avoided up front.
976   Optional<unsigned> computeMaxVF();
977 
978   /// \return True if runtime checks are required for vectorization, and false
979   /// otherwise.
980   bool runtimeChecksRequired();
981 
982   /// \return The most profitable vectorization factor and the cost of that VF.
983   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
984   /// then this vectorization factor will be selected if vectorization is
985   /// possible.
986   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
987 
988   /// Setup cost-based decisions for user vectorization factor.
989   void selectUserVectorizationFactor(unsigned UserVF) {
990     collectUniformsAndScalars(UserVF);
991     collectInstsToScalarize(UserVF);
992   }
993 
994   /// \return The size (in bits) of the smallest and widest types in the code
995   /// that needs to be vectorized. We ignore values that remain scalar such as
996   /// 64 bit loop indices.
997   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
998 
999   /// \return The desired interleave count.
1000   /// If interleave count has been specified by metadata it will be returned.
1001   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1002   /// are the selected vectorization factor and the cost of the selected VF.
1003   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1004 
1005   /// Memory access instruction may be vectorized in more than one way.
1006   /// Form of instruction after vectorization depends on cost.
1007   /// This function takes cost-based decisions for Load/Store instructions
1008   /// and collects them in a map. This decisions map is used for building
1009   /// the lists of loop-uniform and loop-scalar instructions.
1010   /// The calculated cost is saved with widening decision in order to
1011   /// avoid redundant calculations.
1012   void setCostBasedWideningDecision(unsigned VF);
1013 
1014   /// A struct that represents some properties of the register usage
1015   /// of a loop.
1016   struct RegisterUsage {
1017     /// Holds the number of loop invariant values that are used in the loop.
1018     /// The key is ClassID of target-provided register class.
1019     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1020     /// Holds the maximum number of concurrent live intervals in the loop.
1021     /// The key is ClassID of target-provided register class.
1022     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1023   };
1024 
1025   /// \return Returns information about the register usages of the loop for the
1026   /// given vectorization factors.
1027   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1028 
1029   /// Collect values we want to ignore in the cost model.
1030   void collectValuesToIgnore();
1031 
1032   /// \returns The smallest bitwidth each instruction can be represented with.
1033   /// The vector equivalents of these instructions should be truncated to this
1034   /// type.
1035   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1036     return MinBWs;
1037   }
1038 
1039   /// \returns True if it is more profitable to scalarize instruction \p I for
1040   /// vectorization factor \p VF.
1041   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1042     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1043 
1044     // Cost model is not run in the VPlan-native path - return conservative
1045     // result until this changes.
1046     if (EnableVPlanNativePath)
1047       return false;
1048 
1049     auto Scalars = InstsToScalarize.find(VF);
1050     assert(Scalars != InstsToScalarize.end() &&
1051            "VF not yet analyzed for scalarization profitability");
1052     return Scalars->second.find(I) != Scalars->second.end();
1053   }
1054 
1055   /// Returns true if \p I is known to be uniform after vectorization.
1056   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1057     if (VF == 1)
1058       return true;
1059 
1060     // Cost model is not run in the VPlan-native path - return conservative
1061     // result until this changes.
1062     if (EnableVPlanNativePath)
1063       return false;
1064 
1065     auto UniformsPerVF = Uniforms.find(VF);
1066     assert(UniformsPerVF != Uniforms.end() &&
1067            "VF not yet analyzed for uniformity");
1068     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1069   }
1070 
1071   /// Returns true if \p I is known to be scalar after vectorization.
1072   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1073     if (VF == 1)
1074       return true;
1075 
1076     // Cost model is not run in the VPlan-native path - return conservative
1077     // result until this changes.
1078     if (EnableVPlanNativePath)
1079       return false;
1080 
1081     auto ScalarsPerVF = Scalars.find(VF);
1082     assert(ScalarsPerVF != Scalars.end() &&
1083            "Scalar values are not calculated for VF");
1084     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1085   }
1086 
1087   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1088   /// for vectorization factor \p VF.
1089   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1090     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1091            !isProfitableToScalarize(I, VF) &&
1092            !isScalarAfterVectorization(I, VF);
1093   }
1094 
1095   /// Decision that was taken during cost calculation for memory instruction.
1096   enum InstWidening {
1097     CM_Unknown,
1098     CM_Widen,         // For consecutive accesses with stride +1.
1099     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1100     CM_Interleave,
1101     CM_GatherScatter,
1102     CM_Scalarize
1103   };
1104 
1105   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1106   /// instruction \p I and vector width \p VF.
1107   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1108                            unsigned Cost) {
1109     assert(VF >= 2 && "Expected VF >=2");
1110     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1111   }
1112 
1113   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1114   /// interleaving group \p Grp and vector width \p VF.
1115   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1116                            InstWidening W, unsigned Cost) {
1117     assert(VF >= 2 && "Expected VF >=2");
1118     /// Broadcast this decicion to all instructions inside the group.
1119     /// But the cost will be assigned to one instruction only.
1120     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1121       if (auto *I = Grp->getMember(i)) {
1122         if (Grp->getInsertPos() == I)
1123           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1124         else
1125           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1126       }
1127     }
1128   }
1129 
1130   /// Return the cost model decision for the given instruction \p I and vector
1131   /// width \p VF. Return CM_Unknown if this instruction did not pass
1132   /// through the cost modeling.
1133   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1134     assert(VF >= 2 && "Expected VF >=2");
1135 
1136     // Cost model is not run in the VPlan-native path - return conservative
1137     // result until this changes.
1138     if (EnableVPlanNativePath)
1139       return CM_GatherScatter;
1140 
1141     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1142     auto Itr = WideningDecisions.find(InstOnVF);
1143     if (Itr == WideningDecisions.end())
1144       return CM_Unknown;
1145     return Itr->second.first;
1146   }
1147 
1148   /// Return the vectorization cost for the given instruction \p I and vector
1149   /// width \p VF.
1150   unsigned getWideningCost(Instruction *I, unsigned VF) {
1151     assert(VF >= 2 && "Expected VF >=2");
1152     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1153     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1154            "The cost is not calculated");
1155     return WideningDecisions[InstOnVF].second;
1156   }
1157 
1158   /// Return True if instruction \p I is an optimizable truncate whose operand
1159   /// is an induction variable. Such a truncate will be removed by adding a new
1160   /// induction variable with the destination type.
1161   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1162     // If the instruction is not a truncate, return false.
1163     auto *Trunc = dyn_cast<TruncInst>(I);
1164     if (!Trunc)
1165       return false;
1166 
1167     // Get the source and destination types of the truncate.
1168     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1169     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1170 
1171     // If the truncate is free for the given types, return false. Replacing a
1172     // free truncate with an induction variable would add an induction variable
1173     // update instruction to each iteration of the loop. We exclude from this
1174     // check the primary induction variable since it will need an update
1175     // instruction regardless.
1176     Value *Op = Trunc->getOperand(0);
1177     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1178       return false;
1179 
1180     // If the truncated value is not an induction variable, return false.
1181     return Legal->isInductionPhi(Op);
1182   }
1183 
1184   /// Collects the instructions to scalarize for each predicated instruction in
1185   /// the loop.
1186   void collectInstsToScalarize(unsigned VF);
1187 
1188   /// Collect Uniform and Scalar values for the given \p VF.
1189   /// The sets depend on CM decision for Load/Store instructions
1190   /// that may be vectorized as interleave, gather-scatter or scalarized.
1191   void collectUniformsAndScalars(unsigned VF) {
1192     // Do the analysis once.
1193     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1194       return;
1195     setCostBasedWideningDecision(VF);
1196     collectLoopUniforms(VF);
1197     collectLoopScalars(VF);
1198   }
1199 
1200   /// Returns true if the target machine supports masked store operation
1201   /// for the given \p DataType and kind of access to \p Ptr.
1202   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203     return Legal->isConsecutivePtr(Ptr) &&
1204            TTI.isLegalMaskedStore(DataType, Alignment);
1205   }
1206 
1207   /// Returns true if the target machine supports masked load operation
1208   /// for the given \p DataType and kind of access to \p Ptr.
1209   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1210     return Legal->isConsecutivePtr(Ptr) &&
1211            TTI.isLegalMaskedLoad(DataType, Alignment);
1212   }
1213 
1214   /// Returns true if the target machine supports masked scatter operation
1215   /// for the given \p DataType.
1216   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1217     return TTI.isLegalMaskedScatter(DataType, Alignment);
1218   }
1219 
1220   /// Returns true if the target machine supports masked gather operation
1221   /// for the given \p DataType.
1222   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1223     return TTI.isLegalMaskedGather(DataType, Alignment);
1224   }
1225 
1226   /// Returns true if the target machine can represent \p V as a masked gather
1227   /// or scatter operation.
1228   bool isLegalGatherOrScatter(Value *V) {
1229     bool LI = isa<LoadInst>(V);
1230     bool SI = isa<StoreInst>(V);
1231     if (!LI && !SI)
1232       return false;
1233     auto *Ty = getMemInstValueType(V);
1234     MaybeAlign Align = getLoadStoreAlignment(V);
1235     return (LI && isLegalMaskedGather(Ty, Align)) ||
1236            (SI && isLegalMaskedScatter(Ty, Align));
1237   }
1238 
1239   /// Returns true if \p I is an instruction that will be scalarized with
1240   /// predication. Such instructions include conditional stores and
1241   /// instructions that may divide by zero.
1242   /// If a non-zero VF has been calculated, we check if I will be scalarized
1243   /// predication for that VF.
1244   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1245 
1246   // Returns true if \p I is an instruction that will be predicated either
1247   // through scalar predication or masked load/store or masked gather/scatter.
1248   // Superset of instructions that return true for isScalarWithPredication.
1249   bool isPredicatedInst(Instruction *I) {
1250     if (!blockNeedsPredication(I->getParent()))
1251       return false;
1252     // Loads and stores that need some form of masked operation are predicated
1253     // instructions.
1254     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1255       return Legal->isMaskRequired(I);
1256     return isScalarWithPredication(I);
1257   }
1258 
1259   /// Returns true if \p I is a memory instruction with consecutive memory
1260   /// access that can be widened.
1261   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1262 
1263   /// Returns true if \p I is a memory instruction in an interleaved-group
1264   /// of memory accesses that can be vectorized with wide vector loads/stores
1265   /// and shuffles.
1266   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1267 
1268   /// Check if \p Instr belongs to any interleaved access group.
1269   bool isAccessInterleaved(Instruction *Instr) {
1270     return InterleaveInfo.isInterleaved(Instr);
1271   }
1272 
1273   /// Get the interleaved access group that \p Instr belongs to.
1274   const InterleaveGroup<Instruction> *
1275   getInterleavedAccessGroup(Instruction *Instr) {
1276     return InterleaveInfo.getInterleaveGroup(Instr);
1277   }
1278 
1279   /// Returns true if an interleaved group requires a scalar iteration
1280   /// to handle accesses with gaps, and there is nothing preventing us from
1281   /// creating a scalar epilogue.
1282   bool requiresScalarEpilogue() const {
1283     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1284   }
1285 
1286   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1287   /// loop hint annotation.
1288   bool isScalarEpilogueAllowed() const {
1289     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1290   }
1291 
1292   /// Returns true if all loop blocks should be masked to fold tail loop.
1293   bool foldTailByMasking() const { return FoldTailByMasking; }
1294 
1295   bool blockNeedsPredication(BasicBlock *BB) {
1296     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1297   }
1298 
1299   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1300   /// with factor VF.  Return the cost of the instruction, including
1301   /// scalarization overhead if it's needed.
1302   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1303 
1304   /// Estimate cost of a call instruction CI if it were vectorized with factor
1305   /// VF. Return the cost of the instruction, including scalarization overhead
1306   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1307   /// scalarized -
1308   /// i.e. either vector version isn't available, or is too expensive.
1309   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1310 
1311 private:
1312   unsigned NumPredStores = 0;
1313 
1314   /// \return An upper bound for the vectorization factor, larger than zero.
1315   /// One is returned if vectorization should best be avoided due to cost.
1316   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1317 
1318   /// The vectorization cost is a combination of the cost itself and a boolean
1319   /// indicating whether any of the contributing operations will actually
1320   /// operate on
1321   /// vector values after type legalization in the backend. If this latter value
1322   /// is
1323   /// false, then all operations will be scalarized (i.e. no vectorization has
1324   /// actually taken place).
1325   using VectorizationCostTy = std::pair<unsigned, bool>;
1326 
1327   /// Returns the expected execution cost. The unit of the cost does
1328   /// not matter because we use the 'cost' units to compare different
1329   /// vector widths. The cost that is returned is *not* normalized by
1330   /// the factor width.
1331   VectorizationCostTy expectedCost(unsigned VF);
1332 
1333   /// Returns the execution time cost of an instruction for a given vector
1334   /// width. Vector width of one means scalar.
1335   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost-computation logic from getInstructionCost which provides
1338   /// the vector type as an output parameter.
1339   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1340 
1341   /// Calculate vectorization cost of memory instruction \p I.
1342   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1343 
1344   /// The cost computation for scalarized memory instruction.
1345   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1346 
1347   /// The cost computation for interleaving group of memory instructions.
1348   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost computation for Gather/Scatter instruction.
1351   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1352 
1353   /// The cost computation for widening instruction \p I with consecutive
1354   /// memory access.
1355   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1356 
1357   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1358   /// Load: scalar load + broadcast.
1359   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1360   /// element)
1361   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1362 
1363   /// Estimate the overhead of scalarizing an instruction. This is a
1364   /// convenience wrapper for the type-based getScalarizationOverhead API.
1365   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1366 
1367   /// Returns whether the instruction is a load or store and will be a emitted
1368   /// as a vector operation.
1369   bool isConsecutiveLoadOrStore(Instruction *I);
1370 
1371   /// Returns true if an artificially high cost for emulated masked memrefs
1372   /// should be used.
1373   bool useEmulatedMaskMemRefHack(Instruction *I);
1374 
1375   /// Map of scalar integer values to the smallest bitwidth they can be legally
1376   /// represented as. The vector equivalents of these values should be truncated
1377   /// to this type.
1378   MapVector<Instruction *, uint64_t> MinBWs;
1379 
1380   /// A type representing the costs for instructions if they were to be
1381   /// scalarized rather than vectorized. The entries are Instruction-Cost
1382   /// pairs.
1383   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1384 
1385   /// A set containing all BasicBlocks that are known to present after
1386   /// vectorization as a predicated block.
1387   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1388 
1389   /// Records whether it is allowed to have the original scalar loop execute at
1390   /// least once. This may be needed as a fallback loop in case runtime
1391   /// aliasing/dependence checks fail, or to handle the tail/remainder
1392   /// iterations when the trip count is unknown or doesn't divide by the VF,
1393   /// or as a peel-loop to handle gaps in interleave-groups.
1394   /// Under optsize and when the trip count is very small we don't allow any
1395   /// iterations to execute in the scalar loop.
1396   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1397 
1398   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1399   bool FoldTailByMasking = false;
1400 
1401   /// A map holding scalar costs for different vectorization factors. The
1402   /// presence of a cost for an instruction in the mapping indicates that the
1403   /// instruction will be scalarized when vectorizing with the associated
1404   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1405   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1406 
1407   /// Holds the instructions known to be uniform after vectorization.
1408   /// The data is collected per VF.
1409   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1410 
1411   /// Holds the instructions known to be scalar after vectorization.
1412   /// The data is collected per VF.
1413   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1414 
1415   /// Holds the instructions (address computations) that are forced to be
1416   /// scalarized.
1417   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1418 
1419   /// Returns the expected difference in cost from scalarizing the expression
1420   /// feeding a predicated instruction \p PredInst. The instructions to
1421   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1422   /// non-negative return value implies the expression will be scalarized.
1423   /// Currently, only single-use chains are considered for scalarization.
1424   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1425                               unsigned VF);
1426 
1427   /// Collect the instructions that are uniform after vectorization. An
1428   /// instruction is uniform if we represent it with a single scalar value in
1429   /// the vectorized loop corresponding to each vector iteration. Examples of
1430   /// uniform instructions include pointer operands of consecutive or
1431   /// interleaved memory accesses. Note that although uniformity implies an
1432   /// instruction will be scalar, the reverse is not true. In general, a
1433   /// scalarized instruction will be represented by VF scalar values in the
1434   /// vectorized loop, each corresponding to an iteration of the original
1435   /// scalar loop.
1436   void collectLoopUniforms(unsigned VF);
1437 
1438   /// Collect the instructions that are scalar after vectorization. An
1439   /// instruction is scalar if it is known to be uniform or will be scalarized
1440   /// during vectorization. Non-uniform scalarized instructions will be
1441   /// represented by VF values in the vectorized loop, each corresponding to an
1442   /// iteration of the original scalar loop.
1443   void collectLoopScalars(unsigned VF);
1444 
1445   /// Keeps cost model vectorization decision and cost for instructions.
1446   /// Right now it is used for memory instructions only.
1447   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1448                                 std::pair<InstWidening, unsigned>>;
1449 
1450   DecisionList WideningDecisions;
1451 
1452   /// Returns true if \p V is expected to be vectorized and it needs to be
1453   /// extracted.
1454   bool needsExtract(Value *V, unsigned VF) const {
1455     Instruction *I = dyn_cast<Instruction>(V);
1456     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1457       return false;
1458 
1459     // Assume we can vectorize V (and hence we need extraction) if the
1460     // scalars are not computed yet. This can happen, because it is called
1461     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1462     // the scalars are collected. That should be a safe assumption in most
1463     // cases, because we check if the operands have vectorizable types
1464     // beforehand in LoopVectorizationLegality.
1465     return Scalars.find(VF) == Scalars.end() ||
1466            !isScalarAfterVectorization(I, VF);
1467   };
1468 
1469   /// Returns a range containing only operands needing to be extracted.
1470   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1471                                                    unsigned VF) {
1472     return SmallVector<Value *, 4>(make_filter_range(
1473         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1474   }
1475 
1476 public:
1477   /// The loop that we evaluate.
1478   Loop *TheLoop;
1479 
1480   /// Predicated scalar evolution analysis.
1481   PredicatedScalarEvolution &PSE;
1482 
1483   /// Loop Info analysis.
1484   LoopInfo *LI;
1485 
1486   /// Vectorization legality.
1487   LoopVectorizationLegality *Legal;
1488 
1489   /// Vector target information.
1490   const TargetTransformInfo &TTI;
1491 
1492   /// Target Library Info.
1493   const TargetLibraryInfo *TLI;
1494 
1495   /// Demanded bits analysis.
1496   DemandedBits *DB;
1497 
1498   /// Assumption cache.
1499   AssumptionCache *AC;
1500 
1501   /// Interface to emit optimization remarks.
1502   OptimizationRemarkEmitter *ORE;
1503 
1504   const Function *TheFunction;
1505 
1506   /// Loop Vectorize Hint.
1507   const LoopVectorizeHints *Hints;
1508 
1509   /// The interleave access information contains groups of interleaved accesses
1510   /// with the same stride and close to each other.
1511   InterleavedAccessInfo &InterleaveInfo;
1512 
1513   /// Values to ignore in the cost model.
1514   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1515 
1516   /// Values to ignore in the cost model when VF > 1.
1517   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1518 };
1519 
1520 } // end namespace llvm
1521 
1522 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1523 // vectorization. The loop needs to be annotated with #pragma omp simd
1524 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1525 // vector length information is not provided, vectorization is not considered
1526 // explicit. Interleave hints are not allowed either. These limitations will be
1527 // relaxed in the future.
1528 // Please, note that we are currently forced to abuse the pragma 'clang
1529 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1530 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1531 // provides *explicit vectorization hints* (LV can bypass legal checks and
1532 // assume that vectorization is legal). However, both hints are implemented
1533 // using the same metadata (llvm.loop.vectorize, processed by
1534 // LoopVectorizeHints). This will be fixed in the future when the native IR
1535 // representation for pragma 'omp simd' is introduced.
1536 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1537                                    OptimizationRemarkEmitter *ORE) {
1538   assert(!OuterLp->empty() && "This is not an outer loop");
1539   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1540 
1541   // Only outer loops with an explicit vectorization hint are supported.
1542   // Unannotated outer loops are ignored.
1543   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1544     return false;
1545 
1546   Function *Fn = OuterLp->getHeader()->getParent();
1547   if (!Hints.allowVectorization(Fn, OuterLp,
1548                                 true /*VectorizeOnlyWhenForced*/)) {
1549     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1550     return false;
1551   }
1552 
1553   if (Hints.getInterleave() > 1) {
1554     // TODO: Interleave support is future work.
1555     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1556                          "outer loops.\n");
1557     Hints.emitRemarkWithHints();
1558     return false;
1559   }
1560 
1561   return true;
1562 }
1563 
1564 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1565                                   OptimizationRemarkEmitter *ORE,
1566                                   SmallVectorImpl<Loop *> &V) {
1567   // Collect inner loops and outer loops without irreducible control flow. For
1568   // now, only collect outer loops that have explicit vectorization hints. If we
1569   // are stress testing the VPlan H-CFG construction, we collect the outermost
1570   // loop of every loop nest.
1571   if (L.empty() || VPlanBuildStressTest ||
1572       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1573     LoopBlocksRPO RPOT(&L);
1574     RPOT.perform(LI);
1575     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1576       V.push_back(&L);
1577       // TODO: Collect inner loops inside marked outer loops in case
1578       // vectorization fails for the outer loop. Do not invoke
1579       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1580       // already known to be reducible. We can use an inherited attribute for
1581       // that.
1582       return;
1583     }
1584   }
1585   for (Loop *InnerL : L)
1586     collectSupportedLoops(*InnerL, LI, ORE, V);
1587 }
1588 
1589 namespace {
1590 
1591 /// The LoopVectorize Pass.
1592 struct LoopVectorize : public FunctionPass {
1593   /// Pass identification, replacement for typeid
1594   static char ID;
1595 
1596   LoopVectorizePass Impl;
1597 
1598   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1599                          bool VectorizeOnlyWhenForced = false)
1600       : FunctionPass(ID) {
1601     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1602     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1603     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1604   }
1605 
1606   bool runOnFunction(Function &F) override {
1607     if (skipFunction(F))
1608       return false;
1609 
1610     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1611     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1612     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1613     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1614     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1615     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1616     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1617     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1618     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1619     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1620     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1621     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1622     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1623 
1624     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1625         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1626 
1627     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1628                         GetLAA, *ORE, PSI);
1629   }
1630 
1631   void getAnalysisUsage(AnalysisUsage &AU) const override {
1632     AU.addRequired<AssumptionCacheTracker>();
1633     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1634     AU.addRequired<DominatorTreeWrapperPass>();
1635     AU.addRequired<LoopInfoWrapperPass>();
1636     AU.addRequired<ScalarEvolutionWrapperPass>();
1637     AU.addRequired<TargetTransformInfoWrapperPass>();
1638     AU.addRequired<AAResultsWrapperPass>();
1639     AU.addRequired<LoopAccessLegacyAnalysis>();
1640     AU.addRequired<DemandedBitsWrapperPass>();
1641     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1642     AU.addRequired<InjectTLIMappingsLegacy>();
1643 
1644     // We currently do not preserve loopinfo/dominator analyses with outer loop
1645     // vectorization. Until this is addressed, mark these analyses as preserved
1646     // only for non-VPlan-native path.
1647     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1648     if (!EnableVPlanNativePath) {
1649       AU.addPreserved<LoopInfoWrapperPass>();
1650       AU.addPreserved<DominatorTreeWrapperPass>();
1651     }
1652 
1653     AU.addPreserved<BasicAAWrapperPass>();
1654     AU.addPreserved<GlobalsAAWrapperPass>();
1655     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1656   }
1657 };
1658 
1659 } // end anonymous namespace
1660 
1661 //===----------------------------------------------------------------------===//
1662 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1663 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1664 //===----------------------------------------------------------------------===//
1665 
1666 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1667   // We need to place the broadcast of invariant variables outside the loop,
1668   // but only if it's proven safe to do so. Else, broadcast will be inside
1669   // vector loop body.
1670   Instruction *Instr = dyn_cast<Instruction>(V);
1671   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1672                      (!Instr ||
1673                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1674   // Place the code for broadcasting invariant variables in the new preheader.
1675   IRBuilder<>::InsertPointGuard Guard(Builder);
1676   if (SafeToHoist)
1677     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1678 
1679   // Broadcast the scalar into all locations in the vector.
1680   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1681 
1682   return Shuf;
1683 }
1684 
1685 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1686     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1687   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1688          "Expected either an induction phi-node or a truncate of it!");
1689   Value *Start = II.getStartValue();
1690 
1691   // Construct the initial value of the vector IV in the vector loop preheader
1692   auto CurrIP = Builder.saveIP();
1693   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1694   if (isa<TruncInst>(EntryVal)) {
1695     assert(Start->getType()->isIntegerTy() &&
1696            "Truncation requires an integer type");
1697     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1698     Step = Builder.CreateTrunc(Step, TruncType);
1699     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1700   }
1701   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1702   Value *SteppedStart =
1703       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1704 
1705   // We create vector phi nodes for both integer and floating-point induction
1706   // variables. Here, we determine the kind of arithmetic we will perform.
1707   Instruction::BinaryOps AddOp;
1708   Instruction::BinaryOps MulOp;
1709   if (Step->getType()->isIntegerTy()) {
1710     AddOp = Instruction::Add;
1711     MulOp = Instruction::Mul;
1712   } else {
1713     AddOp = II.getInductionOpcode();
1714     MulOp = Instruction::FMul;
1715   }
1716 
1717   // Multiply the vectorization factor by the step using integer or
1718   // floating-point arithmetic as appropriate.
1719   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1720   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1721 
1722   // Create a vector splat to use in the induction update.
1723   //
1724   // FIXME: If the step is non-constant, we create the vector splat with
1725   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1726   //        handle a constant vector splat.
1727   Value *SplatVF =
1728       isa<Constant>(Mul)
1729           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1730           : Builder.CreateVectorSplat(VF, Mul);
1731   Builder.restoreIP(CurrIP);
1732 
1733   // We may need to add the step a number of times, depending on the unroll
1734   // factor. The last of those goes into the PHI.
1735   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1736                                     &*LoopVectorBody->getFirstInsertionPt());
1737   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1738   Instruction *LastInduction = VecInd;
1739   for (unsigned Part = 0; Part < UF; ++Part) {
1740     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1741 
1742     if (isa<TruncInst>(EntryVal))
1743       addMetadata(LastInduction, EntryVal);
1744     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1745 
1746     LastInduction = cast<Instruction>(addFastMathFlag(
1747         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1748     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1749   }
1750 
1751   // Move the last step to the end of the latch block. This ensures consistent
1752   // placement of all induction updates.
1753   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1754   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1755   auto *ICmp = cast<Instruction>(Br->getCondition());
1756   LastInduction->moveBefore(ICmp);
1757   LastInduction->setName("vec.ind.next");
1758 
1759   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1760   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1761 }
1762 
1763 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1764   return Cost->isScalarAfterVectorization(I, VF) ||
1765          Cost->isProfitableToScalarize(I, VF);
1766 }
1767 
1768 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1769   if (shouldScalarizeInstruction(IV))
1770     return true;
1771   auto isScalarInst = [&](User *U) -> bool {
1772     auto *I = cast<Instruction>(U);
1773     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1774   };
1775   return llvm::any_of(IV->users(), isScalarInst);
1776 }
1777 
1778 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1779     const InductionDescriptor &ID, const Instruction *EntryVal,
1780     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1781   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1782          "Expected either an induction phi-node or a truncate of it!");
1783 
1784   // This induction variable is not the phi from the original loop but the
1785   // newly-created IV based on the proof that casted Phi is equal to the
1786   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1787   // re-uses the same InductionDescriptor that original IV uses but we don't
1788   // have to do any recording in this case - that is done when original IV is
1789   // processed.
1790   if (isa<TruncInst>(EntryVal))
1791     return;
1792 
1793   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1794   if (Casts.empty())
1795     return;
1796   // Only the first Cast instruction in the Casts vector is of interest.
1797   // The rest of the Casts (if exist) have no uses outside the
1798   // induction update chain itself.
1799   Instruction *CastInst = *Casts.begin();
1800   if (Lane < UINT_MAX)
1801     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1802   else
1803     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1804 }
1805 
1806 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1807   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1808          "Primary induction variable must have an integer type");
1809 
1810   auto II = Legal->getInductionVars().find(IV);
1811   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1812 
1813   auto ID = II->second;
1814   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1815 
1816   // The value from the original loop to which we are mapping the new induction
1817   // variable.
1818   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1819 
1820   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1821 
1822   // Generate code for the induction step. Note that induction steps are
1823   // required to be loop-invariant
1824   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1825     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1826            "Induction step should be loop invariant");
1827     if (PSE.getSE()->isSCEVable(IV->getType())) {
1828       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1829       return Exp.expandCodeFor(Step, Step->getType(),
1830                                LoopVectorPreHeader->getTerminator());
1831     }
1832     return cast<SCEVUnknown>(Step)->getValue();
1833   };
1834 
1835   // The scalar value to broadcast. This is derived from the canonical
1836   // induction variable. If a truncation type is given, truncate the canonical
1837   // induction variable and step. Otherwise, derive these values from the
1838   // induction descriptor.
1839   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1840     Value *ScalarIV = Induction;
1841     if (IV != OldInduction) {
1842       ScalarIV = IV->getType()->isIntegerTy()
1843                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1844                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1845                                           IV->getType());
1846       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1847       ScalarIV->setName("offset.idx");
1848     }
1849     if (Trunc) {
1850       auto *TruncType = cast<IntegerType>(Trunc->getType());
1851       assert(Step->getType()->isIntegerTy() &&
1852              "Truncation requires an integer step");
1853       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1854       Step = Builder.CreateTrunc(Step, TruncType);
1855     }
1856     return ScalarIV;
1857   };
1858 
1859   // Create the vector values from the scalar IV, in the absence of creating a
1860   // vector IV.
1861   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1862     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1863     for (unsigned Part = 0; Part < UF; ++Part) {
1864       Value *EntryPart =
1865           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1866       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1867       if (Trunc)
1868         addMetadata(EntryPart, Trunc);
1869       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1870     }
1871   };
1872 
1873   // Now do the actual transformations, and start with creating the step value.
1874   Value *Step = CreateStepValue(ID.getStep());
1875   if (VF <= 1) {
1876     Value *ScalarIV = CreateScalarIV(Step);
1877     CreateSplatIV(ScalarIV, Step);
1878     return;
1879   }
1880 
1881   // Determine if we want a scalar version of the induction variable. This is
1882   // true if the induction variable itself is not widened, or if it has at
1883   // least one user in the loop that is not widened.
1884   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1885   if (!NeedsScalarIV) {
1886     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1887     return;
1888   }
1889 
1890   // Try to create a new independent vector induction variable. If we can't
1891   // create the phi node, we will splat the scalar induction variable in each
1892   // loop iteration.
1893   if (!shouldScalarizeInstruction(EntryVal)) {
1894     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1895     Value *ScalarIV = CreateScalarIV(Step);
1896     // Create scalar steps that can be used by instructions we will later
1897     // scalarize. Note that the addition of the scalar steps will not increase
1898     // the number of instructions in the loop in the common case prior to
1899     // InstCombine. We will be trading one vector extract for each scalar step.
1900     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1901     return;
1902   }
1903 
1904   // If we haven't yet vectorized the induction variable, splat the scalar
1905   // induction variable, and build the necessary step vectors.
1906   // TODO: Don't do it unless the vectorized IV is really required.
1907   Value *ScalarIV = CreateScalarIV(Step);
1908   CreateSplatIV(ScalarIV, Step);
1909   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1910 }
1911 
1912 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1913                                           Instruction::BinaryOps BinOp) {
1914   // Create and check the types.
1915   assert(Val->getType()->isVectorTy() && "Must be a vector");
1916   int VLen = Val->getType()->getVectorNumElements();
1917 
1918   Type *STy = Val->getType()->getScalarType();
1919   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1920          "Induction Step must be an integer or FP");
1921   assert(Step->getType() == STy && "Step has wrong type");
1922 
1923   SmallVector<Constant *, 8> Indices;
1924 
1925   if (STy->isIntegerTy()) {
1926     // Create a vector of consecutive numbers from zero to VF.
1927     for (int i = 0; i < VLen; ++i)
1928       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1929 
1930     // Add the consecutive indices to the vector value.
1931     Constant *Cv = ConstantVector::get(Indices);
1932     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1933     Step = Builder.CreateVectorSplat(VLen, Step);
1934     assert(Step->getType() == Val->getType() && "Invalid step vec");
1935     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1936     // which can be found from the original scalar operations.
1937     Step = Builder.CreateMul(Cv, Step);
1938     return Builder.CreateAdd(Val, Step, "induction");
1939   }
1940 
1941   // Floating point induction.
1942   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1943          "Binary Opcode should be specified for FP induction");
1944   // Create a vector of consecutive numbers from zero to VF.
1945   for (int i = 0; i < VLen; ++i)
1946     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1947 
1948   // Add the consecutive indices to the vector value.
1949   Constant *Cv = ConstantVector::get(Indices);
1950 
1951   Step = Builder.CreateVectorSplat(VLen, Step);
1952 
1953   // Floating point operations had to be 'fast' to enable the induction.
1954   FastMathFlags Flags;
1955   Flags.setFast();
1956 
1957   Value *MulOp = Builder.CreateFMul(Cv, Step);
1958   if (isa<Instruction>(MulOp))
1959     // Have to check, MulOp may be a constant
1960     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1961 
1962   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1963   if (isa<Instruction>(BOp))
1964     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1965   return BOp;
1966 }
1967 
1968 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1969                                            Instruction *EntryVal,
1970                                            const InductionDescriptor &ID) {
1971   // We shouldn't have to build scalar steps if we aren't vectorizing.
1972   assert(VF > 1 && "VF should be greater than one");
1973 
1974   // Get the value type and ensure it and the step have the same integer type.
1975   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1976   assert(ScalarIVTy == Step->getType() &&
1977          "Val and Step should have the same type");
1978 
1979   // We build scalar steps for both integer and floating-point induction
1980   // variables. Here, we determine the kind of arithmetic we will perform.
1981   Instruction::BinaryOps AddOp;
1982   Instruction::BinaryOps MulOp;
1983   if (ScalarIVTy->isIntegerTy()) {
1984     AddOp = Instruction::Add;
1985     MulOp = Instruction::Mul;
1986   } else {
1987     AddOp = ID.getInductionOpcode();
1988     MulOp = Instruction::FMul;
1989   }
1990 
1991   // Determine the number of scalars we need to generate for each unroll
1992   // iteration. If EntryVal is uniform, we only need to generate the first
1993   // lane. Otherwise, we generate all VF values.
1994   unsigned Lanes =
1995       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1996                                                                          : VF;
1997   // Compute the scalar steps and save the results in VectorLoopValueMap.
1998   for (unsigned Part = 0; Part < UF; ++Part) {
1999     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2000       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2001       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2002       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2003       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2004       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2005     }
2006   }
2007 }
2008 
2009 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2010   assert(V != Induction && "The new induction variable should not be used.");
2011   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2012   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2013 
2014   // If we have a stride that is replaced by one, do it here. Defer this for
2015   // the VPlan-native path until we start running Legal checks in that path.
2016   if (!EnableVPlanNativePath && Legal->hasStride(V))
2017     V = ConstantInt::get(V->getType(), 1);
2018 
2019   // If we have a vector mapped to this value, return it.
2020   if (VectorLoopValueMap.hasVectorValue(V, Part))
2021     return VectorLoopValueMap.getVectorValue(V, Part);
2022 
2023   // If the value has not been vectorized, check if it has been scalarized
2024   // instead. If it has been scalarized, and we actually need the value in
2025   // vector form, we will construct the vector values on demand.
2026   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2027     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2028 
2029     // If we've scalarized a value, that value should be an instruction.
2030     auto *I = cast<Instruction>(V);
2031 
2032     // If we aren't vectorizing, we can just copy the scalar map values over to
2033     // the vector map.
2034     if (VF == 1) {
2035       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2036       return ScalarValue;
2037     }
2038 
2039     // Get the last scalar instruction we generated for V and Part. If the value
2040     // is known to be uniform after vectorization, this corresponds to lane zero
2041     // of the Part unroll iteration. Otherwise, the last instruction is the one
2042     // we created for the last vector lane of the Part unroll iteration.
2043     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2044     auto *LastInst = cast<Instruction>(
2045         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2046 
2047     // Set the insert point after the last scalarized instruction. This ensures
2048     // the insertelement sequence will directly follow the scalar definitions.
2049     auto OldIP = Builder.saveIP();
2050     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2051     Builder.SetInsertPoint(&*NewIP);
2052 
2053     // However, if we are vectorizing, we need to construct the vector values.
2054     // If the value is known to be uniform after vectorization, we can just
2055     // broadcast the scalar value corresponding to lane zero for each unroll
2056     // iteration. Otherwise, we construct the vector values using insertelement
2057     // instructions. Since the resulting vectors are stored in
2058     // VectorLoopValueMap, we will only generate the insertelements once.
2059     Value *VectorValue = nullptr;
2060     if (Cost->isUniformAfterVectorization(I, VF)) {
2061       VectorValue = getBroadcastInstrs(ScalarValue);
2062       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2063     } else {
2064       // Initialize packing with insertelements to start from undef.
2065       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2066       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2067       for (unsigned Lane = 0; Lane < VF; ++Lane)
2068         packScalarIntoVectorValue(V, {Part, Lane});
2069       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2070     }
2071     Builder.restoreIP(OldIP);
2072     return VectorValue;
2073   }
2074 
2075   // If this scalar is unknown, assume that it is a constant or that it is
2076   // loop invariant. Broadcast V and save the value for future uses.
2077   Value *B = getBroadcastInstrs(V);
2078   VectorLoopValueMap.setVectorValue(V, Part, B);
2079   return B;
2080 }
2081 
2082 Value *
2083 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2084                                             const VPIteration &Instance) {
2085   // If the value is not an instruction contained in the loop, it should
2086   // already be scalar.
2087   if (OrigLoop->isLoopInvariant(V))
2088     return V;
2089 
2090   assert(Instance.Lane > 0
2091              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2092              : true && "Uniform values only have lane zero");
2093 
2094   // If the value from the original loop has not been vectorized, it is
2095   // represented by UF x VF scalar values in the new loop. Return the requested
2096   // scalar value.
2097   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2098     return VectorLoopValueMap.getScalarValue(V, Instance);
2099 
2100   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2101   // for the given unroll part. If this entry is not a vector type (i.e., the
2102   // vectorization factor is one), there is no need to generate an
2103   // extractelement instruction.
2104   auto *U = getOrCreateVectorValue(V, Instance.Part);
2105   if (!U->getType()->isVectorTy()) {
2106     assert(VF == 1 && "Value not scalarized has non-vector type");
2107     return U;
2108   }
2109 
2110   // Otherwise, the value from the original loop has been vectorized and is
2111   // represented by UF vector values. Extract and return the requested scalar
2112   // value from the appropriate vector lane.
2113   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2114 }
2115 
2116 void InnerLoopVectorizer::packScalarIntoVectorValue(
2117     Value *V, const VPIteration &Instance) {
2118   assert(V != Induction && "The new induction variable should not be used.");
2119   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2120   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2121 
2122   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2123   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2124   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2125                                             Builder.getInt32(Instance.Lane));
2126   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2127 }
2128 
2129 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2130   assert(Vec->getType()->isVectorTy() && "Invalid type");
2131   SmallVector<Constant *, 8> ShuffleMask;
2132   for (unsigned i = 0; i < VF; ++i)
2133     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2134 
2135   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2136                                      ConstantVector::get(ShuffleMask),
2137                                      "reverse");
2138 }
2139 
2140 // Return whether we allow using masked interleave-groups (for dealing with
2141 // strided loads/stores that reside in predicated blocks, or for dealing
2142 // with gaps).
2143 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2144   // If an override option has been passed in for interleaved accesses, use it.
2145   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2146     return EnableMaskedInterleavedMemAccesses;
2147 
2148   return TTI.enableMaskedInterleavedAccessVectorization();
2149 }
2150 
2151 // Try to vectorize the interleave group that \p Instr belongs to.
2152 //
2153 // E.g. Translate following interleaved load group (factor = 3):
2154 //   for (i = 0; i < N; i+=3) {
2155 //     R = Pic[i];             // Member of index 0
2156 //     G = Pic[i+1];           // Member of index 1
2157 //     B = Pic[i+2];           // Member of index 2
2158 //     ... // do something to R, G, B
2159 //   }
2160 // To:
2161 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2162 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2163 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2164 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2165 //
2166 // Or translate following interleaved store group (factor = 3):
2167 //   for (i = 0; i < N; i+=3) {
2168 //     ... do something to R, G, B
2169 //     Pic[i]   = R;           // Member of index 0
2170 //     Pic[i+1] = G;           // Member of index 1
2171 //     Pic[i+2] = B;           // Member of index 2
2172 //   }
2173 // To:
2174 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2175 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2176 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2177 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2178 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2179 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2180                                                    VPTransformState &State,
2181                                                    VPValue *Addr,
2182                                                    VPValue *BlockInMask) {
2183   const InterleaveGroup<Instruction> *Group =
2184       Cost->getInterleavedAccessGroup(Instr);
2185   assert(Group && "Fail to get an interleaved access group.");
2186 
2187   // Skip if current instruction is not the insert position.
2188   if (Instr != Group->getInsertPos())
2189     return;
2190 
2191   const DataLayout &DL = Instr->getModule()->getDataLayout();
2192 
2193   // Prepare for the vector type of the interleaved load/store.
2194   Type *ScalarTy = getMemInstValueType(Instr);
2195   unsigned InterleaveFactor = Group->getFactor();
2196   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2197 
2198   // Prepare for the new pointers.
2199   SmallVector<Value *, 2> AddrParts;
2200   unsigned Index = Group->getIndex(Instr);
2201 
2202   // TODO: extend the masked interleaved-group support to reversed access.
2203   assert((!BlockInMask || !Group->isReverse()) &&
2204          "Reversed masked interleave-group not supported.");
2205 
2206   // If the group is reverse, adjust the index to refer to the last vector lane
2207   // instead of the first. We adjust the index from the first vector lane,
2208   // rather than directly getting the pointer for lane VF - 1, because the
2209   // pointer operand of the interleaved access is supposed to be uniform. For
2210   // uniform instructions, we're only required to generate a value for the
2211   // first vector lane in each unroll iteration.
2212   if (Group->isReverse())
2213     Index += (VF - 1) * Group->getFactor();
2214 
2215   for (unsigned Part = 0; Part < UF; Part++) {
2216     Value *AddrPart = State.get(Addr, {Part, 0});
2217     setDebugLocFromInst(Builder, AddrPart);
2218 
2219     // Notice current instruction could be any index. Need to adjust the address
2220     // to the member of index 0.
2221     //
2222     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2223     //       b = A[i];       // Member of index 0
2224     // Current pointer is pointed to A[i+1], adjust it to A[i].
2225     //
2226     // E.g.  A[i+1] = a;     // Member of index 1
2227     //       A[i]   = b;     // Member of index 0
2228     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2229     // Current pointer is pointed to A[i+2], adjust it to A[i].
2230 
2231     bool InBounds = false;
2232     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2233       InBounds = gep->isInBounds();
2234     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2235     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2236 
2237     // Cast to the vector pointer type.
2238     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2239     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2240     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2241   }
2242 
2243   setDebugLocFromInst(Builder, Instr);
2244   Value *UndefVec = UndefValue::get(VecTy);
2245 
2246   Value *MaskForGaps = nullptr;
2247   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2248     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2249     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2250   }
2251 
2252   // Vectorize the interleaved load group.
2253   if (isa<LoadInst>(Instr)) {
2254     // For each unroll part, create a wide load for the group.
2255     SmallVector<Value *, 2> NewLoads;
2256     for (unsigned Part = 0; Part < UF; Part++) {
2257       Instruction *NewLoad;
2258       if (BlockInMask || MaskForGaps) {
2259         assert(useMaskedInterleavedAccesses(*TTI) &&
2260                "masked interleaved groups are not allowed.");
2261         Value *GroupMask = MaskForGaps;
2262         if (BlockInMask) {
2263           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2264           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2265           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2266           Value *ShuffledMask = Builder.CreateShuffleVector(
2267               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2268           GroupMask = MaskForGaps
2269                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2270                                                 MaskForGaps)
2271                           : ShuffledMask;
2272         }
2273         NewLoad =
2274             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2275                                      GroupMask, UndefVec, "wide.masked.vec");
2276       }
2277       else
2278         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2279                                             Group->getAlign(), "wide.vec");
2280       Group->addMetadata(NewLoad);
2281       NewLoads.push_back(NewLoad);
2282     }
2283 
2284     // For each member in the group, shuffle out the appropriate data from the
2285     // wide loads.
2286     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2287       Instruction *Member = Group->getMember(I);
2288 
2289       // Skip the gaps in the group.
2290       if (!Member)
2291         continue;
2292 
2293       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2294       for (unsigned Part = 0; Part < UF; Part++) {
2295         Value *StridedVec = Builder.CreateShuffleVector(
2296             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2297 
2298         // If this member has different type, cast the result type.
2299         if (Member->getType() != ScalarTy) {
2300           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2301           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2302         }
2303 
2304         if (Group->isReverse())
2305           StridedVec = reverseVector(StridedVec);
2306 
2307         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2308       }
2309     }
2310     return;
2311   }
2312 
2313   // The sub vector type for current instruction.
2314   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2315 
2316   // Vectorize the interleaved store group.
2317   for (unsigned Part = 0; Part < UF; Part++) {
2318     // Collect the stored vector from each member.
2319     SmallVector<Value *, 4> StoredVecs;
2320     for (unsigned i = 0; i < InterleaveFactor; i++) {
2321       // Interleaved store group doesn't allow a gap, so each index has a member
2322       Instruction *Member = Group->getMember(i);
2323       assert(Member && "Fail to get a member from an interleaved store group");
2324 
2325       Value *StoredVec = getOrCreateVectorValue(
2326           cast<StoreInst>(Member)->getValueOperand(), Part);
2327       if (Group->isReverse())
2328         StoredVec = reverseVector(StoredVec);
2329 
2330       // If this member has different type, cast it to a unified type.
2331 
2332       if (StoredVec->getType() != SubVT)
2333         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2334 
2335       StoredVecs.push_back(StoredVec);
2336     }
2337 
2338     // Concatenate all vectors into a wide vector.
2339     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2340 
2341     // Interleave the elements in the wide vector.
2342     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2343     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2344                                               "interleaved.vec");
2345 
2346     Instruction *NewStoreInstr;
2347     if (BlockInMask) {
2348       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2349       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2350       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2351       Value *ShuffledMask = Builder.CreateShuffleVector(
2352           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2353       NewStoreInstr = Builder.CreateMaskedStore(
2354           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2355     }
2356     else
2357       NewStoreInstr =
2358           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2359 
2360     Group->addMetadata(NewStoreInstr);
2361   }
2362 }
2363 
2364 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2365                                                      VPTransformState &State,
2366                                                      VPValue *Addr,
2367                                                      VPValue *StoredValue,
2368                                                      VPValue *BlockInMask) {
2369   // Attempt to issue a wide load.
2370   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2371   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2372 
2373   assert((LI || SI) && "Invalid Load/Store instruction");
2374   assert((!SI || StoredValue) && "No stored value provided for widened store");
2375   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2376 
2377   LoopVectorizationCostModel::InstWidening Decision =
2378       Cost->getWideningDecision(Instr, VF);
2379   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2380          "CM decision should be taken at this point");
2381   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2382     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2383 
2384   Type *ScalarDataTy = getMemInstValueType(Instr);
2385   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2386   // An alignment of 0 means target abi alignment. We need to use the scalar's
2387   // target abi alignment in such a case.
2388   const DataLayout &DL = Instr->getModule()->getDataLayout();
2389   const Align Alignment =
2390       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2391 
2392   // Determine if the pointer operand of the access is either consecutive or
2393   // reverse consecutive.
2394   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2395   bool ConsecutiveStride =
2396       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2397   bool CreateGatherScatter =
2398       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2399 
2400   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2401   // gather/scatter. Otherwise Decision should have been to Scalarize.
2402   assert((ConsecutiveStride || CreateGatherScatter) &&
2403          "The instruction should be scalarized");
2404   (void)ConsecutiveStride;
2405 
2406   VectorParts BlockInMaskParts(UF);
2407   bool isMaskRequired = BlockInMask;
2408   if (isMaskRequired)
2409     for (unsigned Part = 0; Part < UF; ++Part)
2410       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2411 
2412   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2413     // Calculate the pointer for the specific unroll-part.
2414     GetElementPtrInst *PartPtr = nullptr;
2415 
2416     bool InBounds = false;
2417     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2418       InBounds = gep->isInBounds();
2419 
2420     if (Reverse) {
2421       // If the address is consecutive but reversed, then the
2422       // wide store needs to start at the last vector element.
2423       PartPtr = cast<GetElementPtrInst>(
2424           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2425       PartPtr->setIsInBounds(InBounds);
2426       PartPtr = cast<GetElementPtrInst>(
2427           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2428       PartPtr->setIsInBounds(InBounds);
2429       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2430         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2431     } else {
2432       PartPtr = cast<GetElementPtrInst>(
2433           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2434       PartPtr->setIsInBounds(InBounds);
2435     }
2436 
2437     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2438     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2439   };
2440 
2441   // Handle Stores:
2442   if (SI) {
2443     setDebugLocFromInst(Builder, SI);
2444 
2445     for (unsigned Part = 0; Part < UF; ++Part) {
2446       Instruction *NewSI = nullptr;
2447       Value *StoredVal = State.get(StoredValue, Part);
2448       if (CreateGatherScatter) {
2449         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2450         Value *VectorGep = State.get(Addr, Part);
2451         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2452                                             MaskPart);
2453       } else {
2454         if (Reverse) {
2455           // If we store to reverse consecutive memory locations, then we need
2456           // to reverse the order of elements in the stored value.
2457           StoredVal = reverseVector(StoredVal);
2458           // We don't want to update the value in the map as it might be used in
2459           // another expression. So don't call resetVectorValue(StoredVal).
2460         }
2461         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2462         if (isMaskRequired)
2463           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2464                                             BlockInMaskParts[Part]);
2465         else
2466           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2467       }
2468       addMetadata(NewSI, SI);
2469     }
2470     return;
2471   }
2472 
2473   // Handle loads.
2474   assert(LI && "Must have a load instruction");
2475   setDebugLocFromInst(Builder, LI);
2476   for (unsigned Part = 0; Part < UF; ++Part) {
2477     Value *NewLI;
2478     if (CreateGatherScatter) {
2479       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2480       Value *VectorGep = State.get(Addr, Part);
2481       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2482                                          nullptr, "wide.masked.gather");
2483       addMetadata(NewLI, LI);
2484     } else {
2485       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2486       if (isMaskRequired)
2487         NewLI = Builder.CreateMaskedLoad(
2488             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2489             "wide.masked.load");
2490       else
2491         NewLI =
2492             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2493 
2494       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2495       addMetadata(NewLI, LI);
2496       if (Reverse)
2497         NewLI = reverseVector(NewLI);
2498     }
2499     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2500   }
2501 }
2502 
2503 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2504                                                const VPIteration &Instance,
2505                                                bool IfPredicateInstr) {
2506   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2507 
2508   setDebugLocFromInst(Builder, Instr);
2509 
2510   // Does this instruction return a value ?
2511   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2512 
2513   Instruction *Cloned = Instr->clone();
2514   if (!IsVoidRetTy)
2515     Cloned->setName(Instr->getName() + ".cloned");
2516 
2517   // Replace the operands of the cloned instructions with their scalar
2518   // equivalents in the new loop.
2519   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2520     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2521     Cloned->setOperand(op, NewOp);
2522   }
2523   addNewMetadata(Cloned, Instr);
2524 
2525   // Place the cloned scalar in the new loop.
2526   Builder.Insert(Cloned);
2527 
2528   // Add the cloned scalar to the scalar map entry.
2529   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2530 
2531   // If we just cloned a new assumption, add it the assumption cache.
2532   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2533     if (II->getIntrinsicID() == Intrinsic::assume)
2534       AC->registerAssumption(II);
2535 
2536   // End if-block.
2537   if (IfPredicateInstr)
2538     PredicatedInstructions.push_back(Cloned);
2539 }
2540 
2541 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2542                                                       Value *End, Value *Step,
2543                                                       Instruction *DL) {
2544   BasicBlock *Header = L->getHeader();
2545   BasicBlock *Latch = L->getLoopLatch();
2546   // As we're just creating this loop, it's possible no latch exists
2547   // yet. If so, use the header as this will be a single block loop.
2548   if (!Latch)
2549     Latch = Header;
2550 
2551   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2552   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2553   setDebugLocFromInst(Builder, OldInst);
2554   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2555 
2556   Builder.SetInsertPoint(Latch->getTerminator());
2557   setDebugLocFromInst(Builder, OldInst);
2558 
2559   // Create i+1 and fill the PHINode.
2560   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2561   Induction->addIncoming(Start, L->getLoopPreheader());
2562   Induction->addIncoming(Next, Latch);
2563   // Create the compare.
2564   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2565   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2566 
2567   // Now we have two terminators. Remove the old one from the block.
2568   Latch->getTerminator()->eraseFromParent();
2569 
2570   return Induction;
2571 }
2572 
2573 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2574   if (TripCount)
2575     return TripCount;
2576 
2577   assert(L && "Create Trip Count for null loop.");
2578   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2579   // Find the loop boundaries.
2580   ScalarEvolution *SE = PSE.getSE();
2581   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2582   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2583          "Invalid loop count");
2584 
2585   Type *IdxTy = Legal->getWidestInductionType();
2586   assert(IdxTy && "No type for induction");
2587 
2588   // The exit count might have the type of i64 while the phi is i32. This can
2589   // happen if we have an induction variable that is sign extended before the
2590   // compare. The only way that we get a backedge taken count is that the
2591   // induction variable was signed and as such will not overflow. In such a case
2592   // truncation is legal.
2593   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2594       IdxTy->getPrimitiveSizeInBits())
2595     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2596   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2597 
2598   // Get the total trip count from the count by adding 1.
2599   const SCEV *ExitCount = SE->getAddExpr(
2600       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2601 
2602   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2603 
2604   // Expand the trip count and place the new instructions in the preheader.
2605   // Notice that the pre-header does not change, only the loop body.
2606   SCEVExpander Exp(*SE, DL, "induction");
2607 
2608   // Count holds the overall loop count (N).
2609   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2610                                 L->getLoopPreheader()->getTerminator());
2611 
2612   if (TripCount->getType()->isPointerTy())
2613     TripCount =
2614         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2615                                     L->getLoopPreheader()->getTerminator());
2616 
2617   return TripCount;
2618 }
2619 
2620 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2621   if (VectorTripCount)
2622     return VectorTripCount;
2623 
2624   Value *TC = getOrCreateTripCount(L);
2625   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2626 
2627   Type *Ty = TC->getType();
2628   Constant *Step = ConstantInt::get(Ty, VF * UF);
2629 
2630   // If the tail is to be folded by masking, round the number of iterations N
2631   // up to a multiple of Step instead of rounding down. This is done by first
2632   // adding Step-1 and then rounding down. Note that it's ok if this addition
2633   // overflows: the vector induction variable will eventually wrap to zero given
2634   // that it starts at zero and its Step is a power of two; the loop will then
2635   // exit, with the last early-exit vector comparison also producing all-true.
2636   if (Cost->foldTailByMasking()) {
2637     assert(isPowerOf2_32(VF * UF) &&
2638            "VF*UF must be a power of 2 when folding tail by masking");
2639     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2640   }
2641 
2642   // Now we need to generate the expression for the part of the loop that the
2643   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2644   // iterations are not required for correctness, or N - Step, otherwise. Step
2645   // is equal to the vectorization factor (number of SIMD elements) times the
2646   // unroll factor (number of SIMD instructions).
2647   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2648 
2649   // If there is a non-reversed interleaved group that may speculatively access
2650   // memory out-of-bounds, we need to ensure that there will be at least one
2651   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2652   // the trip count, we set the remainder to be equal to the step. If the step
2653   // does not evenly divide the trip count, no adjustment is necessary since
2654   // there will already be scalar iterations. Note that the minimum iterations
2655   // check ensures that N >= Step.
2656   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2657     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2658     R = Builder.CreateSelect(IsZero, Step, R);
2659   }
2660 
2661   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2662 
2663   return VectorTripCount;
2664 }
2665 
2666 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2667                                                    const DataLayout &DL) {
2668   // Verify that V is a vector type with same number of elements as DstVTy.
2669   unsigned VF = DstVTy->getNumElements();
2670   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2671   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2672   Type *SrcElemTy = SrcVecTy->getElementType();
2673   Type *DstElemTy = DstVTy->getElementType();
2674   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2675          "Vector elements must have same size");
2676 
2677   // Do a direct cast if element types are castable.
2678   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2679     return Builder.CreateBitOrPointerCast(V, DstVTy);
2680   }
2681   // V cannot be directly casted to desired vector type.
2682   // May happen when V is a floating point vector but DstVTy is a vector of
2683   // pointers or vice-versa. Handle this using a two-step bitcast using an
2684   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2685   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2686          "Only one type should be a pointer type");
2687   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2688          "Only one type should be a floating point type");
2689   Type *IntTy =
2690       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2691   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2692   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2693   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2694 }
2695 
2696 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2697                                                          BasicBlock *Bypass) {
2698   Value *Count = getOrCreateTripCount(L);
2699   // Reuse existing vector loop preheader for TC checks.
2700   // Note that new preheader block is generated for vector loop.
2701   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2702   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2703 
2704   // Generate code to check if the loop's trip count is less than VF * UF, or
2705   // equal to it in case a scalar epilogue is required; this implies that the
2706   // vector trip count is zero. This check also covers the case where adding one
2707   // to the backedge-taken count overflowed leading to an incorrect trip count
2708   // of zero. In this case we will also jump to the scalar loop.
2709   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2710                                           : ICmpInst::ICMP_ULT;
2711 
2712   // If tail is to be folded, vector loop takes care of all iterations.
2713   Value *CheckMinIters = Builder.getFalse();
2714   if (!Cost->foldTailByMasking())
2715     CheckMinIters = Builder.CreateICmp(
2716         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2717         "min.iters.check");
2718 
2719   // Create new preheader for vector loop.
2720   LoopVectorPreHeader =
2721       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2722                  "vector.ph");
2723 
2724   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2725                                DT->getNode(Bypass)->getIDom()) &&
2726          "TC check is expected to dominate Bypass");
2727 
2728   // Update dominator for Bypass & LoopExit.
2729   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2730   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2731 
2732   ReplaceInstWithInst(
2733       TCCheckBlock->getTerminator(),
2734       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2735   LoopBypassBlocks.push_back(TCCheckBlock);
2736 }
2737 
2738 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2739   // Reuse existing vector loop preheader for SCEV checks.
2740   // Note that new preheader block is generated for vector loop.
2741   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2742 
2743   // Generate the code to check that the SCEV assumptions that we made.
2744   // We want the new basic block to start at the first instruction in a
2745   // sequence of instructions that form a check.
2746   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2747                    "scev.check");
2748   Value *SCEVCheck = Exp.expandCodeForPredicate(
2749       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2750 
2751   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2752     if (C->isZero())
2753       return;
2754 
2755   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2756          "Cannot SCEV check stride or overflow when optimizing for size");
2757 
2758   SCEVCheckBlock->setName("vector.scevcheck");
2759   // Create new preheader for vector loop.
2760   LoopVectorPreHeader =
2761       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2762                  nullptr, "vector.ph");
2763 
2764   // Update dominator only if this is first RT check.
2765   if (LoopBypassBlocks.empty()) {
2766     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2767     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2768   }
2769 
2770   ReplaceInstWithInst(
2771       SCEVCheckBlock->getTerminator(),
2772       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2773   LoopBypassBlocks.push_back(SCEVCheckBlock);
2774   AddedSafetyChecks = true;
2775 }
2776 
2777 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2778   // VPlan-native path does not do any analysis for runtime checks currently.
2779   if (EnableVPlanNativePath)
2780     return;
2781 
2782   // Reuse existing vector loop preheader for runtime memory checks.
2783   // Note that new preheader block is generated for vector loop.
2784   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2785 
2786   // Generate the code that checks in runtime if arrays overlap. We put the
2787   // checks into a separate block to make the more common case of few elements
2788   // faster.
2789   Instruction *FirstCheckInst;
2790   Instruction *MemRuntimeCheck;
2791   std::tie(FirstCheckInst, MemRuntimeCheck) =
2792       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2793   if (!MemRuntimeCheck)
2794     return;
2795 
2796   if (MemCheckBlock->getParent()->hasOptSize()) {
2797     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2798            "Cannot emit memory checks when optimizing for size, unless forced "
2799            "to vectorize.");
2800     ORE->emit([&]() {
2801       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2802                                         L->getStartLoc(), L->getHeader())
2803              << "Code-size may be reduced by not forcing "
2804                 "vectorization, or by source-code modifications "
2805                 "eliminating the need for runtime checks "
2806                 "(e.g., adding 'restrict').";
2807     });
2808   }
2809 
2810   MemCheckBlock->setName("vector.memcheck");
2811   // Create new preheader for vector loop.
2812   LoopVectorPreHeader =
2813       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2814                  "vector.ph");
2815 
2816   // Update dominator only if this is first RT check.
2817   if (LoopBypassBlocks.empty()) {
2818     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2819     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2820   }
2821 
2822   ReplaceInstWithInst(
2823       MemCheckBlock->getTerminator(),
2824       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2825   LoopBypassBlocks.push_back(MemCheckBlock);
2826   AddedSafetyChecks = true;
2827 
2828   // We currently don't use LoopVersioning for the actual loop cloning but we
2829   // still use it to add the noalias metadata.
2830   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2831                                           PSE.getSE());
2832   LVer->prepareNoAliasMetadata();
2833 }
2834 
2835 Value *InnerLoopVectorizer::emitTransformedIndex(
2836     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2837     const InductionDescriptor &ID) const {
2838 
2839   SCEVExpander Exp(*SE, DL, "induction");
2840   auto Step = ID.getStep();
2841   auto StartValue = ID.getStartValue();
2842   assert(Index->getType() == Step->getType() &&
2843          "Index type does not match StepValue type");
2844 
2845   // Note: the IR at this point is broken. We cannot use SE to create any new
2846   // SCEV and then expand it, hoping that SCEV's simplification will give us
2847   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2848   // lead to various SCEV crashes. So all we can do is to use builder and rely
2849   // on InstCombine for future simplifications. Here we handle some trivial
2850   // cases only.
2851   auto CreateAdd = [&B](Value *X, Value *Y) {
2852     assert(X->getType() == Y->getType() && "Types don't match!");
2853     if (auto *CX = dyn_cast<ConstantInt>(X))
2854       if (CX->isZero())
2855         return Y;
2856     if (auto *CY = dyn_cast<ConstantInt>(Y))
2857       if (CY->isZero())
2858         return X;
2859     return B.CreateAdd(X, Y);
2860   };
2861 
2862   auto CreateMul = [&B](Value *X, Value *Y) {
2863     assert(X->getType() == Y->getType() && "Types don't match!");
2864     if (auto *CX = dyn_cast<ConstantInt>(X))
2865       if (CX->isOne())
2866         return Y;
2867     if (auto *CY = dyn_cast<ConstantInt>(Y))
2868       if (CY->isOne())
2869         return X;
2870     return B.CreateMul(X, Y);
2871   };
2872 
2873   switch (ID.getKind()) {
2874   case InductionDescriptor::IK_IntInduction: {
2875     assert(Index->getType() == StartValue->getType() &&
2876            "Index type does not match StartValue type");
2877     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2878       return B.CreateSub(StartValue, Index);
2879     auto *Offset = CreateMul(
2880         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2881     return CreateAdd(StartValue, Offset);
2882   }
2883   case InductionDescriptor::IK_PtrInduction: {
2884     assert(isa<SCEVConstant>(Step) &&
2885            "Expected constant step for pointer induction");
2886     return B.CreateGEP(
2887         StartValue->getType()->getPointerElementType(), StartValue,
2888         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2889                                            &*B.GetInsertPoint())));
2890   }
2891   case InductionDescriptor::IK_FpInduction: {
2892     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2893     auto InductionBinOp = ID.getInductionBinOp();
2894     assert(InductionBinOp &&
2895            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2896             InductionBinOp->getOpcode() == Instruction::FSub) &&
2897            "Original bin op should be defined for FP induction");
2898 
2899     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2900 
2901     // Floating point operations had to be 'fast' to enable the induction.
2902     FastMathFlags Flags;
2903     Flags.setFast();
2904 
2905     Value *MulExp = B.CreateFMul(StepValue, Index);
2906     if (isa<Instruction>(MulExp))
2907       // We have to check, the MulExp may be a constant.
2908       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2909 
2910     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2911                                "induction");
2912     if (isa<Instruction>(BOp))
2913       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2914 
2915     return BOp;
2916   }
2917   case InductionDescriptor::IK_NoInduction:
2918     return nullptr;
2919   }
2920   llvm_unreachable("invalid enum");
2921 }
2922 
2923 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2924   /*
2925    In this function we generate a new loop. The new loop will contain
2926    the vectorized instructions while the old loop will continue to run the
2927    scalar remainder.
2928 
2929        [ ] <-- loop iteration number check.
2930     /   |
2931    /    v
2932   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2933   |  /  |
2934   | /   v
2935   ||   [ ]     <-- vector pre header.
2936   |/    |
2937   |     v
2938   |    [  ] \
2939   |    [  ]_|   <-- vector loop.
2940   |     |
2941   |     v
2942   |   -[ ]   <--- middle-block.
2943   |  /  |
2944   | /   v
2945   -|- >[ ]     <--- new preheader.
2946    |    |
2947    |    v
2948    |   [ ] \
2949    |   [ ]_|   <-- old scalar loop to handle remainder.
2950     \   |
2951      \  v
2952       >[ ]     <-- exit block.
2953    ...
2954    */
2955 
2956   MDNode *OrigLoopID = OrigLoop->getLoopID();
2957 
2958   // Some loops have a single integer induction variable, while other loops
2959   // don't. One example is c++ iterators that often have multiple pointer
2960   // induction variables. In the code below we also support a case where we
2961   // don't have a single induction variable.
2962   //
2963   // We try to obtain an induction variable from the original loop as hard
2964   // as possible. However if we don't find one that:
2965   //   - is an integer
2966   //   - counts from zero, stepping by one
2967   //   - is the size of the widest induction variable type
2968   // then we create a new one.
2969   OldInduction = Legal->getPrimaryInduction();
2970   Type *IdxTy = Legal->getWidestInductionType();
2971 
2972   // Split the single block loop into the two loop structure described above.
2973   LoopScalarBody = OrigLoop->getHeader();
2974   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2975   LoopExitBlock = OrigLoop->getExitBlock();
2976   assert(LoopExitBlock && "Must have an exit block");
2977   assert(LoopVectorPreHeader && "Invalid loop structure");
2978 
2979   LoopMiddleBlock =
2980       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2981                  LI, nullptr, "middle.block");
2982   LoopScalarPreHeader =
2983       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2984                  nullptr, "scalar.ph");
2985   // We intentionally don't let SplitBlock to update LoopInfo since
2986   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2987   // LoopVectorBody is explicitly added to the correct place few lines later.
2988   LoopVectorBody =
2989       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2990                  nullptr, nullptr, "vector.body");
2991 
2992   // Update dominator for loop exit.
2993   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2994 
2995   // Create and register the new vector loop.
2996   Loop *Lp = LI->AllocateLoop();
2997   Loop *ParentLoop = OrigLoop->getParentLoop();
2998 
2999   // Insert the new loop into the loop nest and register the new basic blocks
3000   // before calling any utilities such as SCEV that require valid LoopInfo.
3001   if (ParentLoop) {
3002     ParentLoop->addChildLoop(Lp);
3003   } else {
3004     LI->addTopLevelLoop(Lp);
3005   }
3006   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3007 
3008   // Find the loop boundaries.
3009   Value *Count = getOrCreateTripCount(Lp);
3010 
3011   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3012 
3013   // Now, compare the new count to zero. If it is zero skip the vector loop and
3014   // jump to the scalar loop. This check also covers the case where the
3015   // backedge-taken count is uint##_max: adding one to it will overflow leading
3016   // to an incorrect trip count of zero. In this (rare) case we will also jump
3017   // to the scalar loop.
3018   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3019 
3020   // Generate the code to check any assumptions that we've made for SCEV
3021   // expressions.
3022   emitSCEVChecks(Lp, LoopScalarPreHeader);
3023 
3024   // Generate the code that checks in runtime if arrays overlap. We put the
3025   // checks into a separate block to make the more common case of few elements
3026   // faster.
3027   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3028 
3029   // Generate the induction variable.
3030   // The loop step is equal to the vectorization factor (num of SIMD elements)
3031   // times the unroll factor (num of SIMD instructions).
3032   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3033   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3034   Induction =
3035       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3036                               getDebugLocFromInstOrOperands(OldInduction));
3037 
3038   // We are going to resume the execution of the scalar loop.
3039   // Go over all of the induction variables that we found and fix the
3040   // PHIs that are left in the scalar version of the loop.
3041   // The starting values of PHI nodes depend on the counter of the last
3042   // iteration in the vectorized loop.
3043   // If we come from a bypass edge then we need to start from the original
3044   // start value.
3045 
3046   // This variable saves the new starting index for the scalar loop. It is used
3047   // to test if there are any tail iterations left once the vector loop has
3048   // completed.
3049   for (auto &InductionEntry : Legal->getInductionVars()) {
3050     PHINode *OrigPhi = InductionEntry.first;
3051     InductionDescriptor II = InductionEntry.second;
3052 
3053     // Create phi nodes to merge from the  backedge-taken check block.
3054     PHINode *BCResumeVal =
3055         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3056                         LoopScalarPreHeader->getTerminator());
3057     // Copy original phi DL over to the new one.
3058     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3059     Value *&EndValue = IVEndValues[OrigPhi];
3060     if (OrigPhi == OldInduction) {
3061       // We know what the end value is.
3062       EndValue = CountRoundDown;
3063     } else {
3064       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3065       Type *StepType = II.getStep()->getType();
3066       Instruction::CastOps CastOp =
3067           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3068       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3069       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3070       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3071       EndValue->setName("ind.end");
3072     }
3073 
3074     // The new PHI merges the original incoming value, in case of a bypass,
3075     // or the value at the end of the vectorized loop.
3076     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3077 
3078     // Fix the scalar body counter (PHI node).
3079     // The old induction's phi node in the scalar body needs the truncated
3080     // value.
3081     for (BasicBlock *BB : LoopBypassBlocks)
3082       BCResumeVal->addIncoming(II.getStartValue(), BB);
3083     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3084   }
3085 
3086   // We need the OrigLoop (scalar loop part) latch terminator to help
3087   // produce correct debug info for the middle block BB instructions.
3088   // The legality check stage guarantees that the loop will have a single
3089   // latch.
3090   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3091          "Scalar loop latch terminator isn't a branch");
3092   BranchInst *ScalarLatchBr =
3093       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3094 
3095   // Add a check in the middle block to see if we have completed
3096   // all of the iterations in the first vector loop.
3097   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3098   // If tail is to be folded, we know we don't need to run the remainder.
3099   Value *CmpN = Builder.getTrue();
3100   if (!Cost->foldTailByMasking()) {
3101     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3102                            CountRoundDown, "cmp.n",
3103                            LoopMiddleBlock->getTerminator());
3104 
3105     // Here we use the same DebugLoc as the scalar loop latch branch instead
3106     // of the corresponding compare because they may have ended up with
3107     // different line numbers and we want to avoid awkward line stepping while
3108     // debugging. Eg. if the compare has got a line number inside the loop.
3109     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3110   }
3111 
3112   BranchInst *BrInst =
3113       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3114   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3115   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3116 
3117   // Get ready to start creating new instructions into the vectorized body.
3118   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3119          "Inconsistent vector loop preheader");
3120   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3121 
3122   Optional<MDNode *> VectorizedLoopID =
3123       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3124                                       LLVMLoopVectorizeFollowupVectorized});
3125   if (VectorizedLoopID.hasValue()) {
3126     Lp->setLoopID(VectorizedLoopID.getValue());
3127 
3128     // Do not setAlreadyVectorized if loop attributes have been defined
3129     // explicitly.
3130     return LoopVectorPreHeader;
3131   }
3132 
3133   // Keep all loop hints from the original loop on the vector loop (we'll
3134   // replace the vectorizer-specific hints below).
3135   if (MDNode *LID = OrigLoop->getLoopID())
3136     Lp->setLoopID(LID);
3137 
3138   LoopVectorizeHints Hints(Lp, true, *ORE);
3139   Hints.setAlreadyVectorized();
3140 
3141 #ifdef EXPENSIVE_CHECKS
3142   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3143   LI->verify(*DT);
3144 #endif
3145 
3146   return LoopVectorPreHeader;
3147 }
3148 
3149 // Fix up external users of the induction variable. At this point, we are
3150 // in LCSSA form, with all external PHIs that use the IV having one input value,
3151 // coming from the remainder loop. We need those PHIs to also have a correct
3152 // value for the IV when arriving directly from the middle block.
3153 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3154                                        const InductionDescriptor &II,
3155                                        Value *CountRoundDown, Value *EndValue,
3156                                        BasicBlock *MiddleBlock) {
3157   // There are two kinds of external IV usages - those that use the value
3158   // computed in the last iteration (the PHI) and those that use the penultimate
3159   // value (the value that feeds into the phi from the loop latch).
3160   // We allow both, but they, obviously, have different values.
3161 
3162   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3163 
3164   DenseMap<Value *, Value *> MissingVals;
3165 
3166   // An external user of the last iteration's value should see the value that
3167   // the remainder loop uses to initialize its own IV.
3168   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3169   for (User *U : PostInc->users()) {
3170     Instruction *UI = cast<Instruction>(U);
3171     if (!OrigLoop->contains(UI)) {
3172       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3173       MissingVals[UI] = EndValue;
3174     }
3175   }
3176 
3177   // An external user of the penultimate value need to see EndValue - Step.
3178   // The simplest way to get this is to recompute it from the constituent SCEVs,
3179   // that is Start + (Step * (CRD - 1)).
3180   for (User *U : OrigPhi->users()) {
3181     auto *UI = cast<Instruction>(U);
3182     if (!OrigLoop->contains(UI)) {
3183       const DataLayout &DL =
3184           OrigLoop->getHeader()->getModule()->getDataLayout();
3185       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3186 
3187       IRBuilder<> B(MiddleBlock->getTerminator());
3188       Value *CountMinusOne = B.CreateSub(
3189           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3190       Value *CMO =
3191           !II.getStep()->getType()->isIntegerTy()
3192               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3193                              II.getStep()->getType())
3194               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3195       CMO->setName("cast.cmo");
3196       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3197       Escape->setName("ind.escape");
3198       MissingVals[UI] = Escape;
3199     }
3200   }
3201 
3202   for (auto &I : MissingVals) {
3203     PHINode *PHI = cast<PHINode>(I.first);
3204     // One corner case we have to handle is two IVs "chasing" each-other,
3205     // that is %IV2 = phi [...], [ %IV1, %latch ]
3206     // In this case, if IV1 has an external use, we need to avoid adding both
3207     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3208     // don't already have an incoming value for the middle block.
3209     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3210       PHI->addIncoming(I.second, MiddleBlock);
3211   }
3212 }
3213 
3214 namespace {
3215 
3216 struct CSEDenseMapInfo {
3217   static bool canHandle(const Instruction *I) {
3218     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3219            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3220   }
3221 
3222   static inline Instruction *getEmptyKey() {
3223     return DenseMapInfo<Instruction *>::getEmptyKey();
3224   }
3225 
3226   static inline Instruction *getTombstoneKey() {
3227     return DenseMapInfo<Instruction *>::getTombstoneKey();
3228   }
3229 
3230   static unsigned getHashValue(const Instruction *I) {
3231     assert(canHandle(I) && "Unknown instruction!");
3232     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3233                                                            I->value_op_end()));
3234   }
3235 
3236   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3237     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3238         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3239       return LHS == RHS;
3240     return LHS->isIdenticalTo(RHS);
3241   }
3242 };
3243 
3244 } // end anonymous namespace
3245 
3246 ///Perform cse of induction variable instructions.
3247 static void cse(BasicBlock *BB) {
3248   // Perform simple cse.
3249   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3250   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3251     Instruction *In = &*I++;
3252 
3253     if (!CSEDenseMapInfo::canHandle(In))
3254       continue;
3255 
3256     // Check if we can replace this instruction with any of the
3257     // visited instructions.
3258     if (Instruction *V = CSEMap.lookup(In)) {
3259       In->replaceAllUsesWith(V);
3260       In->eraseFromParent();
3261       continue;
3262     }
3263 
3264     CSEMap[In] = In;
3265   }
3266 }
3267 
3268 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3269                                                        unsigned VF,
3270                                                        bool &NeedToScalarize) {
3271   Function *F = CI->getCalledFunction();
3272   Type *ScalarRetTy = CI->getType();
3273   SmallVector<Type *, 4> Tys, ScalarTys;
3274   for (auto &ArgOp : CI->arg_operands())
3275     ScalarTys.push_back(ArgOp->getType());
3276 
3277   // Estimate cost of scalarized vector call. The source operands are assumed
3278   // to be vectors, so we need to extract individual elements from there,
3279   // execute VF scalar calls, and then gather the result into the vector return
3280   // value.
3281   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3282   if (VF == 1)
3283     return ScalarCallCost;
3284 
3285   // Compute corresponding vector type for return value and arguments.
3286   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3287   for (Type *ScalarTy : ScalarTys)
3288     Tys.push_back(ToVectorTy(ScalarTy, VF));
3289 
3290   // Compute costs of unpacking argument values for the scalar calls and
3291   // packing the return values to a vector.
3292   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3293 
3294   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3295 
3296   // If we can't emit a vector call for this function, then the currently found
3297   // cost is the cost we need to return.
3298   NeedToScalarize = true;
3299   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3300   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3301 
3302   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3303     return Cost;
3304 
3305   // If the corresponding vector cost is cheaper, return its cost.
3306   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3307   if (VectorCallCost < Cost) {
3308     NeedToScalarize = false;
3309     return VectorCallCost;
3310   }
3311   return Cost;
3312 }
3313 
3314 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3315                                                             unsigned VF) {
3316   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3317   assert(ID && "Expected intrinsic call!");
3318 
3319   FastMathFlags FMF;
3320   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3321     FMF = FPMO->getFastMathFlags();
3322 
3323   SmallVector<Value *, 4> Operands(CI->arg_operands());
3324   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
3325 }
3326 
3327 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3328   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3329   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3330   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3331 }
3332 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3333   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3334   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3335   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3336 }
3337 
3338 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3339   // For every instruction `I` in MinBWs, truncate the operands, create a
3340   // truncated version of `I` and reextend its result. InstCombine runs
3341   // later and will remove any ext/trunc pairs.
3342   SmallPtrSet<Value *, 4> Erased;
3343   for (const auto &KV : Cost->getMinimalBitwidths()) {
3344     // If the value wasn't vectorized, we must maintain the original scalar
3345     // type. The absence of the value from VectorLoopValueMap indicates that it
3346     // wasn't vectorized.
3347     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3348       continue;
3349     for (unsigned Part = 0; Part < UF; ++Part) {
3350       Value *I = getOrCreateVectorValue(KV.first, Part);
3351       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3352           !isa<Instruction>(I))
3353         continue;
3354       Type *OriginalTy = I->getType();
3355       Type *ScalarTruncatedTy =
3356           IntegerType::get(OriginalTy->getContext(), KV.second);
3357       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3358                                           OriginalTy->getVectorNumElements());
3359       if (TruncatedTy == OriginalTy)
3360         continue;
3361 
3362       IRBuilder<> B(cast<Instruction>(I));
3363       auto ShrinkOperand = [&](Value *V) -> Value * {
3364         if (auto *ZI = dyn_cast<ZExtInst>(V))
3365           if (ZI->getSrcTy() == TruncatedTy)
3366             return ZI->getOperand(0);
3367         return B.CreateZExtOrTrunc(V, TruncatedTy);
3368       };
3369 
3370       // The actual instruction modification depends on the instruction type,
3371       // unfortunately.
3372       Value *NewI = nullptr;
3373       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3374         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3375                              ShrinkOperand(BO->getOperand(1)));
3376 
3377         // Any wrapping introduced by shrinking this operation shouldn't be
3378         // considered undefined behavior. So, we can't unconditionally copy
3379         // arithmetic wrapping flags to NewI.
3380         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3381       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3382         NewI =
3383             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3384                          ShrinkOperand(CI->getOperand(1)));
3385       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3386         NewI = B.CreateSelect(SI->getCondition(),
3387                               ShrinkOperand(SI->getTrueValue()),
3388                               ShrinkOperand(SI->getFalseValue()));
3389       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3390         switch (CI->getOpcode()) {
3391         default:
3392           llvm_unreachable("Unhandled cast!");
3393         case Instruction::Trunc:
3394           NewI = ShrinkOperand(CI->getOperand(0));
3395           break;
3396         case Instruction::SExt:
3397           NewI = B.CreateSExtOrTrunc(
3398               CI->getOperand(0),
3399               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3400           break;
3401         case Instruction::ZExt:
3402           NewI = B.CreateZExtOrTrunc(
3403               CI->getOperand(0),
3404               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3405           break;
3406         }
3407       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3408         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3409         auto *O0 = B.CreateZExtOrTrunc(
3410             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3411         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3412         auto *O1 = B.CreateZExtOrTrunc(
3413             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3414 
3415         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3416       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3417         // Don't do anything with the operands, just extend the result.
3418         continue;
3419       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3420         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3421         auto *O0 = B.CreateZExtOrTrunc(
3422             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3423         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3424         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3425       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3426         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3427         auto *O0 = B.CreateZExtOrTrunc(
3428             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3429         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3430       } else {
3431         // If we don't know what to do, be conservative and don't do anything.
3432         continue;
3433       }
3434 
3435       // Lastly, extend the result.
3436       NewI->takeName(cast<Instruction>(I));
3437       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3438       I->replaceAllUsesWith(Res);
3439       cast<Instruction>(I)->eraseFromParent();
3440       Erased.insert(I);
3441       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3442     }
3443   }
3444 
3445   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3446   for (const auto &KV : Cost->getMinimalBitwidths()) {
3447     // If the value wasn't vectorized, we must maintain the original scalar
3448     // type. The absence of the value from VectorLoopValueMap indicates that it
3449     // wasn't vectorized.
3450     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3451       continue;
3452     for (unsigned Part = 0; Part < UF; ++Part) {
3453       Value *I = getOrCreateVectorValue(KV.first, Part);
3454       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3455       if (Inst && Inst->use_empty()) {
3456         Value *NewI = Inst->getOperand(0);
3457         Inst->eraseFromParent();
3458         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3459       }
3460     }
3461   }
3462 }
3463 
3464 void InnerLoopVectorizer::fixVectorizedLoop() {
3465   // Insert truncates and extends for any truncated instructions as hints to
3466   // InstCombine.
3467   if (VF > 1)
3468     truncateToMinimalBitwidths();
3469 
3470   // Fix widened non-induction PHIs by setting up the PHI operands.
3471   if (OrigPHIsToFix.size()) {
3472     assert(EnableVPlanNativePath &&
3473            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3474     fixNonInductionPHIs();
3475   }
3476 
3477   // At this point every instruction in the original loop is widened to a
3478   // vector form. Now we need to fix the recurrences in the loop. These PHI
3479   // nodes are currently empty because we did not want to introduce cycles.
3480   // This is the second stage of vectorizing recurrences.
3481   fixCrossIterationPHIs();
3482 
3483   // Forget the original basic block.
3484   PSE.getSE()->forgetLoop(OrigLoop);
3485 
3486   // Fix-up external users of the induction variables.
3487   for (auto &Entry : Legal->getInductionVars())
3488     fixupIVUsers(Entry.first, Entry.second,
3489                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3490                  IVEndValues[Entry.first], LoopMiddleBlock);
3491 
3492   fixLCSSAPHIs();
3493   for (Instruction *PI : PredicatedInstructions)
3494     sinkScalarOperands(&*PI);
3495 
3496   // Remove redundant induction instructions.
3497   cse(LoopVectorBody);
3498 
3499   // Set/update profile weights for the vector and remainder loops as original
3500   // loop iterations are now distributed among them. Note that original loop
3501   // represented by LoopScalarBody becomes remainder loop after vectorization.
3502   //
3503   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3504   // end up getting slightly roughened result but that should be OK since
3505   // profile is not inherently precise anyway. Note also possible bypass of
3506   // vector code caused by legality checks is ignored, assigning all the weight
3507   // to the vector loop, optimistically.
3508   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3509                                LI->getLoopFor(LoopVectorBody),
3510                                LI->getLoopFor(LoopScalarBody), VF * UF);
3511 }
3512 
3513 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3514   // In order to support recurrences we need to be able to vectorize Phi nodes.
3515   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3516   // stage #2: We now need to fix the recurrences by adding incoming edges to
3517   // the currently empty PHI nodes. At this point every instruction in the
3518   // original loop is widened to a vector form so we can use them to construct
3519   // the incoming edges.
3520   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3521     // Handle first-order recurrences and reductions that need to be fixed.
3522     if (Legal->isFirstOrderRecurrence(&Phi))
3523       fixFirstOrderRecurrence(&Phi);
3524     else if (Legal->isReductionVariable(&Phi))
3525       fixReduction(&Phi);
3526   }
3527 }
3528 
3529 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3530   // This is the second phase of vectorizing first-order recurrences. An
3531   // overview of the transformation is described below. Suppose we have the
3532   // following loop.
3533   //
3534   //   for (int i = 0; i < n; ++i)
3535   //     b[i] = a[i] - a[i - 1];
3536   //
3537   // There is a first-order recurrence on "a". For this loop, the shorthand
3538   // scalar IR looks like:
3539   //
3540   //   scalar.ph:
3541   //     s_init = a[-1]
3542   //     br scalar.body
3543   //
3544   //   scalar.body:
3545   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3546   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3547   //     s2 = a[i]
3548   //     b[i] = s2 - s1
3549   //     br cond, scalar.body, ...
3550   //
3551   // In this example, s1 is a recurrence because it's value depends on the
3552   // previous iteration. In the first phase of vectorization, we created a
3553   // temporary value for s1. We now complete the vectorization and produce the
3554   // shorthand vector IR shown below (for VF = 4, UF = 1).
3555   //
3556   //   vector.ph:
3557   //     v_init = vector(..., ..., ..., a[-1])
3558   //     br vector.body
3559   //
3560   //   vector.body
3561   //     i = phi [0, vector.ph], [i+4, vector.body]
3562   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3563   //     v2 = a[i, i+1, i+2, i+3];
3564   //     v3 = vector(v1(3), v2(0, 1, 2))
3565   //     b[i, i+1, i+2, i+3] = v2 - v3
3566   //     br cond, vector.body, middle.block
3567   //
3568   //   middle.block:
3569   //     x = v2(3)
3570   //     br scalar.ph
3571   //
3572   //   scalar.ph:
3573   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3574   //     br scalar.body
3575   //
3576   // After execution completes the vector loop, we extract the next value of
3577   // the recurrence (x) to use as the initial value in the scalar loop.
3578 
3579   // Get the original loop preheader and single loop latch.
3580   auto *Preheader = OrigLoop->getLoopPreheader();
3581   auto *Latch = OrigLoop->getLoopLatch();
3582 
3583   // Get the initial and previous values of the scalar recurrence.
3584   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3585   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3586 
3587   // Create a vector from the initial value.
3588   auto *VectorInit = ScalarInit;
3589   if (VF > 1) {
3590     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3591     VectorInit = Builder.CreateInsertElement(
3592         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3593         Builder.getInt32(VF - 1), "vector.recur.init");
3594   }
3595 
3596   // We constructed a temporary phi node in the first phase of vectorization.
3597   // This phi node will eventually be deleted.
3598   Builder.SetInsertPoint(
3599       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3600 
3601   // Create a phi node for the new recurrence. The current value will either be
3602   // the initial value inserted into a vector or loop-varying vector value.
3603   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3604   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3605 
3606   // Get the vectorized previous value of the last part UF - 1. It appears last
3607   // among all unrolled iterations, due to the order of their construction.
3608   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3609 
3610   // Find and set the insertion point after the previous value if it is an
3611   // instruction.
3612   BasicBlock::iterator InsertPt;
3613   // Note that the previous value may have been constant-folded so it is not
3614   // guaranteed to be an instruction in the vector loop.
3615   // FIXME: Loop invariant values do not form recurrences. We should deal with
3616   //        them earlier.
3617   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3618     InsertPt = LoopVectorBody->getFirstInsertionPt();
3619   else {
3620     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3621     if (isa<PHINode>(PreviousLastPart))
3622       // If the previous value is a phi node, we should insert after all the phi
3623       // nodes in the block containing the PHI to avoid breaking basic block
3624       // verification. Note that the basic block may be different to
3625       // LoopVectorBody, in case we predicate the loop.
3626       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3627     else
3628       InsertPt = ++PreviousInst->getIterator();
3629   }
3630   Builder.SetInsertPoint(&*InsertPt);
3631 
3632   // We will construct a vector for the recurrence by combining the values for
3633   // the current and previous iterations. This is the required shuffle mask.
3634   SmallVector<Constant *, 8> ShuffleMask(VF);
3635   ShuffleMask[0] = Builder.getInt32(VF - 1);
3636   for (unsigned I = 1; I < VF; ++I)
3637     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3638 
3639   // The vector from which to take the initial value for the current iteration
3640   // (actual or unrolled). Initially, this is the vector phi node.
3641   Value *Incoming = VecPhi;
3642 
3643   // Shuffle the current and previous vector and update the vector parts.
3644   for (unsigned Part = 0; Part < UF; ++Part) {
3645     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3646     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3647     auto *Shuffle =
3648         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3649                                              ConstantVector::get(ShuffleMask))
3650                : Incoming;
3651     PhiPart->replaceAllUsesWith(Shuffle);
3652     cast<Instruction>(PhiPart)->eraseFromParent();
3653     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3654     Incoming = PreviousPart;
3655   }
3656 
3657   // Fix the latch value of the new recurrence in the vector loop.
3658   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3659 
3660   // Extract the last vector element in the middle block. This will be the
3661   // initial value for the recurrence when jumping to the scalar loop.
3662   auto *ExtractForScalar = Incoming;
3663   if (VF > 1) {
3664     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3665     ExtractForScalar = Builder.CreateExtractElement(
3666         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3667   }
3668   // Extract the second last element in the middle block if the
3669   // Phi is used outside the loop. We need to extract the phi itself
3670   // and not the last element (the phi update in the current iteration). This
3671   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3672   // when the scalar loop is not run at all.
3673   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3674   if (VF > 1)
3675     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3676         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3677   // When loop is unrolled without vectorizing, initialize
3678   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3679   // `Incoming`. This is analogous to the vectorized case above: extracting the
3680   // second last element when VF > 1.
3681   else if (UF > 1)
3682     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3683 
3684   // Fix the initial value of the original recurrence in the scalar loop.
3685   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3686   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3687   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3688     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3689     Start->addIncoming(Incoming, BB);
3690   }
3691 
3692   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3693   Phi->setName("scalar.recur");
3694 
3695   // Finally, fix users of the recurrence outside the loop. The users will need
3696   // either the last value of the scalar recurrence or the last value of the
3697   // vector recurrence we extracted in the middle block. Since the loop is in
3698   // LCSSA form, we just need to find all the phi nodes for the original scalar
3699   // recurrence in the exit block, and then add an edge for the middle block.
3700   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3701     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3702       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3703     }
3704   }
3705 }
3706 
3707 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3708   Constant *Zero = Builder.getInt32(0);
3709 
3710   // Get it's reduction variable descriptor.
3711   assert(Legal->isReductionVariable(Phi) &&
3712          "Unable to find the reduction variable");
3713   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3714 
3715   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3716   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3717   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3718   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3719     RdxDesc.getMinMaxRecurrenceKind();
3720   setDebugLocFromInst(Builder, ReductionStartValue);
3721 
3722   // We need to generate a reduction vector from the incoming scalar.
3723   // To do so, we need to generate the 'identity' vector and override
3724   // one of the elements with the incoming scalar reduction. We need
3725   // to do it in the vector-loop preheader.
3726   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3727 
3728   // This is the vector-clone of the value that leaves the loop.
3729   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3730 
3731   // Find the reduction identity variable. Zero for addition, or, xor,
3732   // one for multiplication, -1 for And.
3733   Value *Identity;
3734   Value *VectorStart;
3735   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3736       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3737     // MinMax reduction have the start value as their identify.
3738     if (VF == 1) {
3739       VectorStart = Identity = ReductionStartValue;
3740     } else {
3741       VectorStart = Identity =
3742         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3743     }
3744   } else {
3745     // Handle other reduction kinds:
3746     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3747         RK, VecTy->getScalarType());
3748     if (VF == 1) {
3749       Identity = Iden;
3750       // This vector is the Identity vector where the first element is the
3751       // incoming scalar reduction.
3752       VectorStart = ReductionStartValue;
3753     } else {
3754       Identity = ConstantVector::getSplat({VF, false}, Iden);
3755 
3756       // This vector is the Identity vector where the first element is the
3757       // incoming scalar reduction.
3758       VectorStart =
3759         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3760     }
3761   }
3762 
3763   // Wrap flags are in general invalid after vectorization, clear them.
3764   clearReductionWrapFlags(RdxDesc);
3765 
3766   // Fix the vector-loop phi.
3767 
3768   // Reductions do not have to start at zero. They can start with
3769   // any loop invariant values.
3770   BasicBlock *Latch = OrigLoop->getLoopLatch();
3771   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3772 
3773   for (unsigned Part = 0; Part < UF; ++Part) {
3774     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3775     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3776     // Make sure to add the reduction start value only to the
3777     // first unroll part.
3778     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3779     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3780     cast<PHINode>(VecRdxPhi)
3781       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3782   }
3783 
3784   // Before each round, move the insertion point right between
3785   // the PHIs and the values we are going to write.
3786   // This allows us to write both PHINodes and the extractelement
3787   // instructions.
3788   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3789 
3790   setDebugLocFromInst(Builder, LoopExitInst);
3791 
3792   // If tail is folded by masking, the vector value to leave the loop should be
3793   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3794   // instead of the former.
3795   if (Cost->foldTailByMasking()) {
3796     for (unsigned Part = 0; Part < UF; ++Part) {
3797       Value *VecLoopExitInst =
3798           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3799       Value *Sel = nullptr;
3800       for (User *U : VecLoopExitInst->users()) {
3801         if (isa<SelectInst>(U)) {
3802           assert(!Sel && "Reduction exit feeding two selects");
3803           Sel = U;
3804         } else
3805           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3806       }
3807       assert(Sel && "Reduction exit feeds no select");
3808       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3809     }
3810   }
3811 
3812   // If the vector reduction can be performed in a smaller type, we truncate
3813   // then extend the loop exit value to enable InstCombine to evaluate the
3814   // entire expression in the smaller type.
3815   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3816     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3817     Builder.SetInsertPoint(
3818         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3819     VectorParts RdxParts(UF);
3820     for (unsigned Part = 0; Part < UF; ++Part) {
3821       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3822       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3823       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3824                                         : Builder.CreateZExt(Trunc, VecTy);
3825       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3826            UI != RdxParts[Part]->user_end();)
3827         if (*UI != Trunc) {
3828           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3829           RdxParts[Part] = Extnd;
3830         } else {
3831           ++UI;
3832         }
3833     }
3834     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3835     for (unsigned Part = 0; Part < UF; ++Part) {
3836       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3837       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3838     }
3839   }
3840 
3841   // Reduce all of the unrolled parts into a single vector.
3842   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3843   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3844 
3845   // The middle block terminator has already been assigned a DebugLoc here (the
3846   // OrigLoop's single latch terminator). We want the whole middle block to
3847   // appear to execute on this line because: (a) it is all compiler generated,
3848   // (b) these instructions are always executed after evaluating the latch
3849   // conditional branch, and (c) other passes may add new predecessors which
3850   // terminate on this line. This is the easiest way to ensure we don't
3851   // accidentally cause an extra step back into the loop while debugging.
3852   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3853   for (unsigned Part = 1; Part < UF; ++Part) {
3854     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3855     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3856       // Floating point operations had to be 'fast' to enable the reduction.
3857       ReducedPartRdx = addFastMathFlag(
3858           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3859                               ReducedPartRdx, "bin.rdx"),
3860           RdxDesc.getFastMathFlags());
3861     else
3862       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3863                                       RdxPart);
3864   }
3865 
3866   if (VF > 1) {
3867     bool NoNaN = Legal->hasFunNoNaNAttr();
3868     ReducedPartRdx =
3869         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3870     // If the reduction can be performed in a smaller type, we need to extend
3871     // the reduction to the wider type before we branch to the original loop.
3872     if (Phi->getType() != RdxDesc.getRecurrenceType())
3873       ReducedPartRdx =
3874         RdxDesc.isSigned()
3875         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3876         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3877   }
3878 
3879   // Create a phi node that merges control-flow from the backedge-taken check
3880   // block and the middle block.
3881   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3882                                         LoopScalarPreHeader->getTerminator());
3883   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3884     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3885   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3886 
3887   // Now, we need to fix the users of the reduction variable
3888   // inside and outside of the scalar remainder loop.
3889   // We know that the loop is in LCSSA form. We need to update the
3890   // PHI nodes in the exit blocks.
3891   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3892     // All PHINodes need to have a single entry edge, or two if
3893     // we already fixed them.
3894     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3895 
3896     // We found a reduction value exit-PHI. Update it with the
3897     // incoming bypass edge.
3898     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3899       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3900   } // end of the LCSSA phi scan.
3901 
3902     // Fix the scalar loop reduction variable with the incoming reduction sum
3903     // from the vector body and from the backedge value.
3904   int IncomingEdgeBlockIdx =
3905     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3906   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3907   // Pick the other block.
3908   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3909   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3910   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3911 }
3912 
3913 void InnerLoopVectorizer::clearReductionWrapFlags(
3914     RecurrenceDescriptor &RdxDesc) {
3915   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3916   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3917       RK != RecurrenceDescriptor::RK_IntegerMult)
3918     return;
3919 
3920   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3921   assert(LoopExitInstr && "null loop exit instruction");
3922   SmallVector<Instruction *, 8> Worklist;
3923   SmallPtrSet<Instruction *, 8> Visited;
3924   Worklist.push_back(LoopExitInstr);
3925   Visited.insert(LoopExitInstr);
3926 
3927   while (!Worklist.empty()) {
3928     Instruction *Cur = Worklist.pop_back_val();
3929     if (isa<OverflowingBinaryOperator>(Cur))
3930       for (unsigned Part = 0; Part < UF; ++Part) {
3931         Value *V = getOrCreateVectorValue(Cur, Part);
3932         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3933       }
3934 
3935     for (User *U : Cur->users()) {
3936       Instruction *UI = cast<Instruction>(U);
3937       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3938           Visited.insert(UI).second)
3939         Worklist.push_back(UI);
3940     }
3941   }
3942 }
3943 
3944 void InnerLoopVectorizer::fixLCSSAPHIs() {
3945   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3946     if (LCSSAPhi.getNumIncomingValues() == 1) {
3947       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3948       // Non-instruction incoming values will have only one value.
3949       unsigned LastLane = 0;
3950       if (isa<Instruction>(IncomingValue))
3951           LastLane = Cost->isUniformAfterVectorization(
3952                          cast<Instruction>(IncomingValue), VF)
3953                          ? 0
3954                          : VF - 1;
3955       // Can be a loop invariant incoming value or the last scalar value to be
3956       // extracted from the vectorized loop.
3957       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3958       Value *lastIncomingValue =
3959           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3960       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3961     }
3962   }
3963 }
3964 
3965 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3966   // The basic block and loop containing the predicated instruction.
3967   auto *PredBB = PredInst->getParent();
3968   auto *VectorLoop = LI->getLoopFor(PredBB);
3969 
3970   // Initialize a worklist with the operands of the predicated instruction.
3971   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3972 
3973   // Holds instructions that we need to analyze again. An instruction may be
3974   // reanalyzed if we don't yet know if we can sink it or not.
3975   SmallVector<Instruction *, 8> InstsToReanalyze;
3976 
3977   // Returns true if a given use occurs in the predicated block. Phi nodes use
3978   // their operands in their corresponding predecessor blocks.
3979   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3980     auto *I = cast<Instruction>(U.getUser());
3981     BasicBlock *BB = I->getParent();
3982     if (auto *Phi = dyn_cast<PHINode>(I))
3983       BB = Phi->getIncomingBlock(
3984           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3985     return BB == PredBB;
3986   };
3987 
3988   // Iteratively sink the scalarized operands of the predicated instruction
3989   // into the block we created for it. When an instruction is sunk, it's
3990   // operands are then added to the worklist. The algorithm ends after one pass
3991   // through the worklist doesn't sink a single instruction.
3992   bool Changed;
3993   do {
3994     // Add the instructions that need to be reanalyzed to the worklist, and
3995     // reset the changed indicator.
3996     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3997     InstsToReanalyze.clear();
3998     Changed = false;
3999 
4000     while (!Worklist.empty()) {
4001       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4002 
4003       // We can't sink an instruction if it is a phi node, is already in the
4004       // predicated block, is not in the loop, or may have side effects.
4005       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4006           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4007         continue;
4008 
4009       // It's legal to sink the instruction if all its uses occur in the
4010       // predicated block. Otherwise, there's nothing to do yet, and we may
4011       // need to reanalyze the instruction.
4012       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4013         InstsToReanalyze.push_back(I);
4014         continue;
4015       }
4016 
4017       // Move the instruction to the beginning of the predicated block, and add
4018       // it's operands to the worklist.
4019       I->moveBefore(&*PredBB->getFirstInsertionPt());
4020       Worklist.insert(I->op_begin(), I->op_end());
4021 
4022       // The sinking may have enabled other instructions to be sunk, so we will
4023       // need to iterate.
4024       Changed = true;
4025     }
4026   } while (Changed);
4027 }
4028 
4029 void InnerLoopVectorizer::fixNonInductionPHIs() {
4030   for (PHINode *OrigPhi : OrigPHIsToFix) {
4031     PHINode *NewPhi =
4032         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4033     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4034 
4035     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4036         predecessors(OrigPhi->getParent()));
4037     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4038         predecessors(NewPhi->getParent()));
4039     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4040            "Scalar and Vector BB should have the same number of predecessors");
4041 
4042     // The insertion point in Builder may be invalidated by the time we get
4043     // here. Force the Builder insertion point to something valid so that we do
4044     // not run into issues during insertion point restore in
4045     // getOrCreateVectorValue calls below.
4046     Builder.SetInsertPoint(NewPhi);
4047 
4048     // The predecessor order is preserved and we can rely on mapping between
4049     // scalar and vector block predecessors.
4050     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4051       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4052 
4053       // When looking up the new scalar/vector values to fix up, use incoming
4054       // values from original phi.
4055       Value *ScIncV =
4056           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4057 
4058       // Scalar incoming value may need a broadcast
4059       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4060       NewPhi->addIncoming(NewIncV, NewPredBB);
4061     }
4062   }
4063 }
4064 
4065 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4066                                    unsigned VF, bool IsPtrLoopInvariant,
4067                                    SmallBitVector &IsIndexLoopInvariant) {
4068   // Construct a vector GEP by widening the operands of the scalar GEP as
4069   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4070   // results in a vector of pointers when at least one operand of the GEP
4071   // is vector-typed. Thus, to keep the representation compact, we only use
4072   // vector-typed operands for loop-varying values.
4073 
4074   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4075     // If we are vectorizing, but the GEP has only loop-invariant operands,
4076     // the GEP we build (by only using vector-typed operands for
4077     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4078     // produce a vector of pointers, we need to either arbitrarily pick an
4079     // operand to broadcast, or broadcast a clone of the original GEP.
4080     // Here, we broadcast a clone of the original.
4081     //
4082     // TODO: If at some point we decide to scalarize instructions having
4083     //       loop-invariant operands, this special case will no longer be
4084     //       required. We would add the scalarization decision to
4085     //       collectLoopScalars() and teach getVectorValue() to broadcast
4086     //       the lane-zero scalar value.
4087     auto *Clone = Builder.Insert(GEP->clone());
4088     for (unsigned Part = 0; Part < UF; ++Part) {
4089       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4090       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4091       addMetadata(EntryPart, GEP);
4092     }
4093   } else {
4094     // If the GEP has at least one loop-varying operand, we are sure to
4095     // produce a vector of pointers. But if we are only unrolling, we want
4096     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4097     // produce with the code below will be scalar (if VF == 1) or vector
4098     // (otherwise). Note that for the unroll-only case, we still maintain
4099     // values in the vector mapping with initVector, as we do for other
4100     // instructions.
4101     for (unsigned Part = 0; Part < UF; ++Part) {
4102       // The pointer operand of the new GEP. If it's loop-invariant, we
4103       // won't broadcast it.
4104       auto *Ptr = IsPtrLoopInvariant
4105                       ? GEP->getPointerOperand()
4106                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4107 
4108       // Collect all the indices for the new GEP. If any index is
4109       // loop-invariant, we won't broadcast it.
4110       SmallVector<Value *, 4> Indices;
4111       for (auto Index : enumerate(GEP->indices())) {
4112         Value *User = Index.value().get();
4113         if (IsIndexLoopInvariant[Index.index()])
4114           Indices.push_back(User);
4115         else
4116           Indices.push_back(getOrCreateVectorValue(User, Part));
4117       }
4118 
4119       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4120       // but it should be a vector, otherwise.
4121       auto *NewGEP =
4122           GEP->isInBounds()
4123               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4124                                           Indices)
4125               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4126       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4127              "NewGEP is not a pointer vector");
4128       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4129       addMetadata(NewGEP, GEP);
4130     }
4131   }
4132 }
4133 
4134 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4135                                               unsigned VF) {
4136   PHINode *P = cast<PHINode>(PN);
4137   if (EnableVPlanNativePath) {
4138     // Currently we enter here in the VPlan-native path for non-induction
4139     // PHIs where all control flow is uniform. We simply widen these PHIs.
4140     // Create a vector phi with no operands - the vector phi operands will be
4141     // set at the end of vector code generation.
4142     Type *VecTy =
4143         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4144     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4145     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4146     OrigPHIsToFix.push_back(P);
4147 
4148     return;
4149   }
4150 
4151   assert(PN->getParent() == OrigLoop->getHeader() &&
4152          "Non-header phis should have been handled elsewhere");
4153 
4154   // In order to support recurrences we need to be able to vectorize Phi nodes.
4155   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4156   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4157   // this value when we vectorize all of the instructions that use the PHI.
4158   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4159     for (unsigned Part = 0; Part < UF; ++Part) {
4160       // This is phase one of vectorizing PHIs.
4161       Type *VecTy =
4162           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4163       Value *EntryPart = PHINode::Create(
4164           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4165       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4166     }
4167     return;
4168   }
4169 
4170   setDebugLocFromInst(Builder, P);
4171 
4172   // This PHINode must be an induction variable.
4173   // Make sure that we know about it.
4174   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4175 
4176   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4177   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4178 
4179   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4180   // which can be found from the original scalar operations.
4181   switch (II.getKind()) {
4182   case InductionDescriptor::IK_NoInduction:
4183     llvm_unreachable("Unknown induction");
4184   case InductionDescriptor::IK_IntInduction:
4185   case InductionDescriptor::IK_FpInduction:
4186     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4187   case InductionDescriptor::IK_PtrInduction: {
4188     // Handle the pointer induction variable case.
4189     assert(P->getType()->isPointerTy() && "Unexpected type.");
4190     // This is the normalized GEP that starts counting at zero.
4191     Value *PtrInd = Induction;
4192     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4193     // Determine the number of scalars we need to generate for each unroll
4194     // iteration. If the instruction is uniform, we only need to generate the
4195     // first lane. Otherwise, we generate all VF values.
4196     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4197     // These are the scalar results. Notice that we don't generate vector GEPs
4198     // because scalar GEPs result in better code.
4199     for (unsigned Part = 0; Part < UF; ++Part) {
4200       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4201         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4202         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4203         Value *SclrGep =
4204             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4205         SclrGep->setName("next.gep");
4206         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4207       }
4208     }
4209     return;
4210   }
4211   }
4212 }
4213 
4214 /// A helper function for checking whether an integer division-related
4215 /// instruction may divide by zero (in which case it must be predicated if
4216 /// executed conditionally in the scalar code).
4217 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4218 /// Non-zero divisors that are non compile-time constants will not be
4219 /// converted into multiplication, so we will still end up scalarizing
4220 /// the division, but can do so w/o predication.
4221 static bool mayDivideByZero(Instruction &I) {
4222   assert((I.getOpcode() == Instruction::UDiv ||
4223           I.getOpcode() == Instruction::SDiv ||
4224           I.getOpcode() == Instruction::URem ||
4225           I.getOpcode() == Instruction::SRem) &&
4226          "Unexpected instruction");
4227   Value *Divisor = I.getOperand(1);
4228   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4229   return !CInt || CInt->isZero();
4230 }
4231 
4232 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4233   switch (I.getOpcode()) {
4234   case Instruction::Call:
4235   case Instruction::Br:
4236   case Instruction::PHI:
4237   case Instruction::GetElementPtr:
4238   case Instruction::Select:
4239     llvm_unreachable("This instruction is handled by a different recipe.");
4240   case Instruction::UDiv:
4241   case Instruction::SDiv:
4242   case Instruction::SRem:
4243   case Instruction::URem:
4244   case Instruction::Add:
4245   case Instruction::FAdd:
4246   case Instruction::Sub:
4247   case Instruction::FSub:
4248   case Instruction::FNeg:
4249   case Instruction::Mul:
4250   case Instruction::FMul:
4251   case Instruction::FDiv:
4252   case Instruction::FRem:
4253   case Instruction::Shl:
4254   case Instruction::LShr:
4255   case Instruction::AShr:
4256   case Instruction::And:
4257   case Instruction::Or:
4258   case Instruction::Xor: {
4259     // Just widen unops and binops.
4260     setDebugLocFromInst(Builder, &I);
4261 
4262     for (unsigned Part = 0; Part < UF; ++Part) {
4263       SmallVector<Value *, 2> Ops;
4264       for (Value *Op : I.operands())
4265         Ops.push_back(getOrCreateVectorValue(Op, Part));
4266 
4267       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4268 
4269       if (auto *VecOp = dyn_cast<Instruction>(V))
4270         VecOp->copyIRFlags(&I);
4271 
4272       // Use this vector value for all users of the original instruction.
4273       VectorLoopValueMap.setVectorValue(&I, Part, V);
4274       addMetadata(V, &I);
4275     }
4276 
4277     break;
4278   }
4279   case Instruction::ICmp:
4280   case Instruction::FCmp: {
4281     // Widen compares. Generate vector compares.
4282     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4283     auto *Cmp = cast<CmpInst>(&I);
4284     setDebugLocFromInst(Builder, Cmp);
4285     for (unsigned Part = 0; Part < UF; ++Part) {
4286       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4287       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4288       Value *C = nullptr;
4289       if (FCmp) {
4290         // Propagate fast math flags.
4291         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4292         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4293         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4294       } else {
4295         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4296       }
4297       VectorLoopValueMap.setVectorValue(&I, Part, C);
4298       addMetadata(C, &I);
4299     }
4300 
4301     break;
4302   }
4303 
4304   case Instruction::ZExt:
4305   case Instruction::SExt:
4306   case Instruction::FPToUI:
4307   case Instruction::FPToSI:
4308   case Instruction::FPExt:
4309   case Instruction::PtrToInt:
4310   case Instruction::IntToPtr:
4311   case Instruction::SIToFP:
4312   case Instruction::UIToFP:
4313   case Instruction::Trunc:
4314   case Instruction::FPTrunc:
4315   case Instruction::BitCast: {
4316     auto *CI = cast<CastInst>(&I);
4317     setDebugLocFromInst(Builder, CI);
4318 
4319     /// Vectorize casts.
4320     Type *DestTy =
4321         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4322 
4323     for (unsigned Part = 0; Part < UF; ++Part) {
4324       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4325       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4326       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4327       addMetadata(Cast, &I);
4328     }
4329     break;
4330   }
4331   default:
4332     // This instruction is not vectorized by simple widening.
4333     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4334     llvm_unreachable("Unhandled instruction!");
4335   } // end of switch.
4336 }
4337 
4338 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4339                                                VPTransformState &State) {
4340   assert(!isa<DbgInfoIntrinsic>(I) &&
4341          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4342   setDebugLocFromInst(Builder, &I);
4343 
4344   Module *M = I.getParent()->getParent()->getParent();
4345   auto *CI = cast<CallInst>(&I);
4346 
4347   SmallVector<Type *, 4> Tys;
4348   for (Value *ArgOperand : CI->arg_operands())
4349     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4350 
4351   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4352 
4353   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4354   // version of the instruction.
4355   // Is it beneficial to perform intrinsic call compared to lib call?
4356   bool NeedToScalarize = false;
4357   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4358   bool UseVectorIntrinsic =
4359       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4360   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4361          "Instruction should be scalarized elsewhere.");
4362 
4363   for (unsigned Part = 0; Part < UF; ++Part) {
4364     SmallVector<Value *, 4> Args;
4365     for (auto &I : enumerate(ArgOperands.operands())) {
4366       // Some intrinsics have a scalar argument - don't replace it with a
4367       // vector.
4368       Value *Arg;
4369       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4370         Arg = State.get(I.value(), Part);
4371       else
4372         Arg = State.get(I.value(), {0, 0});
4373       Args.push_back(Arg);
4374     }
4375 
4376     Function *VectorF;
4377     if (UseVectorIntrinsic) {
4378       // Use vector version of the intrinsic.
4379       Type *TysForDecl[] = {CI->getType()};
4380       if (VF > 1)
4381         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4382       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4383     } else {
4384       // Use vector version of the function call.
4385       const VFShape Shape =
4386           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4387 #ifndef NDEBUG
4388         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4389         assert(std::find_if(Infos.begin(), Infos.end(),
4390                             [&Shape](const VFInfo &Info) {
4391                               return Info.Shape == Shape;
4392                             }) != Infos.end() &&
4393                "Vector function shape is missing from the database.");
4394 #endif
4395         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4396     }
4397       assert(VectorF && "Can't create vector function.");
4398 
4399       SmallVector<OperandBundleDef, 1> OpBundles;
4400       CI->getOperandBundlesAsDefs(OpBundles);
4401       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4402 
4403       if (isa<FPMathOperator>(V))
4404         V->copyFastMathFlags(CI);
4405 
4406       VectorLoopValueMap.setVectorValue(&I, Part, V);
4407       addMetadata(V, &I);
4408   }
4409 }
4410 
4411 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4412                                                  bool InvariantCond) {
4413   setDebugLocFromInst(Builder, &I);
4414 
4415   // The condition can be loop invariant  but still defined inside the
4416   // loop. This means that we can't just use the original 'cond' value.
4417   // We have to take the 'vectorized' value and pick the first lane.
4418   // Instcombine will make this a no-op.
4419 
4420   auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4421 
4422   for (unsigned Part = 0; Part < UF; ++Part) {
4423     Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4424     Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4425     Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4426     Value *Sel =
4427         Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4428     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4429     addMetadata(Sel, &I);
4430   }
4431 }
4432 
4433 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4434   // We should not collect Scalars more than once per VF. Right now, this
4435   // function is called from collectUniformsAndScalars(), which already does
4436   // this check. Collecting Scalars for VF=1 does not make any sense.
4437   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4438          "This function should not be visited twice for the same VF");
4439 
4440   SmallSetVector<Instruction *, 8> Worklist;
4441 
4442   // These sets are used to seed the analysis with pointers used by memory
4443   // accesses that will remain scalar.
4444   SmallSetVector<Instruction *, 8> ScalarPtrs;
4445   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4446 
4447   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4448   // The pointer operands of loads and stores will be scalar as long as the
4449   // memory access is not a gather or scatter operation. The value operand of a
4450   // store will remain scalar if the store is scalarized.
4451   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4452     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4453     assert(WideningDecision != CM_Unknown &&
4454            "Widening decision should be ready at this moment");
4455     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4456       if (Ptr == Store->getValueOperand())
4457         return WideningDecision == CM_Scalarize;
4458     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4459            "Ptr is neither a value or pointer operand");
4460     return WideningDecision != CM_GatherScatter;
4461   };
4462 
4463   // A helper that returns true if the given value is a bitcast or
4464   // getelementptr instruction contained in the loop.
4465   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4466     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4467             isa<GetElementPtrInst>(V)) &&
4468            !TheLoop->isLoopInvariant(V);
4469   };
4470 
4471   // A helper that evaluates a memory access's use of a pointer. If the use
4472   // will be a scalar use, and the pointer is only used by memory accesses, we
4473   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4474   // PossibleNonScalarPtrs.
4475   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4476     // We only care about bitcast and getelementptr instructions contained in
4477     // the loop.
4478     if (!isLoopVaryingBitCastOrGEP(Ptr))
4479       return;
4480 
4481     // If the pointer has already been identified as scalar (e.g., if it was
4482     // also identified as uniform), there's nothing to do.
4483     auto *I = cast<Instruction>(Ptr);
4484     if (Worklist.count(I))
4485       return;
4486 
4487     // If the use of the pointer will be a scalar use, and all users of the
4488     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4489     // place the pointer in PossibleNonScalarPtrs.
4490     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4491           return isa<LoadInst>(U) || isa<StoreInst>(U);
4492         }))
4493       ScalarPtrs.insert(I);
4494     else
4495       PossibleNonScalarPtrs.insert(I);
4496   };
4497 
4498   // We seed the scalars analysis with three classes of instructions: (1)
4499   // instructions marked uniform-after-vectorization, (2) bitcast and
4500   // getelementptr instructions used by memory accesses requiring a scalar use,
4501   // and (3) pointer induction variables and their update instructions (we
4502   // currently only scalarize these).
4503   //
4504   // (1) Add to the worklist all instructions that have been identified as
4505   // uniform-after-vectorization.
4506   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4507 
4508   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4509   // memory accesses requiring a scalar use. The pointer operands of loads and
4510   // stores will be scalar as long as the memory accesses is not a gather or
4511   // scatter operation. The value operand of a store will remain scalar if the
4512   // store is scalarized.
4513   for (auto *BB : TheLoop->blocks())
4514     for (auto &I : *BB) {
4515       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4516         evaluatePtrUse(Load, Load->getPointerOperand());
4517       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4518         evaluatePtrUse(Store, Store->getPointerOperand());
4519         evaluatePtrUse(Store, Store->getValueOperand());
4520       }
4521     }
4522   for (auto *I : ScalarPtrs)
4523     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4524       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4525       Worklist.insert(I);
4526     }
4527 
4528   // (3) Add to the worklist all pointer induction variables and their update
4529   // instructions.
4530   //
4531   // TODO: Once we are able to vectorize pointer induction variables we should
4532   //       no longer insert them into the worklist here.
4533   auto *Latch = TheLoop->getLoopLatch();
4534   for (auto &Induction : Legal->getInductionVars()) {
4535     auto *Ind = Induction.first;
4536     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4537     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4538       continue;
4539     Worklist.insert(Ind);
4540     Worklist.insert(IndUpdate);
4541     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4542     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4543                       << "\n");
4544   }
4545 
4546   // Insert the forced scalars.
4547   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4548   // induction variable when the PHI user is scalarized.
4549   auto ForcedScalar = ForcedScalars.find(VF);
4550   if (ForcedScalar != ForcedScalars.end())
4551     for (auto *I : ForcedScalar->second)
4552       Worklist.insert(I);
4553 
4554   // Expand the worklist by looking through any bitcasts and getelementptr
4555   // instructions we've already identified as scalar. This is similar to the
4556   // expansion step in collectLoopUniforms(); however, here we're only
4557   // expanding to include additional bitcasts and getelementptr instructions.
4558   unsigned Idx = 0;
4559   while (Idx != Worklist.size()) {
4560     Instruction *Dst = Worklist[Idx++];
4561     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4562       continue;
4563     auto *Src = cast<Instruction>(Dst->getOperand(0));
4564     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4565           auto *J = cast<Instruction>(U);
4566           return !TheLoop->contains(J) || Worklist.count(J) ||
4567                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4568                   isScalarUse(J, Src));
4569         })) {
4570       Worklist.insert(Src);
4571       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4572     }
4573   }
4574 
4575   // An induction variable will remain scalar if all users of the induction
4576   // variable and induction variable update remain scalar.
4577   for (auto &Induction : Legal->getInductionVars()) {
4578     auto *Ind = Induction.first;
4579     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4580 
4581     // We already considered pointer induction variables, so there's no reason
4582     // to look at their users again.
4583     //
4584     // TODO: Once we are able to vectorize pointer induction variables we
4585     //       should no longer skip over them here.
4586     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4587       continue;
4588 
4589     // Determine if all users of the induction variable are scalar after
4590     // vectorization.
4591     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4592       auto *I = cast<Instruction>(U);
4593       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4594     });
4595     if (!ScalarInd)
4596       continue;
4597 
4598     // Determine if all users of the induction variable update instruction are
4599     // scalar after vectorization.
4600     auto ScalarIndUpdate =
4601         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4602           auto *I = cast<Instruction>(U);
4603           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4604         });
4605     if (!ScalarIndUpdate)
4606       continue;
4607 
4608     // The induction variable and its update instruction will remain scalar.
4609     Worklist.insert(Ind);
4610     Worklist.insert(IndUpdate);
4611     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4612     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4613                       << "\n");
4614   }
4615 
4616   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4617 }
4618 
4619 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4620   if (!blockNeedsPredication(I->getParent()))
4621     return false;
4622   switch(I->getOpcode()) {
4623   default:
4624     break;
4625   case Instruction::Load:
4626   case Instruction::Store: {
4627     if (!Legal->isMaskRequired(I))
4628       return false;
4629     auto *Ptr = getLoadStorePointerOperand(I);
4630     auto *Ty = getMemInstValueType(I);
4631     // We have already decided how to vectorize this instruction, get that
4632     // result.
4633     if (VF > 1) {
4634       InstWidening WideningDecision = getWideningDecision(I, VF);
4635       assert(WideningDecision != CM_Unknown &&
4636              "Widening decision should be ready at this moment");
4637       return WideningDecision == CM_Scalarize;
4638     }
4639     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4640     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4641                                 isLegalMaskedGather(Ty, Alignment))
4642                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4643                                 isLegalMaskedScatter(Ty, Alignment));
4644   }
4645   case Instruction::UDiv:
4646   case Instruction::SDiv:
4647   case Instruction::SRem:
4648   case Instruction::URem:
4649     return mayDivideByZero(*I);
4650   }
4651   return false;
4652 }
4653 
4654 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4655                                                                unsigned VF) {
4656   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4657   assert(getWideningDecision(I, VF) == CM_Unknown &&
4658          "Decision should not be set yet.");
4659   auto *Group = getInterleavedAccessGroup(I);
4660   assert(Group && "Must have a group.");
4661 
4662   // If the instruction's allocated size doesn't equal it's type size, it
4663   // requires padding and will be scalarized.
4664   auto &DL = I->getModule()->getDataLayout();
4665   auto *ScalarTy = getMemInstValueType(I);
4666   if (hasIrregularType(ScalarTy, DL, VF))
4667     return false;
4668 
4669   // Check if masking is required.
4670   // A Group may need masking for one of two reasons: it resides in a block that
4671   // needs predication, or it was decided to use masking to deal with gaps.
4672   bool PredicatedAccessRequiresMasking =
4673       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4674   bool AccessWithGapsRequiresMasking =
4675       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4676   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4677     return true;
4678 
4679   // If masked interleaving is required, we expect that the user/target had
4680   // enabled it, because otherwise it either wouldn't have been created or
4681   // it should have been invalidated by the CostModel.
4682   assert(useMaskedInterleavedAccesses(TTI) &&
4683          "Masked interleave-groups for predicated accesses are not enabled.");
4684 
4685   auto *Ty = getMemInstValueType(I);
4686   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4687   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4688                           : TTI.isLegalMaskedStore(Ty, Alignment);
4689 }
4690 
4691 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4692                                                                unsigned VF) {
4693   // Get and ensure we have a valid memory instruction.
4694   LoadInst *LI = dyn_cast<LoadInst>(I);
4695   StoreInst *SI = dyn_cast<StoreInst>(I);
4696   assert((LI || SI) && "Invalid memory instruction");
4697 
4698   auto *Ptr = getLoadStorePointerOperand(I);
4699 
4700   // In order to be widened, the pointer should be consecutive, first of all.
4701   if (!Legal->isConsecutivePtr(Ptr))
4702     return false;
4703 
4704   // If the instruction is a store located in a predicated block, it will be
4705   // scalarized.
4706   if (isScalarWithPredication(I))
4707     return false;
4708 
4709   // If the instruction's allocated size doesn't equal it's type size, it
4710   // requires padding and will be scalarized.
4711   auto &DL = I->getModule()->getDataLayout();
4712   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4713   if (hasIrregularType(ScalarTy, DL, VF))
4714     return false;
4715 
4716   return true;
4717 }
4718 
4719 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4720   // We should not collect Uniforms more than once per VF. Right now,
4721   // this function is called from collectUniformsAndScalars(), which
4722   // already does this check. Collecting Uniforms for VF=1 does not make any
4723   // sense.
4724 
4725   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4726          "This function should not be visited twice for the same VF");
4727 
4728   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4729   // not analyze again.  Uniforms.count(VF) will return 1.
4730   Uniforms[VF].clear();
4731 
4732   // We now know that the loop is vectorizable!
4733   // Collect instructions inside the loop that will remain uniform after
4734   // vectorization.
4735 
4736   // Global values, params and instructions outside of current loop are out of
4737   // scope.
4738   auto isOutOfScope = [&](Value *V) -> bool {
4739     Instruction *I = dyn_cast<Instruction>(V);
4740     return (!I || !TheLoop->contains(I));
4741   };
4742 
4743   SetVector<Instruction *> Worklist;
4744   BasicBlock *Latch = TheLoop->getLoopLatch();
4745 
4746   // Instructions that are scalar with predication must not be considered
4747   // uniform after vectorization, because that would create an erroneous
4748   // replicating region where only a single instance out of VF should be formed.
4749   // TODO: optimize such seldom cases if found important, see PR40816.
4750   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4751     if (isScalarWithPredication(I, VF)) {
4752       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4753                         << *I << "\n");
4754       return;
4755     }
4756     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4757     Worklist.insert(I);
4758   };
4759 
4760   // Start with the conditional branch. If the branch condition is an
4761   // instruction contained in the loop that is only used by the branch, it is
4762   // uniform.
4763   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4764   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4765     addToWorklistIfAllowed(Cmp);
4766 
4767   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4768   // are pointers that are treated like consecutive pointers during
4769   // vectorization. The pointer operands of interleaved accesses are an
4770   // example.
4771   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4772 
4773   // Holds pointer operands of instructions that are possibly non-uniform.
4774   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4775 
4776   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4777     InstWidening WideningDecision = getWideningDecision(I, VF);
4778     assert(WideningDecision != CM_Unknown &&
4779            "Widening decision should be ready at this moment");
4780 
4781     return (WideningDecision == CM_Widen ||
4782             WideningDecision == CM_Widen_Reverse ||
4783             WideningDecision == CM_Interleave);
4784   };
4785   // Iterate over the instructions in the loop, and collect all
4786   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4787   // that a consecutive-like pointer operand will be scalarized, we collect it
4788   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4789   // getelementptr instruction can be used by both vectorized and scalarized
4790   // memory instructions. For example, if a loop loads and stores from the same
4791   // location, but the store is conditional, the store will be scalarized, and
4792   // the getelementptr won't remain uniform.
4793   for (auto *BB : TheLoop->blocks())
4794     for (auto &I : *BB) {
4795       // If there's no pointer operand, there's nothing to do.
4796       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4797       if (!Ptr)
4798         continue;
4799 
4800       // True if all users of Ptr are memory accesses that have Ptr as their
4801       // pointer operand.
4802       auto UsersAreMemAccesses =
4803           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4804             return getLoadStorePointerOperand(U) == Ptr;
4805           });
4806 
4807       // Ensure the memory instruction will not be scalarized or used by
4808       // gather/scatter, making its pointer operand non-uniform. If the pointer
4809       // operand is used by any instruction other than a memory access, we
4810       // conservatively assume the pointer operand may be non-uniform.
4811       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4812         PossibleNonUniformPtrs.insert(Ptr);
4813 
4814       // If the memory instruction will be vectorized and its pointer operand
4815       // is consecutive-like, or interleaving - the pointer operand should
4816       // remain uniform.
4817       else
4818         ConsecutiveLikePtrs.insert(Ptr);
4819     }
4820 
4821   // Add to the Worklist all consecutive and consecutive-like pointers that
4822   // aren't also identified as possibly non-uniform.
4823   for (auto *V : ConsecutiveLikePtrs)
4824     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4825       addToWorklistIfAllowed(V);
4826 
4827   // Expand Worklist in topological order: whenever a new instruction
4828   // is added , its users should be already inside Worklist.  It ensures
4829   // a uniform instruction will only be used by uniform instructions.
4830   unsigned idx = 0;
4831   while (idx != Worklist.size()) {
4832     Instruction *I = Worklist[idx++];
4833 
4834     for (auto OV : I->operand_values()) {
4835       // isOutOfScope operands cannot be uniform instructions.
4836       if (isOutOfScope(OV))
4837         continue;
4838       // First order recurrence Phi's should typically be considered
4839       // non-uniform.
4840       auto *OP = dyn_cast<PHINode>(OV);
4841       if (OP && Legal->isFirstOrderRecurrence(OP))
4842         continue;
4843       // If all the users of the operand are uniform, then add the
4844       // operand into the uniform worklist.
4845       auto *OI = cast<Instruction>(OV);
4846       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4847             auto *J = cast<Instruction>(U);
4848             return Worklist.count(J) ||
4849                    (OI == getLoadStorePointerOperand(J) &&
4850                     isUniformDecision(J, VF));
4851           }))
4852         addToWorklistIfAllowed(OI);
4853     }
4854   }
4855 
4856   // Returns true if Ptr is the pointer operand of a memory access instruction
4857   // I, and I is known to not require scalarization.
4858   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4859     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4860   };
4861 
4862   // For an instruction to be added into Worklist above, all its users inside
4863   // the loop should also be in Worklist. However, this condition cannot be
4864   // true for phi nodes that form a cyclic dependence. We must process phi
4865   // nodes separately. An induction variable will remain uniform if all users
4866   // of the induction variable and induction variable update remain uniform.
4867   // The code below handles both pointer and non-pointer induction variables.
4868   for (auto &Induction : Legal->getInductionVars()) {
4869     auto *Ind = Induction.first;
4870     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4871 
4872     // Determine if all users of the induction variable are uniform after
4873     // vectorization.
4874     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4875       auto *I = cast<Instruction>(U);
4876       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4877              isVectorizedMemAccessUse(I, Ind);
4878     });
4879     if (!UniformInd)
4880       continue;
4881 
4882     // Determine if all users of the induction variable update instruction are
4883     // uniform after vectorization.
4884     auto UniformIndUpdate =
4885         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4886           auto *I = cast<Instruction>(U);
4887           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4888                  isVectorizedMemAccessUse(I, IndUpdate);
4889         });
4890     if (!UniformIndUpdate)
4891       continue;
4892 
4893     // The induction variable and its update instruction will remain uniform.
4894     addToWorklistIfAllowed(Ind);
4895     addToWorklistIfAllowed(IndUpdate);
4896   }
4897 
4898   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4899 }
4900 
4901 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4902   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4903 
4904   if (Legal->getRuntimePointerChecking()->Need) {
4905     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4906         "runtime pointer checks needed. Enable vectorization of this "
4907         "loop with '#pragma clang loop vectorize(enable)' when "
4908         "compiling with -Os/-Oz",
4909         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4910     return true;
4911   }
4912 
4913   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4914     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4915         "runtime SCEV checks needed. Enable vectorization of this "
4916         "loop with '#pragma clang loop vectorize(enable)' when "
4917         "compiling with -Os/-Oz",
4918         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4919     return true;
4920   }
4921 
4922   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4923   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4924     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4925         "runtime stride == 1 checks needed. Enable vectorization of "
4926         "this loop with '#pragma clang loop vectorize(enable)' when "
4927         "compiling with -Os/-Oz",
4928         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4929     return true;
4930   }
4931 
4932   return false;
4933 }
4934 
4935 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4936   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4937     // TODO: It may by useful to do since it's still likely to be dynamically
4938     // uniform if the target can skip.
4939     reportVectorizationFailure(
4940         "Not inserting runtime ptr check for divergent target",
4941         "runtime pointer checks needed. Not enabled for divergent target",
4942         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4943     return None;
4944   }
4945 
4946   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4947   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4948   if (TC == 1) {
4949     reportVectorizationFailure("Single iteration (non) loop",
4950         "loop trip count is one, irrelevant for vectorization",
4951         "SingleIterationLoop", ORE, TheLoop);
4952     return None;
4953   }
4954 
4955   switch (ScalarEpilogueStatus) {
4956   case CM_ScalarEpilogueAllowed:
4957     return computeFeasibleMaxVF(TC);
4958   case CM_ScalarEpilogueNotNeededUsePredicate:
4959     LLVM_DEBUG(
4960         dbgs() << "LV: vector predicate hint/switch found.\n"
4961                << "LV: Not allowing scalar epilogue, creating predicated "
4962                << "vector loop.\n");
4963     break;
4964   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4965     // fallthrough as a special case of OptForSize
4966   case CM_ScalarEpilogueNotAllowedOptSize:
4967     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4968       LLVM_DEBUG(
4969           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4970     else
4971       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4972                         << "count.\n");
4973 
4974     // Bail if runtime checks are required, which are not good when optimising
4975     // for size.
4976     if (runtimeChecksRequired())
4977       return None;
4978     break;
4979   }
4980 
4981   // Now try the tail folding
4982 
4983   // Invalidate interleave groups that require an epilogue if we can't mask
4984   // the interleave-group.
4985   if (!useMaskedInterleavedAccesses(TTI))
4986     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4987 
4988   unsigned MaxVF = computeFeasibleMaxVF(TC);
4989   if (TC > 0 && TC % MaxVF == 0) {
4990     // Accept MaxVF if we do not have a tail.
4991     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4992     return MaxVF;
4993   }
4994 
4995   // If we don't know the precise trip count, or if the trip count that we
4996   // found modulo the vectorization factor is not zero, try to fold the tail
4997   // by masking.
4998   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4999   if (Legal->prepareToFoldTailByMasking()) {
5000     FoldTailByMasking = true;
5001     return MaxVF;
5002   }
5003 
5004   if (TC == 0) {
5005     reportVectorizationFailure(
5006         "Unable to calculate the loop count due to complex control flow",
5007         "unable to calculate the loop count due to complex control flow",
5008         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5009     return None;
5010   }
5011 
5012   reportVectorizationFailure(
5013       "Cannot optimize for size and vectorize at the same time.",
5014       "cannot optimize for size and vectorize at the same time. "
5015       "Enable vectorization of this loop with '#pragma clang loop "
5016       "vectorize(enable)' when compiling with -Os/-Oz",
5017       "NoTailLoopWithOptForSize", ORE, TheLoop);
5018   return None;
5019 }
5020 
5021 unsigned
5022 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5023   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5024   unsigned SmallestType, WidestType;
5025   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5026   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5027 
5028   // Get the maximum safe dependence distance in bits computed by LAA.
5029   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5030   // the memory accesses that is most restrictive (involved in the smallest
5031   // dependence distance).
5032   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5033 
5034   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5035 
5036   unsigned MaxVectorSize = WidestRegister / WidestType;
5037 
5038   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5039                     << " / " << WidestType << " bits.\n");
5040   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5041                     << WidestRegister << " bits.\n");
5042 
5043   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5044                                  " into one vector!");
5045   if (MaxVectorSize == 0) {
5046     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5047     MaxVectorSize = 1;
5048     return MaxVectorSize;
5049   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5050              isPowerOf2_32(ConstTripCount)) {
5051     // We need to clamp the VF to be the ConstTripCount. There is no point in
5052     // choosing a higher viable VF as done in the loop below.
5053     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5054                       << ConstTripCount << "\n");
5055     MaxVectorSize = ConstTripCount;
5056     return MaxVectorSize;
5057   }
5058 
5059   unsigned MaxVF = MaxVectorSize;
5060   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5061       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5062     // Collect all viable vectorization factors larger than the default MaxVF
5063     // (i.e. MaxVectorSize).
5064     SmallVector<unsigned, 8> VFs;
5065     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5066     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5067       VFs.push_back(VS);
5068 
5069     // For each VF calculate its register usage.
5070     auto RUs = calculateRegisterUsage(VFs);
5071 
5072     // Select the largest VF which doesn't require more registers than existing
5073     // ones.
5074     for (int i = RUs.size() - 1; i >= 0; --i) {
5075       bool Selected = true;
5076       for (auto& pair : RUs[i].MaxLocalUsers) {
5077         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5078         if (pair.second > TargetNumRegisters)
5079           Selected = false;
5080       }
5081       if (Selected) {
5082         MaxVF = VFs[i];
5083         break;
5084       }
5085     }
5086     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5087       if (MaxVF < MinVF) {
5088         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5089                           << ") with target's minimum: " << MinVF << '\n');
5090         MaxVF = MinVF;
5091       }
5092     }
5093   }
5094   return MaxVF;
5095 }
5096 
5097 VectorizationFactor
5098 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5099   float Cost = expectedCost(1).first;
5100   const float ScalarCost = Cost;
5101   unsigned Width = 1;
5102   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5103 
5104   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5105   if (ForceVectorization && MaxVF > 1) {
5106     // Ignore scalar width, because the user explicitly wants vectorization.
5107     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5108     // evaluation.
5109     Cost = std::numeric_limits<float>::max();
5110   }
5111 
5112   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5113     // Notice that the vector loop needs to be executed less times, so
5114     // we need to divide the cost of the vector loops by the width of
5115     // the vector elements.
5116     VectorizationCostTy C = expectedCost(i);
5117     float VectorCost = C.first / (float)i;
5118     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5119                       << " costs: " << (int)VectorCost << ".\n");
5120     if (!C.second && !ForceVectorization) {
5121       LLVM_DEBUG(
5122           dbgs() << "LV: Not considering vector loop of width " << i
5123                  << " because it will not generate any vector instructions.\n");
5124       continue;
5125     }
5126     if (VectorCost < Cost) {
5127       Cost = VectorCost;
5128       Width = i;
5129     }
5130   }
5131 
5132   if (!EnableCondStoresVectorization && NumPredStores) {
5133     reportVectorizationFailure("There are conditional stores.",
5134         "store that is conditionally executed prevents vectorization",
5135         "ConditionalStore", ORE, TheLoop);
5136     Width = 1;
5137     Cost = ScalarCost;
5138   }
5139 
5140   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5141              << "LV: Vectorization seems to be not beneficial, "
5142              << "but was forced by a user.\n");
5143   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5144   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5145   return Factor;
5146 }
5147 
5148 std::pair<unsigned, unsigned>
5149 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5150   unsigned MinWidth = -1U;
5151   unsigned MaxWidth = 8;
5152   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5153 
5154   // For each block.
5155   for (BasicBlock *BB : TheLoop->blocks()) {
5156     // For each instruction in the loop.
5157     for (Instruction &I : BB->instructionsWithoutDebug()) {
5158       Type *T = I.getType();
5159 
5160       // Skip ignored values.
5161       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5162         continue;
5163 
5164       // Only examine Loads, Stores and PHINodes.
5165       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5166         continue;
5167 
5168       // Examine PHI nodes that are reduction variables. Update the type to
5169       // account for the recurrence type.
5170       if (auto *PN = dyn_cast<PHINode>(&I)) {
5171         if (!Legal->isReductionVariable(PN))
5172           continue;
5173         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5174         T = RdxDesc.getRecurrenceType();
5175       }
5176 
5177       // Examine the stored values.
5178       if (auto *ST = dyn_cast<StoreInst>(&I))
5179         T = ST->getValueOperand()->getType();
5180 
5181       // Ignore loaded pointer types and stored pointer types that are not
5182       // vectorizable.
5183       //
5184       // FIXME: The check here attempts to predict whether a load or store will
5185       //        be vectorized. We only know this for certain after a VF has
5186       //        been selected. Here, we assume that if an access can be
5187       //        vectorized, it will be. We should also look at extending this
5188       //        optimization to non-pointer types.
5189       //
5190       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5191           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5192         continue;
5193 
5194       MinWidth = std::min(MinWidth,
5195                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5196       MaxWidth = std::max(MaxWidth,
5197                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5198     }
5199   }
5200 
5201   return {MinWidth, MaxWidth};
5202 }
5203 
5204 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5205                                                            unsigned LoopCost) {
5206   // -- The interleave heuristics --
5207   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5208   // There are many micro-architectural considerations that we can't predict
5209   // at this level. For example, frontend pressure (on decode or fetch) due to
5210   // code size, or the number and capabilities of the execution ports.
5211   //
5212   // We use the following heuristics to select the interleave count:
5213   // 1. If the code has reductions, then we interleave to break the cross
5214   // iteration dependency.
5215   // 2. If the loop is really small, then we interleave to reduce the loop
5216   // overhead.
5217   // 3. We don't interleave if we think that we will spill registers to memory
5218   // due to the increased register pressure.
5219 
5220   if (!isScalarEpilogueAllowed())
5221     return 1;
5222 
5223   // We used the distance for the interleave count.
5224   if (Legal->getMaxSafeDepDistBytes() != -1U)
5225     return 1;
5226 
5227   // Do not interleave loops with a relatively small known or estimated trip
5228   // count.
5229   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5230   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5231     return 1;
5232 
5233   RegisterUsage R = calculateRegisterUsage({VF})[0];
5234   // We divide by these constants so assume that we have at least one
5235   // instruction that uses at least one register.
5236   for (auto& pair : R.MaxLocalUsers) {
5237     pair.second = std::max(pair.second, 1U);
5238   }
5239 
5240   // We calculate the interleave count using the following formula.
5241   // Subtract the number of loop invariants from the number of available
5242   // registers. These registers are used by all of the interleaved instances.
5243   // Next, divide the remaining registers by the number of registers that is
5244   // required by the loop, in order to estimate how many parallel instances
5245   // fit without causing spills. All of this is rounded down if necessary to be
5246   // a power of two. We want power of two interleave count to simplify any
5247   // addressing operations or alignment considerations.
5248   // We also want power of two interleave counts to ensure that the induction
5249   // variable of the vector loop wraps to zero, when tail is folded by masking;
5250   // this currently happens when OptForSize, in which case IC is set to 1 above.
5251   unsigned IC = UINT_MAX;
5252 
5253   for (auto& pair : R.MaxLocalUsers) {
5254     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5255     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5256                       << " registers of "
5257                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5258     if (VF == 1) {
5259       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5260         TargetNumRegisters = ForceTargetNumScalarRegs;
5261     } else {
5262       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5263         TargetNumRegisters = ForceTargetNumVectorRegs;
5264     }
5265     unsigned MaxLocalUsers = pair.second;
5266     unsigned LoopInvariantRegs = 0;
5267     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5268       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5269 
5270     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5271     // Don't count the induction variable as interleaved.
5272     if (EnableIndVarRegisterHeur) {
5273       TmpIC =
5274           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5275                         std::max(1U, (MaxLocalUsers - 1)));
5276     }
5277 
5278     IC = std::min(IC, TmpIC);
5279   }
5280 
5281   // Clamp the interleave ranges to reasonable counts.
5282   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5283 
5284   // Check if the user has overridden the max.
5285   if (VF == 1) {
5286     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5287       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5288   } else {
5289     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5290       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5291   }
5292 
5293   // If trip count is known or estimated compile time constant, limit the
5294   // interleave count to be less than the trip count divided by VF.
5295   if (BestKnownTC) {
5296     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5297   }
5298 
5299   // If we did not calculate the cost for VF (because the user selected the VF)
5300   // then we calculate the cost of VF here.
5301   if (LoopCost == 0)
5302     LoopCost = expectedCost(VF).first;
5303 
5304   assert(LoopCost && "Non-zero loop cost expected");
5305 
5306   // Clamp the calculated IC to be between the 1 and the max interleave count
5307   // that the target and trip count allows.
5308   if (IC > MaxInterleaveCount)
5309     IC = MaxInterleaveCount;
5310   else if (IC < 1)
5311     IC = 1;
5312 
5313   // Interleave if we vectorized this loop and there is a reduction that could
5314   // benefit from interleaving.
5315   if (VF > 1 && !Legal->getReductionVars().empty()) {
5316     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5317     return IC;
5318   }
5319 
5320   // Note that if we've already vectorized the loop we will have done the
5321   // runtime check and so interleaving won't require further checks.
5322   bool InterleavingRequiresRuntimePointerCheck =
5323       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5324 
5325   // We want to interleave small loops in order to reduce the loop overhead and
5326   // potentially expose ILP opportunities.
5327   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5328   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5329     // We assume that the cost overhead is 1 and we use the cost model
5330     // to estimate the cost of the loop and interleave until the cost of the
5331     // loop overhead is about 5% of the cost of the loop.
5332     unsigned SmallIC =
5333         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5334 
5335     // Interleave until store/load ports (estimated by max interleave count) are
5336     // saturated.
5337     unsigned NumStores = Legal->getNumStores();
5338     unsigned NumLoads = Legal->getNumLoads();
5339     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5340     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5341 
5342     // If we have a scalar reduction (vector reductions are already dealt with
5343     // by this point), we can increase the critical path length if the loop
5344     // we're interleaving is inside another loop. Limit, by default to 2, so the
5345     // critical path only gets increased by one reduction operation.
5346     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5347       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5348       SmallIC = std::min(SmallIC, F);
5349       StoresIC = std::min(StoresIC, F);
5350       LoadsIC = std::min(LoadsIC, F);
5351     }
5352 
5353     if (EnableLoadStoreRuntimeInterleave &&
5354         std::max(StoresIC, LoadsIC) > SmallIC) {
5355       LLVM_DEBUG(
5356           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5357       return std::max(StoresIC, LoadsIC);
5358     }
5359 
5360     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5361     return SmallIC;
5362   }
5363 
5364   // Interleave if this is a large loop (small loops are already dealt with by
5365   // this point) that could benefit from interleaving.
5366   bool HasReductions = !Legal->getReductionVars().empty();
5367   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5368     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5369     return IC;
5370   }
5371 
5372   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5373   return 1;
5374 }
5375 
5376 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5377 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5378   // This function calculates the register usage by measuring the highest number
5379   // of values that are alive at a single location. Obviously, this is a very
5380   // rough estimation. We scan the loop in a topological order in order and
5381   // assign a number to each instruction. We use RPO to ensure that defs are
5382   // met before their users. We assume that each instruction that has in-loop
5383   // users starts an interval. We record every time that an in-loop value is
5384   // used, so we have a list of the first and last occurrences of each
5385   // instruction. Next, we transpose this data structure into a multi map that
5386   // holds the list of intervals that *end* at a specific location. This multi
5387   // map allows us to perform a linear search. We scan the instructions linearly
5388   // and record each time that a new interval starts, by placing it in a set.
5389   // If we find this value in the multi-map then we remove it from the set.
5390   // The max register usage is the maximum size of the set.
5391   // We also search for instructions that are defined outside the loop, but are
5392   // used inside the loop. We need this number separately from the max-interval
5393   // usage number because when we unroll, loop-invariant values do not take
5394   // more register.
5395   LoopBlocksDFS DFS(TheLoop);
5396   DFS.perform(LI);
5397 
5398   RegisterUsage RU;
5399 
5400   // Each 'key' in the map opens a new interval. The values
5401   // of the map are the index of the 'last seen' usage of the
5402   // instruction that is the key.
5403   using IntervalMap = DenseMap<Instruction *, unsigned>;
5404 
5405   // Maps instruction to its index.
5406   SmallVector<Instruction *, 64> IdxToInstr;
5407   // Marks the end of each interval.
5408   IntervalMap EndPoint;
5409   // Saves the list of instruction indices that are used in the loop.
5410   SmallPtrSet<Instruction *, 8> Ends;
5411   // Saves the list of values that are used in the loop but are
5412   // defined outside the loop, such as arguments and constants.
5413   SmallPtrSet<Value *, 8> LoopInvariants;
5414 
5415   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5416     for (Instruction &I : BB->instructionsWithoutDebug()) {
5417       IdxToInstr.push_back(&I);
5418 
5419       // Save the end location of each USE.
5420       for (Value *U : I.operands()) {
5421         auto *Instr = dyn_cast<Instruction>(U);
5422 
5423         // Ignore non-instruction values such as arguments, constants, etc.
5424         if (!Instr)
5425           continue;
5426 
5427         // If this instruction is outside the loop then record it and continue.
5428         if (!TheLoop->contains(Instr)) {
5429           LoopInvariants.insert(Instr);
5430           continue;
5431         }
5432 
5433         // Overwrite previous end points.
5434         EndPoint[Instr] = IdxToInstr.size();
5435         Ends.insert(Instr);
5436       }
5437     }
5438   }
5439 
5440   // Saves the list of intervals that end with the index in 'key'.
5441   using InstrList = SmallVector<Instruction *, 2>;
5442   DenseMap<unsigned, InstrList> TransposeEnds;
5443 
5444   // Transpose the EndPoints to a list of values that end at each index.
5445   for (auto &Interval : EndPoint)
5446     TransposeEnds[Interval.second].push_back(Interval.first);
5447 
5448   SmallPtrSet<Instruction *, 8> OpenIntervals;
5449 
5450   // Get the size of the widest register.
5451   unsigned MaxSafeDepDist = -1U;
5452   if (Legal->getMaxSafeDepDistBytes() != -1U)
5453     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5454   unsigned WidestRegister =
5455       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5456   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5457 
5458   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5459   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5460 
5461   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5462 
5463   // A lambda that gets the register usage for the given type and VF.
5464   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5465     if (Ty->isTokenTy())
5466       return 0U;
5467     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5468     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5469   };
5470 
5471   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5472     Instruction *I = IdxToInstr[i];
5473 
5474     // Remove all of the instructions that end at this location.
5475     InstrList &List = TransposeEnds[i];
5476     for (Instruction *ToRemove : List)
5477       OpenIntervals.erase(ToRemove);
5478 
5479     // Ignore instructions that are never used within the loop.
5480     if (Ends.find(I) == Ends.end())
5481       continue;
5482 
5483     // Skip ignored values.
5484     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5485       continue;
5486 
5487     // For each VF find the maximum usage of registers.
5488     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5489       // Count the number of live intervals.
5490       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5491 
5492       if (VFs[j] == 1) {
5493         for (auto Inst : OpenIntervals) {
5494           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5495           if (RegUsage.find(ClassID) == RegUsage.end())
5496             RegUsage[ClassID] = 1;
5497           else
5498             RegUsage[ClassID] += 1;
5499         }
5500       } else {
5501         collectUniformsAndScalars(VFs[j]);
5502         for (auto Inst : OpenIntervals) {
5503           // Skip ignored values for VF > 1.
5504           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5505             continue;
5506           if (isScalarAfterVectorization(Inst, VFs[j])) {
5507             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5508             if (RegUsage.find(ClassID) == RegUsage.end())
5509               RegUsage[ClassID] = 1;
5510             else
5511               RegUsage[ClassID] += 1;
5512           } else {
5513             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5514             if (RegUsage.find(ClassID) == RegUsage.end())
5515               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5516             else
5517               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5518           }
5519         }
5520       }
5521 
5522       for (auto& pair : RegUsage) {
5523         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5524           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5525         else
5526           MaxUsages[j][pair.first] = pair.second;
5527       }
5528     }
5529 
5530     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5531                       << OpenIntervals.size() << '\n');
5532 
5533     // Add the current instruction to the list of open intervals.
5534     OpenIntervals.insert(I);
5535   }
5536 
5537   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5538     SmallMapVector<unsigned, unsigned, 4> Invariant;
5539 
5540     for (auto Inst : LoopInvariants) {
5541       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5542       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5543       if (Invariant.find(ClassID) == Invariant.end())
5544         Invariant[ClassID] = Usage;
5545       else
5546         Invariant[ClassID] += Usage;
5547     }
5548 
5549     LLVM_DEBUG({
5550       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5551       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5552              << " item\n";
5553       for (const auto &pair : MaxUsages[i]) {
5554         dbgs() << "LV(REG): RegisterClass: "
5555                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5556                << " registers\n";
5557       }
5558       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5559              << " item\n";
5560       for (const auto &pair : Invariant) {
5561         dbgs() << "LV(REG): RegisterClass: "
5562                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5563                << " registers\n";
5564       }
5565     });
5566 
5567     RU.LoopInvariantRegs = Invariant;
5568     RU.MaxLocalUsers = MaxUsages[i];
5569     RUs[i] = RU;
5570   }
5571 
5572   return RUs;
5573 }
5574 
5575 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5576   // TODO: Cost model for emulated masked load/store is completely
5577   // broken. This hack guides the cost model to use an artificially
5578   // high enough value to practically disable vectorization with such
5579   // operations, except where previously deployed legality hack allowed
5580   // using very low cost values. This is to avoid regressions coming simply
5581   // from moving "masked load/store" check from legality to cost model.
5582   // Masked Load/Gather emulation was previously never allowed.
5583   // Limited number of Masked Store/Scatter emulation was allowed.
5584   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5585   return isa<LoadInst>(I) ||
5586          (isa<StoreInst>(I) &&
5587           NumPredStores > NumberOfStoresToPredicate);
5588 }
5589 
5590 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5591   // If we aren't vectorizing the loop, or if we've already collected the
5592   // instructions to scalarize, there's nothing to do. Collection may already
5593   // have occurred if we have a user-selected VF and are now computing the
5594   // expected cost for interleaving.
5595   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5596     return;
5597 
5598   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5599   // not profitable to scalarize any instructions, the presence of VF in the
5600   // map will indicate that we've analyzed it already.
5601   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5602 
5603   // Find all the instructions that are scalar with predication in the loop and
5604   // determine if it would be better to not if-convert the blocks they are in.
5605   // If so, we also record the instructions to scalarize.
5606   for (BasicBlock *BB : TheLoop->blocks()) {
5607     if (!blockNeedsPredication(BB))
5608       continue;
5609     for (Instruction &I : *BB)
5610       if (isScalarWithPredication(&I)) {
5611         ScalarCostsTy ScalarCosts;
5612         // Do not apply discount logic if hacked cost is needed
5613         // for emulated masked memrefs.
5614         if (!useEmulatedMaskMemRefHack(&I) &&
5615             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5616           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5617         // Remember that BB will remain after vectorization.
5618         PredicatedBBsAfterVectorization.insert(BB);
5619       }
5620   }
5621 }
5622 
5623 int LoopVectorizationCostModel::computePredInstDiscount(
5624     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5625     unsigned VF) {
5626   assert(!isUniformAfterVectorization(PredInst, VF) &&
5627          "Instruction marked uniform-after-vectorization will be predicated");
5628 
5629   // Initialize the discount to zero, meaning that the scalar version and the
5630   // vector version cost the same.
5631   int Discount = 0;
5632 
5633   // Holds instructions to analyze. The instructions we visit are mapped in
5634   // ScalarCosts. Those instructions are the ones that would be scalarized if
5635   // we find that the scalar version costs less.
5636   SmallVector<Instruction *, 8> Worklist;
5637 
5638   // Returns true if the given instruction can be scalarized.
5639   auto canBeScalarized = [&](Instruction *I) -> bool {
5640     // We only attempt to scalarize instructions forming a single-use chain
5641     // from the original predicated block that would otherwise be vectorized.
5642     // Although not strictly necessary, we give up on instructions we know will
5643     // already be scalar to avoid traversing chains that are unlikely to be
5644     // beneficial.
5645     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5646         isScalarAfterVectorization(I, VF))
5647       return false;
5648 
5649     // If the instruction is scalar with predication, it will be analyzed
5650     // separately. We ignore it within the context of PredInst.
5651     if (isScalarWithPredication(I))
5652       return false;
5653 
5654     // If any of the instruction's operands are uniform after vectorization,
5655     // the instruction cannot be scalarized. This prevents, for example, a
5656     // masked load from being scalarized.
5657     //
5658     // We assume we will only emit a value for lane zero of an instruction
5659     // marked uniform after vectorization, rather than VF identical values.
5660     // Thus, if we scalarize an instruction that uses a uniform, we would
5661     // create uses of values corresponding to the lanes we aren't emitting code
5662     // for. This behavior can be changed by allowing getScalarValue to clone
5663     // the lane zero values for uniforms rather than asserting.
5664     for (Use &U : I->operands())
5665       if (auto *J = dyn_cast<Instruction>(U.get()))
5666         if (isUniformAfterVectorization(J, VF))
5667           return false;
5668 
5669     // Otherwise, we can scalarize the instruction.
5670     return true;
5671   };
5672 
5673   // Compute the expected cost discount from scalarizing the entire expression
5674   // feeding the predicated instruction. We currently only consider expressions
5675   // that are single-use instruction chains.
5676   Worklist.push_back(PredInst);
5677   while (!Worklist.empty()) {
5678     Instruction *I = Worklist.pop_back_val();
5679 
5680     // If we've already analyzed the instruction, there's nothing to do.
5681     if (ScalarCosts.find(I) != ScalarCosts.end())
5682       continue;
5683 
5684     // Compute the cost of the vector instruction. Note that this cost already
5685     // includes the scalarization overhead of the predicated instruction.
5686     unsigned VectorCost = getInstructionCost(I, VF).first;
5687 
5688     // Compute the cost of the scalarized instruction. This cost is the cost of
5689     // the instruction as if it wasn't if-converted and instead remained in the
5690     // predicated block. We will scale this cost by block probability after
5691     // computing the scalarization overhead.
5692     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5693 
5694     // Compute the scalarization overhead of needed insertelement instructions
5695     // and phi nodes.
5696     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5697       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5698                                                  true, false);
5699       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5700     }
5701 
5702     // Compute the scalarization overhead of needed extractelement
5703     // instructions. For each of the instruction's operands, if the operand can
5704     // be scalarized, add it to the worklist; otherwise, account for the
5705     // overhead.
5706     for (Use &U : I->operands())
5707       if (auto *J = dyn_cast<Instruction>(U.get())) {
5708         assert(VectorType::isValidElementType(J->getType()) &&
5709                "Instruction has non-scalar type");
5710         if (canBeScalarized(J))
5711           Worklist.push_back(J);
5712         else if (needsExtract(J, VF))
5713           ScalarCost += TTI.getScalarizationOverhead(
5714                               ToVectorTy(J->getType(),VF), false, true);
5715       }
5716 
5717     // Scale the total scalar cost by block probability.
5718     ScalarCost /= getReciprocalPredBlockProb();
5719 
5720     // Compute the discount. A non-negative discount means the vector version
5721     // of the instruction costs more, and scalarizing would be beneficial.
5722     Discount += VectorCost - ScalarCost;
5723     ScalarCosts[I] = ScalarCost;
5724   }
5725 
5726   return Discount;
5727 }
5728 
5729 LoopVectorizationCostModel::VectorizationCostTy
5730 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5731   VectorizationCostTy Cost;
5732 
5733   // For each block.
5734   for (BasicBlock *BB : TheLoop->blocks()) {
5735     VectorizationCostTy BlockCost;
5736 
5737     // For each instruction in the old loop.
5738     for (Instruction &I : BB->instructionsWithoutDebug()) {
5739       // Skip ignored values.
5740       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5741           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5742         continue;
5743 
5744       VectorizationCostTy C = getInstructionCost(&I, VF);
5745 
5746       // Check if we should override the cost.
5747       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5748         C.first = ForceTargetInstructionCost;
5749 
5750       BlockCost.first += C.first;
5751       BlockCost.second |= C.second;
5752       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5753                         << " for VF " << VF << " For instruction: " << I
5754                         << '\n');
5755     }
5756 
5757     // If we are vectorizing a predicated block, it will have been
5758     // if-converted. This means that the block's instructions (aside from
5759     // stores and instructions that may divide by zero) will now be
5760     // unconditionally executed. For the scalar case, we may not always execute
5761     // the predicated block. Thus, scale the block's cost by the probability of
5762     // executing it.
5763     if (VF == 1 && blockNeedsPredication(BB))
5764       BlockCost.first /= getReciprocalPredBlockProb();
5765 
5766     Cost.first += BlockCost.first;
5767     Cost.second |= BlockCost.second;
5768   }
5769 
5770   return Cost;
5771 }
5772 
5773 /// Gets Address Access SCEV after verifying that the access pattern
5774 /// is loop invariant except the induction variable dependence.
5775 ///
5776 /// This SCEV can be sent to the Target in order to estimate the address
5777 /// calculation cost.
5778 static const SCEV *getAddressAccessSCEV(
5779               Value *Ptr,
5780               LoopVectorizationLegality *Legal,
5781               PredicatedScalarEvolution &PSE,
5782               const Loop *TheLoop) {
5783 
5784   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5785   if (!Gep)
5786     return nullptr;
5787 
5788   // We are looking for a gep with all loop invariant indices except for one
5789   // which should be an induction variable.
5790   auto SE = PSE.getSE();
5791   unsigned NumOperands = Gep->getNumOperands();
5792   for (unsigned i = 1; i < NumOperands; ++i) {
5793     Value *Opd = Gep->getOperand(i);
5794     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5795         !Legal->isInductionVariable(Opd))
5796       return nullptr;
5797   }
5798 
5799   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5800   return PSE.getSCEV(Ptr);
5801 }
5802 
5803 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5804   return Legal->hasStride(I->getOperand(0)) ||
5805          Legal->hasStride(I->getOperand(1));
5806 }
5807 
5808 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5809                                                                  unsigned VF) {
5810   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5811   Type *ValTy = getMemInstValueType(I);
5812   auto SE = PSE.getSE();
5813 
5814   unsigned AS = getLoadStoreAddressSpace(I);
5815   Value *Ptr = getLoadStorePointerOperand(I);
5816   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5817 
5818   // Figure out whether the access is strided and get the stride value
5819   // if it's known in compile time
5820   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5821 
5822   // Get the cost of the scalar memory instruction and address computation.
5823   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5824 
5825   // Don't pass *I here, since it is scalar but will actually be part of a
5826   // vectorized loop where the user of it is a vectorized instruction.
5827   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5828   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5829                                    Alignment, AS);
5830 
5831   // Get the overhead of the extractelement and insertelement instructions
5832   // we might create due to scalarization.
5833   Cost += getScalarizationOverhead(I, VF);
5834 
5835   // If we have a predicated store, it may not be executed for each vector
5836   // lane. Scale the cost by the probability of executing the predicated
5837   // block.
5838   if (isPredicatedInst(I)) {
5839     Cost /= getReciprocalPredBlockProb();
5840 
5841     if (useEmulatedMaskMemRefHack(I))
5842       // Artificially setting to a high enough value to practically disable
5843       // vectorization with such operations.
5844       Cost = 3000000;
5845   }
5846 
5847   return Cost;
5848 }
5849 
5850 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5851                                                              unsigned VF) {
5852   Type *ValTy = getMemInstValueType(I);
5853   Type *VectorTy = ToVectorTy(ValTy, VF);
5854   Value *Ptr = getLoadStorePointerOperand(I);
5855   unsigned AS = getLoadStoreAddressSpace(I);
5856   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5857 
5858   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5859          "Stride should be 1 or -1 for consecutive memory access");
5860   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5861   unsigned Cost = 0;
5862   if (Legal->isMaskRequired(I))
5863     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5864                                       Alignment ? Alignment->value() : 0, AS);
5865   else
5866     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5867 
5868   bool Reverse = ConsecutiveStride < 0;
5869   if (Reverse)
5870     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5871   return Cost;
5872 }
5873 
5874 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5875                                                          unsigned VF) {
5876   Type *ValTy = getMemInstValueType(I);
5877   Type *VectorTy = ToVectorTy(ValTy, VF);
5878   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5879   unsigned AS = getLoadStoreAddressSpace(I);
5880   if (isa<LoadInst>(I)) {
5881     return TTI.getAddressComputationCost(ValTy) +
5882            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5883            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5884   }
5885   StoreInst *SI = cast<StoreInst>(I);
5886 
5887   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5888   return TTI.getAddressComputationCost(ValTy) +
5889          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5890          (isLoopInvariantStoreValue
5891               ? 0
5892               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5893                                        VF - 1));
5894 }
5895 
5896 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5897                                                           unsigned VF) {
5898   Type *ValTy = getMemInstValueType(I);
5899   Type *VectorTy = ToVectorTy(ValTy, VF);
5900   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5901   Value *Ptr = getLoadStorePointerOperand(I);
5902 
5903   return TTI.getAddressComputationCost(VectorTy) +
5904          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5905                                     Legal->isMaskRequired(I),
5906                                     Alignment ? Alignment->value() : 0, I);
5907 }
5908 
5909 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5910                                                             unsigned VF) {
5911   Type *ValTy = getMemInstValueType(I);
5912   Type *VectorTy = ToVectorTy(ValTy, VF);
5913   unsigned AS = getLoadStoreAddressSpace(I);
5914 
5915   auto Group = getInterleavedAccessGroup(I);
5916   assert(Group && "Fail to get an interleaved access group.");
5917 
5918   unsigned InterleaveFactor = Group->getFactor();
5919   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5920 
5921   // Holds the indices of existing members in an interleaved load group.
5922   // An interleaved store group doesn't need this as it doesn't allow gaps.
5923   SmallVector<unsigned, 4> Indices;
5924   if (isa<LoadInst>(I)) {
5925     for (unsigned i = 0; i < InterleaveFactor; i++)
5926       if (Group->getMember(i))
5927         Indices.push_back(i);
5928   }
5929 
5930   // Calculate the cost of the whole interleaved group.
5931   bool UseMaskForGaps =
5932       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5933   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5934       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5935       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5936 
5937   if (Group->isReverse()) {
5938     // TODO: Add support for reversed masked interleaved access.
5939     assert(!Legal->isMaskRequired(I) &&
5940            "Reverse masked interleaved access not supported.");
5941     Cost += Group->getNumMembers() *
5942             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5943   }
5944   return Cost;
5945 }
5946 
5947 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5948                                                               unsigned VF) {
5949   // Calculate scalar cost only. Vectorization cost should be ready at this
5950   // moment.
5951   if (VF == 1) {
5952     Type *ValTy = getMemInstValueType(I);
5953     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5954     unsigned AS = getLoadStoreAddressSpace(I);
5955 
5956     return TTI.getAddressComputationCost(ValTy) +
5957            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5958   }
5959   return getWideningCost(I, VF);
5960 }
5961 
5962 LoopVectorizationCostModel::VectorizationCostTy
5963 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5964   // If we know that this instruction will remain uniform, check the cost of
5965   // the scalar version.
5966   if (isUniformAfterVectorization(I, VF))
5967     VF = 1;
5968 
5969   if (VF > 1 && isProfitableToScalarize(I, VF))
5970     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5971 
5972   // Forced scalars do not have any scalarization overhead.
5973   auto ForcedScalar = ForcedScalars.find(VF);
5974   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5975     auto InstSet = ForcedScalar->second;
5976     if (InstSet.find(I) != InstSet.end())
5977       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5978   }
5979 
5980   Type *VectorTy;
5981   unsigned C = getInstructionCost(I, VF, VectorTy);
5982 
5983   bool TypeNotScalarized =
5984       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5985   return VectorizationCostTy(C, TypeNotScalarized);
5986 }
5987 
5988 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5989                                                               unsigned VF) {
5990 
5991   if (VF == 1)
5992     return 0;
5993 
5994   unsigned Cost = 0;
5995   Type *RetTy = ToVectorTy(I->getType(), VF);
5996   if (!RetTy->isVoidTy() &&
5997       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5998     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5999 
6000   // Some targets keep addresses scalar.
6001   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6002     return Cost;
6003 
6004   // Some targets support efficient element stores.
6005   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6006     return Cost;
6007 
6008   // Collect operands to consider.
6009   CallInst *CI = dyn_cast<CallInst>(I);
6010   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6011 
6012   // Skip operands that do not require extraction/scalarization and do not incur
6013   // any overhead.
6014   return Cost + TTI.getOperandsScalarizationOverhead(
6015                     filterExtractingOperands(Ops, VF), VF);
6016 }
6017 
6018 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6019   if (VF == 1)
6020     return;
6021   NumPredStores = 0;
6022   for (BasicBlock *BB : TheLoop->blocks()) {
6023     // For each instruction in the old loop.
6024     for (Instruction &I : *BB) {
6025       Value *Ptr =  getLoadStorePointerOperand(&I);
6026       if (!Ptr)
6027         continue;
6028 
6029       // TODO: We should generate better code and update the cost model for
6030       // predicated uniform stores. Today they are treated as any other
6031       // predicated store (see added test cases in
6032       // invariant-store-vectorization.ll).
6033       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6034         NumPredStores++;
6035 
6036       if (Legal->isUniform(Ptr) &&
6037           // Conditional loads and stores should be scalarized and predicated.
6038           // isScalarWithPredication cannot be used here since masked
6039           // gather/scatters are not considered scalar with predication.
6040           !Legal->blockNeedsPredication(I.getParent())) {
6041         // TODO: Avoid replicating loads and stores instead of
6042         // relying on instcombine to remove them.
6043         // Load: Scalar load + broadcast
6044         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6045         unsigned Cost = getUniformMemOpCost(&I, VF);
6046         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6047         continue;
6048       }
6049 
6050       // We assume that widening is the best solution when possible.
6051       if (memoryInstructionCanBeWidened(&I, VF)) {
6052         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6053         int ConsecutiveStride =
6054                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6055         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6056                "Expected consecutive stride.");
6057         InstWidening Decision =
6058             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6059         setWideningDecision(&I, VF, Decision, Cost);
6060         continue;
6061       }
6062 
6063       // Choose between Interleaving, Gather/Scatter or Scalarization.
6064       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6065       unsigned NumAccesses = 1;
6066       if (isAccessInterleaved(&I)) {
6067         auto Group = getInterleavedAccessGroup(&I);
6068         assert(Group && "Fail to get an interleaved access group.");
6069 
6070         // Make one decision for the whole group.
6071         if (getWideningDecision(&I, VF) != CM_Unknown)
6072           continue;
6073 
6074         NumAccesses = Group->getNumMembers();
6075         if (interleavedAccessCanBeWidened(&I, VF))
6076           InterleaveCost = getInterleaveGroupCost(&I, VF);
6077       }
6078 
6079       unsigned GatherScatterCost =
6080           isLegalGatherOrScatter(&I)
6081               ? getGatherScatterCost(&I, VF) * NumAccesses
6082               : std::numeric_limits<unsigned>::max();
6083 
6084       unsigned ScalarizationCost =
6085           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6086 
6087       // Choose better solution for the current VF,
6088       // write down this decision and use it during vectorization.
6089       unsigned Cost;
6090       InstWidening Decision;
6091       if (InterleaveCost <= GatherScatterCost &&
6092           InterleaveCost < ScalarizationCost) {
6093         Decision = CM_Interleave;
6094         Cost = InterleaveCost;
6095       } else if (GatherScatterCost < ScalarizationCost) {
6096         Decision = CM_GatherScatter;
6097         Cost = GatherScatterCost;
6098       } else {
6099         Decision = CM_Scalarize;
6100         Cost = ScalarizationCost;
6101       }
6102       // If the instructions belongs to an interleave group, the whole group
6103       // receives the same decision. The whole group receives the cost, but
6104       // the cost will actually be assigned to one instruction.
6105       if (auto Group = getInterleavedAccessGroup(&I))
6106         setWideningDecision(Group, VF, Decision, Cost);
6107       else
6108         setWideningDecision(&I, VF, Decision, Cost);
6109     }
6110   }
6111 
6112   // Make sure that any load of address and any other address computation
6113   // remains scalar unless there is gather/scatter support. This avoids
6114   // inevitable extracts into address registers, and also has the benefit of
6115   // activating LSR more, since that pass can't optimize vectorized
6116   // addresses.
6117   if (TTI.prefersVectorizedAddressing())
6118     return;
6119 
6120   // Start with all scalar pointer uses.
6121   SmallPtrSet<Instruction *, 8> AddrDefs;
6122   for (BasicBlock *BB : TheLoop->blocks())
6123     for (Instruction &I : *BB) {
6124       Instruction *PtrDef =
6125         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6126       if (PtrDef && TheLoop->contains(PtrDef) &&
6127           getWideningDecision(&I, VF) != CM_GatherScatter)
6128         AddrDefs.insert(PtrDef);
6129     }
6130 
6131   // Add all instructions used to generate the addresses.
6132   SmallVector<Instruction *, 4> Worklist;
6133   for (auto *I : AddrDefs)
6134     Worklist.push_back(I);
6135   while (!Worklist.empty()) {
6136     Instruction *I = Worklist.pop_back_val();
6137     for (auto &Op : I->operands())
6138       if (auto *InstOp = dyn_cast<Instruction>(Op))
6139         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6140             AddrDefs.insert(InstOp).second)
6141           Worklist.push_back(InstOp);
6142   }
6143 
6144   for (auto *I : AddrDefs) {
6145     if (isa<LoadInst>(I)) {
6146       // Setting the desired widening decision should ideally be handled in
6147       // by cost functions, but since this involves the task of finding out
6148       // if the loaded register is involved in an address computation, it is
6149       // instead changed here when we know this is the case.
6150       InstWidening Decision = getWideningDecision(I, VF);
6151       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6152         // Scalarize a widened load of address.
6153         setWideningDecision(I, VF, CM_Scalarize,
6154                             (VF * getMemoryInstructionCost(I, 1)));
6155       else if (auto Group = getInterleavedAccessGroup(I)) {
6156         // Scalarize an interleave group of address loads.
6157         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6158           if (Instruction *Member = Group->getMember(I))
6159             setWideningDecision(Member, VF, CM_Scalarize,
6160                                 (VF * getMemoryInstructionCost(Member, 1)));
6161         }
6162       }
6163     } else
6164       // Make sure I gets scalarized and a cost estimate without
6165       // scalarization overhead.
6166       ForcedScalars[VF].insert(I);
6167   }
6168 }
6169 
6170 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6171                                                         unsigned VF,
6172                                                         Type *&VectorTy) {
6173   Type *RetTy = I->getType();
6174   if (canTruncateToMinimalBitwidth(I, VF))
6175     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6176   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6177   auto SE = PSE.getSE();
6178 
6179   // TODO: We need to estimate the cost of intrinsic calls.
6180   switch (I->getOpcode()) {
6181   case Instruction::GetElementPtr:
6182     // We mark this instruction as zero-cost because the cost of GEPs in
6183     // vectorized code depends on whether the corresponding memory instruction
6184     // is scalarized or not. Therefore, we handle GEPs with the memory
6185     // instruction cost.
6186     return 0;
6187   case Instruction::Br: {
6188     // In cases of scalarized and predicated instructions, there will be VF
6189     // predicated blocks in the vectorized loop. Each branch around these
6190     // blocks requires also an extract of its vector compare i1 element.
6191     bool ScalarPredicatedBB = false;
6192     BranchInst *BI = cast<BranchInst>(I);
6193     if (VF > 1 && BI->isConditional() &&
6194         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6195              PredicatedBBsAfterVectorization.end() ||
6196          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6197              PredicatedBBsAfterVectorization.end()))
6198       ScalarPredicatedBB = true;
6199 
6200     if (ScalarPredicatedBB) {
6201       // Return cost for branches around scalarized and predicated blocks.
6202       Type *Vec_i1Ty =
6203           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6204       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6205               (TTI.getCFInstrCost(Instruction::Br) * VF));
6206     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6207       // The back-edge branch will remain, as will all scalar branches.
6208       return TTI.getCFInstrCost(Instruction::Br);
6209     else
6210       // This branch will be eliminated by if-conversion.
6211       return 0;
6212     // Note: We currently assume zero cost for an unconditional branch inside
6213     // a predicated block since it will become a fall-through, although we
6214     // may decide in the future to call TTI for all branches.
6215   }
6216   case Instruction::PHI: {
6217     auto *Phi = cast<PHINode>(I);
6218 
6219     // First-order recurrences are replaced by vector shuffles inside the loop.
6220     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6221     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6222       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6223                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6224 
6225     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6226     // converted into select instructions. We require N - 1 selects per phi
6227     // node, where N is the number of incoming values.
6228     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6229       return (Phi->getNumIncomingValues() - 1) *
6230              TTI.getCmpSelInstrCost(
6231                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6232                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6233 
6234     return TTI.getCFInstrCost(Instruction::PHI);
6235   }
6236   case Instruction::UDiv:
6237   case Instruction::SDiv:
6238   case Instruction::URem:
6239   case Instruction::SRem:
6240     // If we have a predicated instruction, it may not be executed for each
6241     // vector lane. Get the scalarization cost and scale this amount by the
6242     // probability of executing the predicated block. If the instruction is not
6243     // predicated, we fall through to the next case.
6244     if (VF > 1 && isScalarWithPredication(I)) {
6245       unsigned Cost = 0;
6246 
6247       // These instructions have a non-void type, so account for the phi nodes
6248       // that we will create. This cost is likely to be zero. The phi node
6249       // cost, if any, should be scaled by the block probability because it
6250       // models a copy at the end of each predicated block.
6251       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6252 
6253       // The cost of the non-predicated instruction.
6254       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6255 
6256       // The cost of insertelement and extractelement instructions needed for
6257       // scalarization.
6258       Cost += getScalarizationOverhead(I, VF);
6259 
6260       // Scale the cost by the probability of executing the predicated blocks.
6261       // This assumes the predicated block for each vector lane is equally
6262       // likely.
6263       return Cost / getReciprocalPredBlockProb();
6264     }
6265     LLVM_FALLTHROUGH;
6266   case Instruction::Add:
6267   case Instruction::FAdd:
6268   case Instruction::Sub:
6269   case Instruction::FSub:
6270   case Instruction::Mul:
6271   case Instruction::FMul:
6272   case Instruction::FDiv:
6273   case Instruction::FRem:
6274   case Instruction::Shl:
6275   case Instruction::LShr:
6276   case Instruction::AShr:
6277   case Instruction::And:
6278   case Instruction::Or:
6279   case Instruction::Xor: {
6280     // Since we will replace the stride by 1 the multiplication should go away.
6281     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6282       return 0;
6283     // Certain instructions can be cheaper to vectorize if they have a constant
6284     // second vector operand. One example of this are shifts on x86.
6285     Value *Op2 = I->getOperand(1);
6286     TargetTransformInfo::OperandValueProperties Op2VP;
6287     TargetTransformInfo::OperandValueKind Op2VK =
6288         TTI.getOperandInfo(Op2, Op2VP);
6289     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6290       Op2VK = TargetTransformInfo::OK_UniformValue;
6291 
6292     SmallVector<const Value *, 4> Operands(I->operand_values());
6293     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6294     return N * TTI.getArithmeticInstrCost(
6295                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6296                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6297   }
6298   case Instruction::FNeg: {
6299     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6300     return N * TTI.getArithmeticInstrCost(
6301                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6302                    TargetTransformInfo::OK_AnyValue,
6303                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6304                    I->getOperand(0), I);
6305   }
6306   case Instruction::Select: {
6307     SelectInst *SI = cast<SelectInst>(I);
6308     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6309     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6310     Type *CondTy = SI->getCondition()->getType();
6311     if (!ScalarCond)
6312       CondTy = VectorType::get(CondTy, VF);
6313 
6314     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6315   }
6316   case Instruction::ICmp:
6317   case Instruction::FCmp: {
6318     Type *ValTy = I->getOperand(0)->getType();
6319     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6320     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6321       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6322     VectorTy = ToVectorTy(ValTy, VF);
6323     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6324   }
6325   case Instruction::Store:
6326   case Instruction::Load: {
6327     unsigned Width = VF;
6328     if (Width > 1) {
6329       InstWidening Decision = getWideningDecision(I, Width);
6330       assert(Decision != CM_Unknown &&
6331              "CM decision should be taken at this point");
6332       if (Decision == CM_Scalarize)
6333         Width = 1;
6334     }
6335     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6336     return getMemoryInstructionCost(I, VF);
6337   }
6338   case Instruction::ZExt:
6339   case Instruction::SExt:
6340   case Instruction::FPToUI:
6341   case Instruction::FPToSI:
6342   case Instruction::FPExt:
6343   case Instruction::PtrToInt:
6344   case Instruction::IntToPtr:
6345   case Instruction::SIToFP:
6346   case Instruction::UIToFP:
6347   case Instruction::Trunc:
6348   case Instruction::FPTrunc:
6349   case Instruction::BitCast: {
6350     // We optimize the truncation of induction variables having constant
6351     // integer steps. The cost of these truncations is the same as the scalar
6352     // operation.
6353     if (isOptimizableIVTruncate(I, VF)) {
6354       auto *Trunc = cast<TruncInst>(I);
6355       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6356                                   Trunc->getSrcTy(), Trunc);
6357     }
6358 
6359     Type *SrcScalarTy = I->getOperand(0)->getType();
6360     Type *SrcVecTy =
6361         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6362     if (canTruncateToMinimalBitwidth(I, VF)) {
6363       // This cast is going to be shrunk. This may remove the cast or it might
6364       // turn it into slightly different cast. For example, if MinBW == 16,
6365       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6366       //
6367       // Calculate the modified src and dest types.
6368       Type *MinVecTy = VectorTy;
6369       if (I->getOpcode() == Instruction::Trunc) {
6370         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6371         VectorTy =
6372             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6373       } else if (I->getOpcode() == Instruction::ZExt ||
6374                  I->getOpcode() == Instruction::SExt) {
6375         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6376         VectorTy =
6377             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6378       }
6379     }
6380 
6381     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6382     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6383   }
6384   case Instruction::Call: {
6385     bool NeedToScalarize;
6386     CallInst *CI = cast<CallInst>(I);
6387     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6388     if (getVectorIntrinsicIDForCall(CI, TLI))
6389       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6390     return CallCost;
6391   }
6392   default:
6393     // The cost of executing VF copies of the scalar instruction. This opcode
6394     // is unknown. Assume that it is the same as 'mul'.
6395     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6396            getScalarizationOverhead(I, VF);
6397   } // end of switch.
6398 }
6399 
6400 char LoopVectorize::ID = 0;
6401 
6402 static const char lv_name[] = "Loop Vectorization";
6403 
6404 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6405 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6406 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6407 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6408 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6409 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6410 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6411 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6412 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6413 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6414 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6415 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6416 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6417 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6418 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6419 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6420 
6421 namespace llvm {
6422 
6423 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6424 
6425 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6426                               bool VectorizeOnlyWhenForced) {
6427   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6428 }
6429 
6430 } // end namespace llvm
6431 
6432 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6433   // Check if the pointer operand of a load or store instruction is
6434   // consecutive.
6435   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6436     return Legal->isConsecutivePtr(Ptr);
6437   return false;
6438 }
6439 
6440 void LoopVectorizationCostModel::collectValuesToIgnore() {
6441   // Ignore ephemeral values.
6442   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6443 
6444   // Ignore type-promoting instructions we identified during reduction
6445   // detection.
6446   for (auto &Reduction : Legal->getReductionVars()) {
6447     RecurrenceDescriptor &RedDes = Reduction.second;
6448     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6449     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6450   }
6451   // Ignore type-casting instructions we identified during induction
6452   // detection.
6453   for (auto &Induction : Legal->getInductionVars()) {
6454     InductionDescriptor &IndDes = Induction.second;
6455     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6456     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6457   }
6458 }
6459 
6460 // TODO: we could return a pair of values that specify the max VF and
6461 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6462 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6463 // doesn't have a cost model that can choose which plan to execute if
6464 // more than one is generated.
6465 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6466                                  LoopVectorizationCostModel &CM) {
6467   unsigned WidestType;
6468   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6469   return WidestVectorRegBits / WidestType;
6470 }
6471 
6472 VectorizationFactor
6473 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6474   unsigned VF = UserVF;
6475   // Outer loop handling: They may require CFG and instruction level
6476   // transformations before even evaluating whether vectorization is profitable.
6477   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6478   // the vectorization pipeline.
6479   if (!OrigLoop->empty()) {
6480     // If the user doesn't provide a vectorization factor, determine a
6481     // reasonable one.
6482     if (!UserVF) {
6483       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6484       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6485 
6486       // Make sure we have a VF > 1 for stress testing.
6487       if (VPlanBuildStressTest && VF < 2) {
6488         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6489                           << "overriding computed VF.\n");
6490         VF = 4;
6491       }
6492     }
6493     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6494     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6495     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6496                       << " to build VPlans.\n");
6497     buildVPlans(VF, VF);
6498 
6499     // For VPlan build stress testing, we bail out after VPlan construction.
6500     if (VPlanBuildStressTest)
6501       return VectorizationFactor::Disabled();
6502 
6503     return {VF, 0};
6504   }
6505 
6506   LLVM_DEBUG(
6507       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6508                 "VPlan-native path.\n");
6509   return VectorizationFactor::Disabled();
6510 }
6511 
6512 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6513   assert(OrigLoop->empty() && "Inner loop expected.");
6514   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6515   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6516     return None;
6517 
6518   // Invalidate interleave groups if all blocks of loop will be predicated.
6519   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6520       !useMaskedInterleavedAccesses(*TTI)) {
6521     LLVM_DEBUG(
6522         dbgs()
6523         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6524            "which requires masked-interleaved support.\n");
6525     CM.InterleaveInfo.reset();
6526   }
6527 
6528   if (UserVF) {
6529     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6530     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6531     // Collect the instructions (and their associated costs) that will be more
6532     // profitable to scalarize.
6533     CM.selectUserVectorizationFactor(UserVF);
6534     buildVPlansWithVPRecipes(UserVF, UserVF);
6535     LLVM_DEBUG(printPlans(dbgs()));
6536     return {{UserVF, 0}};
6537   }
6538 
6539   unsigned MaxVF = MaybeMaxVF.getValue();
6540   assert(MaxVF != 0 && "MaxVF is zero.");
6541 
6542   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6543     // Collect Uniform and Scalar instructions after vectorization with VF.
6544     CM.collectUniformsAndScalars(VF);
6545 
6546     // Collect the instructions (and their associated costs) that will be more
6547     // profitable to scalarize.
6548     if (VF > 1)
6549       CM.collectInstsToScalarize(VF);
6550   }
6551 
6552   buildVPlansWithVPRecipes(1, MaxVF);
6553   LLVM_DEBUG(printPlans(dbgs()));
6554   if (MaxVF == 1)
6555     return VectorizationFactor::Disabled();
6556 
6557   // Select the optimal vectorization factor.
6558   return CM.selectVectorizationFactor(MaxVF);
6559 }
6560 
6561 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6562   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6563                     << '\n');
6564   BestVF = VF;
6565   BestUF = UF;
6566 
6567   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6568     return !Plan->hasVF(VF);
6569   });
6570   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6571 }
6572 
6573 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6574                                            DominatorTree *DT) {
6575   // Perform the actual loop transformation.
6576 
6577   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6578   VPCallbackILV CallbackILV(ILV);
6579 
6580   VPTransformState State{BestVF, BestUF,      LI,
6581                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6582                          &ILV,   CallbackILV};
6583   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6584   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6585   State.CanonicalIV = ILV.Induction;
6586 
6587   //===------------------------------------------------===//
6588   //
6589   // Notice: any optimization or new instruction that go
6590   // into the code below should also be implemented in
6591   // the cost-model.
6592   //
6593   //===------------------------------------------------===//
6594 
6595   // 2. Copy and widen instructions from the old loop into the new loop.
6596   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6597   VPlans.front()->execute(&State);
6598 
6599   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6600   //    predication, updating analyses.
6601   ILV.fixVectorizedLoop();
6602 }
6603 
6604 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6605     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6606   BasicBlock *Latch = OrigLoop->getLoopLatch();
6607 
6608   // We create new control-flow for the vectorized loop, so the original
6609   // condition will be dead after vectorization if it's only used by the
6610   // branch.
6611   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6612   if (Cmp && Cmp->hasOneUse())
6613     DeadInstructions.insert(Cmp);
6614 
6615   // We create new "steps" for induction variable updates to which the original
6616   // induction variables map. An original update instruction will be dead if
6617   // all its users except the induction variable are dead.
6618   for (auto &Induction : Legal->getInductionVars()) {
6619     PHINode *Ind = Induction.first;
6620     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6621     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6622           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6623                                  DeadInstructions.end();
6624         }))
6625       DeadInstructions.insert(IndUpdate);
6626 
6627     // We record as "Dead" also the type-casting instructions we had identified
6628     // during induction analysis. We don't need any handling for them in the
6629     // vectorized loop because we have proven that, under a proper runtime
6630     // test guarding the vectorized loop, the value of the phi, and the casted
6631     // value of the phi, are the same. The last instruction in this casting chain
6632     // will get its scalar/vector/widened def from the scalar/vector/widened def
6633     // of the respective phi node. Any other casts in the induction def-use chain
6634     // have no other uses outside the phi update chain, and will be ignored.
6635     InductionDescriptor &IndDes = Induction.second;
6636     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6637     DeadInstructions.insert(Casts.begin(), Casts.end());
6638   }
6639 }
6640 
6641 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6642 
6643 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6644 
6645 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6646                                         Instruction::BinaryOps BinOp) {
6647   // When unrolling and the VF is 1, we only need to add a simple scalar.
6648   Type *Ty = Val->getType();
6649   assert(!Ty->isVectorTy() && "Val must be a scalar");
6650 
6651   if (Ty->isFloatingPointTy()) {
6652     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6653 
6654     // Floating point operations had to be 'fast' to enable the unrolling.
6655     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6656     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6657   }
6658   Constant *C = ConstantInt::get(Ty, StartIdx);
6659   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6660 }
6661 
6662 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6663   SmallVector<Metadata *, 4> MDs;
6664   // Reserve first location for self reference to the LoopID metadata node.
6665   MDs.push_back(nullptr);
6666   bool IsUnrollMetadata = false;
6667   MDNode *LoopID = L->getLoopID();
6668   if (LoopID) {
6669     // First find existing loop unrolling disable metadata.
6670     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6671       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6672       if (MD) {
6673         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6674         IsUnrollMetadata =
6675             S && S->getString().startswith("llvm.loop.unroll.disable");
6676       }
6677       MDs.push_back(LoopID->getOperand(i));
6678     }
6679   }
6680 
6681   if (!IsUnrollMetadata) {
6682     // Add runtime unroll disable metadata.
6683     LLVMContext &Context = L->getHeader()->getContext();
6684     SmallVector<Metadata *, 1> DisableOperands;
6685     DisableOperands.push_back(
6686         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6687     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6688     MDs.push_back(DisableNode);
6689     MDNode *NewLoopID = MDNode::get(Context, MDs);
6690     // Set operand 0 to refer to the loop id itself.
6691     NewLoopID->replaceOperandWith(0, NewLoopID);
6692     L->setLoopID(NewLoopID);
6693   }
6694 }
6695 
6696 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6697     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6698   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6699   bool PredicateAtRangeStart = Predicate(Range.Start);
6700 
6701   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6702     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6703       Range.End = TmpVF;
6704       break;
6705     }
6706 
6707   return PredicateAtRangeStart;
6708 }
6709 
6710 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6711 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6712 /// of VF's starting at a given VF and extending it as much as possible. Each
6713 /// vectorization decision can potentially shorten this sub-range during
6714 /// buildVPlan().
6715 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6716   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6717     VFRange SubRange = {VF, MaxVF + 1};
6718     VPlans.push_back(buildVPlan(SubRange));
6719     VF = SubRange.End;
6720   }
6721 }
6722 
6723 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6724                                          VPlanPtr &Plan) {
6725   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6726 
6727   // Look for cached value.
6728   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6729   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6730   if (ECEntryIt != EdgeMaskCache.end())
6731     return ECEntryIt->second;
6732 
6733   VPValue *SrcMask = createBlockInMask(Src, Plan);
6734 
6735   // The terminator has to be a branch inst!
6736   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6737   assert(BI && "Unexpected terminator found");
6738 
6739   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6740     return EdgeMaskCache[Edge] = SrcMask;
6741 
6742   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6743   assert(EdgeMask && "No Edge Mask found for condition");
6744 
6745   if (BI->getSuccessor(0) != Dst)
6746     EdgeMask = Builder.createNot(EdgeMask);
6747 
6748   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6749     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6750 
6751   return EdgeMaskCache[Edge] = EdgeMask;
6752 }
6753 
6754 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6755   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6756 
6757   // Look for cached value.
6758   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6759   if (BCEntryIt != BlockMaskCache.end())
6760     return BCEntryIt->second;
6761 
6762   // All-one mask is modelled as no-mask following the convention for masked
6763   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6764   VPValue *BlockMask = nullptr;
6765 
6766   if (OrigLoop->getHeader() == BB) {
6767     if (!CM.blockNeedsPredication(BB))
6768       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6769 
6770     // Introduce the early-exit compare IV <= BTC to form header block mask.
6771     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6772     // Start by constructing the desired canonical IV.
6773     VPValue *IV = nullptr;
6774     if (Legal->getPrimaryInduction())
6775       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6776     else {
6777       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6778       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6779       IV = IVRecipe->getVPValue();
6780     }
6781     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6782     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6783     return BlockMaskCache[BB] = BlockMask;
6784   }
6785 
6786   // This is the block mask. We OR all incoming edges.
6787   for (auto *Predecessor : predecessors(BB)) {
6788     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6789     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6790       return BlockMaskCache[BB] = EdgeMask;
6791 
6792     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6793       BlockMask = EdgeMask;
6794       continue;
6795     }
6796 
6797     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6798   }
6799 
6800   return BlockMaskCache[BB] = BlockMask;
6801 }
6802 
6803 VPWidenMemoryInstructionRecipe *
6804 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6805                                   VPlanPtr &Plan) {
6806   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6807     return nullptr;
6808 
6809   auto willWiden = [&](unsigned VF) -> bool {
6810     if (VF == 1)
6811       return false;
6812     LoopVectorizationCostModel::InstWidening Decision =
6813         CM.getWideningDecision(I, VF);
6814     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6815            "CM decision should be taken at this point.");
6816     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6817       return true;
6818     if (CM.isScalarAfterVectorization(I, VF) ||
6819         CM.isProfitableToScalarize(I, VF))
6820       return false;
6821     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6822   };
6823 
6824   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6825     return nullptr;
6826 
6827   VPValue *Mask = nullptr;
6828   if (Legal->isMaskRequired(I))
6829     Mask = createBlockInMask(I->getParent(), Plan);
6830 
6831   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6832   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6833     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6834 
6835   StoreInst *Store = cast<StoreInst>(I);
6836   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6837   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6838 }
6839 
6840 VPWidenIntOrFpInductionRecipe *
6841 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6842   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6843     // Check if this is an integer or fp induction. If so, build the recipe that
6844     // produces its scalar and vector values.
6845     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6846     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6847         II.getKind() == InductionDescriptor::IK_FpInduction)
6848       return new VPWidenIntOrFpInductionRecipe(Phi);
6849 
6850     return nullptr;
6851   }
6852 
6853   // Optimize the special case where the source is a constant integer
6854   // induction variable. Notice that we can only optimize the 'trunc' case
6855   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6856   // (c) other casts depend on pointer size.
6857 
6858   // Determine whether \p K is a truncation based on an induction variable that
6859   // can be optimized.
6860   auto isOptimizableIVTruncate =
6861       [&](Instruction *K) -> std::function<bool(unsigned)> {
6862     return
6863         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6864   };
6865 
6866   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6867                                isOptimizableIVTruncate(I), Range))
6868     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6869                                              cast<TruncInst>(I));
6870   return nullptr;
6871 }
6872 
6873 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6874   PHINode *Phi = dyn_cast<PHINode>(I);
6875   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6876     return nullptr;
6877 
6878   // We know that all PHIs in non-header blocks are converted into selects, so
6879   // we don't have to worry about the insertion order and we can just use the
6880   // builder. At this point we generate the predication tree. There may be
6881   // duplications since this is a simple recursive scan, but future
6882   // optimizations will clean it up.
6883 
6884   SmallVector<VPValue *, 2> Operands;
6885   unsigned NumIncoming = Phi->getNumIncomingValues();
6886   for (unsigned In = 0; In < NumIncoming; In++) {
6887     VPValue *EdgeMask =
6888       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6889     assert((EdgeMask || NumIncoming == 1) &&
6890            "Multiple predecessors with one having a full mask");
6891     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
6892     if (EdgeMask)
6893       Operands.push_back(EdgeMask);
6894   }
6895   return new VPBlendRecipe(Phi, Operands);
6896 }
6897 
6898 VPWidenCallRecipe *
6899 VPRecipeBuilder::tryToWidenCall(Instruction *I, VFRange &Range, VPlan &Plan) {
6900 
6901   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6902       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6903 
6904   CallInst *CI = dyn_cast<CallInst>(I);
6905   if (IsPredicated || !CI)
6906     return nullptr;
6907 
6908   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6909   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6910              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6911     return nullptr;
6912 
6913   auto willWiden = [&](unsigned VF) -> bool {
6914     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6915     // The following case may be scalarized depending on the VF.
6916     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6917     // version of the instruction.
6918     // Is it beneficial to perform intrinsic call compared to lib call?
6919     bool NeedToScalarize = false;
6920     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6921     bool UseVectorIntrinsic =
6922         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6923     return UseVectorIntrinsic || !NeedToScalarize;
6924   };
6925 
6926   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6927     return nullptr;
6928 
6929   // Success: widen this call.
6930   auto VPValues = map_range(CI->arg_operands(), [&Plan](Value *Op) {
6931     return Plan.getOrAddVPValue(Op);
6932   });
6933 
6934   return new VPWidenCallRecipe(*CI, VPValues);
6935 }
6936 
6937 VPWidenSelectRecipe *VPRecipeBuilder::tryToWidenSelect(Instruction *I,
6938                                                        VFRange &Range) {
6939   auto *SI = dyn_cast<SelectInst>(I);
6940   if (!SI)
6941     return nullptr;
6942 
6943   // SI should be widened, unless it is scalar after vectorization,
6944   // scalarization is profitable or it is predicated.
6945   auto willWiden = [this, SI](unsigned VF) -> bool {
6946     return !CM.isScalarAfterVectorization(SI, VF) &&
6947            !CM.isProfitableToScalarize(SI, VF) &&
6948            !CM.isScalarWithPredication(SI, VF);
6949   };
6950   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6951     return nullptr;
6952 
6953   auto *SE = PSE.getSE();
6954   bool InvariantCond =
6955       SE->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
6956   // Success: widen this instruction.
6957   return new VPWidenSelectRecipe(*SI, InvariantCond);
6958 }
6959 
6960 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
6961   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6962       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6963 
6964   if (IsPredicated)
6965     return nullptr;
6966 
6967   auto IsVectorizableOpcode = [](unsigned Opcode) {
6968     switch (Opcode) {
6969     case Instruction::Add:
6970     case Instruction::And:
6971     case Instruction::AShr:
6972     case Instruction::BitCast:
6973     case Instruction::Br:
6974     case Instruction::FAdd:
6975     case Instruction::FCmp:
6976     case Instruction::FDiv:
6977     case Instruction::FMul:
6978     case Instruction::FNeg:
6979     case Instruction::FPExt:
6980     case Instruction::FPToSI:
6981     case Instruction::FPToUI:
6982     case Instruction::FPTrunc:
6983     case Instruction::FRem:
6984     case Instruction::FSub:
6985     case Instruction::ICmp:
6986     case Instruction::IntToPtr:
6987     case Instruction::Load:
6988     case Instruction::LShr:
6989     case Instruction::Mul:
6990     case Instruction::Or:
6991     case Instruction::PHI:
6992     case Instruction::PtrToInt:
6993     case Instruction::SDiv:
6994     case Instruction::Select:
6995     case Instruction::SExt:
6996     case Instruction::Shl:
6997     case Instruction::SIToFP:
6998     case Instruction::SRem:
6999     case Instruction::Store:
7000     case Instruction::Sub:
7001     case Instruction::Trunc:
7002     case Instruction::UDiv:
7003     case Instruction::UIToFP:
7004     case Instruction::URem:
7005     case Instruction::Xor:
7006     case Instruction::ZExt:
7007       return true;
7008     }
7009     return false;
7010   };
7011 
7012   if (!IsVectorizableOpcode(I->getOpcode()))
7013     return nullptr;
7014 
7015   auto willWiden = [&](unsigned VF) -> bool {
7016     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
7017                              CM.isProfitableToScalarize(I, VF)))
7018       return false;
7019     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
7020       assert(CM.getWideningDecision(I, VF) ==
7021                  LoopVectorizationCostModel::CM_Scalarize &&
7022              "Memory widening decisions should have been taken care by now");
7023       return false;
7024     }
7025     return true;
7026   };
7027 
7028   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7029     return nullptr;
7030 
7031   // Success: widen this instruction.
7032   return new VPWidenRecipe(*I);
7033 }
7034 
7035 VPBasicBlock *VPRecipeBuilder::handleReplication(
7036     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7037     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7038     VPlanPtr &Plan) {
7039   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7040       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7041       Range);
7042 
7043   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7044       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7045 
7046   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7047   setRecipe(I, Recipe);
7048 
7049   // Find if I uses a predicated instruction. If so, it will use its scalar
7050   // value. Avoid hoisting the insert-element which packs the scalar value into
7051   // a vector value, as that happens iff all users use the vector value.
7052   for (auto &Op : I->operands())
7053     if (auto *PredInst = dyn_cast<Instruction>(Op))
7054       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7055         PredInst2Recipe[PredInst]->setAlsoPack(false);
7056 
7057   // Finalize the recipe for Instr, first if it is not predicated.
7058   if (!IsPredicated) {
7059     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7060     VPBB->appendRecipe(Recipe);
7061     return VPBB;
7062   }
7063   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7064   assert(VPBB->getSuccessors().empty() &&
7065          "VPBB has successors when handling predicated replication.");
7066   // Record predicated instructions for above packing optimizations.
7067   PredInst2Recipe[I] = Recipe;
7068   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7069   VPBlockUtils::insertBlockAfter(Region, VPBB);
7070   auto *RegSucc = new VPBasicBlock();
7071   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7072   return RegSucc;
7073 }
7074 
7075 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7076                                                       VPRecipeBase *PredRecipe,
7077                                                       VPlanPtr &Plan) {
7078   // Instructions marked for predication are replicated and placed under an
7079   // if-then construct to prevent side-effects.
7080 
7081   // Generate recipes to compute the block mask for this region.
7082   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7083 
7084   // Build the triangular if-then region.
7085   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7086   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7087   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7088   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7089   auto *PHIRecipe =
7090       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7091   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7092   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7093   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7094 
7095   // Note: first set Entry as region entry and then connect successors starting
7096   // from it in order, to propagate the "parent" of each VPBasicBlock.
7097   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7098   VPBlockUtils::connectBlocks(Pred, Exit);
7099 
7100   return Region;
7101 }
7102 
7103 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7104                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7105   VPRecipeBase *Recipe = nullptr;
7106 
7107   // First, check for specific widening recipes that deal with calls, memory
7108   // operations, inductions and Phi nodes.
7109   if ((Recipe = tryToWidenCall(Instr, Range, *Plan)) ||
7110       (Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7111       (Recipe = tryToWidenSelect(Instr, Range)) ||
7112       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7113       (Recipe = tryToBlend(Instr, Plan)) ||
7114       (isa<PHINode>(Instr) &&
7115        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7116     setRecipe(Instr, Recipe);
7117     VPBB->appendRecipe(Recipe);
7118     return true;
7119   }
7120 
7121   // Handle GEP widening.
7122   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7123     auto Scalarize = [&](unsigned VF) {
7124       return CM.isScalarWithPredication(Instr, VF) ||
7125              CM.isScalarAfterVectorization(Instr, VF) ||
7126              CM.isProfitableToScalarize(Instr, VF);
7127     };
7128     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7129       return false;
7130     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7131     setRecipe(Instr, Recipe);
7132     VPBB->appendRecipe(Recipe);
7133     return true;
7134   }
7135 
7136   // Check if Instr is to be widened by a general VPWidenRecipe, after
7137   // having first checked for specific widening recipes.
7138   if ((Recipe = tryToWiden(Instr, Range))) {
7139     setRecipe(Instr, Recipe);
7140     VPBB->appendRecipe(Recipe);
7141     return true;
7142   }
7143 
7144   return false;
7145 }
7146 
7147 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7148                                                         unsigned MaxVF) {
7149   assert(OrigLoop->empty() && "Inner loop expected.");
7150 
7151   // Collect conditions feeding internal conditional branches; they need to be
7152   // represented in VPlan for it to model masking.
7153   SmallPtrSet<Value *, 1> NeedDef;
7154 
7155   auto *Latch = OrigLoop->getLoopLatch();
7156   for (BasicBlock *BB : OrigLoop->blocks()) {
7157     if (BB == Latch)
7158       continue;
7159     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7160     if (Branch && Branch->isConditional())
7161       NeedDef.insert(Branch->getCondition());
7162   }
7163 
7164   // If the tail is to be folded by masking, the primary induction variable, if
7165   // exists needs to be represented in VPlan for it to model early-exit masking.
7166   // Also, both the Phi and the live-out instruction of each reduction are
7167   // required in order to introduce a select between them in VPlan.
7168   if (CM.foldTailByMasking()) {
7169     if (Legal->getPrimaryInduction())
7170       NeedDef.insert(Legal->getPrimaryInduction());
7171     for (auto &Reduction : Legal->getReductionVars()) {
7172       NeedDef.insert(Reduction.first);
7173       NeedDef.insert(Reduction.second.getLoopExitInstr());
7174     }
7175   }
7176 
7177   // Collect instructions from the original loop that will become trivially dead
7178   // in the vectorized loop. We don't need to vectorize these instructions. For
7179   // example, original induction update instructions can become dead because we
7180   // separately emit induction "steps" when generating code for the new loop.
7181   // Similarly, we create a new latch condition when setting up the structure
7182   // of the new loop, so the old one can become dead.
7183   SmallPtrSet<Instruction *, 4> DeadInstructions;
7184   collectTriviallyDeadInstructions(DeadInstructions);
7185 
7186   // Add assume instructions we need to drop to DeadInstructions, to prevent
7187   // them from being added to the VPlan.
7188   // TODO: We only need to drop assumes in blocks that get flattend. If the
7189   // control flow is preserved, we should keep them.
7190   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7191   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7192 
7193   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7194   // Dead instructions do not need sinking. Remove them from SinkAfter.
7195   for (Instruction *I : DeadInstructions)
7196     SinkAfter.erase(I);
7197 
7198   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7199     VFRange SubRange = {VF, MaxVF + 1};
7200     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7201                                              DeadInstructions, SinkAfter));
7202     VF = SubRange.End;
7203   }
7204 }
7205 
7206 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7207     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7208     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7209     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7210 
7211   // Hold a mapping from predicated instructions to their recipes, in order to
7212   // fix their AlsoPack behavior if a user is determined to replicate and use a
7213   // scalar instead of vector value.
7214   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7215 
7216   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7217 
7218   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7219 
7220   // ---------------------------------------------------------------------------
7221   // Pre-construction: record ingredients whose recipes we'll need to further
7222   // process after constructing the initial VPlan.
7223   // ---------------------------------------------------------------------------
7224 
7225   // Mark instructions we'll need to sink later and their targets as
7226   // ingredients whose recipe we'll need to record.
7227   for (auto &Entry : SinkAfter) {
7228     RecipeBuilder.recordRecipeOf(Entry.first);
7229     RecipeBuilder.recordRecipeOf(Entry.second);
7230   }
7231 
7232   // For each interleave group which is relevant for this (possibly trimmed)
7233   // Range, add it to the set of groups to be later applied to the VPlan and add
7234   // placeholders for its members' Recipes which we'll be replacing with a
7235   // single VPInterleaveRecipe.
7236   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7237     auto applyIG = [IG, this](unsigned VF) -> bool {
7238       return (VF >= 2 && // Query is illegal for VF == 1
7239               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7240                   LoopVectorizationCostModel::CM_Interleave);
7241     };
7242     if (!getDecisionAndClampRange(applyIG, Range))
7243       continue;
7244     InterleaveGroups.insert(IG);
7245     for (unsigned i = 0; i < IG->getFactor(); i++)
7246       if (Instruction *Member = IG->getMember(i))
7247         RecipeBuilder.recordRecipeOf(Member);
7248   };
7249 
7250   // ---------------------------------------------------------------------------
7251   // Build initial VPlan: Scan the body of the loop in a topological order to
7252   // visit each basic block after having visited its predecessor basic blocks.
7253   // ---------------------------------------------------------------------------
7254 
7255   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7256   auto Plan = std::make_unique<VPlan>();
7257   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7258   Plan->setEntry(VPBB);
7259 
7260   // Represent values that will have defs inside VPlan.
7261   for (Value *V : NeedDef)
7262     Plan->addVPValue(V);
7263 
7264   // Scan the body of the loop in a topological order to visit each basic block
7265   // after having visited its predecessor basic blocks.
7266   LoopBlocksDFS DFS(OrigLoop);
7267   DFS.perform(LI);
7268 
7269   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7270     // Relevant instructions from basic block BB will be grouped into VPRecipe
7271     // ingredients and fill a new VPBasicBlock.
7272     unsigned VPBBsForBB = 0;
7273     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7274     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7275     VPBB = FirstVPBBForBB;
7276     Builder.setInsertPoint(VPBB);
7277 
7278     // Introduce each ingredient into VPlan.
7279     // TODO: Model and preserve debug instrinsics in VPlan.
7280     for (Instruction &I : BB->instructionsWithoutDebug()) {
7281       Instruction *Instr = &I;
7282 
7283       // First filter out irrelevant instructions, to ensure no recipes are
7284       // built for them.
7285       if (isa<BranchInst>(Instr) ||
7286           DeadInstructions.find(Instr) != DeadInstructions.end())
7287         continue;
7288 
7289       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7290         continue;
7291 
7292       // Otherwise, if all widening options failed, Instruction is to be
7293       // replicated. This may create a successor for VPBB.
7294       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7295           Instr, Range, VPBB, PredInst2Recipe, Plan);
7296       if (NextVPBB != VPBB) {
7297         VPBB = NextVPBB;
7298         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7299                                     : "");
7300       }
7301     }
7302   }
7303 
7304   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7305   // may also be empty, such as the last one VPBB, reflecting original
7306   // basic-blocks with no recipes.
7307   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7308   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7309   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7310   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7311   delete PreEntry;
7312 
7313   // ---------------------------------------------------------------------------
7314   // Transform initial VPlan: Apply previously taken decisions, in order, to
7315   // bring the VPlan to its final state.
7316   // ---------------------------------------------------------------------------
7317 
7318   // Apply Sink-After legal constraints.
7319   for (auto &Entry : SinkAfter) {
7320     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7321     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7322     Sink->moveAfter(Target);
7323   }
7324 
7325   // Interleave memory: for each Interleave Group we marked earlier as relevant
7326   // for this VPlan, replace the Recipes widening its memory instructions with a
7327   // single VPInterleaveRecipe at its insertion point.
7328   for (auto IG : InterleaveGroups) {
7329     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7330         RecipeBuilder.getRecipe(IG->getInsertPos()));
7331     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7332         ->insertBefore(Recipe);
7333 
7334     for (unsigned i = 0; i < IG->getFactor(); ++i)
7335       if (Instruction *Member = IG->getMember(i)) {
7336         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7337       }
7338   }
7339 
7340   // Finally, if tail is folded by masking, introduce selects between the phi
7341   // and the live-out instruction of each reduction, at the end of the latch.
7342   if (CM.foldTailByMasking()) {
7343     Builder.setInsertPoint(VPBB);
7344     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7345     for (auto &Reduction : Legal->getReductionVars()) {
7346       VPValue *Phi = Plan->getVPValue(Reduction.first);
7347       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7348       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7349     }
7350   }
7351 
7352   std::string PlanName;
7353   raw_string_ostream RSO(PlanName);
7354   unsigned VF = Range.Start;
7355   Plan->addVF(VF);
7356   RSO << "Initial VPlan for VF={" << VF;
7357   for (VF *= 2; VF < Range.End; VF *= 2) {
7358     Plan->addVF(VF);
7359     RSO << "," << VF;
7360   }
7361   RSO << "},UF>=1";
7362   RSO.flush();
7363   Plan->setName(PlanName);
7364 
7365   return Plan;
7366 }
7367 
7368 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7369   // Outer loop handling: They may require CFG and instruction level
7370   // transformations before even evaluating whether vectorization is profitable.
7371   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7372   // the vectorization pipeline.
7373   assert(!OrigLoop->empty());
7374   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7375 
7376   // Create new empty VPlan
7377   auto Plan = std::make_unique<VPlan>();
7378 
7379   // Build hierarchical CFG
7380   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7381   HCFGBuilder.buildHierarchicalCFG();
7382 
7383   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7384     Plan->addVF(VF);
7385 
7386   if (EnableVPlanPredication) {
7387     VPlanPredicator VPP(*Plan);
7388     VPP.predicate();
7389 
7390     // Avoid running transformation to recipes until masked code generation in
7391     // VPlan-native path is in place.
7392     return Plan;
7393   }
7394 
7395   SmallPtrSet<Instruction *, 1> DeadInstructions;
7396   VPlanTransforms::VPInstructionsToVPRecipes(
7397       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7398   return Plan;
7399 }
7400 
7401 Value* LoopVectorizationPlanner::VPCallbackILV::
7402 getOrCreateVectorValues(Value *V, unsigned Part) {
7403       return ILV.getOrCreateVectorValue(V, Part);
7404 }
7405 
7406 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7407     Value *V, const VPIteration &Instance) {
7408   return ILV.getOrCreateScalarValue(V, Instance);
7409 }
7410 
7411 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7412                                VPSlotTracker &SlotTracker) const {
7413   O << " +\n"
7414     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7415   IG->getInsertPos()->printAsOperand(O, false);
7416   O << ", ";
7417   getAddr()->printAsOperand(O, SlotTracker);
7418   VPValue *Mask = getMask();
7419   if (Mask) {
7420     O << ", ";
7421     Mask->printAsOperand(O, SlotTracker);
7422   }
7423   O << "\\l\"";
7424   for (unsigned i = 0; i < IG->getFactor(); ++i)
7425     if (Instruction *I = IG->getMember(i))
7426       O << " +\n"
7427         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7428 }
7429 
7430 void VPWidenCallRecipe::execute(VPTransformState &State) {
7431   State.ILV->widenCallInstruction(Ingredient, User, State);
7432 }
7433 
7434 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7435   State.ILV->widenSelectInstruction(Ingredient, InvariantCond);
7436 }
7437 
7438 void VPWidenRecipe::execute(VPTransformState &State) {
7439   State.ILV->widenInstruction(Ingredient);
7440 }
7441 
7442 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7443   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7444                       IsIndexLoopInvariant);
7445 }
7446 
7447 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7448   assert(!State.Instance && "Int or FP induction being replicated.");
7449   State.ILV->widenIntOrFpInduction(IV, Trunc);
7450 }
7451 
7452 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7453   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7454 }
7455 
7456 void VPBlendRecipe::execute(VPTransformState &State) {
7457   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7458   // We know that all PHIs in non-header blocks are converted into
7459   // selects, so we don't have to worry about the insertion order and we
7460   // can just use the builder.
7461   // At this point we generate the predication tree. There may be
7462   // duplications since this is a simple recursive scan, but future
7463   // optimizations will clean it up.
7464 
7465   unsigned NumIncoming = getNumIncomingValues();
7466 
7467   // Generate a sequence of selects of the form:
7468   // SELECT(Mask3, In3,
7469   //      SELECT(Mask2, In2,
7470   //                   ( ...)))
7471   InnerLoopVectorizer::VectorParts Entry(State.UF);
7472   for (unsigned In = 0; In < NumIncoming; ++In) {
7473     for (unsigned Part = 0; Part < State.UF; ++Part) {
7474       // We might have single edge PHIs (blocks) - use an identity
7475       // 'select' for the first PHI operand.
7476       Value *In0 = State.get(getIncomingValue(In), Part);
7477       if (In == 0)
7478         Entry[Part] = In0; // Initialize with the first incoming value.
7479       else {
7480         // Select between the current value and the previous incoming edge
7481         // based on the incoming mask.
7482         Value *Cond = State.get(getMask(In), Part);
7483         Entry[Part] =
7484             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7485       }
7486     }
7487   }
7488   for (unsigned Part = 0; Part < State.UF; ++Part)
7489     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7490 }
7491 
7492 void VPInterleaveRecipe::execute(VPTransformState &State) {
7493   assert(!State.Instance && "Interleave group being replicated.");
7494   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7495                                       getMask());
7496 }
7497 
7498 void VPReplicateRecipe::execute(VPTransformState &State) {
7499   if (State.Instance) { // Generate a single instance.
7500     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7501     // Insert scalar instance packing it into a vector.
7502     if (AlsoPack && State.VF > 1) {
7503       // If we're constructing lane 0, initialize to start from undef.
7504       if (State.Instance->Lane == 0) {
7505         Value *Undef =
7506             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7507         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7508       }
7509       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7510     }
7511     return;
7512   }
7513 
7514   // Generate scalar instances for all VF lanes of all UF parts, unless the
7515   // instruction is uniform inwhich case generate only the first lane for each
7516   // of the UF parts.
7517   unsigned EndLane = IsUniform ? 1 : State.VF;
7518   for (unsigned Part = 0; Part < State.UF; ++Part)
7519     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7520       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7521 }
7522 
7523 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7524   assert(State.Instance && "Branch on Mask works only on single instance.");
7525 
7526   unsigned Part = State.Instance->Part;
7527   unsigned Lane = State.Instance->Lane;
7528 
7529   Value *ConditionBit = nullptr;
7530   if (!User) // Block in mask is all-one.
7531     ConditionBit = State.Builder.getTrue();
7532   else {
7533     VPValue *BlockInMask = User->getOperand(0);
7534     ConditionBit = State.get(BlockInMask, Part);
7535     if (ConditionBit->getType()->isVectorTy())
7536       ConditionBit = State.Builder.CreateExtractElement(
7537           ConditionBit, State.Builder.getInt32(Lane));
7538   }
7539 
7540   // Replace the temporary unreachable terminator with a new conditional branch,
7541   // whose two destinations will be set later when they are created.
7542   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7543   assert(isa<UnreachableInst>(CurrentTerminator) &&
7544          "Expected to replace unreachable terminator with conditional branch.");
7545   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7546   CondBr->setSuccessor(0, nullptr);
7547   ReplaceInstWithInst(CurrentTerminator, CondBr);
7548 }
7549 
7550 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7551   assert(State.Instance && "Predicated instruction PHI works per instance.");
7552   Instruction *ScalarPredInst = cast<Instruction>(
7553       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7554   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7555   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7556   assert(PredicatingBB && "Predicated block has no single predecessor.");
7557 
7558   // By current pack/unpack logic we need to generate only a single phi node: if
7559   // a vector value for the predicated instruction exists at this point it means
7560   // the instruction has vector users only, and a phi for the vector value is
7561   // needed. In this case the recipe of the predicated instruction is marked to
7562   // also do that packing, thereby "hoisting" the insert-element sequence.
7563   // Otherwise, a phi node for the scalar value is needed.
7564   unsigned Part = State.Instance->Part;
7565   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7566     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7567     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7568     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7569     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7570     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7571     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7572   } else {
7573     Type *PredInstType = PredInst->getType();
7574     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7575     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7576     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7577     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7578   }
7579 }
7580 
7581 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7582   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7583   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7584                                         getMask());
7585 }
7586 
7587 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7588 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7589 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7590 // for predication.
7591 static ScalarEpilogueLowering getScalarEpilogueLowering(
7592     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7593     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7594     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7595     LoopVectorizationLegality &LVL) {
7596   bool OptSize =
7597       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7598                                                      PGSOQueryType::IRPass);
7599   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7600   // don't look at hints or options, and don't request a scalar epilogue.
7601   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7602     return CM_ScalarEpilogueNotAllowedOptSize;
7603 
7604   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7605                               !PreferPredicateOverEpilog;
7606 
7607   // 2) Next, if disabling predication is requested on the command line, honour
7608   // this and request a scalar epilogue.
7609   if (PredicateOptDisabled)
7610     return CM_ScalarEpilogueAllowed;
7611 
7612   // 3) and 4) look if enabling predication is requested on the command line,
7613   // with a loop hint, or if the TTI hook indicates this is profitable, request
7614   // predication .
7615   if (PreferPredicateOverEpilog ||
7616       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7617       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7618                                         LVL.getLAI()) &&
7619        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7620     return CM_ScalarEpilogueNotNeededUsePredicate;
7621 
7622   return CM_ScalarEpilogueAllowed;
7623 }
7624 
7625 // Process the loop in the VPlan-native vectorization path. This path builds
7626 // VPlan upfront in the vectorization pipeline, which allows to apply
7627 // VPlan-to-VPlan transformations from the very beginning without modifying the
7628 // input LLVM IR.
7629 static bool processLoopInVPlanNativePath(
7630     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7631     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7632     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7633     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7634     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7635 
7636   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7637   Function *F = L->getHeader()->getParent();
7638   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7639 
7640   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7641       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7642 
7643   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7644                                 &Hints, IAI);
7645   // Use the planner for outer loop vectorization.
7646   // TODO: CM is not used at this point inside the planner. Turn CM into an
7647   // optional argument if we don't need it in the future.
7648   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7649 
7650   // Get user vectorization factor.
7651   const unsigned UserVF = Hints.getWidth();
7652 
7653   // Plan how to best vectorize, return the best VF and its cost.
7654   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7655 
7656   // If we are stress testing VPlan builds, do not attempt to generate vector
7657   // code. Masked vector code generation support will follow soon.
7658   // Also, do not attempt to vectorize if no vector code will be produced.
7659   if (VPlanBuildStressTest || EnableVPlanPredication ||
7660       VectorizationFactor::Disabled() == VF)
7661     return false;
7662 
7663   LVP.setBestPlan(VF.Width, 1);
7664 
7665   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7666                          &CM);
7667   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7668                     << L->getHeader()->getParent()->getName() << "\"\n");
7669   LVP.executePlan(LB, DT);
7670 
7671   // Mark the loop as already vectorized to avoid vectorizing again.
7672   Hints.setAlreadyVectorized();
7673 
7674   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7675   return true;
7676 }
7677 
7678 bool LoopVectorizePass::processLoop(Loop *L) {
7679   assert((EnableVPlanNativePath || L->empty()) &&
7680          "VPlan-native path is not enabled. Only process inner loops.");
7681 
7682 #ifndef NDEBUG
7683   const std::string DebugLocStr = getDebugLocString(L);
7684 #endif /* NDEBUG */
7685 
7686   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7687                     << L->getHeader()->getParent()->getName() << "\" from "
7688                     << DebugLocStr << "\n");
7689 
7690   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7691 
7692   LLVM_DEBUG(
7693       dbgs() << "LV: Loop hints:"
7694              << " force="
7695              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7696                      ? "disabled"
7697                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7698                             ? "enabled"
7699                             : "?"))
7700              << " width=" << Hints.getWidth()
7701              << " unroll=" << Hints.getInterleave() << "\n");
7702 
7703   // Function containing loop
7704   Function *F = L->getHeader()->getParent();
7705 
7706   // Looking at the diagnostic output is the only way to determine if a loop
7707   // was vectorized (other than looking at the IR or machine code), so it
7708   // is important to generate an optimization remark for each loop. Most of
7709   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7710   // generated as OptimizationRemark and OptimizationRemarkMissed are
7711   // less verbose reporting vectorized loops and unvectorized loops that may
7712   // benefit from vectorization, respectively.
7713 
7714   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7715     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7716     return false;
7717   }
7718 
7719   PredicatedScalarEvolution PSE(*SE, *L);
7720 
7721   // Check if it is legal to vectorize the loop.
7722   LoopVectorizationRequirements Requirements(*ORE);
7723   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7724                                 &Requirements, &Hints, DB, AC);
7725   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7726     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7727     Hints.emitRemarkWithHints();
7728     return false;
7729   }
7730 
7731   // Check the function attributes and profiles to find out if this function
7732   // should be optimized for size.
7733   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7734       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7735 
7736   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7737   // here. They may require CFG and instruction level transformations before
7738   // even evaluating whether vectorization is profitable. Since we cannot modify
7739   // the incoming IR, we need to build VPlan upfront in the vectorization
7740   // pipeline.
7741   if (!L->empty())
7742     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7743                                         ORE, BFI, PSI, Hints);
7744 
7745   assert(L->empty() && "Inner loop expected.");
7746 
7747   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7748   // count by optimizing for size, to minimize overheads.
7749   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7750   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7751     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7752                       << "This loop is worth vectorizing only if no scalar "
7753                       << "iteration overheads are incurred.");
7754     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7755       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7756     else {
7757       LLVM_DEBUG(dbgs() << "\n");
7758       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7759     }
7760   }
7761 
7762   // Check the function attributes to see if implicit floats are allowed.
7763   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7764   // an integer loop and the vector instructions selected are purely integer
7765   // vector instructions?
7766   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7767     reportVectorizationFailure(
7768         "Can't vectorize when the NoImplicitFloat attribute is used",
7769         "loop not vectorized due to NoImplicitFloat attribute",
7770         "NoImplicitFloat", ORE, L);
7771     Hints.emitRemarkWithHints();
7772     return false;
7773   }
7774 
7775   // Check if the target supports potentially unsafe FP vectorization.
7776   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7777   // for the target we're vectorizing for, to make sure none of the
7778   // additional fp-math flags can help.
7779   if (Hints.isPotentiallyUnsafe() &&
7780       TTI->isFPVectorizationPotentiallyUnsafe()) {
7781     reportVectorizationFailure(
7782         "Potentially unsafe FP op prevents vectorization",
7783         "loop not vectorized due to unsafe FP support.",
7784         "UnsafeFP", ORE, L);
7785     Hints.emitRemarkWithHints();
7786     return false;
7787   }
7788 
7789   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7790   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7791 
7792   // If an override option has been passed in for interleaved accesses, use it.
7793   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7794     UseInterleaved = EnableInterleavedMemAccesses;
7795 
7796   // Analyze interleaved memory accesses.
7797   if (UseInterleaved) {
7798     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7799   }
7800 
7801   // Use the cost model.
7802   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7803                                 F, &Hints, IAI);
7804   CM.collectValuesToIgnore();
7805 
7806   // Use the planner for vectorization.
7807   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
7808 
7809   // Get user vectorization factor.
7810   unsigned UserVF = Hints.getWidth();
7811 
7812   // Plan how to best vectorize, return the best VF and its cost.
7813   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7814 
7815   VectorizationFactor VF = VectorizationFactor::Disabled();
7816   unsigned IC = 1;
7817   unsigned UserIC = Hints.getInterleave();
7818 
7819   if (MaybeVF) {
7820     VF = *MaybeVF;
7821     // Select the interleave count.
7822     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7823   }
7824 
7825   // Identify the diagnostic messages that should be produced.
7826   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7827   bool VectorizeLoop = true, InterleaveLoop = true;
7828   if (Requirements.doesNotMeet(F, L, Hints)) {
7829     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7830                          "requirements.\n");
7831     Hints.emitRemarkWithHints();
7832     return false;
7833   }
7834 
7835   if (VF.Width == 1) {
7836     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7837     VecDiagMsg = std::make_pair(
7838         "VectorizationNotBeneficial",
7839         "the cost-model indicates that vectorization is not beneficial");
7840     VectorizeLoop = false;
7841   }
7842 
7843   if (!MaybeVF && UserIC > 1) {
7844     // Tell the user interleaving was avoided up-front, despite being explicitly
7845     // requested.
7846     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7847                          "interleaving should be avoided up front\n");
7848     IntDiagMsg = std::make_pair(
7849         "InterleavingAvoided",
7850         "Ignoring UserIC, because interleaving was avoided up front");
7851     InterleaveLoop = false;
7852   } else if (IC == 1 && UserIC <= 1) {
7853     // Tell the user interleaving is not beneficial.
7854     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7855     IntDiagMsg = std::make_pair(
7856         "InterleavingNotBeneficial",
7857         "the cost-model indicates that interleaving is not beneficial");
7858     InterleaveLoop = false;
7859     if (UserIC == 1) {
7860       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7861       IntDiagMsg.second +=
7862           " and is explicitly disabled or interleave count is set to 1";
7863     }
7864   } else if (IC > 1 && UserIC == 1) {
7865     // Tell the user interleaving is beneficial, but it explicitly disabled.
7866     LLVM_DEBUG(
7867         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7868     IntDiagMsg = std::make_pair(
7869         "InterleavingBeneficialButDisabled",
7870         "the cost-model indicates that interleaving is beneficial "
7871         "but is explicitly disabled or interleave count is set to 1");
7872     InterleaveLoop = false;
7873   }
7874 
7875   // Override IC if user provided an interleave count.
7876   IC = UserIC > 0 ? UserIC : IC;
7877 
7878   // Emit diagnostic messages, if any.
7879   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7880   if (!VectorizeLoop && !InterleaveLoop) {
7881     // Do not vectorize or interleaving the loop.
7882     ORE->emit([&]() {
7883       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7884                                       L->getStartLoc(), L->getHeader())
7885              << VecDiagMsg.second;
7886     });
7887     ORE->emit([&]() {
7888       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7889                                       L->getStartLoc(), L->getHeader())
7890              << IntDiagMsg.second;
7891     });
7892     return false;
7893   } else if (!VectorizeLoop && InterleaveLoop) {
7894     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7895     ORE->emit([&]() {
7896       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7897                                         L->getStartLoc(), L->getHeader())
7898              << VecDiagMsg.second;
7899     });
7900   } else if (VectorizeLoop && !InterleaveLoop) {
7901     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7902                       << ") in " << DebugLocStr << '\n');
7903     ORE->emit([&]() {
7904       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7905                                         L->getStartLoc(), L->getHeader())
7906              << IntDiagMsg.second;
7907     });
7908   } else if (VectorizeLoop && InterleaveLoop) {
7909     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7910                       << ") in " << DebugLocStr << '\n');
7911     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7912   }
7913 
7914   LVP.setBestPlan(VF.Width, IC);
7915 
7916   using namespace ore;
7917   bool DisableRuntimeUnroll = false;
7918   MDNode *OrigLoopID = L->getLoopID();
7919 
7920   if (!VectorizeLoop) {
7921     assert(IC > 1 && "interleave count should not be 1 or 0");
7922     // If we decided that it is not legal to vectorize the loop, then
7923     // interleave it.
7924     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7925                                &CM);
7926     LVP.executePlan(Unroller, DT);
7927 
7928     ORE->emit([&]() {
7929       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7930                                 L->getHeader())
7931              << "interleaved loop (interleaved count: "
7932              << NV("InterleaveCount", IC) << ")";
7933     });
7934   } else {
7935     // If we decided that it is *legal* to vectorize the loop, then do it.
7936     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7937                            &LVL, &CM);
7938     LVP.executePlan(LB, DT);
7939     ++LoopsVectorized;
7940 
7941     // Add metadata to disable runtime unrolling a scalar loop when there are
7942     // no runtime checks about strides and memory. A scalar loop that is
7943     // rarely used is not worth unrolling.
7944     if (!LB.areSafetyChecksAdded())
7945       DisableRuntimeUnroll = true;
7946 
7947     // Report the vectorization decision.
7948     ORE->emit([&]() {
7949       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7950                                 L->getHeader())
7951              << "vectorized loop (vectorization width: "
7952              << NV("VectorizationFactor", VF.Width)
7953              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7954     });
7955   }
7956 
7957   Optional<MDNode *> RemainderLoopID =
7958       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7959                                       LLVMLoopVectorizeFollowupEpilogue});
7960   if (RemainderLoopID.hasValue()) {
7961     L->setLoopID(RemainderLoopID.getValue());
7962   } else {
7963     if (DisableRuntimeUnroll)
7964       AddRuntimeUnrollDisableMetaData(L);
7965 
7966     // Mark the loop as already vectorized to avoid vectorizing again.
7967     Hints.setAlreadyVectorized();
7968   }
7969 
7970   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7971   return true;
7972 }
7973 
7974 bool LoopVectorizePass::runImpl(
7975     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7976     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7977     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7978     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7979     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7980   SE = &SE_;
7981   LI = &LI_;
7982   TTI = &TTI_;
7983   DT = &DT_;
7984   BFI = &BFI_;
7985   TLI = TLI_;
7986   AA = &AA_;
7987   AC = &AC_;
7988   GetLAA = &GetLAA_;
7989   DB = &DB_;
7990   ORE = &ORE_;
7991   PSI = PSI_;
7992 
7993   // Don't attempt if
7994   // 1. the target claims to have no vector registers, and
7995   // 2. interleaving won't help ILP.
7996   //
7997   // The second condition is necessary because, even if the target has no
7998   // vector registers, loop vectorization may still enable scalar
7999   // interleaving.
8000   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8001       TTI->getMaxInterleaveFactor(1) < 2)
8002     return false;
8003 
8004   bool Changed = false;
8005 
8006   // The vectorizer requires loops to be in simplified form.
8007   // Since simplification may add new inner loops, it has to run before the
8008   // legality and profitability checks. This means running the loop vectorizer
8009   // will simplify all loops, regardless of whether anything end up being
8010   // vectorized.
8011   for (auto &L : *LI)
8012     Changed |=
8013         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8014 
8015   // Build up a worklist of inner-loops to vectorize. This is necessary as
8016   // the act of vectorizing or partially unrolling a loop creates new loops
8017   // and can invalidate iterators across the loops.
8018   SmallVector<Loop *, 8> Worklist;
8019 
8020   for (Loop *L : *LI)
8021     collectSupportedLoops(*L, LI, ORE, Worklist);
8022 
8023   LoopsAnalyzed += Worklist.size();
8024 
8025   // Now walk the identified inner loops.
8026   while (!Worklist.empty()) {
8027     Loop *L = Worklist.pop_back_val();
8028 
8029     // For the inner loops we actually process, form LCSSA to simplify the
8030     // transform.
8031     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8032 
8033     Changed |= processLoop(L);
8034   }
8035 
8036   // Process each loop nest in the function.
8037   return Changed;
8038 }
8039 
8040 PreservedAnalyses LoopVectorizePass::run(Function &F,
8041                                          FunctionAnalysisManager &AM) {
8042     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8043     auto &LI = AM.getResult<LoopAnalysis>(F);
8044     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8045     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8046     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8047     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8048     auto &AA = AM.getResult<AAManager>(F);
8049     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8050     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8051     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8052     MemorySSA *MSSA = EnableMSSALoopDependency
8053                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8054                           : nullptr;
8055 
8056     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8057     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8058         [&](Loop &L) -> const LoopAccessInfo & {
8059       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8060       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8061     };
8062     const ModuleAnalysisManager &MAM =
8063         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
8064     ProfileSummaryInfo *PSI =
8065         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8066     bool Changed =
8067         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8068     if (!Changed)
8069       return PreservedAnalyses::all();
8070     PreservedAnalyses PA;
8071 
8072     // We currently do not preserve loopinfo/dominator analyses with outer loop
8073     // vectorization. Until this is addressed, mark these analyses as preserved
8074     // only for non-VPlan-native path.
8075     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8076     if (!EnableVPlanNativePath) {
8077       PA.preserve<LoopAnalysis>();
8078       PA.preserve<DominatorTreeAnalysis>();
8079     }
8080     PA.preserve<BasicAA>();
8081     PA.preserve<GlobalsAA>();
8082     return PA;
8083 }
8084