1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Widen a single call instruction within the innermost loop.
413   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
414                             VPTransformState &State);
415 
416   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
417   void fixVectorizedLoop();
418 
419   // Return true if any runtime check is added.
420   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
421 
422   /// A type for vectorized values in the new loop. Each value from the
423   /// original loop, when vectorized, is represented by UF vector values in the
424   /// new unrolled loop, where UF is the unroll factor.
425   using VectorParts = SmallVector<Value *, 2>;
426 
427   /// Vectorize a single GetElementPtrInst based on information gathered and
428   /// decisions taken during planning.
429   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
430                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
431 
432   /// Vectorize a single PHINode in a block. This method handles the induction
433   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
434   /// arbitrary length vectors.
435   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
436 
437   /// A helper function to scalarize a single Instruction in the innermost loop.
438   /// Generates a sequence of scalar instances for each lane between \p MinLane
439   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
440   /// inclusive..
441   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
442                             bool IfPredicateInstr);
443 
444   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
445   /// is provided, the integer induction variable will first be truncated to
446   /// the corresponding type.
447   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
448 
449   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
450   /// vector or scalar value on-demand if one is not yet available. When
451   /// vectorizing a loop, we visit the definition of an instruction before its
452   /// uses. When visiting the definition, we either vectorize or scalarize the
453   /// instruction, creating an entry for it in the corresponding map. (In some
454   /// cases, such as induction variables, we will create both vector and scalar
455   /// entries.) Then, as we encounter uses of the definition, we derive values
456   /// for each scalar or vector use unless such a value is already available.
457   /// For example, if we scalarize a definition and one of its uses is vector,
458   /// we build the required vector on-demand with an insertelement sequence
459   /// when visiting the use. Otherwise, if the use is scalar, we can use the
460   /// existing scalar definition.
461   ///
462   /// Return a value in the new loop corresponding to \p V from the original
463   /// loop at unroll index \p Part. If the value has already been vectorized,
464   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
465   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
466   /// a new vector value on-demand by inserting the scalar values into a vector
467   /// with an insertelement sequence. If the value has been neither vectorized
468   /// nor scalarized, it must be loop invariant, so we simply broadcast the
469   /// value into a vector.
470   Value *getOrCreateVectorValue(Value *V, unsigned Part);
471 
472   /// Return a value in the new loop corresponding to \p V from the original
473   /// loop at unroll and vector indices \p Instance. If the value has been
474   /// vectorized but not scalarized, the necessary extractelement instruction
475   /// will be generated.
476   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
477 
478   /// Construct the vector value of a scalarized value \p V one lane at a time.
479   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
480 
481   /// Try to vectorize the interleaved access group that \p Instr belongs to
482   /// with the base address given in \p Addr, optionally masking the vector
483   /// operations if \p BlockInMask is non-null. Use \p State to translate given
484   /// VPValues to IR values in the vectorized loop.
485   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
486                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
487 
488   /// Vectorize Load and Store instructions with the base address given in \p
489   /// Addr, optionally masking the vector operations if \p BlockInMask is
490   /// non-null. Use \p State to translate given VPValues to IR values in the
491   /// vectorized loop.
492   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
493                                   VPValue *Addr, VPValue *StoredValue,
494                                   VPValue *BlockInMask);
495 
496   /// Set the debug location in the builder using the debug location in
497   /// the instruction.
498   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
499 
500   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
501   void fixNonInductionPHIs(void);
502 
503 protected:
504   friend class LoopVectorizationPlanner;
505 
506   /// A small list of PHINodes.
507   using PhiVector = SmallVector<PHINode *, 4>;
508 
509   /// A type for scalarized values in the new loop. Each value from the
510   /// original loop, when scalarized, is represented by UF x VF scalar values
511   /// in the new unrolled loop, where UF is the unroll factor and VF is the
512   /// vectorization factor.
513   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
514 
515   /// Set up the values of the IVs correctly when exiting the vector loop.
516   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
517                     Value *CountRoundDown, Value *EndValue,
518                     BasicBlock *MiddleBlock);
519 
520   /// Create a new induction variable inside L.
521   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
522                                    Value *Step, Instruction *DL);
523 
524   /// Handle all cross-iteration phis in the header.
525   void fixCrossIterationPHIs();
526 
527   /// Fix a first-order recurrence. This is the second phase of vectorizing
528   /// this phi node.
529   void fixFirstOrderRecurrence(PHINode *Phi);
530 
531   /// Fix a reduction cross-iteration phi. This is the second phase of
532   /// vectorizing this phi node.
533   void fixReduction(PHINode *Phi);
534 
535   /// Clear NSW/NUW flags from reduction instructions if necessary.
536   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
537 
538   /// The Loop exit block may have single value PHI nodes with some
539   /// incoming value. While vectorizing we only handled real values
540   /// that were defined inside the loop and we should have one value for
541   /// each predecessor of its parent basic block. See PR14725.
542   void fixLCSSAPHIs();
543 
544   /// Iteratively sink the scalarized operands of a predicated instruction into
545   /// the block that was created for it.
546   void sinkScalarOperands(Instruction *PredInst);
547 
548   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
549   /// represented as.
550   void truncateToMinimalBitwidths();
551 
552   /// Create a broadcast instruction. This method generates a broadcast
553   /// instruction (shuffle) for loop invariant values and for the induction
554   /// value. If this is the induction variable then we extend it to N, N+1, ...
555   /// this is needed because each iteration in the loop corresponds to a SIMD
556   /// element.
557   virtual Value *getBroadcastInstrs(Value *V);
558 
559   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
560   /// to each vector element of Val. The sequence starts at StartIndex.
561   /// \p Opcode is relevant for FP induction variable.
562   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
563                                Instruction::BinaryOps Opcode =
564                                Instruction::BinaryOpsEnd);
565 
566   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
567   /// variable on which to base the steps, \p Step is the size of the step, and
568   /// \p EntryVal is the value from the original loop that maps to the steps.
569   /// Note that \p EntryVal doesn't have to be an induction variable - it
570   /// can also be a truncate instruction.
571   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
572                         const InductionDescriptor &ID);
573 
574   /// Create a vector induction phi node based on an existing scalar one. \p
575   /// EntryVal is the value from the original loop that maps to the vector phi
576   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
577   /// truncate instruction, instead of widening the original IV, we widen a
578   /// version of the IV truncated to \p EntryVal's type.
579   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
580                                        Value *Step, Instruction *EntryVal);
581 
582   /// Returns true if an instruction \p I should be scalarized instead of
583   /// vectorized for the chosen vectorization factor.
584   bool shouldScalarizeInstruction(Instruction *I) const;
585 
586   /// Returns true if we should generate a scalar version of \p IV.
587   bool needsScalarInduction(Instruction *IV) const;
588 
589   /// If there is a cast involved in the induction variable \p ID, which should
590   /// be ignored in the vectorized loop body, this function records the
591   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
592   /// cast. We had already proved that the casted Phi is equal to the uncasted
593   /// Phi in the vectorized loop (under a runtime guard), and therefore
594   /// there is no need to vectorize the cast - the same value can be used in the
595   /// vector loop for both the Phi and the cast.
596   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
597   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
598   ///
599   /// \p EntryVal is the value from the original loop that maps to the vector
600   /// phi node and is used to distinguish what is the IV currently being
601   /// processed - original one (if \p EntryVal is a phi corresponding to the
602   /// original IV) or the "newly-created" one based on the proof mentioned above
603   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
604   /// latter case \p EntryVal is a TruncInst and we must not record anything for
605   /// that IV, but it's error-prone to expect callers of this routine to care
606   /// about that, hence this explicit parameter.
607   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
608                                              const Instruction *EntryVal,
609                                              Value *VectorLoopValue,
610                                              unsigned Part,
611                                              unsigned Lane = UINT_MAX);
612 
613   /// Generate a shuffle sequence that will reverse the vector Vec.
614   virtual Value *reverseVector(Value *Vec);
615 
616   /// Returns (and creates if needed) the original loop trip count.
617   Value *getOrCreateTripCount(Loop *NewLoop);
618 
619   /// Returns (and creates if needed) the trip count of the widened loop.
620   Value *getOrCreateVectorTripCount(Loop *NewLoop);
621 
622   /// Returns a bitcasted value to the requested vector type.
623   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
625                                 const DataLayout &DL);
626 
627   /// Emit a bypass check to see if the vector trip count is zero, including if
628   /// it overflows.
629   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit a bypass check to see if all of the SCEV assumptions we've
632   /// had to make are correct.
633   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
634 
635   /// Emit bypass checks to check any memory assumptions we may have made.
636   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
637 
638   /// Compute the transformed value of Index at offset StartValue using step
639   /// StepValue.
640   /// For integer induction, returns StartValue + Index * StepValue.
641   /// For pointer induction, returns StartValue[Index * StepValue].
642   /// FIXME: The newly created binary instructions should contain nsw/nuw
643   /// flags, which can be found from the original scalar operations.
644   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
645                               const DataLayout &DL,
646                               const InductionDescriptor &ID) const;
647 
648   /// Add additional metadata to \p To that was not present on \p Orig.
649   ///
650   /// Currently this is used to add the noalias annotations based on the
651   /// inserted memchecks.  Use this for instructions that are *cloned* into the
652   /// vector loop.
653   void addNewMetadata(Instruction *To, const Instruction *Orig);
654 
655   /// Add metadata from one instruction to another.
656   ///
657   /// This includes both the original MDs from \p From and additional ones (\see
658   /// addNewMetadata).  Use this for *newly created* instructions in the vector
659   /// loop.
660   void addMetadata(Instruction *To, Instruction *From);
661 
662   /// Similar to the previous function but it adds the metadata to a
663   /// vector of instructions.
664   void addMetadata(ArrayRef<Value *> To, Instruction *From);
665 
666   /// The original loop.
667   Loop *OrigLoop;
668 
669   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
670   /// dynamic knowledge to simplify SCEV expressions and converts them to a
671   /// more usable form.
672   PredicatedScalarEvolution &PSE;
673 
674   /// Loop Info.
675   LoopInfo *LI;
676 
677   /// Dominator Tree.
678   DominatorTree *DT;
679 
680   /// Alias Analysis.
681   AliasAnalysis *AA;
682 
683   /// Target Library Info.
684   const TargetLibraryInfo *TLI;
685 
686   /// Target Transform Info.
687   const TargetTransformInfo *TTI;
688 
689   /// Assumption Cache.
690   AssumptionCache *AC;
691 
692   /// Interface to emit optimization remarks.
693   OptimizationRemarkEmitter *ORE;
694 
695   /// LoopVersioning.  It's only set up (non-null) if memchecks were
696   /// used.
697   ///
698   /// This is currently only used to add no-alias metadata based on the
699   /// memchecks.  The actually versioning is performed manually.
700   std::unique_ptr<LoopVersioning> LVer;
701 
702   /// The vectorization SIMD factor to use. Each vector will have this many
703   /// vector elements.
704   unsigned VF;
705 
706   /// The vectorization unroll factor to use. Each scalar is vectorized to this
707   /// many different vector instructions.
708   unsigned UF;
709 
710   /// The builder that we use
711   IRBuilder<> Builder;
712 
713   // --- Vectorization state ---
714 
715   /// The vector-loop preheader.
716   BasicBlock *LoopVectorPreHeader;
717 
718   /// The scalar-loop preheader.
719   BasicBlock *LoopScalarPreHeader;
720 
721   /// Middle Block between the vector and the scalar.
722   BasicBlock *LoopMiddleBlock;
723 
724   /// The ExitBlock of the scalar loop.
725   BasicBlock *LoopExitBlock;
726 
727   /// The vector loop body.
728   BasicBlock *LoopVectorBody;
729 
730   /// The scalar loop body.
731   BasicBlock *LoopScalarBody;
732 
733   /// A list of all bypass blocks. The first block is the entry of the loop.
734   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
735 
736   /// The new Induction variable which was added to the new block.
737   PHINode *Induction = nullptr;
738 
739   /// The induction variable of the old basic block.
740   PHINode *OldInduction = nullptr;
741 
742   /// Maps values from the original loop to their corresponding values in the
743   /// vectorized loop. A key value can map to either vector values, scalar
744   /// values or both kinds of values, depending on whether the key was
745   /// vectorized and scalarized.
746   VectorizerValueMap VectorLoopValueMap;
747 
748   /// Store instructions that were predicated.
749   SmallVector<Instruction *, 4> PredicatedInstructions;
750 
751   /// Trip count of the original loop.
752   Value *TripCount = nullptr;
753 
754   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
755   Value *VectorTripCount = nullptr;
756 
757   /// The legality analysis.
758   LoopVectorizationLegality *Legal;
759 
760   /// The profitablity analysis.
761   LoopVectorizationCostModel *Cost;
762 
763   // Record whether runtime checks are added.
764   bool AddedSafetyChecks = false;
765 
766   // Holds the end values for each induction variable. We save the end values
767   // so we can later fix-up the external users of the induction variables.
768   DenseMap<PHINode *, Value *> IVEndValues;
769 
770   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
771   // fixed up at the end of vector code generation.
772   SmallVector<PHINode *, 8> OrigPHIsToFix;
773 };
774 
775 class InnerLoopUnroller : public InnerLoopVectorizer {
776 public:
777   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
778                     LoopInfo *LI, DominatorTree *DT,
779                     const TargetLibraryInfo *TLI,
780                     const TargetTransformInfo *TTI, AssumptionCache *AC,
781                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
782                     LoopVectorizationLegality *LVL,
783                     LoopVectorizationCostModel *CM)
784       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
785                             UnrollFactor, LVL, CM) {}
786 
787 private:
788   Value *getBroadcastInstrs(Value *V) override;
789   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
790                        Instruction::BinaryOps Opcode =
791                        Instruction::BinaryOpsEnd) override;
792   Value *reverseVector(Value *Vec) override;
793 };
794 
795 } // end namespace llvm
796 
797 /// Look for a meaningful debug location on the instruction or it's
798 /// operands.
799 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
800   if (!I)
801     return I;
802 
803   DebugLoc Empty;
804   if (I->getDebugLoc() != Empty)
805     return I;
806 
807   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
808     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
809       if (OpInst->getDebugLoc() != Empty)
810         return OpInst;
811   }
812 
813   return I;
814 }
815 
816 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
817   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
818     const DILocation *DIL = Inst->getDebugLoc();
819     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
820         !isa<DbgInfoIntrinsic>(Inst)) {
821       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
822       if (NewDIL)
823         B.SetCurrentDebugLocation(NewDIL.getValue());
824       else
825         LLVM_DEBUG(dbgs()
826                    << "Failed to create new discriminator: "
827                    << DIL->getFilename() << " Line: " << DIL->getLine());
828     }
829     else
830       B.SetCurrentDebugLocation(DIL);
831   } else
832     B.SetCurrentDebugLocation(DebugLoc());
833 }
834 
835 /// Write a record \p DebugMsg about vectorization failure to the debug
836 /// output stream. If \p I is passed, it is an instruction that prevents
837 /// vectorization.
838 #ifndef NDEBUG
839 static void debugVectorizationFailure(const StringRef DebugMsg,
840     Instruction *I) {
841   dbgs() << "LV: Not vectorizing: " << DebugMsg;
842   if (I != nullptr)
843     dbgs() << " " << *I;
844   else
845     dbgs() << '.';
846   dbgs() << '\n';
847 }
848 #endif
849 
850 /// Create an analysis remark that explains why vectorization failed
851 ///
852 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
853 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
854 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
855 /// the location of the remark.  \return the remark object that can be
856 /// streamed to.
857 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
858     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
859   Value *CodeRegion = TheLoop->getHeader();
860   DebugLoc DL = TheLoop->getStartLoc();
861 
862   if (I) {
863     CodeRegion = I->getParent();
864     // If there is no debug location attached to the instruction, revert back to
865     // using the loop's.
866     if (I->getDebugLoc())
867       DL = I->getDebugLoc();
868   }
869 
870   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
871   R << "loop not vectorized: ";
872   return R;
873 }
874 
875 namespace llvm {
876 
877 void reportVectorizationFailure(const StringRef DebugMsg,
878     const StringRef OREMsg, const StringRef ORETag,
879     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
880   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
881   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
882   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
883                 ORETag, TheLoop, I) << OREMsg);
884 }
885 
886 } // end namespace llvm
887 
888 #ifndef NDEBUG
889 /// \return string containing a file name and a line # for the given loop.
890 static std::string getDebugLocString(const Loop *L) {
891   std::string Result;
892   if (L) {
893     raw_string_ostream OS(Result);
894     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
895       LoopDbgLoc.print(OS);
896     else
897       // Just print the module name.
898       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
899     OS.flush();
900   }
901   return Result;
902 }
903 #endif
904 
905 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
906                                          const Instruction *Orig) {
907   // If the loop was versioned with memchecks, add the corresponding no-alias
908   // metadata.
909   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
910     LVer->annotateInstWithNoAlias(To, Orig);
911 }
912 
913 void InnerLoopVectorizer::addMetadata(Instruction *To,
914                                       Instruction *From) {
915   propagateMetadata(To, From);
916   addNewMetadata(To, From);
917 }
918 
919 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
920                                       Instruction *From) {
921   for (Value *V : To) {
922     if (Instruction *I = dyn_cast<Instruction>(V))
923       addMetadata(I, From);
924   }
925 }
926 
927 namespace llvm {
928 
929 // Loop vectorization cost-model hints how the scalar epilogue loop should be
930 // lowered.
931 enum ScalarEpilogueLowering {
932 
933   // The default: allowing scalar epilogues.
934   CM_ScalarEpilogueAllowed,
935 
936   // Vectorization with OptForSize: don't allow epilogues.
937   CM_ScalarEpilogueNotAllowedOptSize,
938 
939   // A special case of vectorisation with OptForSize: loops with a very small
940   // trip count are considered for vectorization under OptForSize, thereby
941   // making sure the cost of their loop body is dominant, free of runtime
942   // guards and scalar iteration overheads.
943   CM_ScalarEpilogueNotAllowedLowTripLoop,
944 
945   // Loop hint predicate indicating an epilogue is undesired.
946   CM_ScalarEpilogueNotNeededUsePredicate
947 };
948 
949 /// LoopVectorizationCostModel - estimates the expected speedups due to
950 /// vectorization.
951 /// In many cases vectorization is not profitable. This can happen because of
952 /// a number of reasons. In this class we mainly attempt to predict the
953 /// expected speedup/slowdowns due to the supported instruction set. We use the
954 /// TargetTransformInfo to query the different backends for the cost of
955 /// different operations.
956 class LoopVectorizationCostModel {
957 public:
958   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
959                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
960                              LoopVectorizationLegality *Legal,
961                              const TargetTransformInfo &TTI,
962                              const TargetLibraryInfo *TLI, DemandedBits *DB,
963                              AssumptionCache *AC,
964                              OptimizationRemarkEmitter *ORE, const Function *F,
965                              const LoopVectorizeHints *Hints,
966                              InterleavedAccessInfo &IAI)
967       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
968         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
969         Hints(Hints), InterleaveInfo(IAI) {}
970 
971   /// \return An upper bound for the vectorization factor, or None if
972   /// vectorization and interleaving should be avoided up front.
973   Optional<unsigned> computeMaxVF();
974 
975   /// \return True if runtime checks are required for vectorization, and false
976   /// otherwise.
977   bool runtimeChecksRequired();
978 
979   /// \return The most profitable vectorization factor and the cost of that VF.
980   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
981   /// then this vectorization factor will be selected if vectorization is
982   /// possible.
983   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
984 
985   /// Setup cost-based decisions for user vectorization factor.
986   void selectUserVectorizationFactor(unsigned UserVF) {
987     collectUniformsAndScalars(UserVF);
988     collectInstsToScalarize(UserVF);
989   }
990 
991   /// \return The size (in bits) of the smallest and widest types in the code
992   /// that needs to be vectorized. We ignore values that remain scalar such as
993   /// 64 bit loop indices.
994   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
995 
996   /// \return The desired interleave count.
997   /// If interleave count has been specified by metadata it will be returned.
998   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
999   /// are the selected vectorization factor and the cost of the selected VF.
1000   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1001 
1002   /// Memory access instruction may be vectorized in more than one way.
1003   /// Form of instruction after vectorization depends on cost.
1004   /// This function takes cost-based decisions for Load/Store instructions
1005   /// and collects them in a map. This decisions map is used for building
1006   /// the lists of loop-uniform and loop-scalar instructions.
1007   /// The calculated cost is saved with widening decision in order to
1008   /// avoid redundant calculations.
1009   void setCostBasedWideningDecision(unsigned VF);
1010 
1011   /// A struct that represents some properties of the register usage
1012   /// of a loop.
1013   struct RegisterUsage {
1014     /// Holds the number of loop invariant values that are used in the loop.
1015     /// The key is ClassID of target-provided register class.
1016     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1017     /// Holds the maximum number of concurrent live intervals in the loop.
1018     /// The key is ClassID of target-provided register class.
1019     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1020   };
1021 
1022   /// \return Returns information about the register usages of the loop for the
1023   /// given vectorization factors.
1024   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1025 
1026   /// Collect values we want to ignore in the cost model.
1027   void collectValuesToIgnore();
1028 
1029   /// \returns The smallest bitwidth each instruction can be represented with.
1030   /// The vector equivalents of these instructions should be truncated to this
1031   /// type.
1032   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1033     return MinBWs;
1034   }
1035 
1036   /// \returns True if it is more profitable to scalarize instruction \p I for
1037   /// vectorization factor \p VF.
1038   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1039     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1040 
1041     // Cost model is not run in the VPlan-native path - return conservative
1042     // result until this changes.
1043     if (EnableVPlanNativePath)
1044       return false;
1045 
1046     auto Scalars = InstsToScalarize.find(VF);
1047     assert(Scalars != InstsToScalarize.end() &&
1048            "VF not yet analyzed for scalarization profitability");
1049     return Scalars->second.find(I) != Scalars->second.end();
1050   }
1051 
1052   /// Returns true if \p I is known to be uniform after vectorization.
1053   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1054     if (VF == 1)
1055       return true;
1056 
1057     // Cost model is not run in the VPlan-native path - return conservative
1058     // result until this changes.
1059     if (EnableVPlanNativePath)
1060       return false;
1061 
1062     auto UniformsPerVF = Uniforms.find(VF);
1063     assert(UniformsPerVF != Uniforms.end() &&
1064            "VF not yet analyzed for uniformity");
1065     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1066   }
1067 
1068   /// Returns true if \p I is known to be scalar after vectorization.
1069   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1070     if (VF == 1)
1071       return true;
1072 
1073     // Cost model is not run in the VPlan-native path - return conservative
1074     // result until this changes.
1075     if (EnableVPlanNativePath)
1076       return false;
1077 
1078     auto ScalarsPerVF = Scalars.find(VF);
1079     assert(ScalarsPerVF != Scalars.end() &&
1080            "Scalar values are not calculated for VF");
1081     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1082   }
1083 
1084   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1085   /// for vectorization factor \p VF.
1086   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1087     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1088            !isProfitableToScalarize(I, VF) &&
1089            !isScalarAfterVectorization(I, VF);
1090   }
1091 
1092   /// Decision that was taken during cost calculation for memory instruction.
1093   enum InstWidening {
1094     CM_Unknown,
1095     CM_Widen,         // For consecutive accesses with stride +1.
1096     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1097     CM_Interleave,
1098     CM_GatherScatter,
1099     CM_Scalarize
1100   };
1101 
1102   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1103   /// instruction \p I and vector width \p VF.
1104   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1105                            unsigned Cost) {
1106     assert(VF >= 2 && "Expected VF >=2");
1107     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1108   }
1109 
1110   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1111   /// interleaving group \p Grp and vector width \p VF.
1112   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1113                            InstWidening W, unsigned Cost) {
1114     assert(VF >= 2 && "Expected VF >=2");
1115     /// Broadcast this decicion to all instructions inside the group.
1116     /// But the cost will be assigned to one instruction only.
1117     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1118       if (auto *I = Grp->getMember(i)) {
1119         if (Grp->getInsertPos() == I)
1120           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1121         else
1122           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1123       }
1124     }
1125   }
1126 
1127   /// Return the cost model decision for the given instruction \p I and vector
1128   /// width \p VF. Return CM_Unknown if this instruction did not pass
1129   /// through the cost modeling.
1130   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1131     assert(VF >= 2 && "Expected VF >=2");
1132 
1133     // Cost model is not run in the VPlan-native path - return conservative
1134     // result until this changes.
1135     if (EnableVPlanNativePath)
1136       return CM_GatherScatter;
1137 
1138     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1139     auto Itr = WideningDecisions.find(InstOnVF);
1140     if (Itr == WideningDecisions.end())
1141       return CM_Unknown;
1142     return Itr->second.first;
1143   }
1144 
1145   /// Return the vectorization cost for the given instruction \p I and vector
1146   /// width \p VF.
1147   unsigned getWideningCost(Instruction *I, unsigned VF) {
1148     assert(VF >= 2 && "Expected VF >=2");
1149     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1150     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1151            "The cost is not calculated");
1152     return WideningDecisions[InstOnVF].second;
1153   }
1154 
1155   /// Return True if instruction \p I is an optimizable truncate whose operand
1156   /// is an induction variable. Such a truncate will be removed by adding a new
1157   /// induction variable with the destination type.
1158   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1159     // If the instruction is not a truncate, return false.
1160     auto *Trunc = dyn_cast<TruncInst>(I);
1161     if (!Trunc)
1162       return false;
1163 
1164     // Get the source and destination types of the truncate.
1165     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1166     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1167 
1168     // If the truncate is free for the given types, return false. Replacing a
1169     // free truncate with an induction variable would add an induction variable
1170     // update instruction to each iteration of the loop. We exclude from this
1171     // check the primary induction variable since it will need an update
1172     // instruction regardless.
1173     Value *Op = Trunc->getOperand(0);
1174     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1175       return false;
1176 
1177     // If the truncated value is not an induction variable, return false.
1178     return Legal->isInductionPhi(Op);
1179   }
1180 
1181   /// Collects the instructions to scalarize for each predicated instruction in
1182   /// the loop.
1183   void collectInstsToScalarize(unsigned VF);
1184 
1185   /// Collect Uniform and Scalar values for the given \p VF.
1186   /// The sets depend on CM decision for Load/Store instructions
1187   /// that may be vectorized as interleave, gather-scatter or scalarized.
1188   void collectUniformsAndScalars(unsigned VF) {
1189     // Do the analysis once.
1190     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1191       return;
1192     setCostBasedWideningDecision(VF);
1193     collectLoopUniforms(VF);
1194     collectLoopScalars(VF);
1195   }
1196 
1197   /// Returns true if the target machine supports masked store operation
1198   /// for the given \p DataType and kind of access to \p Ptr.
1199   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1200     return Legal->isConsecutivePtr(Ptr) &&
1201            TTI.isLegalMaskedStore(DataType, Alignment);
1202   }
1203 
1204   /// Returns true if the target machine supports masked load operation
1205   /// for the given \p DataType and kind of access to \p Ptr.
1206   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1207     return Legal->isConsecutivePtr(Ptr) &&
1208            TTI.isLegalMaskedLoad(DataType, Alignment);
1209   }
1210 
1211   /// Returns true if the target machine supports masked scatter operation
1212   /// for the given \p DataType.
1213   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1214     return TTI.isLegalMaskedScatter(DataType, Alignment);
1215   }
1216 
1217   /// Returns true if the target machine supports masked gather operation
1218   /// for the given \p DataType.
1219   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1220     return TTI.isLegalMaskedGather(DataType, Alignment);
1221   }
1222 
1223   /// Returns true if the target machine can represent \p V as a masked gather
1224   /// or scatter operation.
1225   bool isLegalGatherOrScatter(Value *V) {
1226     bool LI = isa<LoadInst>(V);
1227     bool SI = isa<StoreInst>(V);
1228     if (!LI && !SI)
1229       return false;
1230     auto *Ty = getMemInstValueType(V);
1231     MaybeAlign Align = getLoadStoreAlignment(V);
1232     return (LI && isLegalMaskedGather(Ty, Align)) ||
1233            (SI && isLegalMaskedScatter(Ty, Align));
1234   }
1235 
1236   /// Returns true if \p I is an instruction that will be scalarized with
1237   /// predication. Such instructions include conditional stores and
1238   /// instructions that may divide by zero.
1239   /// If a non-zero VF has been calculated, we check if I will be scalarized
1240   /// predication for that VF.
1241   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1242 
1243   // Returns true if \p I is an instruction that will be predicated either
1244   // through scalar predication or masked load/store or masked gather/scatter.
1245   // Superset of instructions that return true for isScalarWithPredication.
1246   bool isPredicatedInst(Instruction *I) {
1247     if (!blockNeedsPredication(I->getParent()))
1248       return false;
1249     // Loads and stores that need some form of masked operation are predicated
1250     // instructions.
1251     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1252       return Legal->isMaskRequired(I);
1253     return isScalarWithPredication(I);
1254   }
1255 
1256   /// Returns true if \p I is a memory instruction with consecutive memory
1257   /// access that can be widened.
1258   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1259 
1260   /// Returns true if \p I is a memory instruction in an interleaved-group
1261   /// of memory accesses that can be vectorized with wide vector loads/stores
1262   /// and shuffles.
1263   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1264 
1265   /// Check if \p Instr belongs to any interleaved access group.
1266   bool isAccessInterleaved(Instruction *Instr) {
1267     return InterleaveInfo.isInterleaved(Instr);
1268   }
1269 
1270   /// Get the interleaved access group that \p Instr belongs to.
1271   const InterleaveGroup<Instruction> *
1272   getInterleavedAccessGroup(Instruction *Instr) {
1273     return InterleaveInfo.getInterleaveGroup(Instr);
1274   }
1275 
1276   /// Returns true if an interleaved group requires a scalar iteration
1277   /// to handle accesses with gaps, and there is nothing preventing us from
1278   /// creating a scalar epilogue.
1279   bool requiresScalarEpilogue() const {
1280     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1281   }
1282 
1283   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1284   /// loop hint annotation.
1285   bool isScalarEpilogueAllowed() const {
1286     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1287   }
1288 
1289   /// Returns true if all loop blocks should be masked to fold tail loop.
1290   bool foldTailByMasking() const { return FoldTailByMasking; }
1291 
1292   bool blockNeedsPredication(BasicBlock *BB) {
1293     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1294   }
1295 
1296   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1297   /// with factor VF.  Return the cost of the instruction, including
1298   /// scalarization overhead if it's needed.
1299   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1300 
1301   /// Estimate cost of a call instruction CI if it were vectorized with factor
1302   /// VF. Return the cost of the instruction, including scalarization overhead
1303   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1304   /// scalarized -
1305   /// i.e. either vector version isn't available, or is too expensive.
1306   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1307 
1308 private:
1309   unsigned NumPredStores = 0;
1310 
1311   /// \return An upper bound for the vectorization factor, larger than zero.
1312   /// One is returned if vectorization should best be avoided due to cost.
1313   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1314 
1315   /// The vectorization cost is a combination of the cost itself and a boolean
1316   /// indicating whether any of the contributing operations will actually
1317   /// operate on
1318   /// vector values after type legalization in the backend. If this latter value
1319   /// is
1320   /// false, then all operations will be scalarized (i.e. no vectorization has
1321   /// actually taken place).
1322   using VectorizationCostTy = std::pair<unsigned, bool>;
1323 
1324   /// Returns the expected execution cost. The unit of the cost does
1325   /// not matter because we use the 'cost' units to compare different
1326   /// vector widths. The cost that is returned is *not* normalized by
1327   /// the factor width.
1328   VectorizationCostTy expectedCost(unsigned VF);
1329 
1330   /// Returns the execution time cost of an instruction for a given vector
1331   /// width. Vector width of one means scalar.
1332   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1333 
1334   /// The cost-computation logic from getInstructionCost which provides
1335   /// the vector type as an output parameter.
1336   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1337 
1338   /// Calculate vectorization cost of memory instruction \p I.
1339   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1340 
1341   /// The cost computation for scalarized memory instruction.
1342   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1343 
1344   /// The cost computation for interleaving group of memory instructions.
1345   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1346 
1347   /// The cost computation for Gather/Scatter instruction.
1348   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost computation for widening instruction \p I with consecutive
1351   /// memory access.
1352   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1353 
1354   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1355   /// Load: scalar load + broadcast.
1356   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1357   /// element)
1358   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1359 
1360   /// Estimate the overhead of scalarizing an instruction. This is a
1361   /// convenience wrapper for the type-based getScalarizationOverhead API.
1362   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1363 
1364   /// Returns whether the instruction is a load or store and will be a emitted
1365   /// as a vector operation.
1366   bool isConsecutiveLoadOrStore(Instruction *I);
1367 
1368   /// Returns true if an artificially high cost for emulated masked memrefs
1369   /// should be used.
1370   bool useEmulatedMaskMemRefHack(Instruction *I);
1371 
1372   /// Map of scalar integer values to the smallest bitwidth they can be legally
1373   /// represented as. The vector equivalents of these values should be truncated
1374   /// to this type.
1375   MapVector<Instruction *, uint64_t> MinBWs;
1376 
1377   /// A type representing the costs for instructions if they were to be
1378   /// scalarized rather than vectorized. The entries are Instruction-Cost
1379   /// pairs.
1380   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1381 
1382   /// A set containing all BasicBlocks that are known to present after
1383   /// vectorization as a predicated block.
1384   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1385 
1386   /// Records whether it is allowed to have the original scalar loop execute at
1387   /// least once. This may be needed as a fallback loop in case runtime
1388   /// aliasing/dependence checks fail, or to handle the tail/remainder
1389   /// iterations when the trip count is unknown or doesn't divide by the VF,
1390   /// or as a peel-loop to handle gaps in interleave-groups.
1391   /// Under optsize and when the trip count is very small we don't allow any
1392   /// iterations to execute in the scalar loop.
1393   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1394 
1395   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1396   bool FoldTailByMasking = false;
1397 
1398   /// A map holding scalar costs for different vectorization factors. The
1399   /// presence of a cost for an instruction in the mapping indicates that the
1400   /// instruction will be scalarized when vectorizing with the associated
1401   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1402   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1403 
1404   /// Holds the instructions known to be uniform after vectorization.
1405   /// The data is collected per VF.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1407 
1408   /// Holds the instructions known to be scalar after vectorization.
1409   /// The data is collected per VF.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1411 
1412   /// Holds the instructions (address computations) that are forced to be
1413   /// scalarized.
1414   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1415 
1416   /// Returns the expected difference in cost from scalarizing the expression
1417   /// feeding a predicated instruction \p PredInst. The instructions to
1418   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1419   /// non-negative return value implies the expression will be scalarized.
1420   /// Currently, only single-use chains are considered for scalarization.
1421   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1422                               unsigned VF);
1423 
1424   /// Collect the instructions that are uniform after vectorization. An
1425   /// instruction is uniform if we represent it with a single scalar value in
1426   /// the vectorized loop corresponding to each vector iteration. Examples of
1427   /// uniform instructions include pointer operands of consecutive or
1428   /// interleaved memory accesses. Note that although uniformity implies an
1429   /// instruction will be scalar, the reverse is not true. In general, a
1430   /// scalarized instruction will be represented by VF scalar values in the
1431   /// vectorized loop, each corresponding to an iteration of the original
1432   /// scalar loop.
1433   void collectLoopUniforms(unsigned VF);
1434 
1435   /// Collect the instructions that are scalar after vectorization. An
1436   /// instruction is scalar if it is known to be uniform or will be scalarized
1437   /// during vectorization. Non-uniform scalarized instructions will be
1438   /// represented by VF values in the vectorized loop, each corresponding to an
1439   /// iteration of the original scalar loop.
1440   void collectLoopScalars(unsigned VF);
1441 
1442   /// Keeps cost model vectorization decision and cost for instructions.
1443   /// Right now it is used for memory instructions only.
1444   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1445                                 std::pair<InstWidening, unsigned>>;
1446 
1447   DecisionList WideningDecisions;
1448 
1449   /// Returns true if \p V is expected to be vectorized and it needs to be
1450   /// extracted.
1451   bool needsExtract(Value *V, unsigned VF) const {
1452     Instruction *I = dyn_cast<Instruction>(V);
1453     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1454       return false;
1455 
1456     // Assume we can vectorize V (and hence we need extraction) if the
1457     // scalars are not computed yet. This can happen, because it is called
1458     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1459     // the scalars are collected. That should be a safe assumption in most
1460     // cases, because we check if the operands have vectorizable types
1461     // beforehand in LoopVectorizationLegality.
1462     return Scalars.find(VF) == Scalars.end() ||
1463            !isScalarAfterVectorization(I, VF);
1464   };
1465 
1466   /// Returns a range containing only operands needing to be extracted.
1467   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1468                                                    unsigned VF) {
1469     return SmallVector<Value *, 4>(make_filter_range(
1470         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1471   }
1472 
1473 public:
1474   /// The loop that we evaluate.
1475   Loop *TheLoop;
1476 
1477   /// Predicated scalar evolution analysis.
1478   PredicatedScalarEvolution &PSE;
1479 
1480   /// Loop Info analysis.
1481   LoopInfo *LI;
1482 
1483   /// Vectorization legality.
1484   LoopVectorizationLegality *Legal;
1485 
1486   /// Vector target information.
1487   const TargetTransformInfo &TTI;
1488 
1489   /// Target Library Info.
1490   const TargetLibraryInfo *TLI;
1491 
1492   /// Demanded bits analysis.
1493   DemandedBits *DB;
1494 
1495   /// Assumption cache.
1496   AssumptionCache *AC;
1497 
1498   /// Interface to emit optimization remarks.
1499   OptimizationRemarkEmitter *ORE;
1500 
1501   const Function *TheFunction;
1502 
1503   /// Loop Vectorize Hint.
1504   const LoopVectorizeHints *Hints;
1505 
1506   /// The interleave access information contains groups of interleaved accesses
1507   /// with the same stride and close to each other.
1508   InterleavedAccessInfo &InterleaveInfo;
1509 
1510   /// Values to ignore in the cost model.
1511   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1512 
1513   /// Values to ignore in the cost model when VF > 1.
1514   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1515 };
1516 
1517 } // end namespace llvm
1518 
1519 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1520 // vectorization. The loop needs to be annotated with #pragma omp simd
1521 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1522 // vector length information is not provided, vectorization is not considered
1523 // explicit. Interleave hints are not allowed either. These limitations will be
1524 // relaxed in the future.
1525 // Please, note that we are currently forced to abuse the pragma 'clang
1526 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1527 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1528 // provides *explicit vectorization hints* (LV can bypass legal checks and
1529 // assume that vectorization is legal). However, both hints are implemented
1530 // using the same metadata (llvm.loop.vectorize, processed by
1531 // LoopVectorizeHints). This will be fixed in the future when the native IR
1532 // representation for pragma 'omp simd' is introduced.
1533 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1534                                    OptimizationRemarkEmitter *ORE) {
1535   assert(!OuterLp->empty() && "This is not an outer loop");
1536   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1537 
1538   // Only outer loops with an explicit vectorization hint are supported.
1539   // Unannotated outer loops are ignored.
1540   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1541     return false;
1542 
1543   Function *Fn = OuterLp->getHeader()->getParent();
1544   if (!Hints.allowVectorization(Fn, OuterLp,
1545                                 true /*VectorizeOnlyWhenForced*/)) {
1546     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1547     return false;
1548   }
1549 
1550   if (Hints.getInterleave() > 1) {
1551     // TODO: Interleave support is future work.
1552     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1553                          "outer loops.\n");
1554     Hints.emitRemarkWithHints();
1555     return false;
1556   }
1557 
1558   return true;
1559 }
1560 
1561 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1562                                   OptimizationRemarkEmitter *ORE,
1563                                   SmallVectorImpl<Loop *> &V) {
1564   // Collect inner loops and outer loops without irreducible control flow. For
1565   // now, only collect outer loops that have explicit vectorization hints. If we
1566   // are stress testing the VPlan H-CFG construction, we collect the outermost
1567   // loop of every loop nest.
1568   if (L.empty() || VPlanBuildStressTest ||
1569       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1570     LoopBlocksRPO RPOT(&L);
1571     RPOT.perform(LI);
1572     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1573       V.push_back(&L);
1574       // TODO: Collect inner loops inside marked outer loops in case
1575       // vectorization fails for the outer loop. Do not invoke
1576       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1577       // already known to be reducible. We can use an inherited attribute for
1578       // that.
1579       return;
1580     }
1581   }
1582   for (Loop *InnerL : L)
1583     collectSupportedLoops(*InnerL, LI, ORE, V);
1584 }
1585 
1586 namespace {
1587 
1588 /// The LoopVectorize Pass.
1589 struct LoopVectorize : public FunctionPass {
1590   /// Pass identification, replacement for typeid
1591   static char ID;
1592 
1593   LoopVectorizePass Impl;
1594 
1595   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1596                          bool VectorizeOnlyWhenForced = false)
1597       : FunctionPass(ID) {
1598     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1599     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1600     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1601   }
1602 
1603   bool runOnFunction(Function &F) override {
1604     if (skipFunction(F))
1605       return false;
1606 
1607     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1608     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1609     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1610     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1611     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1612     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1613     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1614     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1615     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1616     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1617     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1618     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1619     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1620 
1621     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1622         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1623 
1624     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1625                         GetLAA, *ORE, PSI);
1626   }
1627 
1628   void getAnalysisUsage(AnalysisUsage &AU) const override {
1629     AU.addRequired<AssumptionCacheTracker>();
1630     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1631     AU.addRequired<DominatorTreeWrapperPass>();
1632     AU.addRequired<LoopInfoWrapperPass>();
1633     AU.addRequired<ScalarEvolutionWrapperPass>();
1634     AU.addRequired<TargetTransformInfoWrapperPass>();
1635     AU.addRequired<AAResultsWrapperPass>();
1636     AU.addRequired<LoopAccessLegacyAnalysis>();
1637     AU.addRequired<DemandedBitsWrapperPass>();
1638     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1639     AU.addRequired<InjectTLIMappingsLegacy>();
1640 
1641     // We currently do not preserve loopinfo/dominator analyses with outer loop
1642     // vectorization. Until this is addressed, mark these analyses as preserved
1643     // only for non-VPlan-native path.
1644     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1645     if (!EnableVPlanNativePath) {
1646       AU.addPreserved<LoopInfoWrapperPass>();
1647       AU.addPreserved<DominatorTreeWrapperPass>();
1648     }
1649 
1650     AU.addPreserved<BasicAAWrapperPass>();
1651     AU.addPreserved<GlobalsAAWrapperPass>();
1652     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1653   }
1654 };
1655 
1656 } // end anonymous namespace
1657 
1658 //===----------------------------------------------------------------------===//
1659 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1660 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1661 //===----------------------------------------------------------------------===//
1662 
1663 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1664   // We need to place the broadcast of invariant variables outside the loop,
1665   // but only if it's proven safe to do so. Else, broadcast will be inside
1666   // vector loop body.
1667   Instruction *Instr = dyn_cast<Instruction>(V);
1668   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1669                      (!Instr ||
1670                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1671   // Place the code for broadcasting invariant variables in the new preheader.
1672   IRBuilder<>::InsertPointGuard Guard(Builder);
1673   if (SafeToHoist)
1674     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1675 
1676   // Broadcast the scalar into all locations in the vector.
1677   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1678 
1679   return Shuf;
1680 }
1681 
1682 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1683     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1684   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1685          "Expected either an induction phi-node or a truncate of it!");
1686   Value *Start = II.getStartValue();
1687 
1688   // Construct the initial value of the vector IV in the vector loop preheader
1689   auto CurrIP = Builder.saveIP();
1690   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1691   if (isa<TruncInst>(EntryVal)) {
1692     assert(Start->getType()->isIntegerTy() &&
1693            "Truncation requires an integer type");
1694     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1695     Step = Builder.CreateTrunc(Step, TruncType);
1696     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1697   }
1698   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1699   Value *SteppedStart =
1700       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1701 
1702   // We create vector phi nodes for both integer and floating-point induction
1703   // variables. Here, we determine the kind of arithmetic we will perform.
1704   Instruction::BinaryOps AddOp;
1705   Instruction::BinaryOps MulOp;
1706   if (Step->getType()->isIntegerTy()) {
1707     AddOp = Instruction::Add;
1708     MulOp = Instruction::Mul;
1709   } else {
1710     AddOp = II.getInductionOpcode();
1711     MulOp = Instruction::FMul;
1712   }
1713 
1714   // Multiply the vectorization factor by the step using integer or
1715   // floating-point arithmetic as appropriate.
1716   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1717   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1718 
1719   // Create a vector splat to use in the induction update.
1720   //
1721   // FIXME: If the step is non-constant, we create the vector splat with
1722   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1723   //        handle a constant vector splat.
1724   Value *SplatVF =
1725       isa<Constant>(Mul)
1726           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1727           : Builder.CreateVectorSplat(VF, Mul);
1728   Builder.restoreIP(CurrIP);
1729 
1730   // We may need to add the step a number of times, depending on the unroll
1731   // factor. The last of those goes into the PHI.
1732   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1733                                     &*LoopVectorBody->getFirstInsertionPt());
1734   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1735   Instruction *LastInduction = VecInd;
1736   for (unsigned Part = 0; Part < UF; ++Part) {
1737     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1738 
1739     if (isa<TruncInst>(EntryVal))
1740       addMetadata(LastInduction, EntryVal);
1741     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1742 
1743     LastInduction = cast<Instruction>(addFastMathFlag(
1744         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1745     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1746   }
1747 
1748   // Move the last step to the end of the latch block. This ensures consistent
1749   // placement of all induction updates.
1750   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1751   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1752   auto *ICmp = cast<Instruction>(Br->getCondition());
1753   LastInduction->moveBefore(ICmp);
1754   LastInduction->setName("vec.ind.next");
1755 
1756   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1757   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1758 }
1759 
1760 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1761   return Cost->isScalarAfterVectorization(I, VF) ||
1762          Cost->isProfitableToScalarize(I, VF);
1763 }
1764 
1765 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1766   if (shouldScalarizeInstruction(IV))
1767     return true;
1768   auto isScalarInst = [&](User *U) -> bool {
1769     auto *I = cast<Instruction>(U);
1770     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1771   };
1772   return llvm::any_of(IV->users(), isScalarInst);
1773 }
1774 
1775 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1776     const InductionDescriptor &ID, const Instruction *EntryVal,
1777     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1778   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1779          "Expected either an induction phi-node or a truncate of it!");
1780 
1781   // This induction variable is not the phi from the original loop but the
1782   // newly-created IV based on the proof that casted Phi is equal to the
1783   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1784   // re-uses the same InductionDescriptor that original IV uses but we don't
1785   // have to do any recording in this case - that is done when original IV is
1786   // processed.
1787   if (isa<TruncInst>(EntryVal))
1788     return;
1789 
1790   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1791   if (Casts.empty())
1792     return;
1793   // Only the first Cast instruction in the Casts vector is of interest.
1794   // The rest of the Casts (if exist) have no uses outside the
1795   // induction update chain itself.
1796   Instruction *CastInst = *Casts.begin();
1797   if (Lane < UINT_MAX)
1798     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1799   else
1800     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1801 }
1802 
1803 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1804   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1805          "Primary induction variable must have an integer type");
1806 
1807   auto II = Legal->getInductionVars().find(IV);
1808   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1809 
1810   auto ID = II->second;
1811   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1812 
1813   // The value from the original loop to which we are mapping the new induction
1814   // variable.
1815   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1816 
1817   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1818 
1819   // Generate code for the induction step. Note that induction steps are
1820   // required to be loop-invariant
1821   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1822     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1823            "Induction step should be loop invariant");
1824     if (PSE.getSE()->isSCEVable(IV->getType())) {
1825       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1826       return Exp.expandCodeFor(Step, Step->getType(),
1827                                LoopVectorPreHeader->getTerminator());
1828     }
1829     return cast<SCEVUnknown>(Step)->getValue();
1830   };
1831 
1832   // The scalar value to broadcast. This is derived from the canonical
1833   // induction variable. If a truncation type is given, truncate the canonical
1834   // induction variable and step. Otherwise, derive these values from the
1835   // induction descriptor.
1836   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1837     Value *ScalarIV = Induction;
1838     if (IV != OldInduction) {
1839       ScalarIV = IV->getType()->isIntegerTy()
1840                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1841                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1842                                           IV->getType());
1843       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1844       ScalarIV->setName("offset.idx");
1845     }
1846     if (Trunc) {
1847       auto *TruncType = cast<IntegerType>(Trunc->getType());
1848       assert(Step->getType()->isIntegerTy() &&
1849              "Truncation requires an integer step");
1850       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1851       Step = Builder.CreateTrunc(Step, TruncType);
1852     }
1853     return ScalarIV;
1854   };
1855 
1856   // Create the vector values from the scalar IV, in the absence of creating a
1857   // vector IV.
1858   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1859     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1860     for (unsigned Part = 0; Part < UF; ++Part) {
1861       Value *EntryPart =
1862           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1863       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1864       if (Trunc)
1865         addMetadata(EntryPart, Trunc);
1866       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1867     }
1868   };
1869 
1870   // Now do the actual transformations, and start with creating the step value.
1871   Value *Step = CreateStepValue(ID.getStep());
1872   if (VF <= 1) {
1873     Value *ScalarIV = CreateScalarIV(Step);
1874     CreateSplatIV(ScalarIV, Step);
1875     return;
1876   }
1877 
1878   // Determine if we want a scalar version of the induction variable. This is
1879   // true if the induction variable itself is not widened, or if it has at
1880   // least one user in the loop that is not widened.
1881   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1882   if (!NeedsScalarIV) {
1883     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1884     return;
1885   }
1886 
1887   // Try to create a new independent vector induction variable. If we can't
1888   // create the phi node, we will splat the scalar induction variable in each
1889   // loop iteration.
1890   if (!shouldScalarizeInstruction(EntryVal)) {
1891     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1892     Value *ScalarIV = CreateScalarIV(Step);
1893     // Create scalar steps that can be used by instructions we will later
1894     // scalarize. Note that the addition of the scalar steps will not increase
1895     // the number of instructions in the loop in the common case prior to
1896     // InstCombine. We will be trading one vector extract for each scalar step.
1897     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1898     return;
1899   }
1900 
1901   // If we haven't yet vectorized the induction variable, splat the scalar
1902   // induction variable, and build the necessary step vectors.
1903   // TODO: Don't do it unless the vectorized IV is really required.
1904   Value *ScalarIV = CreateScalarIV(Step);
1905   CreateSplatIV(ScalarIV, Step);
1906   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1907 }
1908 
1909 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1910                                           Instruction::BinaryOps BinOp) {
1911   // Create and check the types.
1912   assert(Val->getType()->isVectorTy() && "Must be a vector");
1913   int VLen = Val->getType()->getVectorNumElements();
1914 
1915   Type *STy = Val->getType()->getScalarType();
1916   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1917          "Induction Step must be an integer or FP");
1918   assert(Step->getType() == STy && "Step has wrong type");
1919 
1920   SmallVector<Constant *, 8> Indices;
1921 
1922   if (STy->isIntegerTy()) {
1923     // Create a vector of consecutive numbers from zero to VF.
1924     for (int i = 0; i < VLen; ++i)
1925       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1926 
1927     // Add the consecutive indices to the vector value.
1928     Constant *Cv = ConstantVector::get(Indices);
1929     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1930     Step = Builder.CreateVectorSplat(VLen, Step);
1931     assert(Step->getType() == Val->getType() && "Invalid step vec");
1932     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1933     // which can be found from the original scalar operations.
1934     Step = Builder.CreateMul(Cv, Step);
1935     return Builder.CreateAdd(Val, Step, "induction");
1936   }
1937 
1938   // Floating point induction.
1939   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1940          "Binary Opcode should be specified for FP induction");
1941   // Create a vector of consecutive numbers from zero to VF.
1942   for (int i = 0; i < VLen; ++i)
1943     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1944 
1945   // Add the consecutive indices to the vector value.
1946   Constant *Cv = ConstantVector::get(Indices);
1947 
1948   Step = Builder.CreateVectorSplat(VLen, Step);
1949 
1950   // Floating point operations had to be 'fast' to enable the induction.
1951   FastMathFlags Flags;
1952   Flags.setFast();
1953 
1954   Value *MulOp = Builder.CreateFMul(Cv, Step);
1955   if (isa<Instruction>(MulOp))
1956     // Have to check, MulOp may be a constant
1957     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1958 
1959   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1960   if (isa<Instruction>(BOp))
1961     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1962   return BOp;
1963 }
1964 
1965 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1966                                            Instruction *EntryVal,
1967                                            const InductionDescriptor &ID) {
1968   // We shouldn't have to build scalar steps if we aren't vectorizing.
1969   assert(VF > 1 && "VF should be greater than one");
1970 
1971   // Get the value type and ensure it and the step have the same integer type.
1972   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1973   assert(ScalarIVTy == Step->getType() &&
1974          "Val and Step should have the same type");
1975 
1976   // We build scalar steps for both integer and floating-point induction
1977   // variables. Here, we determine the kind of arithmetic we will perform.
1978   Instruction::BinaryOps AddOp;
1979   Instruction::BinaryOps MulOp;
1980   if (ScalarIVTy->isIntegerTy()) {
1981     AddOp = Instruction::Add;
1982     MulOp = Instruction::Mul;
1983   } else {
1984     AddOp = ID.getInductionOpcode();
1985     MulOp = Instruction::FMul;
1986   }
1987 
1988   // Determine the number of scalars we need to generate for each unroll
1989   // iteration. If EntryVal is uniform, we only need to generate the first
1990   // lane. Otherwise, we generate all VF values.
1991   unsigned Lanes =
1992       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1993                                                                          : VF;
1994   // Compute the scalar steps and save the results in VectorLoopValueMap.
1995   for (unsigned Part = 0; Part < UF; ++Part) {
1996     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1997       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1998       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1999       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2000       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2001       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2002     }
2003   }
2004 }
2005 
2006 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2007   assert(V != Induction && "The new induction variable should not be used.");
2008   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2009   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2010 
2011   // If we have a stride that is replaced by one, do it here. Defer this for
2012   // the VPlan-native path until we start running Legal checks in that path.
2013   if (!EnableVPlanNativePath && Legal->hasStride(V))
2014     V = ConstantInt::get(V->getType(), 1);
2015 
2016   // If we have a vector mapped to this value, return it.
2017   if (VectorLoopValueMap.hasVectorValue(V, Part))
2018     return VectorLoopValueMap.getVectorValue(V, Part);
2019 
2020   // If the value has not been vectorized, check if it has been scalarized
2021   // instead. If it has been scalarized, and we actually need the value in
2022   // vector form, we will construct the vector values on demand.
2023   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2024     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2025 
2026     // If we've scalarized a value, that value should be an instruction.
2027     auto *I = cast<Instruction>(V);
2028 
2029     // If we aren't vectorizing, we can just copy the scalar map values over to
2030     // the vector map.
2031     if (VF == 1) {
2032       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2033       return ScalarValue;
2034     }
2035 
2036     // Get the last scalar instruction we generated for V and Part. If the value
2037     // is known to be uniform after vectorization, this corresponds to lane zero
2038     // of the Part unroll iteration. Otherwise, the last instruction is the one
2039     // we created for the last vector lane of the Part unroll iteration.
2040     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2041     auto *LastInst = cast<Instruction>(
2042         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2043 
2044     // Set the insert point after the last scalarized instruction. This ensures
2045     // the insertelement sequence will directly follow the scalar definitions.
2046     auto OldIP = Builder.saveIP();
2047     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2048     Builder.SetInsertPoint(&*NewIP);
2049 
2050     // However, if we are vectorizing, we need to construct the vector values.
2051     // If the value is known to be uniform after vectorization, we can just
2052     // broadcast the scalar value corresponding to lane zero for each unroll
2053     // iteration. Otherwise, we construct the vector values using insertelement
2054     // instructions. Since the resulting vectors are stored in
2055     // VectorLoopValueMap, we will only generate the insertelements once.
2056     Value *VectorValue = nullptr;
2057     if (Cost->isUniformAfterVectorization(I, VF)) {
2058       VectorValue = getBroadcastInstrs(ScalarValue);
2059       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2060     } else {
2061       // Initialize packing with insertelements to start from undef.
2062       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2063       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2064       for (unsigned Lane = 0; Lane < VF; ++Lane)
2065         packScalarIntoVectorValue(V, {Part, Lane});
2066       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2067     }
2068     Builder.restoreIP(OldIP);
2069     return VectorValue;
2070   }
2071 
2072   // If this scalar is unknown, assume that it is a constant or that it is
2073   // loop invariant. Broadcast V and save the value for future uses.
2074   Value *B = getBroadcastInstrs(V);
2075   VectorLoopValueMap.setVectorValue(V, Part, B);
2076   return B;
2077 }
2078 
2079 Value *
2080 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2081                                             const VPIteration &Instance) {
2082   // If the value is not an instruction contained in the loop, it should
2083   // already be scalar.
2084   if (OrigLoop->isLoopInvariant(V))
2085     return V;
2086 
2087   assert(Instance.Lane > 0
2088              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2089              : true && "Uniform values only have lane zero");
2090 
2091   // If the value from the original loop has not been vectorized, it is
2092   // represented by UF x VF scalar values in the new loop. Return the requested
2093   // scalar value.
2094   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2095     return VectorLoopValueMap.getScalarValue(V, Instance);
2096 
2097   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2098   // for the given unroll part. If this entry is not a vector type (i.e., the
2099   // vectorization factor is one), there is no need to generate an
2100   // extractelement instruction.
2101   auto *U = getOrCreateVectorValue(V, Instance.Part);
2102   if (!U->getType()->isVectorTy()) {
2103     assert(VF == 1 && "Value not scalarized has non-vector type");
2104     return U;
2105   }
2106 
2107   // Otherwise, the value from the original loop has been vectorized and is
2108   // represented by UF vector values. Extract and return the requested scalar
2109   // value from the appropriate vector lane.
2110   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2111 }
2112 
2113 void InnerLoopVectorizer::packScalarIntoVectorValue(
2114     Value *V, const VPIteration &Instance) {
2115   assert(V != Induction && "The new induction variable should not be used.");
2116   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2117   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2118 
2119   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2120   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2121   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2122                                             Builder.getInt32(Instance.Lane));
2123   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2124 }
2125 
2126 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2127   assert(Vec->getType()->isVectorTy() && "Invalid type");
2128   SmallVector<Constant *, 8> ShuffleMask;
2129   for (unsigned i = 0; i < VF; ++i)
2130     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2131 
2132   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2133                                      ConstantVector::get(ShuffleMask),
2134                                      "reverse");
2135 }
2136 
2137 // Return whether we allow using masked interleave-groups (for dealing with
2138 // strided loads/stores that reside in predicated blocks, or for dealing
2139 // with gaps).
2140 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2141   // If an override option has been passed in for interleaved accesses, use it.
2142   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2143     return EnableMaskedInterleavedMemAccesses;
2144 
2145   return TTI.enableMaskedInterleavedAccessVectorization();
2146 }
2147 
2148 // Try to vectorize the interleave group that \p Instr belongs to.
2149 //
2150 // E.g. Translate following interleaved load group (factor = 3):
2151 //   for (i = 0; i < N; i+=3) {
2152 //     R = Pic[i];             // Member of index 0
2153 //     G = Pic[i+1];           // Member of index 1
2154 //     B = Pic[i+2];           // Member of index 2
2155 //     ... // do something to R, G, B
2156 //   }
2157 // To:
2158 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2159 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2160 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2161 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2162 //
2163 // Or translate following interleaved store group (factor = 3):
2164 //   for (i = 0; i < N; i+=3) {
2165 //     ... do something to R, G, B
2166 //     Pic[i]   = R;           // Member of index 0
2167 //     Pic[i+1] = G;           // Member of index 1
2168 //     Pic[i+2] = B;           // Member of index 2
2169 //   }
2170 // To:
2171 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2172 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2173 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2174 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2175 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2176 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2177                                                    VPTransformState &State,
2178                                                    VPValue *Addr,
2179                                                    VPValue *BlockInMask) {
2180   const InterleaveGroup<Instruction> *Group =
2181       Cost->getInterleavedAccessGroup(Instr);
2182   assert(Group && "Fail to get an interleaved access group.");
2183 
2184   // Skip if current instruction is not the insert position.
2185   if (Instr != Group->getInsertPos())
2186     return;
2187 
2188   const DataLayout &DL = Instr->getModule()->getDataLayout();
2189 
2190   // Prepare for the vector type of the interleaved load/store.
2191   Type *ScalarTy = getMemInstValueType(Instr);
2192   unsigned InterleaveFactor = Group->getFactor();
2193   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2194 
2195   // Prepare for the new pointers.
2196   SmallVector<Value *, 2> AddrParts;
2197   unsigned Index = Group->getIndex(Instr);
2198 
2199   // TODO: extend the masked interleaved-group support to reversed access.
2200   assert((!BlockInMask || !Group->isReverse()) &&
2201          "Reversed masked interleave-group not supported.");
2202 
2203   // If the group is reverse, adjust the index to refer to the last vector lane
2204   // instead of the first. We adjust the index from the first vector lane,
2205   // rather than directly getting the pointer for lane VF - 1, because the
2206   // pointer operand of the interleaved access is supposed to be uniform. For
2207   // uniform instructions, we're only required to generate a value for the
2208   // first vector lane in each unroll iteration.
2209   if (Group->isReverse())
2210     Index += (VF - 1) * Group->getFactor();
2211 
2212   for (unsigned Part = 0; Part < UF; Part++) {
2213     Value *AddrPart = State.get(Addr, {Part, 0});
2214     setDebugLocFromInst(Builder, AddrPart);
2215 
2216     // Notice current instruction could be any index. Need to adjust the address
2217     // to the member of index 0.
2218     //
2219     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2220     //       b = A[i];       // Member of index 0
2221     // Current pointer is pointed to A[i+1], adjust it to A[i].
2222     //
2223     // E.g.  A[i+1] = a;     // Member of index 1
2224     //       A[i]   = b;     // Member of index 0
2225     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2226     // Current pointer is pointed to A[i+2], adjust it to A[i].
2227 
2228     bool InBounds = false;
2229     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2230       InBounds = gep->isInBounds();
2231     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2232     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2233 
2234     // Cast to the vector pointer type.
2235     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2236     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2237     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2238   }
2239 
2240   setDebugLocFromInst(Builder, Instr);
2241   Value *UndefVec = UndefValue::get(VecTy);
2242 
2243   Value *MaskForGaps = nullptr;
2244   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2245     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2246     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2247   }
2248 
2249   // Vectorize the interleaved load group.
2250   if (isa<LoadInst>(Instr)) {
2251     // For each unroll part, create a wide load for the group.
2252     SmallVector<Value *, 2> NewLoads;
2253     for (unsigned Part = 0; Part < UF; Part++) {
2254       Instruction *NewLoad;
2255       if (BlockInMask || MaskForGaps) {
2256         assert(useMaskedInterleavedAccesses(*TTI) &&
2257                "masked interleaved groups are not allowed.");
2258         Value *GroupMask = MaskForGaps;
2259         if (BlockInMask) {
2260           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2261           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2262           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2263           Value *ShuffledMask = Builder.CreateShuffleVector(
2264               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2265           GroupMask = MaskForGaps
2266                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2267                                                 MaskForGaps)
2268                           : ShuffledMask;
2269         }
2270         NewLoad =
2271             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2272                                      GroupMask, UndefVec, "wide.masked.vec");
2273       }
2274       else
2275         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2276                                             Group->getAlign(), "wide.vec");
2277       Group->addMetadata(NewLoad);
2278       NewLoads.push_back(NewLoad);
2279     }
2280 
2281     // For each member in the group, shuffle out the appropriate data from the
2282     // wide loads.
2283     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2284       Instruction *Member = Group->getMember(I);
2285 
2286       // Skip the gaps in the group.
2287       if (!Member)
2288         continue;
2289 
2290       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2291       for (unsigned Part = 0; Part < UF; Part++) {
2292         Value *StridedVec = Builder.CreateShuffleVector(
2293             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2294 
2295         // If this member has different type, cast the result type.
2296         if (Member->getType() != ScalarTy) {
2297           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2298           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2299         }
2300 
2301         if (Group->isReverse())
2302           StridedVec = reverseVector(StridedVec);
2303 
2304         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2305       }
2306     }
2307     return;
2308   }
2309 
2310   // The sub vector type for current instruction.
2311   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2312 
2313   // Vectorize the interleaved store group.
2314   for (unsigned Part = 0; Part < UF; Part++) {
2315     // Collect the stored vector from each member.
2316     SmallVector<Value *, 4> StoredVecs;
2317     for (unsigned i = 0; i < InterleaveFactor; i++) {
2318       // Interleaved store group doesn't allow a gap, so each index has a member
2319       Instruction *Member = Group->getMember(i);
2320       assert(Member && "Fail to get a member from an interleaved store group");
2321 
2322       Value *StoredVec = getOrCreateVectorValue(
2323           cast<StoreInst>(Member)->getValueOperand(), Part);
2324       if (Group->isReverse())
2325         StoredVec = reverseVector(StoredVec);
2326 
2327       // If this member has different type, cast it to a unified type.
2328 
2329       if (StoredVec->getType() != SubVT)
2330         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2331 
2332       StoredVecs.push_back(StoredVec);
2333     }
2334 
2335     // Concatenate all vectors into a wide vector.
2336     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2337 
2338     // Interleave the elements in the wide vector.
2339     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2340     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2341                                               "interleaved.vec");
2342 
2343     Instruction *NewStoreInstr;
2344     if (BlockInMask) {
2345       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2346       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2347       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2348       Value *ShuffledMask = Builder.CreateShuffleVector(
2349           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2350       NewStoreInstr = Builder.CreateMaskedStore(
2351           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2352     }
2353     else
2354       NewStoreInstr =
2355           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2356 
2357     Group->addMetadata(NewStoreInstr);
2358   }
2359 }
2360 
2361 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2362                                                      VPTransformState &State,
2363                                                      VPValue *Addr,
2364                                                      VPValue *StoredValue,
2365                                                      VPValue *BlockInMask) {
2366   // Attempt to issue a wide load.
2367   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2368   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2369 
2370   assert((LI || SI) && "Invalid Load/Store instruction");
2371   assert((!SI || StoredValue) && "No stored value provided for widened store");
2372   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2373 
2374   LoopVectorizationCostModel::InstWidening Decision =
2375       Cost->getWideningDecision(Instr, VF);
2376   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2377          "CM decision should be taken at this point");
2378   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2379     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2380 
2381   Type *ScalarDataTy = getMemInstValueType(Instr);
2382   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2383   // An alignment of 0 means target abi alignment. We need to use the scalar's
2384   // target abi alignment in such a case.
2385   const DataLayout &DL = Instr->getModule()->getDataLayout();
2386   const Align Alignment =
2387       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2388 
2389   // Determine if the pointer operand of the access is either consecutive or
2390   // reverse consecutive.
2391   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2392   bool ConsecutiveStride =
2393       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2394   bool CreateGatherScatter =
2395       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2396 
2397   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2398   // gather/scatter. Otherwise Decision should have been to Scalarize.
2399   assert((ConsecutiveStride || CreateGatherScatter) &&
2400          "The instruction should be scalarized");
2401   (void)ConsecutiveStride;
2402 
2403   VectorParts BlockInMaskParts(UF);
2404   bool isMaskRequired = BlockInMask;
2405   if (isMaskRequired)
2406     for (unsigned Part = 0; Part < UF; ++Part)
2407       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2408 
2409   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2410     // Calculate the pointer for the specific unroll-part.
2411     GetElementPtrInst *PartPtr = nullptr;
2412 
2413     bool InBounds = false;
2414     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2415       InBounds = gep->isInBounds();
2416 
2417     if (Reverse) {
2418       // If the address is consecutive but reversed, then the
2419       // wide store needs to start at the last vector element.
2420       PartPtr = cast<GetElementPtrInst>(
2421           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2422       PartPtr->setIsInBounds(InBounds);
2423       PartPtr = cast<GetElementPtrInst>(
2424           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2425       PartPtr->setIsInBounds(InBounds);
2426       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2427         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2428     } else {
2429       PartPtr = cast<GetElementPtrInst>(
2430           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2431       PartPtr->setIsInBounds(InBounds);
2432     }
2433 
2434     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2435     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2436   };
2437 
2438   // Handle Stores:
2439   if (SI) {
2440     setDebugLocFromInst(Builder, SI);
2441 
2442     for (unsigned Part = 0; Part < UF; ++Part) {
2443       Instruction *NewSI = nullptr;
2444       Value *StoredVal = State.get(StoredValue, Part);
2445       if (CreateGatherScatter) {
2446         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2447         Value *VectorGep = State.get(Addr, Part);
2448         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2449                                             MaskPart);
2450       } else {
2451         if (Reverse) {
2452           // If we store to reverse consecutive memory locations, then we need
2453           // to reverse the order of elements in the stored value.
2454           StoredVal = reverseVector(StoredVal);
2455           // We don't want to update the value in the map as it might be used in
2456           // another expression. So don't call resetVectorValue(StoredVal).
2457         }
2458         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2459         if (isMaskRequired)
2460           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2461                                             BlockInMaskParts[Part]);
2462         else
2463           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2464       }
2465       addMetadata(NewSI, SI);
2466     }
2467     return;
2468   }
2469 
2470   // Handle loads.
2471   assert(LI && "Must have a load instruction");
2472   setDebugLocFromInst(Builder, LI);
2473   for (unsigned Part = 0; Part < UF; ++Part) {
2474     Value *NewLI;
2475     if (CreateGatherScatter) {
2476       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2477       Value *VectorGep = State.get(Addr, Part);
2478       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2479                                          nullptr, "wide.masked.gather");
2480       addMetadata(NewLI, LI);
2481     } else {
2482       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2483       if (isMaskRequired)
2484         NewLI = Builder.CreateMaskedLoad(
2485             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2486             "wide.masked.load");
2487       else
2488         NewLI =
2489             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2490 
2491       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2492       addMetadata(NewLI, LI);
2493       if (Reverse)
2494         NewLI = reverseVector(NewLI);
2495     }
2496     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2497   }
2498 }
2499 
2500 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2501                                                const VPIteration &Instance,
2502                                                bool IfPredicateInstr) {
2503   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2504 
2505   setDebugLocFromInst(Builder, Instr);
2506 
2507   // Does this instruction return a value ?
2508   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2509 
2510   Instruction *Cloned = Instr->clone();
2511   if (!IsVoidRetTy)
2512     Cloned->setName(Instr->getName() + ".cloned");
2513 
2514   // Replace the operands of the cloned instructions with their scalar
2515   // equivalents in the new loop.
2516   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2517     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2518     Cloned->setOperand(op, NewOp);
2519   }
2520   addNewMetadata(Cloned, Instr);
2521 
2522   // Place the cloned scalar in the new loop.
2523   Builder.Insert(Cloned);
2524 
2525   // Add the cloned scalar to the scalar map entry.
2526   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2527 
2528   // If we just cloned a new assumption, add it the assumption cache.
2529   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2530     if (II->getIntrinsicID() == Intrinsic::assume)
2531       AC->registerAssumption(II);
2532 
2533   // End if-block.
2534   if (IfPredicateInstr)
2535     PredicatedInstructions.push_back(Cloned);
2536 }
2537 
2538 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2539                                                       Value *End, Value *Step,
2540                                                       Instruction *DL) {
2541   BasicBlock *Header = L->getHeader();
2542   BasicBlock *Latch = L->getLoopLatch();
2543   // As we're just creating this loop, it's possible no latch exists
2544   // yet. If so, use the header as this will be a single block loop.
2545   if (!Latch)
2546     Latch = Header;
2547 
2548   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2549   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2550   setDebugLocFromInst(Builder, OldInst);
2551   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2552 
2553   Builder.SetInsertPoint(Latch->getTerminator());
2554   setDebugLocFromInst(Builder, OldInst);
2555 
2556   // Create i+1 and fill the PHINode.
2557   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2558   Induction->addIncoming(Start, L->getLoopPreheader());
2559   Induction->addIncoming(Next, Latch);
2560   // Create the compare.
2561   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2562   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2563 
2564   // Now we have two terminators. Remove the old one from the block.
2565   Latch->getTerminator()->eraseFromParent();
2566 
2567   return Induction;
2568 }
2569 
2570 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2571   if (TripCount)
2572     return TripCount;
2573 
2574   assert(L && "Create Trip Count for null loop.");
2575   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2576   // Find the loop boundaries.
2577   ScalarEvolution *SE = PSE.getSE();
2578   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2579   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2580          "Invalid loop count");
2581 
2582   Type *IdxTy = Legal->getWidestInductionType();
2583   assert(IdxTy && "No type for induction");
2584 
2585   // The exit count might have the type of i64 while the phi is i32. This can
2586   // happen if we have an induction variable that is sign extended before the
2587   // compare. The only way that we get a backedge taken count is that the
2588   // induction variable was signed and as such will not overflow. In such a case
2589   // truncation is legal.
2590   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2591       IdxTy->getPrimitiveSizeInBits())
2592     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2593   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2594 
2595   // Get the total trip count from the count by adding 1.
2596   const SCEV *ExitCount = SE->getAddExpr(
2597       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2598 
2599   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2600 
2601   // Expand the trip count and place the new instructions in the preheader.
2602   // Notice that the pre-header does not change, only the loop body.
2603   SCEVExpander Exp(*SE, DL, "induction");
2604 
2605   // Count holds the overall loop count (N).
2606   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2607                                 L->getLoopPreheader()->getTerminator());
2608 
2609   if (TripCount->getType()->isPointerTy())
2610     TripCount =
2611         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2612                                     L->getLoopPreheader()->getTerminator());
2613 
2614   return TripCount;
2615 }
2616 
2617 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2618   if (VectorTripCount)
2619     return VectorTripCount;
2620 
2621   Value *TC = getOrCreateTripCount(L);
2622   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2623 
2624   Type *Ty = TC->getType();
2625   Constant *Step = ConstantInt::get(Ty, VF * UF);
2626 
2627   // If the tail is to be folded by masking, round the number of iterations N
2628   // up to a multiple of Step instead of rounding down. This is done by first
2629   // adding Step-1 and then rounding down. Note that it's ok if this addition
2630   // overflows: the vector induction variable will eventually wrap to zero given
2631   // that it starts at zero and its Step is a power of two; the loop will then
2632   // exit, with the last early-exit vector comparison also producing all-true.
2633   if (Cost->foldTailByMasking()) {
2634     assert(isPowerOf2_32(VF * UF) &&
2635            "VF*UF must be a power of 2 when folding tail by masking");
2636     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2637   }
2638 
2639   // Now we need to generate the expression for the part of the loop that the
2640   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2641   // iterations are not required for correctness, or N - Step, otherwise. Step
2642   // is equal to the vectorization factor (number of SIMD elements) times the
2643   // unroll factor (number of SIMD instructions).
2644   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2645 
2646   // If there is a non-reversed interleaved group that may speculatively access
2647   // memory out-of-bounds, we need to ensure that there will be at least one
2648   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2649   // the trip count, we set the remainder to be equal to the step. If the step
2650   // does not evenly divide the trip count, no adjustment is necessary since
2651   // there will already be scalar iterations. Note that the minimum iterations
2652   // check ensures that N >= Step.
2653   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2654     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2655     R = Builder.CreateSelect(IsZero, Step, R);
2656   }
2657 
2658   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2659 
2660   return VectorTripCount;
2661 }
2662 
2663 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2664                                                    const DataLayout &DL) {
2665   // Verify that V is a vector type with same number of elements as DstVTy.
2666   unsigned VF = DstVTy->getNumElements();
2667   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2668   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2669   Type *SrcElemTy = SrcVecTy->getElementType();
2670   Type *DstElemTy = DstVTy->getElementType();
2671   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2672          "Vector elements must have same size");
2673 
2674   // Do a direct cast if element types are castable.
2675   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2676     return Builder.CreateBitOrPointerCast(V, DstVTy);
2677   }
2678   // V cannot be directly casted to desired vector type.
2679   // May happen when V is a floating point vector but DstVTy is a vector of
2680   // pointers or vice-versa. Handle this using a two-step bitcast using an
2681   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2682   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2683          "Only one type should be a pointer type");
2684   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2685          "Only one type should be a floating point type");
2686   Type *IntTy =
2687       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2688   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2689   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2690   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2691 }
2692 
2693 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2694                                                          BasicBlock *Bypass) {
2695   Value *Count = getOrCreateTripCount(L);
2696   // Reuse existing vector loop preheader for TC checks.
2697   // Note that new preheader block is generated for vector loop.
2698   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2699   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2700 
2701   // Generate code to check if the loop's trip count is less than VF * UF, or
2702   // equal to it in case a scalar epilogue is required; this implies that the
2703   // vector trip count is zero. This check also covers the case where adding one
2704   // to the backedge-taken count overflowed leading to an incorrect trip count
2705   // of zero. In this case we will also jump to the scalar loop.
2706   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2707                                           : ICmpInst::ICMP_ULT;
2708 
2709   // If tail is to be folded, vector loop takes care of all iterations.
2710   Value *CheckMinIters = Builder.getFalse();
2711   if (!Cost->foldTailByMasking())
2712     CheckMinIters = Builder.CreateICmp(
2713         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2714         "min.iters.check");
2715 
2716   // Create new preheader for vector loop.
2717   LoopVectorPreHeader =
2718       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2719                  "vector.ph");
2720 
2721   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2722                                DT->getNode(Bypass)->getIDom()) &&
2723          "TC check is expected to dominate Bypass");
2724 
2725   // Update dominator for Bypass & LoopExit.
2726   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2727   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2728 
2729   ReplaceInstWithInst(
2730       TCCheckBlock->getTerminator(),
2731       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2732   LoopBypassBlocks.push_back(TCCheckBlock);
2733 }
2734 
2735 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2736   // Reuse existing vector loop preheader for SCEV checks.
2737   // Note that new preheader block is generated for vector loop.
2738   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2739 
2740   // Generate the code to check that the SCEV assumptions that we made.
2741   // We want the new basic block to start at the first instruction in a
2742   // sequence of instructions that form a check.
2743   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2744                    "scev.check");
2745   Value *SCEVCheck = Exp.expandCodeForPredicate(
2746       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2747 
2748   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2749     if (C->isZero())
2750       return;
2751 
2752   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2753          "Cannot SCEV check stride or overflow when optimizing for size");
2754 
2755   SCEVCheckBlock->setName("vector.scevcheck");
2756   // Create new preheader for vector loop.
2757   LoopVectorPreHeader =
2758       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2759                  nullptr, "vector.ph");
2760 
2761   // Update dominator only if this is first RT check.
2762   if (LoopBypassBlocks.empty()) {
2763     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2764     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2765   }
2766 
2767   ReplaceInstWithInst(
2768       SCEVCheckBlock->getTerminator(),
2769       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2770   LoopBypassBlocks.push_back(SCEVCheckBlock);
2771   AddedSafetyChecks = true;
2772 }
2773 
2774 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2775   // VPlan-native path does not do any analysis for runtime checks currently.
2776   if (EnableVPlanNativePath)
2777     return;
2778 
2779   // Reuse existing vector loop preheader for runtime memory checks.
2780   // Note that new preheader block is generated for vector loop.
2781   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2782 
2783   // Generate the code that checks in runtime if arrays overlap. We put the
2784   // checks into a separate block to make the more common case of few elements
2785   // faster.
2786   Instruction *FirstCheckInst;
2787   Instruction *MemRuntimeCheck;
2788   std::tie(FirstCheckInst, MemRuntimeCheck) =
2789       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2790   if (!MemRuntimeCheck)
2791     return;
2792 
2793   if (MemCheckBlock->getParent()->hasOptSize()) {
2794     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2795            "Cannot emit memory checks when optimizing for size, unless forced "
2796            "to vectorize.");
2797     ORE->emit([&]() {
2798       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2799                                         L->getStartLoc(), L->getHeader())
2800              << "Code-size may be reduced by not forcing "
2801                 "vectorization, or by source-code modifications "
2802                 "eliminating the need for runtime checks "
2803                 "(e.g., adding 'restrict').";
2804     });
2805   }
2806 
2807   MemCheckBlock->setName("vector.memcheck");
2808   // Create new preheader for vector loop.
2809   LoopVectorPreHeader =
2810       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2811                  "vector.ph");
2812 
2813   // Update dominator only if this is first RT check.
2814   if (LoopBypassBlocks.empty()) {
2815     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2816     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2817   }
2818 
2819   ReplaceInstWithInst(
2820       MemCheckBlock->getTerminator(),
2821       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2822   LoopBypassBlocks.push_back(MemCheckBlock);
2823   AddedSafetyChecks = true;
2824 
2825   // We currently don't use LoopVersioning for the actual loop cloning but we
2826   // still use it to add the noalias metadata.
2827   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2828                                           PSE.getSE());
2829   LVer->prepareNoAliasMetadata();
2830 }
2831 
2832 Value *InnerLoopVectorizer::emitTransformedIndex(
2833     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2834     const InductionDescriptor &ID) const {
2835 
2836   SCEVExpander Exp(*SE, DL, "induction");
2837   auto Step = ID.getStep();
2838   auto StartValue = ID.getStartValue();
2839   assert(Index->getType() == Step->getType() &&
2840          "Index type does not match StepValue type");
2841 
2842   // Note: the IR at this point is broken. We cannot use SE to create any new
2843   // SCEV and then expand it, hoping that SCEV's simplification will give us
2844   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2845   // lead to various SCEV crashes. So all we can do is to use builder and rely
2846   // on InstCombine for future simplifications. Here we handle some trivial
2847   // cases only.
2848   auto CreateAdd = [&B](Value *X, Value *Y) {
2849     assert(X->getType() == Y->getType() && "Types don't match!");
2850     if (auto *CX = dyn_cast<ConstantInt>(X))
2851       if (CX->isZero())
2852         return Y;
2853     if (auto *CY = dyn_cast<ConstantInt>(Y))
2854       if (CY->isZero())
2855         return X;
2856     return B.CreateAdd(X, Y);
2857   };
2858 
2859   auto CreateMul = [&B](Value *X, Value *Y) {
2860     assert(X->getType() == Y->getType() && "Types don't match!");
2861     if (auto *CX = dyn_cast<ConstantInt>(X))
2862       if (CX->isOne())
2863         return Y;
2864     if (auto *CY = dyn_cast<ConstantInt>(Y))
2865       if (CY->isOne())
2866         return X;
2867     return B.CreateMul(X, Y);
2868   };
2869 
2870   switch (ID.getKind()) {
2871   case InductionDescriptor::IK_IntInduction: {
2872     assert(Index->getType() == StartValue->getType() &&
2873            "Index type does not match StartValue type");
2874     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2875       return B.CreateSub(StartValue, Index);
2876     auto *Offset = CreateMul(
2877         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2878     return CreateAdd(StartValue, Offset);
2879   }
2880   case InductionDescriptor::IK_PtrInduction: {
2881     assert(isa<SCEVConstant>(Step) &&
2882            "Expected constant step for pointer induction");
2883     return B.CreateGEP(
2884         StartValue->getType()->getPointerElementType(), StartValue,
2885         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2886                                            &*B.GetInsertPoint())));
2887   }
2888   case InductionDescriptor::IK_FpInduction: {
2889     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2890     auto InductionBinOp = ID.getInductionBinOp();
2891     assert(InductionBinOp &&
2892            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2893             InductionBinOp->getOpcode() == Instruction::FSub) &&
2894            "Original bin op should be defined for FP induction");
2895 
2896     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2897 
2898     // Floating point operations had to be 'fast' to enable the induction.
2899     FastMathFlags Flags;
2900     Flags.setFast();
2901 
2902     Value *MulExp = B.CreateFMul(StepValue, Index);
2903     if (isa<Instruction>(MulExp))
2904       // We have to check, the MulExp may be a constant.
2905       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2906 
2907     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2908                                "induction");
2909     if (isa<Instruction>(BOp))
2910       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2911 
2912     return BOp;
2913   }
2914   case InductionDescriptor::IK_NoInduction:
2915     return nullptr;
2916   }
2917   llvm_unreachable("invalid enum");
2918 }
2919 
2920 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2921   /*
2922    In this function we generate a new loop. The new loop will contain
2923    the vectorized instructions while the old loop will continue to run the
2924    scalar remainder.
2925 
2926        [ ] <-- loop iteration number check.
2927     /   |
2928    /    v
2929   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2930   |  /  |
2931   | /   v
2932   ||   [ ]     <-- vector pre header.
2933   |/    |
2934   |     v
2935   |    [  ] \
2936   |    [  ]_|   <-- vector loop.
2937   |     |
2938   |     v
2939   |   -[ ]   <--- middle-block.
2940   |  /  |
2941   | /   v
2942   -|- >[ ]     <--- new preheader.
2943    |    |
2944    |    v
2945    |   [ ] \
2946    |   [ ]_|   <-- old scalar loop to handle remainder.
2947     \   |
2948      \  v
2949       >[ ]     <-- exit block.
2950    ...
2951    */
2952 
2953   MDNode *OrigLoopID = OrigLoop->getLoopID();
2954 
2955   // Some loops have a single integer induction variable, while other loops
2956   // don't. One example is c++ iterators that often have multiple pointer
2957   // induction variables. In the code below we also support a case where we
2958   // don't have a single induction variable.
2959   //
2960   // We try to obtain an induction variable from the original loop as hard
2961   // as possible. However if we don't find one that:
2962   //   - is an integer
2963   //   - counts from zero, stepping by one
2964   //   - is the size of the widest induction variable type
2965   // then we create a new one.
2966   OldInduction = Legal->getPrimaryInduction();
2967   Type *IdxTy = Legal->getWidestInductionType();
2968 
2969   // Split the single block loop into the two loop structure described above.
2970   LoopScalarBody = OrigLoop->getHeader();
2971   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2972   LoopExitBlock = OrigLoop->getExitBlock();
2973   assert(LoopExitBlock && "Must have an exit block");
2974   assert(LoopVectorPreHeader && "Invalid loop structure");
2975 
2976   LoopMiddleBlock =
2977       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2978                  LI, nullptr, "middle.block");
2979   LoopScalarPreHeader =
2980       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2981                  nullptr, "scalar.ph");
2982   // We intentionally don't let SplitBlock to update LoopInfo since
2983   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2984   // LoopVectorBody is explicitly added to the correct place few lines later.
2985   LoopVectorBody =
2986       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2987                  nullptr, nullptr, "vector.body");
2988 
2989   // Update dominator for loop exit.
2990   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2991 
2992   // Create and register the new vector loop.
2993   Loop *Lp = LI->AllocateLoop();
2994   Loop *ParentLoop = OrigLoop->getParentLoop();
2995 
2996   // Insert the new loop into the loop nest and register the new basic blocks
2997   // before calling any utilities such as SCEV that require valid LoopInfo.
2998   if (ParentLoop) {
2999     ParentLoop->addChildLoop(Lp);
3000   } else {
3001     LI->addTopLevelLoop(Lp);
3002   }
3003   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3004 
3005   // Find the loop boundaries.
3006   Value *Count = getOrCreateTripCount(Lp);
3007 
3008   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3009 
3010   // Now, compare the new count to zero. If it is zero skip the vector loop and
3011   // jump to the scalar loop. This check also covers the case where the
3012   // backedge-taken count is uint##_max: adding one to it will overflow leading
3013   // to an incorrect trip count of zero. In this (rare) case we will also jump
3014   // to the scalar loop.
3015   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3016 
3017   // Generate the code to check any assumptions that we've made for SCEV
3018   // expressions.
3019   emitSCEVChecks(Lp, LoopScalarPreHeader);
3020 
3021   // Generate the code that checks in runtime if arrays overlap. We put the
3022   // checks into a separate block to make the more common case of few elements
3023   // faster.
3024   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3025 
3026   // Generate the induction variable.
3027   // The loop step is equal to the vectorization factor (num of SIMD elements)
3028   // times the unroll factor (num of SIMD instructions).
3029   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3030   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3031   Induction =
3032       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3033                               getDebugLocFromInstOrOperands(OldInduction));
3034 
3035   // We are going to resume the execution of the scalar loop.
3036   // Go over all of the induction variables that we found and fix the
3037   // PHIs that are left in the scalar version of the loop.
3038   // The starting values of PHI nodes depend on the counter of the last
3039   // iteration in the vectorized loop.
3040   // If we come from a bypass edge then we need to start from the original
3041   // start value.
3042 
3043   // This variable saves the new starting index for the scalar loop. It is used
3044   // to test if there are any tail iterations left once the vector loop has
3045   // completed.
3046   for (auto &InductionEntry : Legal->getInductionVars()) {
3047     PHINode *OrigPhi = InductionEntry.first;
3048     InductionDescriptor II = InductionEntry.second;
3049 
3050     // Create phi nodes to merge from the  backedge-taken check block.
3051     PHINode *BCResumeVal =
3052         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3053                         LoopScalarPreHeader->getTerminator());
3054     // Copy original phi DL over to the new one.
3055     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3056     Value *&EndValue = IVEndValues[OrigPhi];
3057     if (OrigPhi == OldInduction) {
3058       // We know what the end value is.
3059       EndValue = CountRoundDown;
3060     } else {
3061       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3062       Type *StepType = II.getStep()->getType();
3063       Instruction::CastOps CastOp =
3064           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3065       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3066       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3067       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3068       EndValue->setName("ind.end");
3069     }
3070 
3071     // The new PHI merges the original incoming value, in case of a bypass,
3072     // or the value at the end of the vectorized loop.
3073     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3074 
3075     // Fix the scalar body counter (PHI node).
3076     // The old induction's phi node in the scalar body needs the truncated
3077     // value.
3078     for (BasicBlock *BB : LoopBypassBlocks)
3079       BCResumeVal->addIncoming(II.getStartValue(), BB);
3080     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3081   }
3082 
3083   // We need the OrigLoop (scalar loop part) latch terminator to help
3084   // produce correct debug info for the middle block BB instructions.
3085   // The legality check stage guarantees that the loop will have a single
3086   // latch.
3087   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3088          "Scalar loop latch terminator isn't a branch");
3089   BranchInst *ScalarLatchBr =
3090       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3091 
3092   // Add a check in the middle block to see if we have completed
3093   // all of the iterations in the first vector loop.
3094   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3095   // If tail is to be folded, we know we don't need to run the remainder.
3096   Value *CmpN = Builder.getTrue();
3097   if (!Cost->foldTailByMasking()) {
3098     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3099                            CountRoundDown, "cmp.n",
3100                            LoopMiddleBlock->getTerminator());
3101 
3102     // Here we use the same DebugLoc as the scalar loop latch branch instead
3103     // of the corresponding compare because they may have ended up with
3104     // different line numbers and we want to avoid awkward line stepping while
3105     // debugging. Eg. if the compare has got a line number inside the loop.
3106     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3107   }
3108 
3109   BranchInst *BrInst =
3110       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3111   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3112   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3113 
3114   // Get ready to start creating new instructions into the vectorized body.
3115   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3116          "Inconsistent vector loop preheader");
3117   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3118 
3119   Optional<MDNode *> VectorizedLoopID =
3120       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3121                                       LLVMLoopVectorizeFollowupVectorized});
3122   if (VectorizedLoopID.hasValue()) {
3123     Lp->setLoopID(VectorizedLoopID.getValue());
3124 
3125     // Do not setAlreadyVectorized if loop attributes have been defined
3126     // explicitly.
3127     return LoopVectorPreHeader;
3128   }
3129 
3130   // Keep all loop hints from the original loop on the vector loop (we'll
3131   // replace the vectorizer-specific hints below).
3132   if (MDNode *LID = OrigLoop->getLoopID())
3133     Lp->setLoopID(LID);
3134 
3135   LoopVectorizeHints Hints(Lp, true, *ORE);
3136   Hints.setAlreadyVectorized();
3137 
3138 #ifdef EXPENSIVE_CHECKS
3139   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3140   LI->verify(*DT);
3141 #endif
3142 
3143   return LoopVectorPreHeader;
3144 }
3145 
3146 // Fix up external users of the induction variable. At this point, we are
3147 // in LCSSA form, with all external PHIs that use the IV having one input value,
3148 // coming from the remainder loop. We need those PHIs to also have a correct
3149 // value for the IV when arriving directly from the middle block.
3150 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3151                                        const InductionDescriptor &II,
3152                                        Value *CountRoundDown, Value *EndValue,
3153                                        BasicBlock *MiddleBlock) {
3154   // There are two kinds of external IV usages - those that use the value
3155   // computed in the last iteration (the PHI) and those that use the penultimate
3156   // value (the value that feeds into the phi from the loop latch).
3157   // We allow both, but they, obviously, have different values.
3158 
3159   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3160 
3161   DenseMap<Value *, Value *> MissingVals;
3162 
3163   // An external user of the last iteration's value should see the value that
3164   // the remainder loop uses to initialize its own IV.
3165   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3166   for (User *U : PostInc->users()) {
3167     Instruction *UI = cast<Instruction>(U);
3168     if (!OrigLoop->contains(UI)) {
3169       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3170       MissingVals[UI] = EndValue;
3171     }
3172   }
3173 
3174   // An external user of the penultimate value need to see EndValue - Step.
3175   // The simplest way to get this is to recompute it from the constituent SCEVs,
3176   // that is Start + (Step * (CRD - 1)).
3177   for (User *U : OrigPhi->users()) {
3178     auto *UI = cast<Instruction>(U);
3179     if (!OrigLoop->contains(UI)) {
3180       const DataLayout &DL =
3181           OrigLoop->getHeader()->getModule()->getDataLayout();
3182       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3183 
3184       IRBuilder<> B(MiddleBlock->getTerminator());
3185       Value *CountMinusOne = B.CreateSub(
3186           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3187       Value *CMO =
3188           !II.getStep()->getType()->isIntegerTy()
3189               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3190                              II.getStep()->getType())
3191               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3192       CMO->setName("cast.cmo");
3193       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3194       Escape->setName("ind.escape");
3195       MissingVals[UI] = Escape;
3196     }
3197   }
3198 
3199   for (auto &I : MissingVals) {
3200     PHINode *PHI = cast<PHINode>(I.first);
3201     // One corner case we have to handle is two IVs "chasing" each-other,
3202     // that is %IV2 = phi [...], [ %IV1, %latch ]
3203     // In this case, if IV1 has an external use, we need to avoid adding both
3204     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3205     // don't already have an incoming value for the middle block.
3206     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3207       PHI->addIncoming(I.second, MiddleBlock);
3208   }
3209 }
3210 
3211 namespace {
3212 
3213 struct CSEDenseMapInfo {
3214   static bool canHandle(const Instruction *I) {
3215     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3216            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3217   }
3218 
3219   static inline Instruction *getEmptyKey() {
3220     return DenseMapInfo<Instruction *>::getEmptyKey();
3221   }
3222 
3223   static inline Instruction *getTombstoneKey() {
3224     return DenseMapInfo<Instruction *>::getTombstoneKey();
3225   }
3226 
3227   static unsigned getHashValue(const Instruction *I) {
3228     assert(canHandle(I) && "Unknown instruction!");
3229     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3230                                                            I->value_op_end()));
3231   }
3232 
3233   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3234     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3235         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3236       return LHS == RHS;
3237     return LHS->isIdenticalTo(RHS);
3238   }
3239 };
3240 
3241 } // end anonymous namespace
3242 
3243 ///Perform cse of induction variable instructions.
3244 static void cse(BasicBlock *BB) {
3245   // Perform simple cse.
3246   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3247   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3248     Instruction *In = &*I++;
3249 
3250     if (!CSEDenseMapInfo::canHandle(In))
3251       continue;
3252 
3253     // Check if we can replace this instruction with any of the
3254     // visited instructions.
3255     if (Instruction *V = CSEMap.lookup(In)) {
3256       In->replaceAllUsesWith(V);
3257       In->eraseFromParent();
3258       continue;
3259     }
3260 
3261     CSEMap[In] = In;
3262   }
3263 }
3264 
3265 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3266                                                        unsigned VF,
3267                                                        bool &NeedToScalarize) {
3268   Function *F = CI->getCalledFunction();
3269   Type *ScalarRetTy = CI->getType();
3270   SmallVector<Type *, 4> Tys, ScalarTys;
3271   for (auto &ArgOp : CI->arg_operands())
3272     ScalarTys.push_back(ArgOp->getType());
3273 
3274   // Estimate cost of scalarized vector call. The source operands are assumed
3275   // to be vectors, so we need to extract individual elements from there,
3276   // execute VF scalar calls, and then gather the result into the vector return
3277   // value.
3278   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3279   if (VF == 1)
3280     return ScalarCallCost;
3281 
3282   // Compute corresponding vector type for return value and arguments.
3283   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3284   for (Type *ScalarTy : ScalarTys)
3285     Tys.push_back(ToVectorTy(ScalarTy, VF));
3286 
3287   // Compute costs of unpacking argument values for the scalar calls and
3288   // packing the return values to a vector.
3289   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3290 
3291   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3292 
3293   // If we can't emit a vector call for this function, then the currently found
3294   // cost is the cost we need to return.
3295   NeedToScalarize = true;
3296   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3297   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3298 
3299   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3300     return Cost;
3301 
3302   // If the corresponding vector cost is cheaper, return its cost.
3303   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3304   if (VectorCallCost < Cost) {
3305     NeedToScalarize = false;
3306     return VectorCallCost;
3307   }
3308   return Cost;
3309 }
3310 
3311 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3312                                                             unsigned VF) {
3313   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3314   assert(ID && "Expected intrinsic call!");
3315 
3316   FastMathFlags FMF;
3317   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3318     FMF = FPMO->getFastMathFlags();
3319 
3320   SmallVector<Value *, 4> Operands(CI->arg_operands());
3321   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
3322 }
3323 
3324 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3325   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3326   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3327   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3328 }
3329 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3330   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3331   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3332   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3333 }
3334 
3335 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3336   // For every instruction `I` in MinBWs, truncate the operands, create a
3337   // truncated version of `I` and reextend its result. InstCombine runs
3338   // later and will remove any ext/trunc pairs.
3339   SmallPtrSet<Value *, 4> Erased;
3340   for (const auto &KV : Cost->getMinimalBitwidths()) {
3341     // If the value wasn't vectorized, we must maintain the original scalar
3342     // type. The absence of the value from VectorLoopValueMap indicates that it
3343     // wasn't vectorized.
3344     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3345       continue;
3346     for (unsigned Part = 0; Part < UF; ++Part) {
3347       Value *I = getOrCreateVectorValue(KV.first, Part);
3348       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3349           !isa<Instruction>(I))
3350         continue;
3351       Type *OriginalTy = I->getType();
3352       Type *ScalarTruncatedTy =
3353           IntegerType::get(OriginalTy->getContext(), KV.second);
3354       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3355                                           OriginalTy->getVectorNumElements());
3356       if (TruncatedTy == OriginalTy)
3357         continue;
3358 
3359       IRBuilder<> B(cast<Instruction>(I));
3360       auto ShrinkOperand = [&](Value *V) -> Value * {
3361         if (auto *ZI = dyn_cast<ZExtInst>(V))
3362           if (ZI->getSrcTy() == TruncatedTy)
3363             return ZI->getOperand(0);
3364         return B.CreateZExtOrTrunc(V, TruncatedTy);
3365       };
3366 
3367       // The actual instruction modification depends on the instruction type,
3368       // unfortunately.
3369       Value *NewI = nullptr;
3370       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3371         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3372                              ShrinkOperand(BO->getOperand(1)));
3373 
3374         // Any wrapping introduced by shrinking this operation shouldn't be
3375         // considered undefined behavior. So, we can't unconditionally copy
3376         // arithmetic wrapping flags to NewI.
3377         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3378       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3379         NewI =
3380             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3381                          ShrinkOperand(CI->getOperand(1)));
3382       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3383         NewI = B.CreateSelect(SI->getCondition(),
3384                               ShrinkOperand(SI->getTrueValue()),
3385                               ShrinkOperand(SI->getFalseValue()));
3386       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3387         switch (CI->getOpcode()) {
3388         default:
3389           llvm_unreachable("Unhandled cast!");
3390         case Instruction::Trunc:
3391           NewI = ShrinkOperand(CI->getOperand(0));
3392           break;
3393         case Instruction::SExt:
3394           NewI = B.CreateSExtOrTrunc(
3395               CI->getOperand(0),
3396               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3397           break;
3398         case Instruction::ZExt:
3399           NewI = B.CreateZExtOrTrunc(
3400               CI->getOperand(0),
3401               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3402           break;
3403         }
3404       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3405         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3406         auto *O0 = B.CreateZExtOrTrunc(
3407             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3408         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3409         auto *O1 = B.CreateZExtOrTrunc(
3410             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3411 
3412         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3413       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3414         // Don't do anything with the operands, just extend the result.
3415         continue;
3416       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3417         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3418         auto *O0 = B.CreateZExtOrTrunc(
3419             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3420         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3421         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3422       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3423         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3424         auto *O0 = B.CreateZExtOrTrunc(
3425             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3426         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3427       } else {
3428         // If we don't know what to do, be conservative and don't do anything.
3429         continue;
3430       }
3431 
3432       // Lastly, extend the result.
3433       NewI->takeName(cast<Instruction>(I));
3434       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3435       I->replaceAllUsesWith(Res);
3436       cast<Instruction>(I)->eraseFromParent();
3437       Erased.insert(I);
3438       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3439     }
3440   }
3441 
3442   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3443   for (const auto &KV : Cost->getMinimalBitwidths()) {
3444     // If the value wasn't vectorized, we must maintain the original scalar
3445     // type. The absence of the value from VectorLoopValueMap indicates that it
3446     // wasn't vectorized.
3447     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3448       continue;
3449     for (unsigned Part = 0; Part < UF; ++Part) {
3450       Value *I = getOrCreateVectorValue(KV.first, Part);
3451       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3452       if (Inst && Inst->use_empty()) {
3453         Value *NewI = Inst->getOperand(0);
3454         Inst->eraseFromParent();
3455         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3456       }
3457     }
3458   }
3459 }
3460 
3461 void InnerLoopVectorizer::fixVectorizedLoop() {
3462   // Insert truncates and extends for any truncated instructions as hints to
3463   // InstCombine.
3464   if (VF > 1)
3465     truncateToMinimalBitwidths();
3466 
3467   // Fix widened non-induction PHIs by setting up the PHI operands.
3468   if (OrigPHIsToFix.size()) {
3469     assert(EnableVPlanNativePath &&
3470            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3471     fixNonInductionPHIs();
3472   }
3473 
3474   // At this point every instruction in the original loop is widened to a
3475   // vector form. Now we need to fix the recurrences in the loop. These PHI
3476   // nodes are currently empty because we did not want to introduce cycles.
3477   // This is the second stage of vectorizing recurrences.
3478   fixCrossIterationPHIs();
3479 
3480   // Forget the original basic block.
3481   PSE.getSE()->forgetLoop(OrigLoop);
3482 
3483   // Fix-up external users of the induction variables.
3484   for (auto &Entry : Legal->getInductionVars())
3485     fixupIVUsers(Entry.first, Entry.second,
3486                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3487                  IVEndValues[Entry.first], LoopMiddleBlock);
3488 
3489   fixLCSSAPHIs();
3490   for (Instruction *PI : PredicatedInstructions)
3491     sinkScalarOperands(&*PI);
3492 
3493   // Remove redundant induction instructions.
3494   cse(LoopVectorBody);
3495 
3496   // Set/update profile weights for the vector and remainder loops as original
3497   // loop iterations are now distributed among them. Note that original loop
3498   // represented by LoopScalarBody becomes remainder loop after vectorization.
3499   //
3500   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3501   // end up getting slightly roughened result but that should be OK since
3502   // profile is not inherently precise anyway. Note also possible bypass of
3503   // vector code caused by legality checks is ignored, assigning all the weight
3504   // to the vector loop, optimistically.
3505   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3506                                LI->getLoopFor(LoopVectorBody),
3507                                LI->getLoopFor(LoopScalarBody), VF * UF);
3508 }
3509 
3510 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3511   // In order to support recurrences we need to be able to vectorize Phi nodes.
3512   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3513   // stage #2: We now need to fix the recurrences by adding incoming edges to
3514   // the currently empty PHI nodes. At this point every instruction in the
3515   // original loop is widened to a vector form so we can use them to construct
3516   // the incoming edges.
3517   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3518     // Handle first-order recurrences and reductions that need to be fixed.
3519     if (Legal->isFirstOrderRecurrence(&Phi))
3520       fixFirstOrderRecurrence(&Phi);
3521     else if (Legal->isReductionVariable(&Phi))
3522       fixReduction(&Phi);
3523   }
3524 }
3525 
3526 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3527   // This is the second phase of vectorizing first-order recurrences. An
3528   // overview of the transformation is described below. Suppose we have the
3529   // following loop.
3530   //
3531   //   for (int i = 0; i < n; ++i)
3532   //     b[i] = a[i] - a[i - 1];
3533   //
3534   // There is a first-order recurrence on "a". For this loop, the shorthand
3535   // scalar IR looks like:
3536   //
3537   //   scalar.ph:
3538   //     s_init = a[-1]
3539   //     br scalar.body
3540   //
3541   //   scalar.body:
3542   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3543   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3544   //     s2 = a[i]
3545   //     b[i] = s2 - s1
3546   //     br cond, scalar.body, ...
3547   //
3548   // In this example, s1 is a recurrence because it's value depends on the
3549   // previous iteration. In the first phase of vectorization, we created a
3550   // temporary value for s1. We now complete the vectorization and produce the
3551   // shorthand vector IR shown below (for VF = 4, UF = 1).
3552   //
3553   //   vector.ph:
3554   //     v_init = vector(..., ..., ..., a[-1])
3555   //     br vector.body
3556   //
3557   //   vector.body
3558   //     i = phi [0, vector.ph], [i+4, vector.body]
3559   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3560   //     v2 = a[i, i+1, i+2, i+3];
3561   //     v3 = vector(v1(3), v2(0, 1, 2))
3562   //     b[i, i+1, i+2, i+3] = v2 - v3
3563   //     br cond, vector.body, middle.block
3564   //
3565   //   middle.block:
3566   //     x = v2(3)
3567   //     br scalar.ph
3568   //
3569   //   scalar.ph:
3570   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3571   //     br scalar.body
3572   //
3573   // After execution completes the vector loop, we extract the next value of
3574   // the recurrence (x) to use as the initial value in the scalar loop.
3575 
3576   // Get the original loop preheader and single loop latch.
3577   auto *Preheader = OrigLoop->getLoopPreheader();
3578   auto *Latch = OrigLoop->getLoopLatch();
3579 
3580   // Get the initial and previous values of the scalar recurrence.
3581   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3582   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3583 
3584   // Create a vector from the initial value.
3585   auto *VectorInit = ScalarInit;
3586   if (VF > 1) {
3587     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3588     VectorInit = Builder.CreateInsertElement(
3589         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3590         Builder.getInt32(VF - 1), "vector.recur.init");
3591   }
3592 
3593   // We constructed a temporary phi node in the first phase of vectorization.
3594   // This phi node will eventually be deleted.
3595   Builder.SetInsertPoint(
3596       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3597 
3598   // Create a phi node for the new recurrence. The current value will either be
3599   // the initial value inserted into a vector or loop-varying vector value.
3600   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3601   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3602 
3603   // Get the vectorized previous value of the last part UF - 1. It appears last
3604   // among all unrolled iterations, due to the order of their construction.
3605   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3606 
3607   // Find and set the insertion point after the previous value if it is an
3608   // instruction.
3609   BasicBlock::iterator InsertPt;
3610   // Note that the previous value may have been constant-folded so it is not
3611   // guaranteed to be an instruction in the vector loop.
3612   // FIXME: Loop invariant values do not form recurrences. We should deal with
3613   //        them earlier.
3614   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3615     InsertPt = LoopVectorBody->getFirstInsertionPt();
3616   else {
3617     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3618     if (isa<PHINode>(PreviousLastPart))
3619       // If the previous value is a phi node, we should insert after all the phi
3620       // nodes in the block containing the PHI to avoid breaking basic block
3621       // verification. Note that the basic block may be different to
3622       // LoopVectorBody, in case we predicate the loop.
3623       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3624     else
3625       InsertPt = ++PreviousInst->getIterator();
3626   }
3627   Builder.SetInsertPoint(&*InsertPt);
3628 
3629   // We will construct a vector for the recurrence by combining the values for
3630   // the current and previous iterations. This is the required shuffle mask.
3631   SmallVector<Constant *, 8> ShuffleMask(VF);
3632   ShuffleMask[0] = Builder.getInt32(VF - 1);
3633   for (unsigned I = 1; I < VF; ++I)
3634     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3635 
3636   // The vector from which to take the initial value for the current iteration
3637   // (actual or unrolled). Initially, this is the vector phi node.
3638   Value *Incoming = VecPhi;
3639 
3640   // Shuffle the current and previous vector and update the vector parts.
3641   for (unsigned Part = 0; Part < UF; ++Part) {
3642     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3643     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3644     auto *Shuffle =
3645         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3646                                              ConstantVector::get(ShuffleMask))
3647                : Incoming;
3648     PhiPart->replaceAllUsesWith(Shuffle);
3649     cast<Instruction>(PhiPart)->eraseFromParent();
3650     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3651     Incoming = PreviousPart;
3652   }
3653 
3654   // Fix the latch value of the new recurrence in the vector loop.
3655   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3656 
3657   // Extract the last vector element in the middle block. This will be the
3658   // initial value for the recurrence when jumping to the scalar loop.
3659   auto *ExtractForScalar = Incoming;
3660   if (VF > 1) {
3661     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3662     ExtractForScalar = Builder.CreateExtractElement(
3663         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3664   }
3665   // Extract the second last element in the middle block if the
3666   // Phi is used outside the loop. We need to extract the phi itself
3667   // and not the last element (the phi update in the current iteration). This
3668   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3669   // when the scalar loop is not run at all.
3670   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3671   if (VF > 1)
3672     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3673         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3674   // When loop is unrolled without vectorizing, initialize
3675   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3676   // `Incoming`. This is analogous to the vectorized case above: extracting the
3677   // second last element when VF > 1.
3678   else if (UF > 1)
3679     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3680 
3681   // Fix the initial value of the original recurrence in the scalar loop.
3682   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3683   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3684   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3685     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3686     Start->addIncoming(Incoming, BB);
3687   }
3688 
3689   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3690   Phi->setName("scalar.recur");
3691 
3692   // Finally, fix users of the recurrence outside the loop. The users will need
3693   // either the last value of the scalar recurrence or the last value of the
3694   // vector recurrence we extracted in the middle block. Since the loop is in
3695   // LCSSA form, we just need to find all the phi nodes for the original scalar
3696   // recurrence in the exit block, and then add an edge for the middle block.
3697   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3698     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3699       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3700     }
3701   }
3702 }
3703 
3704 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3705   Constant *Zero = Builder.getInt32(0);
3706 
3707   // Get it's reduction variable descriptor.
3708   assert(Legal->isReductionVariable(Phi) &&
3709          "Unable to find the reduction variable");
3710   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3711 
3712   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3713   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3714   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3715   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3716     RdxDesc.getMinMaxRecurrenceKind();
3717   setDebugLocFromInst(Builder, ReductionStartValue);
3718 
3719   // We need to generate a reduction vector from the incoming scalar.
3720   // To do so, we need to generate the 'identity' vector and override
3721   // one of the elements with the incoming scalar reduction. We need
3722   // to do it in the vector-loop preheader.
3723   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3724 
3725   // This is the vector-clone of the value that leaves the loop.
3726   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3727 
3728   // Find the reduction identity variable. Zero for addition, or, xor,
3729   // one for multiplication, -1 for And.
3730   Value *Identity;
3731   Value *VectorStart;
3732   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3733       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3734     // MinMax reduction have the start value as their identify.
3735     if (VF == 1) {
3736       VectorStart = Identity = ReductionStartValue;
3737     } else {
3738       VectorStart = Identity =
3739         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3740     }
3741   } else {
3742     // Handle other reduction kinds:
3743     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3744         RK, VecTy->getScalarType());
3745     if (VF == 1) {
3746       Identity = Iden;
3747       // This vector is the Identity vector where the first element is the
3748       // incoming scalar reduction.
3749       VectorStart = ReductionStartValue;
3750     } else {
3751       Identity = ConstantVector::getSplat({VF, false}, Iden);
3752 
3753       // This vector is the Identity vector where the first element is the
3754       // incoming scalar reduction.
3755       VectorStart =
3756         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3757     }
3758   }
3759 
3760   // Wrap flags are in general invalid after vectorization, clear them.
3761   clearReductionWrapFlags(RdxDesc);
3762 
3763   // Fix the vector-loop phi.
3764 
3765   // Reductions do not have to start at zero. They can start with
3766   // any loop invariant values.
3767   BasicBlock *Latch = OrigLoop->getLoopLatch();
3768   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3769 
3770   for (unsigned Part = 0; Part < UF; ++Part) {
3771     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3772     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3773     // Make sure to add the reduction start value only to the
3774     // first unroll part.
3775     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3776     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3777     cast<PHINode>(VecRdxPhi)
3778       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3779   }
3780 
3781   // Before each round, move the insertion point right between
3782   // the PHIs and the values we are going to write.
3783   // This allows us to write both PHINodes and the extractelement
3784   // instructions.
3785   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3786 
3787   setDebugLocFromInst(Builder, LoopExitInst);
3788 
3789   // If tail is folded by masking, the vector value to leave the loop should be
3790   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3791   // instead of the former.
3792   if (Cost->foldTailByMasking()) {
3793     for (unsigned Part = 0; Part < UF; ++Part) {
3794       Value *VecLoopExitInst =
3795           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3796       Value *Sel = nullptr;
3797       for (User *U : VecLoopExitInst->users()) {
3798         if (isa<SelectInst>(U)) {
3799           assert(!Sel && "Reduction exit feeding two selects");
3800           Sel = U;
3801         } else
3802           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3803       }
3804       assert(Sel && "Reduction exit feeds no select");
3805       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3806     }
3807   }
3808 
3809   // If the vector reduction can be performed in a smaller type, we truncate
3810   // then extend the loop exit value to enable InstCombine to evaluate the
3811   // entire expression in the smaller type.
3812   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3813     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3814     Builder.SetInsertPoint(
3815         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3816     VectorParts RdxParts(UF);
3817     for (unsigned Part = 0; Part < UF; ++Part) {
3818       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3819       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3820       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3821                                         : Builder.CreateZExt(Trunc, VecTy);
3822       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3823            UI != RdxParts[Part]->user_end();)
3824         if (*UI != Trunc) {
3825           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3826           RdxParts[Part] = Extnd;
3827         } else {
3828           ++UI;
3829         }
3830     }
3831     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3832     for (unsigned Part = 0; Part < UF; ++Part) {
3833       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3834       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3835     }
3836   }
3837 
3838   // Reduce all of the unrolled parts into a single vector.
3839   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3840   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3841 
3842   // The middle block terminator has already been assigned a DebugLoc here (the
3843   // OrigLoop's single latch terminator). We want the whole middle block to
3844   // appear to execute on this line because: (a) it is all compiler generated,
3845   // (b) these instructions are always executed after evaluating the latch
3846   // conditional branch, and (c) other passes may add new predecessors which
3847   // terminate on this line. This is the easiest way to ensure we don't
3848   // accidentally cause an extra step back into the loop while debugging.
3849   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3850   for (unsigned Part = 1; Part < UF; ++Part) {
3851     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3852     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3853       // Floating point operations had to be 'fast' to enable the reduction.
3854       ReducedPartRdx = addFastMathFlag(
3855           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3856                               ReducedPartRdx, "bin.rdx"),
3857           RdxDesc.getFastMathFlags());
3858     else
3859       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3860                                       RdxPart);
3861   }
3862 
3863   if (VF > 1) {
3864     bool NoNaN = Legal->hasFunNoNaNAttr();
3865     ReducedPartRdx =
3866         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3867     // If the reduction can be performed in a smaller type, we need to extend
3868     // the reduction to the wider type before we branch to the original loop.
3869     if (Phi->getType() != RdxDesc.getRecurrenceType())
3870       ReducedPartRdx =
3871         RdxDesc.isSigned()
3872         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3873         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3874   }
3875 
3876   // Create a phi node that merges control-flow from the backedge-taken check
3877   // block and the middle block.
3878   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3879                                         LoopScalarPreHeader->getTerminator());
3880   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3881     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3882   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3883 
3884   // Now, we need to fix the users of the reduction variable
3885   // inside and outside of the scalar remainder loop.
3886   // We know that the loop is in LCSSA form. We need to update the
3887   // PHI nodes in the exit blocks.
3888   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3889     // All PHINodes need to have a single entry edge, or two if
3890     // we already fixed them.
3891     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3892 
3893     // We found a reduction value exit-PHI. Update it with the
3894     // incoming bypass edge.
3895     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3896       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3897   } // end of the LCSSA phi scan.
3898 
3899     // Fix the scalar loop reduction variable with the incoming reduction sum
3900     // from the vector body and from the backedge value.
3901   int IncomingEdgeBlockIdx =
3902     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3903   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3904   // Pick the other block.
3905   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3906   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3907   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3908 }
3909 
3910 void InnerLoopVectorizer::clearReductionWrapFlags(
3911     RecurrenceDescriptor &RdxDesc) {
3912   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3913   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3914       RK != RecurrenceDescriptor::RK_IntegerMult)
3915     return;
3916 
3917   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3918   assert(LoopExitInstr && "null loop exit instruction");
3919   SmallVector<Instruction *, 8> Worklist;
3920   SmallPtrSet<Instruction *, 8> Visited;
3921   Worklist.push_back(LoopExitInstr);
3922   Visited.insert(LoopExitInstr);
3923 
3924   while (!Worklist.empty()) {
3925     Instruction *Cur = Worklist.pop_back_val();
3926     if (isa<OverflowingBinaryOperator>(Cur))
3927       for (unsigned Part = 0; Part < UF; ++Part) {
3928         Value *V = getOrCreateVectorValue(Cur, Part);
3929         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3930       }
3931 
3932     for (User *U : Cur->users()) {
3933       Instruction *UI = cast<Instruction>(U);
3934       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3935           Visited.insert(UI).second)
3936         Worklist.push_back(UI);
3937     }
3938   }
3939 }
3940 
3941 void InnerLoopVectorizer::fixLCSSAPHIs() {
3942   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3943     if (LCSSAPhi.getNumIncomingValues() == 1) {
3944       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3945       // Non-instruction incoming values will have only one value.
3946       unsigned LastLane = 0;
3947       if (isa<Instruction>(IncomingValue))
3948           LastLane = Cost->isUniformAfterVectorization(
3949                          cast<Instruction>(IncomingValue), VF)
3950                          ? 0
3951                          : VF - 1;
3952       // Can be a loop invariant incoming value or the last scalar value to be
3953       // extracted from the vectorized loop.
3954       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3955       Value *lastIncomingValue =
3956           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3957       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3958     }
3959   }
3960 }
3961 
3962 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3963   // The basic block and loop containing the predicated instruction.
3964   auto *PredBB = PredInst->getParent();
3965   auto *VectorLoop = LI->getLoopFor(PredBB);
3966 
3967   // Initialize a worklist with the operands of the predicated instruction.
3968   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3969 
3970   // Holds instructions that we need to analyze again. An instruction may be
3971   // reanalyzed if we don't yet know if we can sink it or not.
3972   SmallVector<Instruction *, 8> InstsToReanalyze;
3973 
3974   // Returns true if a given use occurs in the predicated block. Phi nodes use
3975   // their operands in their corresponding predecessor blocks.
3976   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3977     auto *I = cast<Instruction>(U.getUser());
3978     BasicBlock *BB = I->getParent();
3979     if (auto *Phi = dyn_cast<PHINode>(I))
3980       BB = Phi->getIncomingBlock(
3981           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3982     return BB == PredBB;
3983   };
3984 
3985   // Iteratively sink the scalarized operands of the predicated instruction
3986   // into the block we created for it. When an instruction is sunk, it's
3987   // operands are then added to the worklist. The algorithm ends after one pass
3988   // through the worklist doesn't sink a single instruction.
3989   bool Changed;
3990   do {
3991     // Add the instructions that need to be reanalyzed to the worklist, and
3992     // reset the changed indicator.
3993     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3994     InstsToReanalyze.clear();
3995     Changed = false;
3996 
3997     while (!Worklist.empty()) {
3998       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3999 
4000       // We can't sink an instruction if it is a phi node, is already in the
4001       // predicated block, is not in the loop, or may have side effects.
4002       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4003           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4004         continue;
4005 
4006       // It's legal to sink the instruction if all its uses occur in the
4007       // predicated block. Otherwise, there's nothing to do yet, and we may
4008       // need to reanalyze the instruction.
4009       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4010         InstsToReanalyze.push_back(I);
4011         continue;
4012       }
4013 
4014       // Move the instruction to the beginning of the predicated block, and add
4015       // it's operands to the worklist.
4016       I->moveBefore(&*PredBB->getFirstInsertionPt());
4017       Worklist.insert(I->op_begin(), I->op_end());
4018 
4019       // The sinking may have enabled other instructions to be sunk, so we will
4020       // need to iterate.
4021       Changed = true;
4022     }
4023   } while (Changed);
4024 }
4025 
4026 void InnerLoopVectorizer::fixNonInductionPHIs() {
4027   for (PHINode *OrigPhi : OrigPHIsToFix) {
4028     PHINode *NewPhi =
4029         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4030     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4031 
4032     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4033         predecessors(OrigPhi->getParent()));
4034     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4035         predecessors(NewPhi->getParent()));
4036     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4037            "Scalar and Vector BB should have the same number of predecessors");
4038 
4039     // The insertion point in Builder may be invalidated by the time we get
4040     // here. Force the Builder insertion point to something valid so that we do
4041     // not run into issues during insertion point restore in
4042     // getOrCreateVectorValue calls below.
4043     Builder.SetInsertPoint(NewPhi);
4044 
4045     // The predecessor order is preserved and we can rely on mapping between
4046     // scalar and vector block predecessors.
4047     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4048       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4049 
4050       // When looking up the new scalar/vector values to fix up, use incoming
4051       // values from original phi.
4052       Value *ScIncV =
4053           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4054 
4055       // Scalar incoming value may need a broadcast
4056       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4057       NewPhi->addIncoming(NewIncV, NewPredBB);
4058     }
4059   }
4060 }
4061 
4062 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4063                                    unsigned VF, bool IsPtrLoopInvariant,
4064                                    SmallBitVector &IsIndexLoopInvariant) {
4065   // Construct a vector GEP by widening the operands of the scalar GEP as
4066   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4067   // results in a vector of pointers when at least one operand of the GEP
4068   // is vector-typed. Thus, to keep the representation compact, we only use
4069   // vector-typed operands for loop-varying values.
4070 
4071   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4072     // If we are vectorizing, but the GEP has only loop-invariant operands,
4073     // the GEP we build (by only using vector-typed operands for
4074     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4075     // produce a vector of pointers, we need to either arbitrarily pick an
4076     // operand to broadcast, or broadcast a clone of the original GEP.
4077     // Here, we broadcast a clone of the original.
4078     //
4079     // TODO: If at some point we decide to scalarize instructions having
4080     //       loop-invariant operands, this special case will no longer be
4081     //       required. We would add the scalarization decision to
4082     //       collectLoopScalars() and teach getVectorValue() to broadcast
4083     //       the lane-zero scalar value.
4084     auto *Clone = Builder.Insert(GEP->clone());
4085     for (unsigned Part = 0; Part < UF; ++Part) {
4086       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4087       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4088       addMetadata(EntryPart, GEP);
4089     }
4090   } else {
4091     // If the GEP has at least one loop-varying operand, we are sure to
4092     // produce a vector of pointers. But if we are only unrolling, we want
4093     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4094     // produce with the code below will be scalar (if VF == 1) or vector
4095     // (otherwise). Note that for the unroll-only case, we still maintain
4096     // values in the vector mapping with initVector, as we do for other
4097     // instructions.
4098     for (unsigned Part = 0; Part < UF; ++Part) {
4099       // The pointer operand of the new GEP. If it's loop-invariant, we
4100       // won't broadcast it.
4101       auto *Ptr = IsPtrLoopInvariant
4102                       ? GEP->getPointerOperand()
4103                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4104 
4105       // Collect all the indices for the new GEP. If any index is
4106       // loop-invariant, we won't broadcast it.
4107       SmallVector<Value *, 4> Indices;
4108       for (auto Index : enumerate(GEP->indices())) {
4109         Value *User = Index.value().get();
4110         if (IsIndexLoopInvariant[Index.index()])
4111           Indices.push_back(User);
4112         else
4113           Indices.push_back(getOrCreateVectorValue(User, Part));
4114       }
4115 
4116       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4117       // but it should be a vector, otherwise.
4118       auto *NewGEP =
4119           GEP->isInBounds()
4120               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4121                                           Indices)
4122               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4123       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4124              "NewGEP is not a pointer vector");
4125       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4126       addMetadata(NewGEP, GEP);
4127     }
4128   }
4129 }
4130 
4131 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4132                                               unsigned VF) {
4133   PHINode *P = cast<PHINode>(PN);
4134   if (EnableVPlanNativePath) {
4135     // Currently we enter here in the VPlan-native path for non-induction
4136     // PHIs where all control flow is uniform. We simply widen these PHIs.
4137     // Create a vector phi with no operands - the vector phi operands will be
4138     // set at the end of vector code generation.
4139     Type *VecTy =
4140         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4141     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4142     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4143     OrigPHIsToFix.push_back(P);
4144 
4145     return;
4146   }
4147 
4148   assert(PN->getParent() == OrigLoop->getHeader() &&
4149          "Non-header phis should have been handled elsewhere");
4150 
4151   // In order to support recurrences we need to be able to vectorize Phi nodes.
4152   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4153   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4154   // this value when we vectorize all of the instructions that use the PHI.
4155   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4156     for (unsigned Part = 0; Part < UF; ++Part) {
4157       // This is phase one of vectorizing PHIs.
4158       Type *VecTy =
4159           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4160       Value *EntryPart = PHINode::Create(
4161           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4162       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4163     }
4164     return;
4165   }
4166 
4167   setDebugLocFromInst(Builder, P);
4168 
4169   // This PHINode must be an induction variable.
4170   // Make sure that we know about it.
4171   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4172 
4173   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4174   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4175 
4176   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4177   // which can be found from the original scalar operations.
4178   switch (II.getKind()) {
4179   case InductionDescriptor::IK_NoInduction:
4180     llvm_unreachable("Unknown induction");
4181   case InductionDescriptor::IK_IntInduction:
4182   case InductionDescriptor::IK_FpInduction:
4183     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4184   case InductionDescriptor::IK_PtrInduction: {
4185     // Handle the pointer induction variable case.
4186     assert(P->getType()->isPointerTy() && "Unexpected type.");
4187     // This is the normalized GEP that starts counting at zero.
4188     Value *PtrInd = Induction;
4189     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4190     // Determine the number of scalars we need to generate for each unroll
4191     // iteration. If the instruction is uniform, we only need to generate the
4192     // first lane. Otherwise, we generate all VF values.
4193     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4194     // These are the scalar results. Notice that we don't generate vector GEPs
4195     // because scalar GEPs result in better code.
4196     for (unsigned Part = 0; Part < UF; ++Part) {
4197       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4198         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4199         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4200         Value *SclrGep =
4201             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4202         SclrGep->setName("next.gep");
4203         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4204       }
4205     }
4206     return;
4207   }
4208   }
4209 }
4210 
4211 /// A helper function for checking whether an integer division-related
4212 /// instruction may divide by zero (in which case it must be predicated if
4213 /// executed conditionally in the scalar code).
4214 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4215 /// Non-zero divisors that are non compile-time constants will not be
4216 /// converted into multiplication, so we will still end up scalarizing
4217 /// the division, but can do so w/o predication.
4218 static bool mayDivideByZero(Instruction &I) {
4219   assert((I.getOpcode() == Instruction::UDiv ||
4220           I.getOpcode() == Instruction::SDiv ||
4221           I.getOpcode() == Instruction::URem ||
4222           I.getOpcode() == Instruction::SRem) &&
4223          "Unexpected instruction");
4224   Value *Divisor = I.getOperand(1);
4225   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4226   return !CInt || CInt->isZero();
4227 }
4228 
4229 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4230   switch (I.getOpcode()) {
4231   case Instruction::Call:
4232   case Instruction::Br:
4233   case Instruction::PHI:
4234   case Instruction::GetElementPtr:
4235     llvm_unreachable("This instruction is handled by a different recipe.");
4236   case Instruction::UDiv:
4237   case Instruction::SDiv:
4238   case Instruction::SRem:
4239   case Instruction::URem:
4240   case Instruction::Add:
4241   case Instruction::FAdd:
4242   case Instruction::Sub:
4243   case Instruction::FSub:
4244   case Instruction::FNeg:
4245   case Instruction::Mul:
4246   case Instruction::FMul:
4247   case Instruction::FDiv:
4248   case Instruction::FRem:
4249   case Instruction::Shl:
4250   case Instruction::LShr:
4251   case Instruction::AShr:
4252   case Instruction::And:
4253   case Instruction::Or:
4254   case Instruction::Xor: {
4255     // Just widen unops and binops.
4256     setDebugLocFromInst(Builder, &I);
4257 
4258     for (unsigned Part = 0; Part < UF; ++Part) {
4259       SmallVector<Value *, 2> Ops;
4260       for (Value *Op : I.operands())
4261         Ops.push_back(getOrCreateVectorValue(Op, Part));
4262 
4263       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4264 
4265       if (auto *VecOp = dyn_cast<Instruction>(V))
4266         VecOp->copyIRFlags(&I);
4267 
4268       // Use this vector value for all users of the original instruction.
4269       VectorLoopValueMap.setVectorValue(&I, Part, V);
4270       addMetadata(V, &I);
4271     }
4272 
4273     break;
4274   }
4275   case Instruction::Select: {
4276     // Widen selects.
4277     // If the selector is loop invariant we can create a select
4278     // instruction with a scalar condition. Otherwise, use vector-select.
4279     auto *SE = PSE.getSE();
4280     bool InvariantCond =
4281         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4282     setDebugLocFromInst(Builder, &I);
4283 
4284     // The condition can be loop invariant  but still defined inside the
4285     // loop. This means that we can't just use the original 'cond' value.
4286     // We have to take the 'vectorized' value and pick the first lane.
4287     // Instcombine will make this a no-op.
4288 
4289     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4290 
4291     for (unsigned Part = 0; Part < UF; ++Part) {
4292       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4293       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4294       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4295       Value *Sel =
4296           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4297       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4298       addMetadata(Sel, &I);
4299     }
4300 
4301     break;
4302   }
4303 
4304   case Instruction::ICmp:
4305   case Instruction::FCmp: {
4306     // Widen compares. Generate vector compares.
4307     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4308     auto *Cmp = cast<CmpInst>(&I);
4309     setDebugLocFromInst(Builder, Cmp);
4310     for (unsigned Part = 0; Part < UF; ++Part) {
4311       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4312       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4313       Value *C = nullptr;
4314       if (FCmp) {
4315         // Propagate fast math flags.
4316         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4317         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4318         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4319       } else {
4320         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4321       }
4322       VectorLoopValueMap.setVectorValue(&I, Part, C);
4323       addMetadata(C, &I);
4324     }
4325 
4326     break;
4327   }
4328 
4329   case Instruction::ZExt:
4330   case Instruction::SExt:
4331   case Instruction::FPToUI:
4332   case Instruction::FPToSI:
4333   case Instruction::FPExt:
4334   case Instruction::PtrToInt:
4335   case Instruction::IntToPtr:
4336   case Instruction::SIToFP:
4337   case Instruction::UIToFP:
4338   case Instruction::Trunc:
4339   case Instruction::FPTrunc:
4340   case Instruction::BitCast: {
4341     auto *CI = cast<CastInst>(&I);
4342     setDebugLocFromInst(Builder, CI);
4343 
4344     /// Vectorize casts.
4345     Type *DestTy =
4346         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4347 
4348     for (unsigned Part = 0; Part < UF; ++Part) {
4349       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4350       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4351       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4352       addMetadata(Cast, &I);
4353     }
4354     break;
4355   }
4356   default:
4357     // This instruction is not vectorized by simple widening.
4358     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4359     llvm_unreachable("Unhandled instruction!");
4360   } // end of switch.
4361 }
4362 
4363 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4364                                                VPTransformState &State) {
4365   assert(!isa<DbgInfoIntrinsic>(I) &&
4366          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4367   setDebugLocFromInst(Builder, &I);
4368 
4369   Module *M = I.getParent()->getParent()->getParent();
4370   auto *CI = cast<CallInst>(&I);
4371 
4372   SmallVector<Type *, 4> Tys;
4373   for (Value *ArgOperand : CI->arg_operands())
4374     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4375 
4376   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4377 
4378   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4379   // version of the instruction.
4380   // Is it beneficial to perform intrinsic call compared to lib call?
4381   bool NeedToScalarize = false;
4382   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4383   bool UseVectorIntrinsic =
4384       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4385   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4386          "Instruction should be scalarized elsewhere.");
4387 
4388   for (unsigned Part = 0; Part < UF; ++Part) {
4389     SmallVector<Value *, 4> Args;
4390     for (auto &I : enumerate(ArgOperands.operands())) {
4391       // Some intrinsics have a scalar argument - don't replace it with a
4392       // vector.
4393       Value *Arg;
4394       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4395         Arg = State.get(I.value(), Part);
4396       else
4397         Arg = State.get(I.value(), {0, 0});
4398       Args.push_back(Arg);
4399     }
4400 
4401     Function *VectorF;
4402     if (UseVectorIntrinsic) {
4403       // Use vector version of the intrinsic.
4404       Type *TysForDecl[] = {CI->getType()};
4405       if (VF > 1)
4406         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4407       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4408     } else {
4409       // Use vector version of the function call.
4410       const VFShape Shape =
4411           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4412 #ifndef NDEBUG
4413         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4414         assert(std::find_if(Infos.begin(), Infos.end(),
4415                             [&Shape](const VFInfo &Info) {
4416                               return Info.Shape == Shape;
4417                             }) != Infos.end() &&
4418                "Vector function shape is missing from the database.");
4419 #endif
4420         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4421     }
4422       assert(VectorF && "Can't create vector function.");
4423 
4424       SmallVector<OperandBundleDef, 1> OpBundles;
4425       CI->getOperandBundlesAsDefs(OpBundles);
4426       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4427 
4428       if (isa<FPMathOperator>(V))
4429         V->copyFastMathFlags(CI);
4430 
4431       VectorLoopValueMap.setVectorValue(&I, Part, V);
4432       addMetadata(V, &I);
4433   }
4434 }
4435 
4436 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4437   // We should not collect Scalars more than once per VF. Right now, this
4438   // function is called from collectUniformsAndScalars(), which already does
4439   // this check. Collecting Scalars for VF=1 does not make any sense.
4440   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4441          "This function should not be visited twice for the same VF");
4442 
4443   SmallSetVector<Instruction *, 8> Worklist;
4444 
4445   // These sets are used to seed the analysis with pointers used by memory
4446   // accesses that will remain scalar.
4447   SmallSetVector<Instruction *, 8> ScalarPtrs;
4448   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4449 
4450   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4451   // The pointer operands of loads and stores will be scalar as long as the
4452   // memory access is not a gather or scatter operation. The value operand of a
4453   // store will remain scalar if the store is scalarized.
4454   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4455     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4456     assert(WideningDecision != CM_Unknown &&
4457            "Widening decision should be ready at this moment");
4458     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4459       if (Ptr == Store->getValueOperand())
4460         return WideningDecision == CM_Scalarize;
4461     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4462            "Ptr is neither a value or pointer operand");
4463     return WideningDecision != CM_GatherScatter;
4464   };
4465 
4466   // A helper that returns true if the given value is a bitcast or
4467   // getelementptr instruction contained in the loop.
4468   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4469     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4470             isa<GetElementPtrInst>(V)) &&
4471            !TheLoop->isLoopInvariant(V);
4472   };
4473 
4474   // A helper that evaluates a memory access's use of a pointer. If the use
4475   // will be a scalar use, and the pointer is only used by memory accesses, we
4476   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4477   // PossibleNonScalarPtrs.
4478   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4479     // We only care about bitcast and getelementptr instructions contained in
4480     // the loop.
4481     if (!isLoopVaryingBitCastOrGEP(Ptr))
4482       return;
4483 
4484     // If the pointer has already been identified as scalar (e.g., if it was
4485     // also identified as uniform), there's nothing to do.
4486     auto *I = cast<Instruction>(Ptr);
4487     if (Worklist.count(I))
4488       return;
4489 
4490     // If the use of the pointer will be a scalar use, and all users of the
4491     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4492     // place the pointer in PossibleNonScalarPtrs.
4493     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4494           return isa<LoadInst>(U) || isa<StoreInst>(U);
4495         }))
4496       ScalarPtrs.insert(I);
4497     else
4498       PossibleNonScalarPtrs.insert(I);
4499   };
4500 
4501   // We seed the scalars analysis with three classes of instructions: (1)
4502   // instructions marked uniform-after-vectorization, (2) bitcast and
4503   // getelementptr instructions used by memory accesses requiring a scalar use,
4504   // and (3) pointer induction variables and their update instructions (we
4505   // currently only scalarize these).
4506   //
4507   // (1) Add to the worklist all instructions that have been identified as
4508   // uniform-after-vectorization.
4509   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4510 
4511   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4512   // memory accesses requiring a scalar use. The pointer operands of loads and
4513   // stores will be scalar as long as the memory accesses is not a gather or
4514   // scatter operation. The value operand of a store will remain scalar if the
4515   // store is scalarized.
4516   for (auto *BB : TheLoop->blocks())
4517     for (auto &I : *BB) {
4518       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4519         evaluatePtrUse(Load, Load->getPointerOperand());
4520       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4521         evaluatePtrUse(Store, Store->getPointerOperand());
4522         evaluatePtrUse(Store, Store->getValueOperand());
4523       }
4524     }
4525   for (auto *I : ScalarPtrs)
4526     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4527       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4528       Worklist.insert(I);
4529     }
4530 
4531   // (3) Add to the worklist all pointer induction variables and their update
4532   // instructions.
4533   //
4534   // TODO: Once we are able to vectorize pointer induction variables we should
4535   //       no longer insert them into the worklist here.
4536   auto *Latch = TheLoop->getLoopLatch();
4537   for (auto &Induction : Legal->getInductionVars()) {
4538     auto *Ind = Induction.first;
4539     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4540     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4541       continue;
4542     Worklist.insert(Ind);
4543     Worklist.insert(IndUpdate);
4544     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4545     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4546                       << "\n");
4547   }
4548 
4549   // Insert the forced scalars.
4550   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4551   // induction variable when the PHI user is scalarized.
4552   auto ForcedScalar = ForcedScalars.find(VF);
4553   if (ForcedScalar != ForcedScalars.end())
4554     for (auto *I : ForcedScalar->second)
4555       Worklist.insert(I);
4556 
4557   // Expand the worklist by looking through any bitcasts and getelementptr
4558   // instructions we've already identified as scalar. This is similar to the
4559   // expansion step in collectLoopUniforms(); however, here we're only
4560   // expanding to include additional bitcasts and getelementptr instructions.
4561   unsigned Idx = 0;
4562   while (Idx != Worklist.size()) {
4563     Instruction *Dst = Worklist[Idx++];
4564     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4565       continue;
4566     auto *Src = cast<Instruction>(Dst->getOperand(0));
4567     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4568           auto *J = cast<Instruction>(U);
4569           return !TheLoop->contains(J) || Worklist.count(J) ||
4570                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4571                   isScalarUse(J, Src));
4572         })) {
4573       Worklist.insert(Src);
4574       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4575     }
4576   }
4577 
4578   // An induction variable will remain scalar if all users of the induction
4579   // variable and induction variable update remain scalar.
4580   for (auto &Induction : Legal->getInductionVars()) {
4581     auto *Ind = Induction.first;
4582     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4583 
4584     // We already considered pointer induction variables, so there's no reason
4585     // to look at their users again.
4586     //
4587     // TODO: Once we are able to vectorize pointer induction variables we
4588     //       should no longer skip over them here.
4589     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4590       continue;
4591 
4592     // Determine if all users of the induction variable are scalar after
4593     // vectorization.
4594     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4595       auto *I = cast<Instruction>(U);
4596       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4597     });
4598     if (!ScalarInd)
4599       continue;
4600 
4601     // Determine if all users of the induction variable update instruction are
4602     // scalar after vectorization.
4603     auto ScalarIndUpdate =
4604         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4605           auto *I = cast<Instruction>(U);
4606           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4607         });
4608     if (!ScalarIndUpdate)
4609       continue;
4610 
4611     // The induction variable and its update instruction will remain scalar.
4612     Worklist.insert(Ind);
4613     Worklist.insert(IndUpdate);
4614     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4615     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4616                       << "\n");
4617   }
4618 
4619   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4620 }
4621 
4622 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4623   if (!blockNeedsPredication(I->getParent()))
4624     return false;
4625   switch(I->getOpcode()) {
4626   default:
4627     break;
4628   case Instruction::Load:
4629   case Instruction::Store: {
4630     if (!Legal->isMaskRequired(I))
4631       return false;
4632     auto *Ptr = getLoadStorePointerOperand(I);
4633     auto *Ty = getMemInstValueType(I);
4634     // We have already decided how to vectorize this instruction, get that
4635     // result.
4636     if (VF > 1) {
4637       InstWidening WideningDecision = getWideningDecision(I, VF);
4638       assert(WideningDecision != CM_Unknown &&
4639              "Widening decision should be ready at this moment");
4640       return WideningDecision == CM_Scalarize;
4641     }
4642     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4643     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4644                                 isLegalMaskedGather(Ty, Alignment))
4645                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4646                                 isLegalMaskedScatter(Ty, Alignment));
4647   }
4648   case Instruction::UDiv:
4649   case Instruction::SDiv:
4650   case Instruction::SRem:
4651   case Instruction::URem:
4652     return mayDivideByZero(*I);
4653   }
4654   return false;
4655 }
4656 
4657 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4658                                                                unsigned VF) {
4659   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4660   assert(getWideningDecision(I, VF) == CM_Unknown &&
4661          "Decision should not be set yet.");
4662   auto *Group = getInterleavedAccessGroup(I);
4663   assert(Group && "Must have a group.");
4664 
4665   // If the instruction's allocated size doesn't equal it's type size, it
4666   // requires padding and will be scalarized.
4667   auto &DL = I->getModule()->getDataLayout();
4668   auto *ScalarTy = getMemInstValueType(I);
4669   if (hasIrregularType(ScalarTy, DL, VF))
4670     return false;
4671 
4672   // Check if masking is required.
4673   // A Group may need masking for one of two reasons: it resides in a block that
4674   // needs predication, or it was decided to use masking to deal with gaps.
4675   bool PredicatedAccessRequiresMasking =
4676       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4677   bool AccessWithGapsRequiresMasking =
4678       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4679   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4680     return true;
4681 
4682   // If masked interleaving is required, we expect that the user/target had
4683   // enabled it, because otherwise it either wouldn't have been created or
4684   // it should have been invalidated by the CostModel.
4685   assert(useMaskedInterleavedAccesses(TTI) &&
4686          "Masked interleave-groups for predicated accesses are not enabled.");
4687 
4688   auto *Ty = getMemInstValueType(I);
4689   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4690   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4691                           : TTI.isLegalMaskedStore(Ty, Alignment);
4692 }
4693 
4694 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4695                                                                unsigned VF) {
4696   // Get and ensure we have a valid memory instruction.
4697   LoadInst *LI = dyn_cast<LoadInst>(I);
4698   StoreInst *SI = dyn_cast<StoreInst>(I);
4699   assert((LI || SI) && "Invalid memory instruction");
4700 
4701   auto *Ptr = getLoadStorePointerOperand(I);
4702 
4703   // In order to be widened, the pointer should be consecutive, first of all.
4704   if (!Legal->isConsecutivePtr(Ptr))
4705     return false;
4706 
4707   // If the instruction is a store located in a predicated block, it will be
4708   // scalarized.
4709   if (isScalarWithPredication(I))
4710     return false;
4711 
4712   // If the instruction's allocated size doesn't equal it's type size, it
4713   // requires padding and will be scalarized.
4714   auto &DL = I->getModule()->getDataLayout();
4715   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4716   if (hasIrregularType(ScalarTy, DL, VF))
4717     return false;
4718 
4719   return true;
4720 }
4721 
4722 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4723   // We should not collect Uniforms more than once per VF. Right now,
4724   // this function is called from collectUniformsAndScalars(), which
4725   // already does this check. Collecting Uniforms for VF=1 does not make any
4726   // sense.
4727 
4728   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4729          "This function should not be visited twice for the same VF");
4730 
4731   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4732   // not analyze again.  Uniforms.count(VF) will return 1.
4733   Uniforms[VF].clear();
4734 
4735   // We now know that the loop is vectorizable!
4736   // Collect instructions inside the loop that will remain uniform after
4737   // vectorization.
4738 
4739   // Global values, params and instructions outside of current loop are out of
4740   // scope.
4741   auto isOutOfScope = [&](Value *V) -> bool {
4742     Instruction *I = dyn_cast<Instruction>(V);
4743     return (!I || !TheLoop->contains(I));
4744   };
4745 
4746   SetVector<Instruction *> Worklist;
4747   BasicBlock *Latch = TheLoop->getLoopLatch();
4748 
4749   // Instructions that are scalar with predication must not be considered
4750   // uniform after vectorization, because that would create an erroneous
4751   // replicating region where only a single instance out of VF should be formed.
4752   // TODO: optimize such seldom cases if found important, see PR40816.
4753   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4754     if (isScalarWithPredication(I, VF)) {
4755       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4756                         << *I << "\n");
4757       return;
4758     }
4759     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4760     Worklist.insert(I);
4761   };
4762 
4763   // Start with the conditional branch. If the branch condition is an
4764   // instruction contained in the loop that is only used by the branch, it is
4765   // uniform.
4766   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4767   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4768     addToWorklistIfAllowed(Cmp);
4769 
4770   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4771   // are pointers that are treated like consecutive pointers during
4772   // vectorization. The pointer operands of interleaved accesses are an
4773   // example.
4774   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4775 
4776   // Holds pointer operands of instructions that are possibly non-uniform.
4777   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4778 
4779   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4780     InstWidening WideningDecision = getWideningDecision(I, VF);
4781     assert(WideningDecision != CM_Unknown &&
4782            "Widening decision should be ready at this moment");
4783 
4784     return (WideningDecision == CM_Widen ||
4785             WideningDecision == CM_Widen_Reverse ||
4786             WideningDecision == CM_Interleave);
4787   };
4788   // Iterate over the instructions in the loop, and collect all
4789   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4790   // that a consecutive-like pointer operand will be scalarized, we collect it
4791   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4792   // getelementptr instruction can be used by both vectorized and scalarized
4793   // memory instructions. For example, if a loop loads and stores from the same
4794   // location, but the store is conditional, the store will be scalarized, and
4795   // the getelementptr won't remain uniform.
4796   for (auto *BB : TheLoop->blocks())
4797     for (auto &I : *BB) {
4798       // If there's no pointer operand, there's nothing to do.
4799       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4800       if (!Ptr)
4801         continue;
4802 
4803       // True if all users of Ptr are memory accesses that have Ptr as their
4804       // pointer operand.
4805       auto UsersAreMemAccesses =
4806           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4807             return getLoadStorePointerOperand(U) == Ptr;
4808           });
4809 
4810       // Ensure the memory instruction will not be scalarized or used by
4811       // gather/scatter, making its pointer operand non-uniform. If the pointer
4812       // operand is used by any instruction other than a memory access, we
4813       // conservatively assume the pointer operand may be non-uniform.
4814       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4815         PossibleNonUniformPtrs.insert(Ptr);
4816 
4817       // If the memory instruction will be vectorized and its pointer operand
4818       // is consecutive-like, or interleaving - the pointer operand should
4819       // remain uniform.
4820       else
4821         ConsecutiveLikePtrs.insert(Ptr);
4822     }
4823 
4824   // Add to the Worklist all consecutive and consecutive-like pointers that
4825   // aren't also identified as possibly non-uniform.
4826   for (auto *V : ConsecutiveLikePtrs)
4827     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4828       addToWorklistIfAllowed(V);
4829 
4830   // Expand Worklist in topological order: whenever a new instruction
4831   // is added , its users should be already inside Worklist.  It ensures
4832   // a uniform instruction will only be used by uniform instructions.
4833   unsigned idx = 0;
4834   while (idx != Worklist.size()) {
4835     Instruction *I = Worklist[idx++];
4836 
4837     for (auto OV : I->operand_values()) {
4838       // isOutOfScope operands cannot be uniform instructions.
4839       if (isOutOfScope(OV))
4840         continue;
4841       // First order recurrence Phi's should typically be considered
4842       // non-uniform.
4843       auto *OP = dyn_cast<PHINode>(OV);
4844       if (OP && Legal->isFirstOrderRecurrence(OP))
4845         continue;
4846       // If all the users of the operand are uniform, then add the
4847       // operand into the uniform worklist.
4848       auto *OI = cast<Instruction>(OV);
4849       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4850             auto *J = cast<Instruction>(U);
4851             return Worklist.count(J) ||
4852                    (OI == getLoadStorePointerOperand(J) &&
4853                     isUniformDecision(J, VF));
4854           }))
4855         addToWorklistIfAllowed(OI);
4856     }
4857   }
4858 
4859   // Returns true if Ptr is the pointer operand of a memory access instruction
4860   // I, and I is known to not require scalarization.
4861   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4862     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4863   };
4864 
4865   // For an instruction to be added into Worklist above, all its users inside
4866   // the loop should also be in Worklist. However, this condition cannot be
4867   // true for phi nodes that form a cyclic dependence. We must process phi
4868   // nodes separately. An induction variable will remain uniform if all users
4869   // of the induction variable and induction variable update remain uniform.
4870   // The code below handles both pointer and non-pointer induction variables.
4871   for (auto &Induction : Legal->getInductionVars()) {
4872     auto *Ind = Induction.first;
4873     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4874 
4875     // Determine if all users of the induction variable are uniform after
4876     // vectorization.
4877     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4878       auto *I = cast<Instruction>(U);
4879       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4880              isVectorizedMemAccessUse(I, Ind);
4881     });
4882     if (!UniformInd)
4883       continue;
4884 
4885     // Determine if all users of the induction variable update instruction are
4886     // uniform after vectorization.
4887     auto UniformIndUpdate =
4888         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4889           auto *I = cast<Instruction>(U);
4890           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4891                  isVectorizedMemAccessUse(I, IndUpdate);
4892         });
4893     if (!UniformIndUpdate)
4894       continue;
4895 
4896     // The induction variable and its update instruction will remain uniform.
4897     addToWorklistIfAllowed(Ind);
4898     addToWorklistIfAllowed(IndUpdate);
4899   }
4900 
4901   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4902 }
4903 
4904 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4905   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4906 
4907   if (Legal->getRuntimePointerChecking()->Need) {
4908     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4909         "runtime pointer checks needed. Enable vectorization of this "
4910         "loop with '#pragma clang loop vectorize(enable)' when "
4911         "compiling with -Os/-Oz",
4912         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4913     return true;
4914   }
4915 
4916   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4917     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4918         "runtime SCEV checks needed. Enable vectorization of this "
4919         "loop with '#pragma clang loop vectorize(enable)' when "
4920         "compiling with -Os/-Oz",
4921         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4922     return true;
4923   }
4924 
4925   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4926   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4927     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4928         "runtime stride == 1 checks needed. Enable vectorization of "
4929         "this loop with '#pragma clang loop vectorize(enable)' when "
4930         "compiling with -Os/-Oz",
4931         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4932     return true;
4933   }
4934 
4935   return false;
4936 }
4937 
4938 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4939   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4940     // TODO: It may by useful to do since it's still likely to be dynamically
4941     // uniform if the target can skip.
4942     reportVectorizationFailure(
4943         "Not inserting runtime ptr check for divergent target",
4944         "runtime pointer checks needed. Not enabled for divergent target",
4945         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4946     return None;
4947   }
4948 
4949   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4950   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4951   if (TC == 1) {
4952     reportVectorizationFailure("Single iteration (non) loop",
4953         "loop trip count is one, irrelevant for vectorization",
4954         "SingleIterationLoop", ORE, TheLoop);
4955     return None;
4956   }
4957 
4958   switch (ScalarEpilogueStatus) {
4959   case CM_ScalarEpilogueAllowed:
4960     return computeFeasibleMaxVF(TC);
4961   case CM_ScalarEpilogueNotNeededUsePredicate:
4962     LLVM_DEBUG(
4963         dbgs() << "LV: vector predicate hint/switch found.\n"
4964                << "LV: Not allowing scalar epilogue, creating predicated "
4965                << "vector loop.\n");
4966     break;
4967   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4968     // fallthrough as a special case of OptForSize
4969   case CM_ScalarEpilogueNotAllowedOptSize:
4970     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4971       LLVM_DEBUG(
4972           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4973     else
4974       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4975                         << "count.\n");
4976 
4977     // Bail if runtime checks are required, which are not good when optimising
4978     // for size.
4979     if (runtimeChecksRequired())
4980       return None;
4981     break;
4982   }
4983 
4984   // Now try the tail folding
4985 
4986   // Invalidate interleave groups that require an epilogue if we can't mask
4987   // the interleave-group.
4988   if (!useMaskedInterleavedAccesses(TTI))
4989     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4990 
4991   unsigned MaxVF = computeFeasibleMaxVF(TC);
4992   if (TC > 0 && TC % MaxVF == 0) {
4993     // Accept MaxVF if we do not have a tail.
4994     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4995     return MaxVF;
4996   }
4997 
4998   // If we don't know the precise trip count, or if the trip count that we
4999   // found modulo the vectorization factor is not zero, try to fold the tail
5000   // by masking.
5001   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5002   if (Legal->prepareToFoldTailByMasking()) {
5003     FoldTailByMasking = true;
5004     return MaxVF;
5005   }
5006 
5007   if (TC == 0) {
5008     reportVectorizationFailure(
5009         "Unable to calculate the loop count due to complex control flow",
5010         "unable to calculate the loop count due to complex control flow",
5011         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5012     return None;
5013   }
5014 
5015   reportVectorizationFailure(
5016       "Cannot optimize for size and vectorize at the same time.",
5017       "cannot optimize for size and vectorize at the same time. "
5018       "Enable vectorization of this loop with '#pragma clang loop "
5019       "vectorize(enable)' when compiling with -Os/-Oz",
5020       "NoTailLoopWithOptForSize", ORE, TheLoop);
5021   return None;
5022 }
5023 
5024 unsigned
5025 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5026   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5027   unsigned SmallestType, WidestType;
5028   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5029   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5030 
5031   // Get the maximum safe dependence distance in bits computed by LAA.
5032   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5033   // the memory accesses that is most restrictive (involved in the smallest
5034   // dependence distance).
5035   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5036 
5037   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5038 
5039   unsigned MaxVectorSize = WidestRegister / WidestType;
5040 
5041   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5042                     << " / " << WidestType << " bits.\n");
5043   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5044                     << WidestRegister << " bits.\n");
5045 
5046   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5047                                  " into one vector!");
5048   if (MaxVectorSize == 0) {
5049     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5050     MaxVectorSize = 1;
5051     return MaxVectorSize;
5052   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5053              isPowerOf2_32(ConstTripCount)) {
5054     // We need to clamp the VF to be the ConstTripCount. There is no point in
5055     // choosing a higher viable VF as done in the loop below.
5056     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5057                       << ConstTripCount << "\n");
5058     MaxVectorSize = ConstTripCount;
5059     return MaxVectorSize;
5060   }
5061 
5062   unsigned MaxVF = MaxVectorSize;
5063   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5064       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5065     // Collect all viable vectorization factors larger than the default MaxVF
5066     // (i.e. MaxVectorSize).
5067     SmallVector<unsigned, 8> VFs;
5068     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5069     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5070       VFs.push_back(VS);
5071 
5072     // For each VF calculate its register usage.
5073     auto RUs = calculateRegisterUsage(VFs);
5074 
5075     // Select the largest VF which doesn't require more registers than existing
5076     // ones.
5077     for (int i = RUs.size() - 1; i >= 0; --i) {
5078       bool Selected = true;
5079       for (auto& pair : RUs[i].MaxLocalUsers) {
5080         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5081         if (pair.second > TargetNumRegisters)
5082           Selected = false;
5083       }
5084       if (Selected) {
5085         MaxVF = VFs[i];
5086         break;
5087       }
5088     }
5089     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5090       if (MaxVF < MinVF) {
5091         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5092                           << ") with target's minimum: " << MinVF << '\n');
5093         MaxVF = MinVF;
5094       }
5095     }
5096   }
5097   return MaxVF;
5098 }
5099 
5100 VectorizationFactor
5101 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5102   float Cost = expectedCost(1).first;
5103   const float ScalarCost = Cost;
5104   unsigned Width = 1;
5105   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5106 
5107   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5108   if (ForceVectorization && MaxVF > 1) {
5109     // Ignore scalar width, because the user explicitly wants vectorization.
5110     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5111     // evaluation.
5112     Cost = std::numeric_limits<float>::max();
5113   }
5114 
5115   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5116     // Notice that the vector loop needs to be executed less times, so
5117     // we need to divide the cost of the vector loops by the width of
5118     // the vector elements.
5119     VectorizationCostTy C = expectedCost(i);
5120     float VectorCost = C.first / (float)i;
5121     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5122                       << " costs: " << (int)VectorCost << ".\n");
5123     if (!C.second && !ForceVectorization) {
5124       LLVM_DEBUG(
5125           dbgs() << "LV: Not considering vector loop of width " << i
5126                  << " because it will not generate any vector instructions.\n");
5127       continue;
5128     }
5129     if (VectorCost < Cost) {
5130       Cost = VectorCost;
5131       Width = i;
5132     }
5133   }
5134 
5135   if (!EnableCondStoresVectorization && NumPredStores) {
5136     reportVectorizationFailure("There are conditional stores.",
5137         "store that is conditionally executed prevents vectorization",
5138         "ConditionalStore", ORE, TheLoop);
5139     Width = 1;
5140     Cost = ScalarCost;
5141   }
5142 
5143   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5144              << "LV: Vectorization seems to be not beneficial, "
5145              << "but was forced by a user.\n");
5146   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5147   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5148   return Factor;
5149 }
5150 
5151 std::pair<unsigned, unsigned>
5152 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5153   unsigned MinWidth = -1U;
5154   unsigned MaxWidth = 8;
5155   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5156 
5157   // For each block.
5158   for (BasicBlock *BB : TheLoop->blocks()) {
5159     // For each instruction in the loop.
5160     for (Instruction &I : BB->instructionsWithoutDebug()) {
5161       Type *T = I.getType();
5162 
5163       // Skip ignored values.
5164       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5165         continue;
5166 
5167       // Only examine Loads, Stores and PHINodes.
5168       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5169         continue;
5170 
5171       // Examine PHI nodes that are reduction variables. Update the type to
5172       // account for the recurrence type.
5173       if (auto *PN = dyn_cast<PHINode>(&I)) {
5174         if (!Legal->isReductionVariable(PN))
5175           continue;
5176         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5177         T = RdxDesc.getRecurrenceType();
5178       }
5179 
5180       // Examine the stored values.
5181       if (auto *ST = dyn_cast<StoreInst>(&I))
5182         T = ST->getValueOperand()->getType();
5183 
5184       // Ignore loaded pointer types and stored pointer types that are not
5185       // vectorizable.
5186       //
5187       // FIXME: The check here attempts to predict whether a load or store will
5188       //        be vectorized. We only know this for certain after a VF has
5189       //        been selected. Here, we assume that if an access can be
5190       //        vectorized, it will be. We should also look at extending this
5191       //        optimization to non-pointer types.
5192       //
5193       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5194           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5195         continue;
5196 
5197       MinWidth = std::min(MinWidth,
5198                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5199       MaxWidth = std::max(MaxWidth,
5200                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5201     }
5202   }
5203 
5204   return {MinWidth, MaxWidth};
5205 }
5206 
5207 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5208                                                            unsigned LoopCost) {
5209   // -- The interleave heuristics --
5210   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5211   // There are many micro-architectural considerations that we can't predict
5212   // at this level. For example, frontend pressure (on decode or fetch) due to
5213   // code size, or the number and capabilities of the execution ports.
5214   //
5215   // We use the following heuristics to select the interleave count:
5216   // 1. If the code has reductions, then we interleave to break the cross
5217   // iteration dependency.
5218   // 2. If the loop is really small, then we interleave to reduce the loop
5219   // overhead.
5220   // 3. We don't interleave if we think that we will spill registers to memory
5221   // due to the increased register pressure.
5222 
5223   if (!isScalarEpilogueAllowed())
5224     return 1;
5225 
5226   // We used the distance for the interleave count.
5227   if (Legal->getMaxSafeDepDistBytes() != -1U)
5228     return 1;
5229 
5230   // Do not interleave loops with a relatively small known or estimated trip
5231   // count.
5232   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5233   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5234     return 1;
5235 
5236   RegisterUsage R = calculateRegisterUsage({VF})[0];
5237   // We divide by these constants so assume that we have at least one
5238   // instruction that uses at least one register.
5239   for (auto& pair : R.MaxLocalUsers) {
5240     pair.second = std::max(pair.second, 1U);
5241   }
5242 
5243   // We calculate the interleave count using the following formula.
5244   // Subtract the number of loop invariants from the number of available
5245   // registers. These registers are used by all of the interleaved instances.
5246   // Next, divide the remaining registers by the number of registers that is
5247   // required by the loop, in order to estimate how many parallel instances
5248   // fit without causing spills. All of this is rounded down if necessary to be
5249   // a power of two. We want power of two interleave count to simplify any
5250   // addressing operations or alignment considerations.
5251   // We also want power of two interleave counts to ensure that the induction
5252   // variable of the vector loop wraps to zero, when tail is folded by masking;
5253   // this currently happens when OptForSize, in which case IC is set to 1 above.
5254   unsigned IC = UINT_MAX;
5255 
5256   for (auto& pair : R.MaxLocalUsers) {
5257     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5258     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5259                       << " registers of "
5260                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5261     if (VF == 1) {
5262       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5263         TargetNumRegisters = ForceTargetNumScalarRegs;
5264     } else {
5265       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5266         TargetNumRegisters = ForceTargetNumVectorRegs;
5267     }
5268     unsigned MaxLocalUsers = pair.second;
5269     unsigned LoopInvariantRegs = 0;
5270     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5271       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5272 
5273     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5274     // Don't count the induction variable as interleaved.
5275     if (EnableIndVarRegisterHeur) {
5276       TmpIC =
5277           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5278                         std::max(1U, (MaxLocalUsers - 1)));
5279     }
5280 
5281     IC = std::min(IC, TmpIC);
5282   }
5283 
5284   // Clamp the interleave ranges to reasonable counts.
5285   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5286 
5287   // Check if the user has overridden the max.
5288   if (VF == 1) {
5289     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5290       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5291   } else {
5292     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5293       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5294   }
5295 
5296   // If trip count is known or estimated compile time constant, limit the
5297   // interleave count to be less than the trip count divided by VF.
5298   if (BestKnownTC) {
5299     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5300   }
5301 
5302   // If we did not calculate the cost for VF (because the user selected the VF)
5303   // then we calculate the cost of VF here.
5304   if (LoopCost == 0)
5305     LoopCost = expectedCost(VF).first;
5306 
5307   assert(LoopCost && "Non-zero loop cost expected");
5308 
5309   // Clamp the calculated IC to be between the 1 and the max interleave count
5310   // that the target and trip count allows.
5311   if (IC > MaxInterleaveCount)
5312     IC = MaxInterleaveCount;
5313   else if (IC < 1)
5314     IC = 1;
5315 
5316   // Interleave if we vectorized this loop and there is a reduction that could
5317   // benefit from interleaving.
5318   if (VF > 1 && !Legal->getReductionVars().empty()) {
5319     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5320     return IC;
5321   }
5322 
5323   // Note that if we've already vectorized the loop we will have done the
5324   // runtime check and so interleaving won't require further checks.
5325   bool InterleavingRequiresRuntimePointerCheck =
5326       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5327 
5328   // We want to interleave small loops in order to reduce the loop overhead and
5329   // potentially expose ILP opportunities.
5330   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5331   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5332     // We assume that the cost overhead is 1 and we use the cost model
5333     // to estimate the cost of the loop and interleave until the cost of the
5334     // loop overhead is about 5% of the cost of the loop.
5335     unsigned SmallIC =
5336         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5337 
5338     // Interleave until store/load ports (estimated by max interleave count) are
5339     // saturated.
5340     unsigned NumStores = Legal->getNumStores();
5341     unsigned NumLoads = Legal->getNumLoads();
5342     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5343     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5344 
5345     // If we have a scalar reduction (vector reductions are already dealt with
5346     // by this point), we can increase the critical path length if the loop
5347     // we're interleaving is inside another loop. Limit, by default to 2, so the
5348     // critical path only gets increased by one reduction operation.
5349     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5350       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5351       SmallIC = std::min(SmallIC, F);
5352       StoresIC = std::min(StoresIC, F);
5353       LoadsIC = std::min(LoadsIC, F);
5354     }
5355 
5356     if (EnableLoadStoreRuntimeInterleave &&
5357         std::max(StoresIC, LoadsIC) > SmallIC) {
5358       LLVM_DEBUG(
5359           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5360       return std::max(StoresIC, LoadsIC);
5361     }
5362 
5363     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5364     return SmallIC;
5365   }
5366 
5367   // Interleave if this is a large loop (small loops are already dealt with by
5368   // this point) that could benefit from interleaving.
5369   bool HasReductions = !Legal->getReductionVars().empty();
5370   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5371     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5372     return IC;
5373   }
5374 
5375   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5376   return 1;
5377 }
5378 
5379 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5380 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5381   // This function calculates the register usage by measuring the highest number
5382   // of values that are alive at a single location. Obviously, this is a very
5383   // rough estimation. We scan the loop in a topological order in order and
5384   // assign a number to each instruction. We use RPO to ensure that defs are
5385   // met before their users. We assume that each instruction that has in-loop
5386   // users starts an interval. We record every time that an in-loop value is
5387   // used, so we have a list of the first and last occurrences of each
5388   // instruction. Next, we transpose this data structure into a multi map that
5389   // holds the list of intervals that *end* at a specific location. This multi
5390   // map allows us to perform a linear search. We scan the instructions linearly
5391   // and record each time that a new interval starts, by placing it in a set.
5392   // If we find this value in the multi-map then we remove it from the set.
5393   // The max register usage is the maximum size of the set.
5394   // We also search for instructions that are defined outside the loop, but are
5395   // used inside the loop. We need this number separately from the max-interval
5396   // usage number because when we unroll, loop-invariant values do not take
5397   // more register.
5398   LoopBlocksDFS DFS(TheLoop);
5399   DFS.perform(LI);
5400 
5401   RegisterUsage RU;
5402 
5403   // Each 'key' in the map opens a new interval. The values
5404   // of the map are the index of the 'last seen' usage of the
5405   // instruction that is the key.
5406   using IntervalMap = DenseMap<Instruction *, unsigned>;
5407 
5408   // Maps instruction to its index.
5409   SmallVector<Instruction *, 64> IdxToInstr;
5410   // Marks the end of each interval.
5411   IntervalMap EndPoint;
5412   // Saves the list of instruction indices that are used in the loop.
5413   SmallPtrSet<Instruction *, 8> Ends;
5414   // Saves the list of values that are used in the loop but are
5415   // defined outside the loop, such as arguments and constants.
5416   SmallPtrSet<Value *, 8> LoopInvariants;
5417 
5418   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5419     for (Instruction &I : BB->instructionsWithoutDebug()) {
5420       IdxToInstr.push_back(&I);
5421 
5422       // Save the end location of each USE.
5423       for (Value *U : I.operands()) {
5424         auto *Instr = dyn_cast<Instruction>(U);
5425 
5426         // Ignore non-instruction values such as arguments, constants, etc.
5427         if (!Instr)
5428           continue;
5429 
5430         // If this instruction is outside the loop then record it and continue.
5431         if (!TheLoop->contains(Instr)) {
5432           LoopInvariants.insert(Instr);
5433           continue;
5434         }
5435 
5436         // Overwrite previous end points.
5437         EndPoint[Instr] = IdxToInstr.size();
5438         Ends.insert(Instr);
5439       }
5440     }
5441   }
5442 
5443   // Saves the list of intervals that end with the index in 'key'.
5444   using InstrList = SmallVector<Instruction *, 2>;
5445   DenseMap<unsigned, InstrList> TransposeEnds;
5446 
5447   // Transpose the EndPoints to a list of values that end at each index.
5448   for (auto &Interval : EndPoint)
5449     TransposeEnds[Interval.second].push_back(Interval.first);
5450 
5451   SmallPtrSet<Instruction *, 8> OpenIntervals;
5452 
5453   // Get the size of the widest register.
5454   unsigned MaxSafeDepDist = -1U;
5455   if (Legal->getMaxSafeDepDistBytes() != -1U)
5456     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5457   unsigned WidestRegister =
5458       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5459   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5460 
5461   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5462   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5463 
5464   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5465 
5466   // A lambda that gets the register usage for the given type and VF.
5467   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5468     if (Ty->isTokenTy())
5469       return 0U;
5470     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5471     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5472   };
5473 
5474   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5475     Instruction *I = IdxToInstr[i];
5476 
5477     // Remove all of the instructions that end at this location.
5478     InstrList &List = TransposeEnds[i];
5479     for (Instruction *ToRemove : List)
5480       OpenIntervals.erase(ToRemove);
5481 
5482     // Ignore instructions that are never used within the loop.
5483     if (Ends.find(I) == Ends.end())
5484       continue;
5485 
5486     // Skip ignored values.
5487     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5488       continue;
5489 
5490     // For each VF find the maximum usage of registers.
5491     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5492       // Count the number of live intervals.
5493       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5494 
5495       if (VFs[j] == 1) {
5496         for (auto Inst : OpenIntervals) {
5497           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5498           if (RegUsage.find(ClassID) == RegUsage.end())
5499             RegUsage[ClassID] = 1;
5500           else
5501             RegUsage[ClassID] += 1;
5502         }
5503       } else {
5504         collectUniformsAndScalars(VFs[j]);
5505         for (auto Inst : OpenIntervals) {
5506           // Skip ignored values for VF > 1.
5507           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5508             continue;
5509           if (isScalarAfterVectorization(Inst, VFs[j])) {
5510             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5511             if (RegUsage.find(ClassID) == RegUsage.end())
5512               RegUsage[ClassID] = 1;
5513             else
5514               RegUsage[ClassID] += 1;
5515           } else {
5516             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5517             if (RegUsage.find(ClassID) == RegUsage.end())
5518               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5519             else
5520               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5521           }
5522         }
5523       }
5524 
5525       for (auto& pair : RegUsage) {
5526         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5527           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5528         else
5529           MaxUsages[j][pair.first] = pair.second;
5530       }
5531     }
5532 
5533     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5534                       << OpenIntervals.size() << '\n');
5535 
5536     // Add the current instruction to the list of open intervals.
5537     OpenIntervals.insert(I);
5538   }
5539 
5540   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5541     SmallMapVector<unsigned, unsigned, 4> Invariant;
5542 
5543     for (auto Inst : LoopInvariants) {
5544       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5545       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5546       if (Invariant.find(ClassID) == Invariant.end())
5547         Invariant[ClassID] = Usage;
5548       else
5549         Invariant[ClassID] += Usage;
5550     }
5551 
5552     LLVM_DEBUG({
5553       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5554       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5555              << " item\n";
5556       for (const auto &pair : MaxUsages[i]) {
5557         dbgs() << "LV(REG): RegisterClass: "
5558                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5559                << " registers\n";
5560       }
5561       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5562              << " item\n";
5563       for (const auto &pair : Invariant) {
5564         dbgs() << "LV(REG): RegisterClass: "
5565                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5566                << " registers\n";
5567       }
5568     });
5569 
5570     RU.LoopInvariantRegs = Invariant;
5571     RU.MaxLocalUsers = MaxUsages[i];
5572     RUs[i] = RU;
5573   }
5574 
5575   return RUs;
5576 }
5577 
5578 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5579   // TODO: Cost model for emulated masked load/store is completely
5580   // broken. This hack guides the cost model to use an artificially
5581   // high enough value to practically disable vectorization with such
5582   // operations, except where previously deployed legality hack allowed
5583   // using very low cost values. This is to avoid regressions coming simply
5584   // from moving "masked load/store" check from legality to cost model.
5585   // Masked Load/Gather emulation was previously never allowed.
5586   // Limited number of Masked Store/Scatter emulation was allowed.
5587   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5588   return isa<LoadInst>(I) ||
5589          (isa<StoreInst>(I) &&
5590           NumPredStores > NumberOfStoresToPredicate);
5591 }
5592 
5593 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5594   // If we aren't vectorizing the loop, or if we've already collected the
5595   // instructions to scalarize, there's nothing to do. Collection may already
5596   // have occurred if we have a user-selected VF and are now computing the
5597   // expected cost for interleaving.
5598   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5599     return;
5600 
5601   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5602   // not profitable to scalarize any instructions, the presence of VF in the
5603   // map will indicate that we've analyzed it already.
5604   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5605 
5606   // Find all the instructions that are scalar with predication in the loop and
5607   // determine if it would be better to not if-convert the blocks they are in.
5608   // If so, we also record the instructions to scalarize.
5609   for (BasicBlock *BB : TheLoop->blocks()) {
5610     if (!blockNeedsPredication(BB))
5611       continue;
5612     for (Instruction &I : *BB)
5613       if (isScalarWithPredication(&I)) {
5614         ScalarCostsTy ScalarCosts;
5615         // Do not apply discount logic if hacked cost is needed
5616         // for emulated masked memrefs.
5617         if (!useEmulatedMaskMemRefHack(&I) &&
5618             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5619           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5620         // Remember that BB will remain after vectorization.
5621         PredicatedBBsAfterVectorization.insert(BB);
5622       }
5623   }
5624 }
5625 
5626 int LoopVectorizationCostModel::computePredInstDiscount(
5627     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5628     unsigned VF) {
5629   assert(!isUniformAfterVectorization(PredInst, VF) &&
5630          "Instruction marked uniform-after-vectorization will be predicated");
5631 
5632   // Initialize the discount to zero, meaning that the scalar version and the
5633   // vector version cost the same.
5634   int Discount = 0;
5635 
5636   // Holds instructions to analyze. The instructions we visit are mapped in
5637   // ScalarCosts. Those instructions are the ones that would be scalarized if
5638   // we find that the scalar version costs less.
5639   SmallVector<Instruction *, 8> Worklist;
5640 
5641   // Returns true if the given instruction can be scalarized.
5642   auto canBeScalarized = [&](Instruction *I) -> bool {
5643     // We only attempt to scalarize instructions forming a single-use chain
5644     // from the original predicated block that would otherwise be vectorized.
5645     // Although not strictly necessary, we give up on instructions we know will
5646     // already be scalar to avoid traversing chains that are unlikely to be
5647     // beneficial.
5648     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5649         isScalarAfterVectorization(I, VF))
5650       return false;
5651 
5652     // If the instruction is scalar with predication, it will be analyzed
5653     // separately. We ignore it within the context of PredInst.
5654     if (isScalarWithPredication(I))
5655       return false;
5656 
5657     // If any of the instruction's operands are uniform after vectorization,
5658     // the instruction cannot be scalarized. This prevents, for example, a
5659     // masked load from being scalarized.
5660     //
5661     // We assume we will only emit a value for lane zero of an instruction
5662     // marked uniform after vectorization, rather than VF identical values.
5663     // Thus, if we scalarize an instruction that uses a uniform, we would
5664     // create uses of values corresponding to the lanes we aren't emitting code
5665     // for. This behavior can be changed by allowing getScalarValue to clone
5666     // the lane zero values for uniforms rather than asserting.
5667     for (Use &U : I->operands())
5668       if (auto *J = dyn_cast<Instruction>(U.get()))
5669         if (isUniformAfterVectorization(J, VF))
5670           return false;
5671 
5672     // Otherwise, we can scalarize the instruction.
5673     return true;
5674   };
5675 
5676   // Compute the expected cost discount from scalarizing the entire expression
5677   // feeding the predicated instruction. We currently only consider expressions
5678   // that are single-use instruction chains.
5679   Worklist.push_back(PredInst);
5680   while (!Worklist.empty()) {
5681     Instruction *I = Worklist.pop_back_val();
5682 
5683     // If we've already analyzed the instruction, there's nothing to do.
5684     if (ScalarCosts.find(I) != ScalarCosts.end())
5685       continue;
5686 
5687     // Compute the cost of the vector instruction. Note that this cost already
5688     // includes the scalarization overhead of the predicated instruction.
5689     unsigned VectorCost = getInstructionCost(I, VF).first;
5690 
5691     // Compute the cost of the scalarized instruction. This cost is the cost of
5692     // the instruction as if it wasn't if-converted and instead remained in the
5693     // predicated block. We will scale this cost by block probability after
5694     // computing the scalarization overhead.
5695     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5696 
5697     // Compute the scalarization overhead of needed insertelement instructions
5698     // and phi nodes.
5699     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5700       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5701                                                  true, false);
5702       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5703     }
5704 
5705     // Compute the scalarization overhead of needed extractelement
5706     // instructions. For each of the instruction's operands, if the operand can
5707     // be scalarized, add it to the worklist; otherwise, account for the
5708     // overhead.
5709     for (Use &U : I->operands())
5710       if (auto *J = dyn_cast<Instruction>(U.get())) {
5711         assert(VectorType::isValidElementType(J->getType()) &&
5712                "Instruction has non-scalar type");
5713         if (canBeScalarized(J))
5714           Worklist.push_back(J);
5715         else if (needsExtract(J, VF))
5716           ScalarCost += TTI.getScalarizationOverhead(
5717                               ToVectorTy(J->getType(),VF), false, true);
5718       }
5719 
5720     // Scale the total scalar cost by block probability.
5721     ScalarCost /= getReciprocalPredBlockProb();
5722 
5723     // Compute the discount. A non-negative discount means the vector version
5724     // of the instruction costs more, and scalarizing would be beneficial.
5725     Discount += VectorCost - ScalarCost;
5726     ScalarCosts[I] = ScalarCost;
5727   }
5728 
5729   return Discount;
5730 }
5731 
5732 LoopVectorizationCostModel::VectorizationCostTy
5733 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5734   VectorizationCostTy Cost;
5735 
5736   // For each block.
5737   for (BasicBlock *BB : TheLoop->blocks()) {
5738     VectorizationCostTy BlockCost;
5739 
5740     // For each instruction in the old loop.
5741     for (Instruction &I : BB->instructionsWithoutDebug()) {
5742       // Skip ignored values.
5743       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5744           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5745         continue;
5746 
5747       VectorizationCostTy C = getInstructionCost(&I, VF);
5748 
5749       // Check if we should override the cost.
5750       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5751         C.first = ForceTargetInstructionCost;
5752 
5753       BlockCost.first += C.first;
5754       BlockCost.second |= C.second;
5755       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5756                         << " for VF " << VF << " For instruction: " << I
5757                         << '\n');
5758     }
5759 
5760     // If we are vectorizing a predicated block, it will have been
5761     // if-converted. This means that the block's instructions (aside from
5762     // stores and instructions that may divide by zero) will now be
5763     // unconditionally executed. For the scalar case, we may not always execute
5764     // the predicated block. Thus, scale the block's cost by the probability of
5765     // executing it.
5766     if (VF == 1 && blockNeedsPredication(BB))
5767       BlockCost.first /= getReciprocalPredBlockProb();
5768 
5769     Cost.first += BlockCost.first;
5770     Cost.second |= BlockCost.second;
5771   }
5772 
5773   return Cost;
5774 }
5775 
5776 /// Gets Address Access SCEV after verifying that the access pattern
5777 /// is loop invariant except the induction variable dependence.
5778 ///
5779 /// This SCEV can be sent to the Target in order to estimate the address
5780 /// calculation cost.
5781 static const SCEV *getAddressAccessSCEV(
5782               Value *Ptr,
5783               LoopVectorizationLegality *Legal,
5784               PredicatedScalarEvolution &PSE,
5785               const Loop *TheLoop) {
5786 
5787   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5788   if (!Gep)
5789     return nullptr;
5790 
5791   // We are looking for a gep with all loop invariant indices except for one
5792   // which should be an induction variable.
5793   auto SE = PSE.getSE();
5794   unsigned NumOperands = Gep->getNumOperands();
5795   for (unsigned i = 1; i < NumOperands; ++i) {
5796     Value *Opd = Gep->getOperand(i);
5797     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5798         !Legal->isInductionVariable(Opd))
5799       return nullptr;
5800   }
5801 
5802   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5803   return PSE.getSCEV(Ptr);
5804 }
5805 
5806 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5807   return Legal->hasStride(I->getOperand(0)) ||
5808          Legal->hasStride(I->getOperand(1));
5809 }
5810 
5811 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5812                                                                  unsigned VF) {
5813   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5814   Type *ValTy = getMemInstValueType(I);
5815   auto SE = PSE.getSE();
5816 
5817   unsigned AS = getLoadStoreAddressSpace(I);
5818   Value *Ptr = getLoadStorePointerOperand(I);
5819   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5820 
5821   // Figure out whether the access is strided and get the stride value
5822   // if it's known in compile time
5823   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5824 
5825   // Get the cost of the scalar memory instruction and address computation.
5826   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5827 
5828   // Don't pass *I here, since it is scalar but will actually be part of a
5829   // vectorized loop where the user of it is a vectorized instruction.
5830   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5831   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5832                                    Alignment, AS);
5833 
5834   // Get the overhead of the extractelement and insertelement instructions
5835   // we might create due to scalarization.
5836   Cost += getScalarizationOverhead(I, VF);
5837 
5838   // If we have a predicated store, it may not be executed for each vector
5839   // lane. Scale the cost by the probability of executing the predicated
5840   // block.
5841   if (isPredicatedInst(I)) {
5842     Cost /= getReciprocalPredBlockProb();
5843 
5844     if (useEmulatedMaskMemRefHack(I))
5845       // Artificially setting to a high enough value to practically disable
5846       // vectorization with such operations.
5847       Cost = 3000000;
5848   }
5849 
5850   return Cost;
5851 }
5852 
5853 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5854                                                              unsigned VF) {
5855   Type *ValTy = getMemInstValueType(I);
5856   Type *VectorTy = ToVectorTy(ValTy, VF);
5857   Value *Ptr = getLoadStorePointerOperand(I);
5858   unsigned AS = getLoadStoreAddressSpace(I);
5859   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5860 
5861   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5862          "Stride should be 1 or -1 for consecutive memory access");
5863   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5864   unsigned Cost = 0;
5865   if (Legal->isMaskRequired(I))
5866     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5867                                       Alignment ? Alignment->value() : 0, AS);
5868   else
5869     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5870 
5871   bool Reverse = ConsecutiveStride < 0;
5872   if (Reverse)
5873     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5874   return Cost;
5875 }
5876 
5877 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5878                                                          unsigned VF) {
5879   Type *ValTy = getMemInstValueType(I);
5880   Type *VectorTy = ToVectorTy(ValTy, VF);
5881   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5882   unsigned AS = getLoadStoreAddressSpace(I);
5883   if (isa<LoadInst>(I)) {
5884     return TTI.getAddressComputationCost(ValTy) +
5885            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5886            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5887   }
5888   StoreInst *SI = cast<StoreInst>(I);
5889 
5890   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5891   return TTI.getAddressComputationCost(ValTy) +
5892          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5893          (isLoopInvariantStoreValue
5894               ? 0
5895               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5896                                        VF - 1));
5897 }
5898 
5899 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5900                                                           unsigned VF) {
5901   Type *ValTy = getMemInstValueType(I);
5902   Type *VectorTy = ToVectorTy(ValTy, VF);
5903   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5904   Value *Ptr = getLoadStorePointerOperand(I);
5905 
5906   return TTI.getAddressComputationCost(VectorTy) +
5907          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5908                                     Legal->isMaskRequired(I),
5909                                     Alignment ? Alignment->value() : 0, I);
5910 }
5911 
5912 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5913                                                             unsigned VF) {
5914   Type *ValTy = getMemInstValueType(I);
5915   Type *VectorTy = ToVectorTy(ValTy, VF);
5916   unsigned AS = getLoadStoreAddressSpace(I);
5917 
5918   auto Group = getInterleavedAccessGroup(I);
5919   assert(Group && "Fail to get an interleaved access group.");
5920 
5921   unsigned InterleaveFactor = Group->getFactor();
5922   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5923 
5924   // Holds the indices of existing members in an interleaved load group.
5925   // An interleaved store group doesn't need this as it doesn't allow gaps.
5926   SmallVector<unsigned, 4> Indices;
5927   if (isa<LoadInst>(I)) {
5928     for (unsigned i = 0; i < InterleaveFactor; i++)
5929       if (Group->getMember(i))
5930         Indices.push_back(i);
5931   }
5932 
5933   // Calculate the cost of the whole interleaved group.
5934   bool UseMaskForGaps =
5935       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5936   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5937       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5938       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5939 
5940   if (Group->isReverse()) {
5941     // TODO: Add support for reversed masked interleaved access.
5942     assert(!Legal->isMaskRequired(I) &&
5943            "Reverse masked interleaved access not supported.");
5944     Cost += Group->getNumMembers() *
5945             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5946   }
5947   return Cost;
5948 }
5949 
5950 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5951                                                               unsigned VF) {
5952   // Calculate scalar cost only. Vectorization cost should be ready at this
5953   // moment.
5954   if (VF == 1) {
5955     Type *ValTy = getMemInstValueType(I);
5956     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5957     unsigned AS = getLoadStoreAddressSpace(I);
5958 
5959     return TTI.getAddressComputationCost(ValTy) +
5960            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5961   }
5962   return getWideningCost(I, VF);
5963 }
5964 
5965 LoopVectorizationCostModel::VectorizationCostTy
5966 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5967   // If we know that this instruction will remain uniform, check the cost of
5968   // the scalar version.
5969   if (isUniformAfterVectorization(I, VF))
5970     VF = 1;
5971 
5972   if (VF > 1 && isProfitableToScalarize(I, VF))
5973     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5974 
5975   // Forced scalars do not have any scalarization overhead.
5976   auto ForcedScalar = ForcedScalars.find(VF);
5977   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5978     auto InstSet = ForcedScalar->second;
5979     if (InstSet.find(I) != InstSet.end())
5980       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5981   }
5982 
5983   Type *VectorTy;
5984   unsigned C = getInstructionCost(I, VF, VectorTy);
5985 
5986   bool TypeNotScalarized =
5987       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5988   return VectorizationCostTy(C, TypeNotScalarized);
5989 }
5990 
5991 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5992                                                               unsigned VF) {
5993 
5994   if (VF == 1)
5995     return 0;
5996 
5997   unsigned Cost = 0;
5998   Type *RetTy = ToVectorTy(I->getType(), VF);
5999   if (!RetTy->isVoidTy() &&
6000       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6001     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
6002 
6003   // Some targets keep addresses scalar.
6004   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6005     return Cost;
6006 
6007   // Some targets support efficient element stores.
6008   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6009     return Cost;
6010 
6011   // Collect operands to consider.
6012   CallInst *CI = dyn_cast<CallInst>(I);
6013   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6014 
6015   // Skip operands that do not require extraction/scalarization and do not incur
6016   // any overhead.
6017   return Cost + TTI.getOperandsScalarizationOverhead(
6018                     filterExtractingOperands(Ops, VF), VF);
6019 }
6020 
6021 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6022   if (VF == 1)
6023     return;
6024   NumPredStores = 0;
6025   for (BasicBlock *BB : TheLoop->blocks()) {
6026     // For each instruction in the old loop.
6027     for (Instruction &I : *BB) {
6028       Value *Ptr =  getLoadStorePointerOperand(&I);
6029       if (!Ptr)
6030         continue;
6031 
6032       // TODO: We should generate better code and update the cost model for
6033       // predicated uniform stores. Today they are treated as any other
6034       // predicated store (see added test cases in
6035       // invariant-store-vectorization.ll).
6036       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6037         NumPredStores++;
6038 
6039       if (Legal->isUniform(Ptr) &&
6040           // Conditional loads and stores should be scalarized and predicated.
6041           // isScalarWithPredication cannot be used here since masked
6042           // gather/scatters are not considered scalar with predication.
6043           !Legal->blockNeedsPredication(I.getParent())) {
6044         // TODO: Avoid replicating loads and stores instead of
6045         // relying on instcombine to remove them.
6046         // Load: Scalar load + broadcast
6047         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6048         unsigned Cost = getUniformMemOpCost(&I, VF);
6049         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6050         continue;
6051       }
6052 
6053       // We assume that widening is the best solution when possible.
6054       if (memoryInstructionCanBeWidened(&I, VF)) {
6055         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6056         int ConsecutiveStride =
6057                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6058         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6059                "Expected consecutive stride.");
6060         InstWidening Decision =
6061             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6062         setWideningDecision(&I, VF, Decision, Cost);
6063         continue;
6064       }
6065 
6066       // Choose between Interleaving, Gather/Scatter or Scalarization.
6067       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6068       unsigned NumAccesses = 1;
6069       if (isAccessInterleaved(&I)) {
6070         auto Group = getInterleavedAccessGroup(&I);
6071         assert(Group && "Fail to get an interleaved access group.");
6072 
6073         // Make one decision for the whole group.
6074         if (getWideningDecision(&I, VF) != CM_Unknown)
6075           continue;
6076 
6077         NumAccesses = Group->getNumMembers();
6078         if (interleavedAccessCanBeWidened(&I, VF))
6079           InterleaveCost = getInterleaveGroupCost(&I, VF);
6080       }
6081 
6082       unsigned GatherScatterCost =
6083           isLegalGatherOrScatter(&I)
6084               ? getGatherScatterCost(&I, VF) * NumAccesses
6085               : std::numeric_limits<unsigned>::max();
6086 
6087       unsigned ScalarizationCost =
6088           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6089 
6090       // Choose better solution for the current VF,
6091       // write down this decision and use it during vectorization.
6092       unsigned Cost;
6093       InstWidening Decision;
6094       if (InterleaveCost <= GatherScatterCost &&
6095           InterleaveCost < ScalarizationCost) {
6096         Decision = CM_Interleave;
6097         Cost = InterleaveCost;
6098       } else if (GatherScatterCost < ScalarizationCost) {
6099         Decision = CM_GatherScatter;
6100         Cost = GatherScatterCost;
6101       } else {
6102         Decision = CM_Scalarize;
6103         Cost = ScalarizationCost;
6104       }
6105       // If the instructions belongs to an interleave group, the whole group
6106       // receives the same decision. The whole group receives the cost, but
6107       // the cost will actually be assigned to one instruction.
6108       if (auto Group = getInterleavedAccessGroup(&I))
6109         setWideningDecision(Group, VF, Decision, Cost);
6110       else
6111         setWideningDecision(&I, VF, Decision, Cost);
6112     }
6113   }
6114 
6115   // Make sure that any load of address and any other address computation
6116   // remains scalar unless there is gather/scatter support. This avoids
6117   // inevitable extracts into address registers, and also has the benefit of
6118   // activating LSR more, since that pass can't optimize vectorized
6119   // addresses.
6120   if (TTI.prefersVectorizedAddressing())
6121     return;
6122 
6123   // Start with all scalar pointer uses.
6124   SmallPtrSet<Instruction *, 8> AddrDefs;
6125   for (BasicBlock *BB : TheLoop->blocks())
6126     for (Instruction &I : *BB) {
6127       Instruction *PtrDef =
6128         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6129       if (PtrDef && TheLoop->contains(PtrDef) &&
6130           getWideningDecision(&I, VF) != CM_GatherScatter)
6131         AddrDefs.insert(PtrDef);
6132     }
6133 
6134   // Add all instructions used to generate the addresses.
6135   SmallVector<Instruction *, 4> Worklist;
6136   for (auto *I : AddrDefs)
6137     Worklist.push_back(I);
6138   while (!Worklist.empty()) {
6139     Instruction *I = Worklist.pop_back_val();
6140     for (auto &Op : I->operands())
6141       if (auto *InstOp = dyn_cast<Instruction>(Op))
6142         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6143             AddrDefs.insert(InstOp).second)
6144           Worklist.push_back(InstOp);
6145   }
6146 
6147   for (auto *I : AddrDefs) {
6148     if (isa<LoadInst>(I)) {
6149       // Setting the desired widening decision should ideally be handled in
6150       // by cost functions, but since this involves the task of finding out
6151       // if the loaded register is involved in an address computation, it is
6152       // instead changed here when we know this is the case.
6153       InstWidening Decision = getWideningDecision(I, VF);
6154       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6155         // Scalarize a widened load of address.
6156         setWideningDecision(I, VF, CM_Scalarize,
6157                             (VF * getMemoryInstructionCost(I, 1)));
6158       else if (auto Group = getInterleavedAccessGroup(I)) {
6159         // Scalarize an interleave group of address loads.
6160         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6161           if (Instruction *Member = Group->getMember(I))
6162             setWideningDecision(Member, VF, CM_Scalarize,
6163                                 (VF * getMemoryInstructionCost(Member, 1)));
6164         }
6165       }
6166     } else
6167       // Make sure I gets scalarized and a cost estimate without
6168       // scalarization overhead.
6169       ForcedScalars[VF].insert(I);
6170   }
6171 }
6172 
6173 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6174                                                         unsigned VF,
6175                                                         Type *&VectorTy) {
6176   Type *RetTy = I->getType();
6177   if (canTruncateToMinimalBitwidth(I, VF))
6178     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6179   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6180   auto SE = PSE.getSE();
6181 
6182   // TODO: We need to estimate the cost of intrinsic calls.
6183   switch (I->getOpcode()) {
6184   case Instruction::GetElementPtr:
6185     // We mark this instruction as zero-cost because the cost of GEPs in
6186     // vectorized code depends on whether the corresponding memory instruction
6187     // is scalarized or not. Therefore, we handle GEPs with the memory
6188     // instruction cost.
6189     return 0;
6190   case Instruction::Br: {
6191     // In cases of scalarized and predicated instructions, there will be VF
6192     // predicated blocks in the vectorized loop. Each branch around these
6193     // blocks requires also an extract of its vector compare i1 element.
6194     bool ScalarPredicatedBB = false;
6195     BranchInst *BI = cast<BranchInst>(I);
6196     if (VF > 1 && BI->isConditional() &&
6197         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6198              PredicatedBBsAfterVectorization.end() ||
6199          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6200              PredicatedBBsAfterVectorization.end()))
6201       ScalarPredicatedBB = true;
6202 
6203     if (ScalarPredicatedBB) {
6204       // Return cost for branches around scalarized and predicated blocks.
6205       Type *Vec_i1Ty =
6206           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6207       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6208               (TTI.getCFInstrCost(Instruction::Br) * VF));
6209     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6210       // The back-edge branch will remain, as will all scalar branches.
6211       return TTI.getCFInstrCost(Instruction::Br);
6212     else
6213       // This branch will be eliminated by if-conversion.
6214       return 0;
6215     // Note: We currently assume zero cost for an unconditional branch inside
6216     // a predicated block since it will become a fall-through, although we
6217     // may decide in the future to call TTI for all branches.
6218   }
6219   case Instruction::PHI: {
6220     auto *Phi = cast<PHINode>(I);
6221 
6222     // First-order recurrences are replaced by vector shuffles inside the loop.
6223     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6224     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6225       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6226                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6227 
6228     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6229     // converted into select instructions. We require N - 1 selects per phi
6230     // node, where N is the number of incoming values.
6231     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6232       return (Phi->getNumIncomingValues() - 1) *
6233              TTI.getCmpSelInstrCost(
6234                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6235                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6236 
6237     return TTI.getCFInstrCost(Instruction::PHI);
6238   }
6239   case Instruction::UDiv:
6240   case Instruction::SDiv:
6241   case Instruction::URem:
6242   case Instruction::SRem:
6243     // If we have a predicated instruction, it may not be executed for each
6244     // vector lane. Get the scalarization cost and scale this amount by the
6245     // probability of executing the predicated block. If the instruction is not
6246     // predicated, we fall through to the next case.
6247     if (VF > 1 && isScalarWithPredication(I)) {
6248       unsigned Cost = 0;
6249 
6250       // These instructions have a non-void type, so account for the phi nodes
6251       // that we will create. This cost is likely to be zero. The phi node
6252       // cost, if any, should be scaled by the block probability because it
6253       // models a copy at the end of each predicated block.
6254       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6255 
6256       // The cost of the non-predicated instruction.
6257       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6258 
6259       // The cost of insertelement and extractelement instructions needed for
6260       // scalarization.
6261       Cost += getScalarizationOverhead(I, VF);
6262 
6263       // Scale the cost by the probability of executing the predicated blocks.
6264       // This assumes the predicated block for each vector lane is equally
6265       // likely.
6266       return Cost / getReciprocalPredBlockProb();
6267     }
6268     LLVM_FALLTHROUGH;
6269   case Instruction::Add:
6270   case Instruction::FAdd:
6271   case Instruction::Sub:
6272   case Instruction::FSub:
6273   case Instruction::Mul:
6274   case Instruction::FMul:
6275   case Instruction::FDiv:
6276   case Instruction::FRem:
6277   case Instruction::Shl:
6278   case Instruction::LShr:
6279   case Instruction::AShr:
6280   case Instruction::And:
6281   case Instruction::Or:
6282   case Instruction::Xor: {
6283     // Since we will replace the stride by 1 the multiplication should go away.
6284     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6285       return 0;
6286     // Certain instructions can be cheaper to vectorize if they have a constant
6287     // second vector operand. One example of this are shifts on x86.
6288     Value *Op2 = I->getOperand(1);
6289     TargetTransformInfo::OperandValueProperties Op2VP;
6290     TargetTransformInfo::OperandValueKind Op2VK =
6291         TTI.getOperandInfo(Op2, Op2VP);
6292     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6293       Op2VK = TargetTransformInfo::OK_UniformValue;
6294 
6295     SmallVector<const Value *, 4> Operands(I->operand_values());
6296     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6297     return N * TTI.getArithmeticInstrCost(
6298                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6299                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6300   }
6301   case Instruction::FNeg: {
6302     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6303     return N * TTI.getArithmeticInstrCost(
6304                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6305                    TargetTransformInfo::OK_AnyValue,
6306                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6307                    I->getOperand(0), I);
6308   }
6309   case Instruction::Select: {
6310     SelectInst *SI = cast<SelectInst>(I);
6311     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6312     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6313     Type *CondTy = SI->getCondition()->getType();
6314     if (!ScalarCond)
6315       CondTy = VectorType::get(CondTy, VF);
6316 
6317     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6318   }
6319   case Instruction::ICmp:
6320   case Instruction::FCmp: {
6321     Type *ValTy = I->getOperand(0)->getType();
6322     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6323     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6324       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6325     VectorTy = ToVectorTy(ValTy, VF);
6326     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6327   }
6328   case Instruction::Store:
6329   case Instruction::Load: {
6330     unsigned Width = VF;
6331     if (Width > 1) {
6332       InstWidening Decision = getWideningDecision(I, Width);
6333       assert(Decision != CM_Unknown &&
6334              "CM decision should be taken at this point");
6335       if (Decision == CM_Scalarize)
6336         Width = 1;
6337     }
6338     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6339     return getMemoryInstructionCost(I, VF);
6340   }
6341   case Instruction::ZExt:
6342   case Instruction::SExt:
6343   case Instruction::FPToUI:
6344   case Instruction::FPToSI:
6345   case Instruction::FPExt:
6346   case Instruction::PtrToInt:
6347   case Instruction::IntToPtr:
6348   case Instruction::SIToFP:
6349   case Instruction::UIToFP:
6350   case Instruction::Trunc:
6351   case Instruction::FPTrunc:
6352   case Instruction::BitCast: {
6353     // We optimize the truncation of induction variables having constant
6354     // integer steps. The cost of these truncations is the same as the scalar
6355     // operation.
6356     if (isOptimizableIVTruncate(I, VF)) {
6357       auto *Trunc = cast<TruncInst>(I);
6358       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6359                                   Trunc->getSrcTy(), Trunc);
6360     }
6361 
6362     Type *SrcScalarTy = I->getOperand(0)->getType();
6363     Type *SrcVecTy =
6364         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6365     if (canTruncateToMinimalBitwidth(I, VF)) {
6366       // This cast is going to be shrunk. This may remove the cast or it might
6367       // turn it into slightly different cast. For example, if MinBW == 16,
6368       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6369       //
6370       // Calculate the modified src and dest types.
6371       Type *MinVecTy = VectorTy;
6372       if (I->getOpcode() == Instruction::Trunc) {
6373         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6374         VectorTy =
6375             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6376       } else if (I->getOpcode() == Instruction::ZExt ||
6377                  I->getOpcode() == Instruction::SExt) {
6378         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6379         VectorTy =
6380             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6381       }
6382     }
6383 
6384     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6385     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6386   }
6387   case Instruction::Call: {
6388     bool NeedToScalarize;
6389     CallInst *CI = cast<CallInst>(I);
6390     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6391     if (getVectorIntrinsicIDForCall(CI, TLI))
6392       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6393     return CallCost;
6394   }
6395   default:
6396     // The cost of executing VF copies of the scalar instruction. This opcode
6397     // is unknown. Assume that it is the same as 'mul'.
6398     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6399            getScalarizationOverhead(I, VF);
6400   } // end of switch.
6401 }
6402 
6403 char LoopVectorize::ID = 0;
6404 
6405 static const char lv_name[] = "Loop Vectorization";
6406 
6407 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6408 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6409 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6410 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6411 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6412 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6413 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6414 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6415 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6416 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6417 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6418 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6419 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6420 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6421 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6422 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6423 
6424 namespace llvm {
6425 
6426 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6427 
6428 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6429                               bool VectorizeOnlyWhenForced) {
6430   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6431 }
6432 
6433 } // end namespace llvm
6434 
6435 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6436   // Check if the pointer operand of a load or store instruction is
6437   // consecutive.
6438   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6439     return Legal->isConsecutivePtr(Ptr);
6440   return false;
6441 }
6442 
6443 void LoopVectorizationCostModel::collectValuesToIgnore() {
6444   // Ignore ephemeral values.
6445   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6446 
6447   // Ignore type-promoting instructions we identified during reduction
6448   // detection.
6449   for (auto &Reduction : Legal->getReductionVars()) {
6450     RecurrenceDescriptor &RedDes = Reduction.second;
6451     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6452     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6453   }
6454   // Ignore type-casting instructions we identified during induction
6455   // detection.
6456   for (auto &Induction : Legal->getInductionVars()) {
6457     InductionDescriptor &IndDes = Induction.second;
6458     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6459     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6460   }
6461 }
6462 
6463 // TODO: we could return a pair of values that specify the max VF and
6464 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6465 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6466 // doesn't have a cost model that can choose which plan to execute if
6467 // more than one is generated.
6468 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6469                                  LoopVectorizationCostModel &CM) {
6470   unsigned WidestType;
6471   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6472   return WidestVectorRegBits / WidestType;
6473 }
6474 
6475 VectorizationFactor
6476 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6477   unsigned VF = UserVF;
6478   // Outer loop handling: They may require CFG and instruction level
6479   // transformations before even evaluating whether vectorization is profitable.
6480   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6481   // the vectorization pipeline.
6482   if (!OrigLoop->empty()) {
6483     // If the user doesn't provide a vectorization factor, determine a
6484     // reasonable one.
6485     if (!UserVF) {
6486       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6487       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6488 
6489       // Make sure we have a VF > 1 for stress testing.
6490       if (VPlanBuildStressTest && VF < 2) {
6491         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6492                           << "overriding computed VF.\n");
6493         VF = 4;
6494       }
6495     }
6496     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6497     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6498     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6499                       << " to build VPlans.\n");
6500     buildVPlans(VF, VF);
6501 
6502     // For VPlan build stress testing, we bail out after VPlan construction.
6503     if (VPlanBuildStressTest)
6504       return VectorizationFactor::Disabled();
6505 
6506     return {VF, 0};
6507   }
6508 
6509   LLVM_DEBUG(
6510       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6511                 "VPlan-native path.\n");
6512   return VectorizationFactor::Disabled();
6513 }
6514 
6515 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6516   assert(OrigLoop->empty() && "Inner loop expected.");
6517   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6518   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6519     return None;
6520 
6521   // Invalidate interleave groups if all blocks of loop will be predicated.
6522   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6523       !useMaskedInterleavedAccesses(*TTI)) {
6524     LLVM_DEBUG(
6525         dbgs()
6526         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6527            "which requires masked-interleaved support.\n");
6528     CM.InterleaveInfo.reset();
6529   }
6530 
6531   if (UserVF) {
6532     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6533     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6534     // Collect the instructions (and their associated costs) that will be more
6535     // profitable to scalarize.
6536     CM.selectUserVectorizationFactor(UserVF);
6537     buildVPlansWithVPRecipes(UserVF, UserVF);
6538     LLVM_DEBUG(printPlans(dbgs()));
6539     return {{UserVF, 0}};
6540   }
6541 
6542   unsigned MaxVF = MaybeMaxVF.getValue();
6543   assert(MaxVF != 0 && "MaxVF is zero.");
6544 
6545   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6546     // Collect Uniform and Scalar instructions after vectorization with VF.
6547     CM.collectUniformsAndScalars(VF);
6548 
6549     // Collect the instructions (and their associated costs) that will be more
6550     // profitable to scalarize.
6551     if (VF > 1)
6552       CM.collectInstsToScalarize(VF);
6553   }
6554 
6555   buildVPlansWithVPRecipes(1, MaxVF);
6556   LLVM_DEBUG(printPlans(dbgs()));
6557   if (MaxVF == 1)
6558     return VectorizationFactor::Disabled();
6559 
6560   // Select the optimal vectorization factor.
6561   return CM.selectVectorizationFactor(MaxVF);
6562 }
6563 
6564 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6565   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6566                     << '\n');
6567   BestVF = VF;
6568   BestUF = UF;
6569 
6570   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6571     return !Plan->hasVF(VF);
6572   });
6573   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6574 }
6575 
6576 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6577                                            DominatorTree *DT) {
6578   // Perform the actual loop transformation.
6579 
6580   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6581   VPCallbackILV CallbackILV(ILV);
6582 
6583   VPTransformState State{BestVF, BestUF,      LI,
6584                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6585                          &ILV,   CallbackILV};
6586   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6587   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6588   State.CanonicalIV = ILV.Induction;
6589 
6590   //===------------------------------------------------===//
6591   //
6592   // Notice: any optimization or new instruction that go
6593   // into the code below should also be implemented in
6594   // the cost-model.
6595   //
6596   //===------------------------------------------------===//
6597 
6598   // 2. Copy and widen instructions from the old loop into the new loop.
6599   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6600   VPlans.front()->execute(&State);
6601 
6602   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6603   //    predication, updating analyses.
6604   ILV.fixVectorizedLoop();
6605 }
6606 
6607 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6608     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6609   BasicBlock *Latch = OrigLoop->getLoopLatch();
6610 
6611   // We create new control-flow for the vectorized loop, so the original
6612   // condition will be dead after vectorization if it's only used by the
6613   // branch.
6614   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6615   if (Cmp && Cmp->hasOneUse())
6616     DeadInstructions.insert(Cmp);
6617 
6618   // We create new "steps" for induction variable updates to which the original
6619   // induction variables map. An original update instruction will be dead if
6620   // all its users except the induction variable are dead.
6621   for (auto &Induction : Legal->getInductionVars()) {
6622     PHINode *Ind = Induction.first;
6623     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6624     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6625           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6626                                  DeadInstructions.end();
6627         }))
6628       DeadInstructions.insert(IndUpdate);
6629 
6630     // We record as "Dead" also the type-casting instructions we had identified
6631     // during induction analysis. We don't need any handling for them in the
6632     // vectorized loop because we have proven that, under a proper runtime
6633     // test guarding the vectorized loop, the value of the phi, and the casted
6634     // value of the phi, are the same. The last instruction in this casting chain
6635     // will get its scalar/vector/widened def from the scalar/vector/widened def
6636     // of the respective phi node. Any other casts in the induction def-use chain
6637     // have no other uses outside the phi update chain, and will be ignored.
6638     InductionDescriptor &IndDes = Induction.second;
6639     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6640     DeadInstructions.insert(Casts.begin(), Casts.end());
6641   }
6642 }
6643 
6644 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6645 
6646 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6647 
6648 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6649                                         Instruction::BinaryOps BinOp) {
6650   // When unrolling and the VF is 1, we only need to add a simple scalar.
6651   Type *Ty = Val->getType();
6652   assert(!Ty->isVectorTy() && "Val must be a scalar");
6653 
6654   if (Ty->isFloatingPointTy()) {
6655     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6656 
6657     // Floating point operations had to be 'fast' to enable the unrolling.
6658     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6659     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6660   }
6661   Constant *C = ConstantInt::get(Ty, StartIdx);
6662   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6663 }
6664 
6665 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6666   SmallVector<Metadata *, 4> MDs;
6667   // Reserve first location for self reference to the LoopID metadata node.
6668   MDs.push_back(nullptr);
6669   bool IsUnrollMetadata = false;
6670   MDNode *LoopID = L->getLoopID();
6671   if (LoopID) {
6672     // First find existing loop unrolling disable metadata.
6673     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6674       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6675       if (MD) {
6676         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6677         IsUnrollMetadata =
6678             S && S->getString().startswith("llvm.loop.unroll.disable");
6679       }
6680       MDs.push_back(LoopID->getOperand(i));
6681     }
6682   }
6683 
6684   if (!IsUnrollMetadata) {
6685     // Add runtime unroll disable metadata.
6686     LLVMContext &Context = L->getHeader()->getContext();
6687     SmallVector<Metadata *, 1> DisableOperands;
6688     DisableOperands.push_back(
6689         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6690     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6691     MDs.push_back(DisableNode);
6692     MDNode *NewLoopID = MDNode::get(Context, MDs);
6693     // Set operand 0 to refer to the loop id itself.
6694     NewLoopID->replaceOperandWith(0, NewLoopID);
6695     L->setLoopID(NewLoopID);
6696   }
6697 }
6698 
6699 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6700     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6701   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6702   bool PredicateAtRangeStart = Predicate(Range.Start);
6703 
6704   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6705     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6706       Range.End = TmpVF;
6707       break;
6708     }
6709 
6710   return PredicateAtRangeStart;
6711 }
6712 
6713 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6714 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6715 /// of VF's starting at a given VF and extending it as much as possible. Each
6716 /// vectorization decision can potentially shorten this sub-range during
6717 /// buildVPlan().
6718 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6719   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6720     VFRange SubRange = {VF, MaxVF + 1};
6721     VPlans.push_back(buildVPlan(SubRange));
6722     VF = SubRange.End;
6723   }
6724 }
6725 
6726 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6727                                          VPlanPtr &Plan) {
6728   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6729 
6730   // Look for cached value.
6731   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6732   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6733   if (ECEntryIt != EdgeMaskCache.end())
6734     return ECEntryIt->second;
6735 
6736   VPValue *SrcMask = createBlockInMask(Src, Plan);
6737 
6738   // The terminator has to be a branch inst!
6739   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6740   assert(BI && "Unexpected terminator found");
6741 
6742   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6743     return EdgeMaskCache[Edge] = SrcMask;
6744 
6745   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6746   assert(EdgeMask && "No Edge Mask found for condition");
6747 
6748   if (BI->getSuccessor(0) != Dst)
6749     EdgeMask = Builder.createNot(EdgeMask);
6750 
6751   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6752     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6753 
6754   return EdgeMaskCache[Edge] = EdgeMask;
6755 }
6756 
6757 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6758   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6759 
6760   // Look for cached value.
6761   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6762   if (BCEntryIt != BlockMaskCache.end())
6763     return BCEntryIt->second;
6764 
6765   // All-one mask is modelled as no-mask following the convention for masked
6766   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6767   VPValue *BlockMask = nullptr;
6768 
6769   if (OrigLoop->getHeader() == BB) {
6770     if (!CM.blockNeedsPredication(BB))
6771       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6772 
6773     // Introduce the early-exit compare IV <= BTC to form header block mask.
6774     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6775     // Start by constructing the desired canonical IV.
6776     VPValue *IV = nullptr;
6777     if (Legal->getPrimaryInduction())
6778       IV = Plan->getVPValue(Legal->getPrimaryInduction());
6779     else {
6780       auto IVRecipe = new VPWidenCanonicalIVRecipe();
6781       Builder.getInsertBlock()->appendRecipe(IVRecipe);
6782       IV = IVRecipe->getVPValue();
6783     }
6784     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6785     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6786     return BlockMaskCache[BB] = BlockMask;
6787   }
6788 
6789   // This is the block mask. We OR all incoming edges.
6790   for (auto *Predecessor : predecessors(BB)) {
6791     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6792     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6793       return BlockMaskCache[BB] = EdgeMask;
6794 
6795     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6796       BlockMask = EdgeMask;
6797       continue;
6798     }
6799 
6800     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6801   }
6802 
6803   return BlockMaskCache[BB] = BlockMask;
6804 }
6805 
6806 VPWidenMemoryInstructionRecipe *
6807 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6808                                   VPlanPtr &Plan) {
6809   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6810     return nullptr;
6811 
6812   auto willWiden = [&](unsigned VF) -> bool {
6813     if (VF == 1)
6814       return false;
6815     LoopVectorizationCostModel::InstWidening Decision =
6816         CM.getWideningDecision(I, VF);
6817     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6818            "CM decision should be taken at this point.");
6819     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6820       return true;
6821     if (CM.isScalarAfterVectorization(I, VF) ||
6822         CM.isProfitableToScalarize(I, VF))
6823       return false;
6824     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6825   };
6826 
6827   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6828     return nullptr;
6829 
6830   VPValue *Mask = nullptr;
6831   if (Legal->isMaskRequired(I))
6832     Mask = createBlockInMask(I->getParent(), Plan);
6833 
6834   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6835   if (LoadInst *Load = dyn_cast<LoadInst>(I))
6836     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
6837 
6838   StoreInst *Store = cast<StoreInst>(I);
6839   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
6840   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
6841 }
6842 
6843 VPWidenIntOrFpInductionRecipe *
6844 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6845   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6846     // Check if this is an integer or fp induction. If so, build the recipe that
6847     // produces its scalar and vector values.
6848     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6849     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6850         II.getKind() == InductionDescriptor::IK_FpInduction)
6851       return new VPWidenIntOrFpInductionRecipe(Phi);
6852 
6853     return nullptr;
6854   }
6855 
6856   // Optimize the special case where the source is a constant integer
6857   // induction variable. Notice that we can only optimize the 'trunc' case
6858   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6859   // (c) other casts depend on pointer size.
6860 
6861   // Determine whether \p K is a truncation based on an induction variable that
6862   // can be optimized.
6863   auto isOptimizableIVTruncate =
6864       [&](Instruction *K) -> std::function<bool(unsigned)> {
6865     return
6866         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6867   };
6868 
6869   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6870                                isOptimizableIVTruncate(I), Range))
6871     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6872                                              cast<TruncInst>(I));
6873   return nullptr;
6874 }
6875 
6876 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6877   PHINode *Phi = dyn_cast<PHINode>(I);
6878   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6879     return nullptr;
6880 
6881   // We know that all PHIs in non-header blocks are converted into selects, so
6882   // we don't have to worry about the insertion order and we can just use the
6883   // builder. At this point we generate the predication tree. There may be
6884   // duplications since this is a simple recursive scan, but future
6885   // optimizations will clean it up.
6886 
6887   SmallVector<VPValue *, 2> Operands;
6888   unsigned NumIncoming = Phi->getNumIncomingValues();
6889   for (unsigned In = 0; In < NumIncoming; In++) {
6890     VPValue *EdgeMask =
6891       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6892     assert((EdgeMask || NumIncoming == 1) &&
6893            "Multiple predecessors with one having a full mask");
6894     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
6895     if (EdgeMask)
6896       Operands.push_back(EdgeMask);
6897   }
6898   return new VPBlendRecipe(Phi, Operands);
6899 }
6900 
6901 VPWidenCallRecipe *
6902 VPRecipeBuilder::tryToWidenCall(Instruction *I, VFRange &Range, VPlan &Plan) {
6903 
6904   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6905       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6906 
6907   CallInst *CI = dyn_cast<CallInst>(I);
6908   if (IsPredicated || !CI)
6909     return nullptr;
6910 
6911   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6912   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6913              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6914     return nullptr;
6915 
6916   auto willWiden = [&](unsigned VF) -> bool {
6917     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6918     // The following case may be scalarized depending on the VF.
6919     // The flag shows whether we use Intrinsic or a usual Call for vectorized
6920     // version of the instruction.
6921     // Is it beneficial to perform intrinsic call compared to lib call?
6922     bool NeedToScalarize = false;
6923     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6924     bool UseVectorIntrinsic =
6925         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6926     return UseVectorIntrinsic || !NeedToScalarize;
6927   };
6928 
6929   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6930     return nullptr;
6931 
6932   // Success: widen this call.
6933   auto VPValues = map_range(CI->arg_operands(), [&Plan](Value *Op) {
6934     return Plan.getOrAddVPValue(Op);
6935   });
6936 
6937   return new VPWidenCallRecipe(*CI, VPValues);
6938 }
6939 
6940 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
6941   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6942       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6943 
6944   if (IsPredicated)
6945     return nullptr;
6946 
6947   auto IsVectorizableOpcode = [](unsigned Opcode) {
6948     switch (Opcode) {
6949     case Instruction::Add:
6950     case Instruction::And:
6951     case Instruction::AShr:
6952     case Instruction::BitCast:
6953     case Instruction::Br:
6954     case Instruction::FAdd:
6955     case Instruction::FCmp:
6956     case Instruction::FDiv:
6957     case Instruction::FMul:
6958     case Instruction::FNeg:
6959     case Instruction::FPExt:
6960     case Instruction::FPToSI:
6961     case Instruction::FPToUI:
6962     case Instruction::FPTrunc:
6963     case Instruction::FRem:
6964     case Instruction::FSub:
6965     case Instruction::ICmp:
6966     case Instruction::IntToPtr:
6967     case Instruction::Load:
6968     case Instruction::LShr:
6969     case Instruction::Mul:
6970     case Instruction::Or:
6971     case Instruction::PHI:
6972     case Instruction::PtrToInt:
6973     case Instruction::SDiv:
6974     case Instruction::Select:
6975     case Instruction::SExt:
6976     case Instruction::Shl:
6977     case Instruction::SIToFP:
6978     case Instruction::SRem:
6979     case Instruction::Store:
6980     case Instruction::Sub:
6981     case Instruction::Trunc:
6982     case Instruction::UDiv:
6983     case Instruction::UIToFP:
6984     case Instruction::URem:
6985     case Instruction::Xor:
6986     case Instruction::ZExt:
6987       return true;
6988     }
6989     return false;
6990   };
6991 
6992   if (!IsVectorizableOpcode(I->getOpcode()))
6993     return nullptr;
6994 
6995   auto willWiden = [&](unsigned VF) -> bool {
6996     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6997                              CM.isProfitableToScalarize(I, VF)))
6998       return false;
6999     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
7000       assert(CM.getWideningDecision(I, VF) ==
7001                  LoopVectorizationCostModel::CM_Scalarize &&
7002              "Memory widening decisions should have been taken care by now");
7003       return false;
7004     }
7005     return true;
7006   };
7007 
7008   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7009     return nullptr;
7010 
7011   // Success: widen this instruction.
7012   return new VPWidenRecipe(*I);
7013 }
7014 
7015 VPBasicBlock *VPRecipeBuilder::handleReplication(
7016     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7017     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7018     VPlanPtr &Plan) {
7019   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7020       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7021       Range);
7022 
7023   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7024       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7025 
7026   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
7027   setRecipe(I, Recipe);
7028 
7029   // Find if I uses a predicated instruction. If so, it will use its scalar
7030   // value. Avoid hoisting the insert-element which packs the scalar value into
7031   // a vector value, as that happens iff all users use the vector value.
7032   for (auto &Op : I->operands())
7033     if (auto *PredInst = dyn_cast<Instruction>(Op))
7034       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7035         PredInst2Recipe[PredInst]->setAlsoPack(false);
7036 
7037   // Finalize the recipe for Instr, first if it is not predicated.
7038   if (!IsPredicated) {
7039     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7040     VPBB->appendRecipe(Recipe);
7041     return VPBB;
7042   }
7043   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7044   assert(VPBB->getSuccessors().empty() &&
7045          "VPBB has successors when handling predicated replication.");
7046   // Record predicated instructions for above packing optimizations.
7047   PredInst2Recipe[I] = Recipe;
7048   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7049   VPBlockUtils::insertBlockAfter(Region, VPBB);
7050   auto *RegSucc = new VPBasicBlock();
7051   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7052   return RegSucc;
7053 }
7054 
7055 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7056                                                       VPRecipeBase *PredRecipe,
7057                                                       VPlanPtr &Plan) {
7058   // Instructions marked for predication are replicated and placed under an
7059   // if-then construct to prevent side-effects.
7060 
7061   // Generate recipes to compute the block mask for this region.
7062   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7063 
7064   // Build the triangular if-then region.
7065   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7066   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7067   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7068   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7069   auto *PHIRecipe =
7070       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7071   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7072   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7073   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7074 
7075   // Note: first set Entry as region entry and then connect successors starting
7076   // from it in order, to propagate the "parent" of each VPBasicBlock.
7077   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7078   VPBlockUtils::connectBlocks(Pred, Exit);
7079 
7080   return Region;
7081 }
7082 
7083 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7084                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7085   VPRecipeBase *Recipe = nullptr;
7086 
7087   // First, check for specific widening recipes that deal with calls, memory
7088   // operations, inductions and Phi nodes.
7089   if ((Recipe = tryToWidenCall(Instr, Range, *Plan)) ||
7090       (Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7091       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7092       (Recipe = tryToBlend(Instr, Plan)) ||
7093       (isa<PHINode>(Instr) &&
7094        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7095     setRecipe(Instr, Recipe);
7096     VPBB->appendRecipe(Recipe);
7097     return true;
7098   }
7099 
7100   // Handle GEP widening.
7101   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7102     auto Scalarize = [&](unsigned VF) {
7103       return CM.isScalarWithPredication(Instr, VF) ||
7104              CM.isScalarAfterVectorization(Instr, VF) ||
7105              CM.isProfitableToScalarize(Instr, VF);
7106     };
7107     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7108       return false;
7109     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7110     setRecipe(Instr, Recipe);
7111     VPBB->appendRecipe(Recipe);
7112     return true;
7113   }
7114 
7115   // Check if Instr is to be widened by a general VPWidenRecipe, after
7116   // having first checked for specific widening recipes.
7117   if ((Recipe = tryToWiden(Instr, Range))) {
7118     setRecipe(Instr, Recipe);
7119     VPBB->appendRecipe(Recipe);
7120     return true;
7121   }
7122 
7123   return false;
7124 }
7125 
7126 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7127                                                         unsigned MaxVF) {
7128   assert(OrigLoop->empty() && "Inner loop expected.");
7129 
7130   // Collect conditions feeding internal conditional branches; they need to be
7131   // represented in VPlan for it to model masking.
7132   SmallPtrSet<Value *, 1> NeedDef;
7133 
7134   auto *Latch = OrigLoop->getLoopLatch();
7135   for (BasicBlock *BB : OrigLoop->blocks()) {
7136     if (BB == Latch)
7137       continue;
7138     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7139     if (Branch && Branch->isConditional())
7140       NeedDef.insert(Branch->getCondition());
7141   }
7142 
7143   // If the tail is to be folded by masking, the primary induction variable, if
7144   // exists needs to be represented in VPlan for it to model early-exit masking.
7145   // Also, both the Phi and the live-out instruction of each reduction are
7146   // required in order to introduce a select between them in VPlan.
7147   if (CM.foldTailByMasking()) {
7148     if (Legal->getPrimaryInduction())
7149       NeedDef.insert(Legal->getPrimaryInduction());
7150     for (auto &Reduction : Legal->getReductionVars()) {
7151       NeedDef.insert(Reduction.first);
7152       NeedDef.insert(Reduction.second.getLoopExitInstr());
7153     }
7154   }
7155 
7156   // Collect instructions from the original loop that will become trivially dead
7157   // in the vectorized loop. We don't need to vectorize these instructions. For
7158   // example, original induction update instructions can become dead because we
7159   // separately emit induction "steps" when generating code for the new loop.
7160   // Similarly, we create a new latch condition when setting up the structure
7161   // of the new loop, so the old one can become dead.
7162   SmallPtrSet<Instruction *, 4> DeadInstructions;
7163   collectTriviallyDeadInstructions(DeadInstructions);
7164 
7165   // Add assume instructions we need to drop to DeadInstructions, to prevent
7166   // them from being added to the VPlan.
7167   // TODO: We only need to drop assumes in blocks that get flattend. If the
7168   // control flow is preserved, we should keep them.
7169   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7170   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7171 
7172   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7173   // Dead instructions do not need sinking. Remove them from SinkAfter.
7174   for (Instruction *I : DeadInstructions)
7175     SinkAfter.erase(I);
7176 
7177   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7178     VFRange SubRange = {VF, MaxVF + 1};
7179     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7180                                              DeadInstructions, SinkAfter));
7181     VF = SubRange.End;
7182   }
7183 }
7184 
7185 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7186     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7187     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7188     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7189 
7190   // Hold a mapping from predicated instructions to their recipes, in order to
7191   // fix their AlsoPack behavior if a user is determined to replicate and use a
7192   // scalar instead of vector value.
7193   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7194 
7195   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7196 
7197   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7198 
7199   // ---------------------------------------------------------------------------
7200   // Pre-construction: record ingredients whose recipes we'll need to further
7201   // process after constructing the initial VPlan.
7202   // ---------------------------------------------------------------------------
7203 
7204   // Mark instructions we'll need to sink later and their targets as
7205   // ingredients whose recipe we'll need to record.
7206   for (auto &Entry : SinkAfter) {
7207     RecipeBuilder.recordRecipeOf(Entry.first);
7208     RecipeBuilder.recordRecipeOf(Entry.second);
7209   }
7210 
7211   // For each interleave group which is relevant for this (possibly trimmed)
7212   // Range, add it to the set of groups to be later applied to the VPlan and add
7213   // placeholders for its members' Recipes which we'll be replacing with a
7214   // single VPInterleaveRecipe.
7215   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7216     auto applyIG = [IG, this](unsigned VF) -> bool {
7217       return (VF >= 2 && // Query is illegal for VF == 1
7218               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7219                   LoopVectorizationCostModel::CM_Interleave);
7220     };
7221     if (!getDecisionAndClampRange(applyIG, Range))
7222       continue;
7223     InterleaveGroups.insert(IG);
7224     for (unsigned i = 0; i < IG->getFactor(); i++)
7225       if (Instruction *Member = IG->getMember(i))
7226         RecipeBuilder.recordRecipeOf(Member);
7227   };
7228 
7229   // ---------------------------------------------------------------------------
7230   // Build initial VPlan: Scan the body of the loop in a topological order to
7231   // visit each basic block after having visited its predecessor basic blocks.
7232   // ---------------------------------------------------------------------------
7233 
7234   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7235   auto Plan = std::make_unique<VPlan>();
7236   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7237   Plan->setEntry(VPBB);
7238 
7239   // Represent values that will have defs inside VPlan.
7240   for (Value *V : NeedDef)
7241     Plan->addVPValue(V);
7242 
7243   // Scan the body of the loop in a topological order to visit each basic block
7244   // after having visited its predecessor basic blocks.
7245   LoopBlocksDFS DFS(OrigLoop);
7246   DFS.perform(LI);
7247 
7248   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7249     // Relevant instructions from basic block BB will be grouped into VPRecipe
7250     // ingredients and fill a new VPBasicBlock.
7251     unsigned VPBBsForBB = 0;
7252     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7253     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7254     VPBB = FirstVPBBForBB;
7255     Builder.setInsertPoint(VPBB);
7256 
7257     // Introduce each ingredient into VPlan.
7258     // TODO: Model and preserve debug instrinsics in VPlan.
7259     for (Instruction &I : BB->instructionsWithoutDebug()) {
7260       Instruction *Instr = &I;
7261 
7262       // First filter out irrelevant instructions, to ensure no recipes are
7263       // built for them.
7264       if (isa<BranchInst>(Instr) ||
7265           DeadInstructions.find(Instr) != DeadInstructions.end())
7266         continue;
7267 
7268       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7269         continue;
7270 
7271       // Otherwise, if all widening options failed, Instruction is to be
7272       // replicated. This may create a successor for VPBB.
7273       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7274           Instr, Range, VPBB, PredInst2Recipe, Plan);
7275       if (NextVPBB != VPBB) {
7276         VPBB = NextVPBB;
7277         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7278                                     : "");
7279       }
7280     }
7281   }
7282 
7283   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7284   // may also be empty, such as the last one VPBB, reflecting original
7285   // basic-blocks with no recipes.
7286   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7287   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7288   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7289   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7290   delete PreEntry;
7291 
7292   // ---------------------------------------------------------------------------
7293   // Transform initial VPlan: Apply previously taken decisions, in order, to
7294   // bring the VPlan to its final state.
7295   // ---------------------------------------------------------------------------
7296 
7297   // Apply Sink-After legal constraints.
7298   for (auto &Entry : SinkAfter) {
7299     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7300     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7301     Sink->moveAfter(Target);
7302   }
7303 
7304   // Interleave memory: for each Interleave Group we marked earlier as relevant
7305   // for this VPlan, replace the Recipes widening its memory instructions with a
7306   // single VPInterleaveRecipe at its insertion point.
7307   for (auto IG : InterleaveGroups) {
7308     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7309         RecipeBuilder.getRecipe(IG->getInsertPos()));
7310     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7311         ->insertBefore(Recipe);
7312 
7313     for (unsigned i = 0; i < IG->getFactor(); ++i)
7314       if (Instruction *Member = IG->getMember(i)) {
7315         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7316       }
7317   }
7318 
7319   // Finally, if tail is folded by masking, introduce selects between the phi
7320   // and the live-out instruction of each reduction, at the end of the latch.
7321   if (CM.foldTailByMasking()) {
7322     Builder.setInsertPoint(VPBB);
7323     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7324     for (auto &Reduction : Legal->getReductionVars()) {
7325       VPValue *Phi = Plan->getVPValue(Reduction.first);
7326       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7327       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7328     }
7329   }
7330 
7331   std::string PlanName;
7332   raw_string_ostream RSO(PlanName);
7333   unsigned VF = Range.Start;
7334   Plan->addVF(VF);
7335   RSO << "Initial VPlan for VF={" << VF;
7336   for (VF *= 2; VF < Range.End; VF *= 2) {
7337     Plan->addVF(VF);
7338     RSO << "," << VF;
7339   }
7340   RSO << "},UF>=1";
7341   RSO.flush();
7342   Plan->setName(PlanName);
7343 
7344   return Plan;
7345 }
7346 
7347 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7348   // Outer loop handling: They may require CFG and instruction level
7349   // transformations before even evaluating whether vectorization is profitable.
7350   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7351   // the vectorization pipeline.
7352   assert(!OrigLoop->empty());
7353   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7354 
7355   // Create new empty VPlan
7356   auto Plan = std::make_unique<VPlan>();
7357 
7358   // Build hierarchical CFG
7359   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7360   HCFGBuilder.buildHierarchicalCFG();
7361 
7362   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7363     Plan->addVF(VF);
7364 
7365   if (EnableVPlanPredication) {
7366     VPlanPredicator VPP(*Plan);
7367     VPP.predicate();
7368 
7369     // Avoid running transformation to recipes until masked code generation in
7370     // VPlan-native path is in place.
7371     return Plan;
7372   }
7373 
7374   SmallPtrSet<Instruction *, 1> DeadInstructions;
7375   VPlanTransforms::VPInstructionsToVPRecipes(
7376       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7377   return Plan;
7378 }
7379 
7380 Value* LoopVectorizationPlanner::VPCallbackILV::
7381 getOrCreateVectorValues(Value *V, unsigned Part) {
7382       return ILV.getOrCreateVectorValue(V, Part);
7383 }
7384 
7385 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7386     Value *V, const VPIteration &Instance) {
7387   return ILV.getOrCreateScalarValue(V, Instance);
7388 }
7389 
7390 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7391                                VPSlotTracker &SlotTracker) const {
7392   O << " +\n"
7393     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7394   IG->getInsertPos()->printAsOperand(O, false);
7395   O << ", ";
7396   getAddr()->printAsOperand(O, SlotTracker);
7397   VPValue *Mask = getMask();
7398   if (Mask) {
7399     O << ", ";
7400     Mask->printAsOperand(O, SlotTracker);
7401   }
7402   O << "\\l\"";
7403   for (unsigned i = 0; i < IG->getFactor(); ++i)
7404     if (Instruction *I = IG->getMember(i))
7405       O << " +\n"
7406         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7407 }
7408 
7409 void VPWidenCallRecipe::execute(VPTransformState &State) {
7410   State.ILV->widenCallInstruction(Ingredient, User, State);
7411 }
7412 
7413 void VPWidenRecipe::execute(VPTransformState &State) {
7414   State.ILV->widenInstruction(Ingredient);
7415 }
7416 
7417 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7418   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7419                       IsIndexLoopInvariant);
7420 }
7421 
7422 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7423   assert(!State.Instance && "Int or FP induction being replicated.");
7424   State.ILV->widenIntOrFpInduction(IV, Trunc);
7425 }
7426 
7427 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7428   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7429 }
7430 
7431 void VPBlendRecipe::execute(VPTransformState &State) {
7432   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7433   // We know that all PHIs in non-header blocks are converted into
7434   // selects, so we don't have to worry about the insertion order and we
7435   // can just use the builder.
7436   // At this point we generate the predication tree. There may be
7437   // duplications since this is a simple recursive scan, but future
7438   // optimizations will clean it up.
7439 
7440   unsigned NumIncoming = getNumIncomingValues();
7441 
7442   // Generate a sequence of selects of the form:
7443   // SELECT(Mask3, In3,
7444   //      SELECT(Mask2, In2,
7445   //                   ( ...)))
7446   InnerLoopVectorizer::VectorParts Entry(State.UF);
7447   for (unsigned In = 0; In < NumIncoming; ++In) {
7448     for (unsigned Part = 0; Part < State.UF; ++Part) {
7449       // We might have single edge PHIs (blocks) - use an identity
7450       // 'select' for the first PHI operand.
7451       Value *In0 = State.get(getIncomingValue(In), Part);
7452       if (In == 0)
7453         Entry[Part] = In0; // Initialize with the first incoming value.
7454       else {
7455         // Select between the current value and the previous incoming edge
7456         // based on the incoming mask.
7457         Value *Cond = State.get(getMask(In), Part);
7458         Entry[Part] =
7459             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7460       }
7461     }
7462   }
7463   for (unsigned Part = 0; Part < State.UF; ++Part)
7464     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7465 }
7466 
7467 void VPInterleaveRecipe::execute(VPTransformState &State) {
7468   assert(!State.Instance && "Interleave group being replicated.");
7469   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7470                                       getMask());
7471 }
7472 
7473 void VPReplicateRecipe::execute(VPTransformState &State) {
7474   if (State.Instance) { // Generate a single instance.
7475     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7476     // Insert scalar instance packing it into a vector.
7477     if (AlsoPack && State.VF > 1) {
7478       // If we're constructing lane 0, initialize to start from undef.
7479       if (State.Instance->Lane == 0) {
7480         Value *Undef =
7481             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7482         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7483       }
7484       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7485     }
7486     return;
7487   }
7488 
7489   // Generate scalar instances for all VF lanes of all UF parts, unless the
7490   // instruction is uniform inwhich case generate only the first lane for each
7491   // of the UF parts.
7492   unsigned EndLane = IsUniform ? 1 : State.VF;
7493   for (unsigned Part = 0; Part < State.UF; ++Part)
7494     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7495       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7496 }
7497 
7498 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7499   assert(State.Instance && "Branch on Mask works only on single instance.");
7500 
7501   unsigned Part = State.Instance->Part;
7502   unsigned Lane = State.Instance->Lane;
7503 
7504   Value *ConditionBit = nullptr;
7505   if (!User) // Block in mask is all-one.
7506     ConditionBit = State.Builder.getTrue();
7507   else {
7508     VPValue *BlockInMask = User->getOperand(0);
7509     ConditionBit = State.get(BlockInMask, Part);
7510     if (ConditionBit->getType()->isVectorTy())
7511       ConditionBit = State.Builder.CreateExtractElement(
7512           ConditionBit, State.Builder.getInt32(Lane));
7513   }
7514 
7515   // Replace the temporary unreachable terminator with a new conditional branch,
7516   // whose two destinations will be set later when they are created.
7517   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7518   assert(isa<UnreachableInst>(CurrentTerminator) &&
7519          "Expected to replace unreachable terminator with conditional branch.");
7520   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7521   CondBr->setSuccessor(0, nullptr);
7522   ReplaceInstWithInst(CurrentTerminator, CondBr);
7523 }
7524 
7525 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7526   assert(State.Instance && "Predicated instruction PHI works per instance.");
7527   Instruction *ScalarPredInst = cast<Instruction>(
7528       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7529   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7530   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7531   assert(PredicatingBB && "Predicated block has no single predecessor.");
7532 
7533   // By current pack/unpack logic we need to generate only a single phi node: if
7534   // a vector value for the predicated instruction exists at this point it means
7535   // the instruction has vector users only, and a phi for the vector value is
7536   // needed. In this case the recipe of the predicated instruction is marked to
7537   // also do that packing, thereby "hoisting" the insert-element sequence.
7538   // Otherwise, a phi node for the scalar value is needed.
7539   unsigned Part = State.Instance->Part;
7540   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7541     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7542     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7543     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7544     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7545     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7546     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7547   } else {
7548     Type *PredInstType = PredInst->getType();
7549     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7550     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7551     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7552     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7553   }
7554 }
7555 
7556 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7557   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7558   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7559                                         getMask());
7560 }
7561 
7562 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7563 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7564 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7565 // for predication.
7566 static ScalarEpilogueLowering getScalarEpilogueLowering(
7567     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7568     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7569     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7570     LoopVectorizationLegality &LVL) {
7571   bool OptSize =
7572       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7573                                                      PGSOQueryType::IRPass);
7574   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7575   // don't look at hints or options, and don't request a scalar epilogue.
7576   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7577     return CM_ScalarEpilogueNotAllowedOptSize;
7578 
7579   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7580                               !PreferPredicateOverEpilog;
7581 
7582   // 2) Next, if disabling predication is requested on the command line, honour
7583   // this and request a scalar epilogue.
7584   if (PredicateOptDisabled)
7585     return CM_ScalarEpilogueAllowed;
7586 
7587   // 3) and 4) look if enabling predication is requested on the command line,
7588   // with a loop hint, or if the TTI hook indicates this is profitable, request
7589   // predication .
7590   if (PreferPredicateOverEpilog ||
7591       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7592       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7593                                         LVL.getLAI()) &&
7594        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7595     return CM_ScalarEpilogueNotNeededUsePredicate;
7596 
7597   return CM_ScalarEpilogueAllowed;
7598 }
7599 
7600 // Process the loop in the VPlan-native vectorization path. This path builds
7601 // VPlan upfront in the vectorization pipeline, which allows to apply
7602 // VPlan-to-VPlan transformations from the very beginning without modifying the
7603 // input LLVM IR.
7604 static bool processLoopInVPlanNativePath(
7605     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7606     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7607     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7608     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7609     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7610 
7611   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7612   Function *F = L->getHeader()->getParent();
7613   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7614 
7615   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7616       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7617 
7618   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7619                                 &Hints, IAI);
7620   // Use the planner for outer loop vectorization.
7621   // TODO: CM is not used at this point inside the planner. Turn CM into an
7622   // optional argument if we don't need it in the future.
7623   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7624 
7625   // Get user vectorization factor.
7626   const unsigned UserVF = Hints.getWidth();
7627 
7628   // Plan how to best vectorize, return the best VF and its cost.
7629   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7630 
7631   // If we are stress testing VPlan builds, do not attempt to generate vector
7632   // code. Masked vector code generation support will follow soon.
7633   // Also, do not attempt to vectorize if no vector code will be produced.
7634   if (VPlanBuildStressTest || EnableVPlanPredication ||
7635       VectorizationFactor::Disabled() == VF)
7636     return false;
7637 
7638   LVP.setBestPlan(VF.Width, 1);
7639 
7640   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7641                          &CM);
7642   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7643                     << L->getHeader()->getParent()->getName() << "\"\n");
7644   LVP.executePlan(LB, DT);
7645 
7646   // Mark the loop as already vectorized to avoid vectorizing again.
7647   Hints.setAlreadyVectorized();
7648 
7649   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7650   return true;
7651 }
7652 
7653 bool LoopVectorizePass::processLoop(Loop *L) {
7654   assert((EnableVPlanNativePath || L->empty()) &&
7655          "VPlan-native path is not enabled. Only process inner loops.");
7656 
7657 #ifndef NDEBUG
7658   const std::string DebugLocStr = getDebugLocString(L);
7659 #endif /* NDEBUG */
7660 
7661   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7662                     << L->getHeader()->getParent()->getName() << "\" from "
7663                     << DebugLocStr << "\n");
7664 
7665   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7666 
7667   LLVM_DEBUG(
7668       dbgs() << "LV: Loop hints:"
7669              << " force="
7670              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7671                      ? "disabled"
7672                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7673                             ? "enabled"
7674                             : "?"))
7675              << " width=" << Hints.getWidth()
7676              << " unroll=" << Hints.getInterleave() << "\n");
7677 
7678   // Function containing loop
7679   Function *F = L->getHeader()->getParent();
7680 
7681   // Looking at the diagnostic output is the only way to determine if a loop
7682   // was vectorized (other than looking at the IR or machine code), so it
7683   // is important to generate an optimization remark for each loop. Most of
7684   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7685   // generated as OptimizationRemark and OptimizationRemarkMissed are
7686   // less verbose reporting vectorized loops and unvectorized loops that may
7687   // benefit from vectorization, respectively.
7688 
7689   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7690     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7691     return false;
7692   }
7693 
7694   PredicatedScalarEvolution PSE(*SE, *L);
7695 
7696   // Check if it is legal to vectorize the loop.
7697   LoopVectorizationRequirements Requirements(*ORE);
7698   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7699                                 &Requirements, &Hints, DB, AC);
7700   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7701     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7702     Hints.emitRemarkWithHints();
7703     return false;
7704   }
7705 
7706   // Check the function attributes and profiles to find out if this function
7707   // should be optimized for size.
7708   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7709       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7710 
7711   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7712   // here. They may require CFG and instruction level transformations before
7713   // even evaluating whether vectorization is profitable. Since we cannot modify
7714   // the incoming IR, we need to build VPlan upfront in the vectorization
7715   // pipeline.
7716   if (!L->empty())
7717     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7718                                         ORE, BFI, PSI, Hints);
7719 
7720   assert(L->empty() && "Inner loop expected.");
7721 
7722   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7723   // count by optimizing for size, to minimize overheads.
7724   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7725   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7726     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7727                       << "This loop is worth vectorizing only if no scalar "
7728                       << "iteration overheads are incurred.");
7729     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7730       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7731     else {
7732       LLVM_DEBUG(dbgs() << "\n");
7733       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7734     }
7735   }
7736 
7737   // Check the function attributes to see if implicit floats are allowed.
7738   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7739   // an integer loop and the vector instructions selected are purely integer
7740   // vector instructions?
7741   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7742     reportVectorizationFailure(
7743         "Can't vectorize when the NoImplicitFloat attribute is used",
7744         "loop not vectorized due to NoImplicitFloat attribute",
7745         "NoImplicitFloat", ORE, L);
7746     Hints.emitRemarkWithHints();
7747     return false;
7748   }
7749 
7750   // Check if the target supports potentially unsafe FP vectorization.
7751   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7752   // for the target we're vectorizing for, to make sure none of the
7753   // additional fp-math flags can help.
7754   if (Hints.isPotentiallyUnsafe() &&
7755       TTI->isFPVectorizationPotentiallyUnsafe()) {
7756     reportVectorizationFailure(
7757         "Potentially unsafe FP op prevents vectorization",
7758         "loop not vectorized due to unsafe FP support.",
7759         "UnsafeFP", ORE, L);
7760     Hints.emitRemarkWithHints();
7761     return false;
7762   }
7763 
7764   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7765   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7766 
7767   // If an override option has been passed in for interleaved accesses, use it.
7768   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7769     UseInterleaved = EnableInterleavedMemAccesses;
7770 
7771   // Analyze interleaved memory accesses.
7772   if (UseInterleaved) {
7773     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7774   }
7775 
7776   // Use the cost model.
7777   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7778                                 F, &Hints, IAI);
7779   CM.collectValuesToIgnore();
7780 
7781   // Use the planner for vectorization.
7782   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7783 
7784   // Get user vectorization factor.
7785   unsigned UserVF = Hints.getWidth();
7786 
7787   // Plan how to best vectorize, return the best VF and its cost.
7788   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7789 
7790   VectorizationFactor VF = VectorizationFactor::Disabled();
7791   unsigned IC = 1;
7792   unsigned UserIC = Hints.getInterleave();
7793 
7794   if (MaybeVF) {
7795     VF = *MaybeVF;
7796     // Select the interleave count.
7797     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7798   }
7799 
7800   // Identify the diagnostic messages that should be produced.
7801   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7802   bool VectorizeLoop = true, InterleaveLoop = true;
7803   if (Requirements.doesNotMeet(F, L, Hints)) {
7804     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7805                          "requirements.\n");
7806     Hints.emitRemarkWithHints();
7807     return false;
7808   }
7809 
7810   if (VF.Width == 1) {
7811     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7812     VecDiagMsg = std::make_pair(
7813         "VectorizationNotBeneficial",
7814         "the cost-model indicates that vectorization is not beneficial");
7815     VectorizeLoop = false;
7816   }
7817 
7818   if (!MaybeVF && UserIC > 1) {
7819     // Tell the user interleaving was avoided up-front, despite being explicitly
7820     // requested.
7821     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7822                          "interleaving should be avoided up front\n");
7823     IntDiagMsg = std::make_pair(
7824         "InterleavingAvoided",
7825         "Ignoring UserIC, because interleaving was avoided up front");
7826     InterleaveLoop = false;
7827   } else if (IC == 1 && UserIC <= 1) {
7828     // Tell the user interleaving is not beneficial.
7829     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7830     IntDiagMsg = std::make_pair(
7831         "InterleavingNotBeneficial",
7832         "the cost-model indicates that interleaving is not beneficial");
7833     InterleaveLoop = false;
7834     if (UserIC == 1) {
7835       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7836       IntDiagMsg.second +=
7837           " and is explicitly disabled or interleave count is set to 1";
7838     }
7839   } else if (IC > 1 && UserIC == 1) {
7840     // Tell the user interleaving is beneficial, but it explicitly disabled.
7841     LLVM_DEBUG(
7842         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7843     IntDiagMsg = std::make_pair(
7844         "InterleavingBeneficialButDisabled",
7845         "the cost-model indicates that interleaving is beneficial "
7846         "but is explicitly disabled or interleave count is set to 1");
7847     InterleaveLoop = false;
7848   }
7849 
7850   // Override IC if user provided an interleave count.
7851   IC = UserIC > 0 ? UserIC : IC;
7852 
7853   // Emit diagnostic messages, if any.
7854   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7855   if (!VectorizeLoop && !InterleaveLoop) {
7856     // Do not vectorize or interleaving the loop.
7857     ORE->emit([&]() {
7858       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7859                                       L->getStartLoc(), L->getHeader())
7860              << VecDiagMsg.second;
7861     });
7862     ORE->emit([&]() {
7863       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7864                                       L->getStartLoc(), L->getHeader())
7865              << IntDiagMsg.second;
7866     });
7867     return false;
7868   } else if (!VectorizeLoop && InterleaveLoop) {
7869     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7870     ORE->emit([&]() {
7871       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7872                                         L->getStartLoc(), L->getHeader())
7873              << VecDiagMsg.second;
7874     });
7875   } else if (VectorizeLoop && !InterleaveLoop) {
7876     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7877                       << ") in " << DebugLocStr << '\n');
7878     ORE->emit([&]() {
7879       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7880                                         L->getStartLoc(), L->getHeader())
7881              << IntDiagMsg.second;
7882     });
7883   } else if (VectorizeLoop && InterleaveLoop) {
7884     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7885                       << ") in " << DebugLocStr << '\n');
7886     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7887   }
7888 
7889   LVP.setBestPlan(VF.Width, IC);
7890 
7891   using namespace ore;
7892   bool DisableRuntimeUnroll = false;
7893   MDNode *OrigLoopID = L->getLoopID();
7894 
7895   if (!VectorizeLoop) {
7896     assert(IC > 1 && "interleave count should not be 1 or 0");
7897     // If we decided that it is not legal to vectorize the loop, then
7898     // interleave it.
7899     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7900                                &CM);
7901     LVP.executePlan(Unroller, DT);
7902 
7903     ORE->emit([&]() {
7904       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7905                                 L->getHeader())
7906              << "interleaved loop (interleaved count: "
7907              << NV("InterleaveCount", IC) << ")";
7908     });
7909   } else {
7910     // If we decided that it is *legal* to vectorize the loop, then do it.
7911     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7912                            &LVL, &CM);
7913     LVP.executePlan(LB, DT);
7914     ++LoopsVectorized;
7915 
7916     // Add metadata to disable runtime unrolling a scalar loop when there are
7917     // no runtime checks about strides and memory. A scalar loop that is
7918     // rarely used is not worth unrolling.
7919     if (!LB.areSafetyChecksAdded())
7920       DisableRuntimeUnroll = true;
7921 
7922     // Report the vectorization decision.
7923     ORE->emit([&]() {
7924       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7925                                 L->getHeader())
7926              << "vectorized loop (vectorization width: "
7927              << NV("VectorizationFactor", VF.Width)
7928              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7929     });
7930   }
7931 
7932   Optional<MDNode *> RemainderLoopID =
7933       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7934                                       LLVMLoopVectorizeFollowupEpilogue});
7935   if (RemainderLoopID.hasValue()) {
7936     L->setLoopID(RemainderLoopID.getValue());
7937   } else {
7938     if (DisableRuntimeUnroll)
7939       AddRuntimeUnrollDisableMetaData(L);
7940 
7941     // Mark the loop as already vectorized to avoid vectorizing again.
7942     Hints.setAlreadyVectorized();
7943   }
7944 
7945   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7946   return true;
7947 }
7948 
7949 bool LoopVectorizePass::runImpl(
7950     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7951     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7952     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7953     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7954     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7955   SE = &SE_;
7956   LI = &LI_;
7957   TTI = &TTI_;
7958   DT = &DT_;
7959   BFI = &BFI_;
7960   TLI = TLI_;
7961   AA = &AA_;
7962   AC = &AC_;
7963   GetLAA = &GetLAA_;
7964   DB = &DB_;
7965   ORE = &ORE_;
7966   PSI = PSI_;
7967 
7968   // Don't attempt if
7969   // 1. the target claims to have no vector registers, and
7970   // 2. interleaving won't help ILP.
7971   //
7972   // The second condition is necessary because, even if the target has no
7973   // vector registers, loop vectorization may still enable scalar
7974   // interleaving.
7975   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7976       TTI->getMaxInterleaveFactor(1) < 2)
7977     return false;
7978 
7979   bool Changed = false;
7980 
7981   // The vectorizer requires loops to be in simplified form.
7982   // Since simplification may add new inner loops, it has to run before the
7983   // legality and profitability checks. This means running the loop vectorizer
7984   // will simplify all loops, regardless of whether anything end up being
7985   // vectorized.
7986   for (auto &L : *LI)
7987     Changed |=
7988         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7989 
7990   // Build up a worklist of inner-loops to vectorize. This is necessary as
7991   // the act of vectorizing or partially unrolling a loop creates new loops
7992   // and can invalidate iterators across the loops.
7993   SmallVector<Loop *, 8> Worklist;
7994 
7995   for (Loop *L : *LI)
7996     collectSupportedLoops(*L, LI, ORE, Worklist);
7997 
7998   LoopsAnalyzed += Worklist.size();
7999 
8000   // Now walk the identified inner loops.
8001   while (!Worklist.empty()) {
8002     Loop *L = Worklist.pop_back_val();
8003 
8004     // For the inner loops we actually process, form LCSSA to simplify the
8005     // transform.
8006     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8007 
8008     Changed |= processLoop(L);
8009   }
8010 
8011   // Process each loop nest in the function.
8012   return Changed;
8013 }
8014 
8015 PreservedAnalyses LoopVectorizePass::run(Function &F,
8016                                          FunctionAnalysisManager &AM) {
8017     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8018     auto &LI = AM.getResult<LoopAnalysis>(F);
8019     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8020     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8021     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8022     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8023     auto &AA = AM.getResult<AAManager>(F);
8024     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8025     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8026     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8027     MemorySSA *MSSA = EnableMSSALoopDependency
8028                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8029                           : nullptr;
8030 
8031     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8032     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8033         [&](Loop &L) -> const LoopAccessInfo & {
8034       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8035       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8036     };
8037     const ModuleAnalysisManager &MAM =
8038         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
8039     ProfileSummaryInfo *PSI =
8040         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8041     bool Changed =
8042         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8043     if (!Changed)
8044       return PreservedAnalyses::all();
8045     PreservedAnalyses PA;
8046 
8047     // We currently do not preserve loopinfo/dominator analyses with outer loop
8048     // vectorization. Until this is addressed, mark these analyses as preserved
8049     // only for non-VPlan-native path.
8050     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8051     if (!EnableVPlanNativePath) {
8052       PA.preserve<LoopAnalysis>();
8053       PA.preserve<DominatorTreeAnalysis>();
8054     }
8055     PA.preserve<BasicAA>();
8056     PA.preserve<GlobalsAA>();
8057     return PA;
8058 }
8059