1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 cl::opt<bool> EnableVPlanNativePath(
269     "enable-vplan-native-path", cl::init(false), cl::Hidden,
270     cl::desc("Enable VPlan-native vectorization path with "
271              "support for outer loop vectorization."));
272 
273 // FIXME: Remove this switch once we have divergence analysis. Currently we
274 // assume divergent non-backedge branches when this switch is true.
275 cl::opt<bool> EnableVPlanPredication(
276     "enable-vplan-predication", cl::init(false), cl::Hidden,
277     cl::desc("Enable VPlan-native vectorization path predicator with "
278              "support for outer loop vectorization."));
279 
280 // This flag enables the stress testing of the VPlan H-CFG construction in the
281 // VPlan-native vectorization path. It must be used in conjuction with
282 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
283 // verification of the H-CFGs built.
284 static cl::opt<bool> VPlanBuildStressTest(
285     "vplan-build-stress-test", cl::init(false), cl::Hidden,
286     cl::desc(
287         "Build VPlan for every supported loop nest in the function and bail "
288         "out right after the build (stress test the VPlan H-CFG construction "
289         "in the VPlan-native vectorization path)."));
290 
291 cl::opt<bool> llvm::EnableLoopInterleaving(
292     "interleave-loops", cl::init(true), cl::Hidden,
293     cl::desc("Enable loop interleaving in Loop vectorization passes"));
294 cl::opt<bool> llvm::EnableLoopVectorization(
295     "vectorize-loops", cl::init(true), cl::Hidden,
296     cl::desc("Run the Loop vectorization passes"));
297 
298 /// A helper function that returns the type of loaded or stored value.
299 static Type *getMemInstValueType(Value *I) {
300   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
301          "Expected Load or Store instruction");
302   if (auto *LI = dyn_cast<LoadInst>(I))
303     return LI->getType();
304   return cast<StoreInst>(I)->getValueOperand()->getType();
305 }
306 
307 /// A helper function that returns true if the given type is irregular. The
308 /// type is irregular if its allocated size doesn't equal the store size of an
309 /// element of the corresponding vector type at the given vectorization factor.
310 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
311   // Determine if an array of VF elements of type Ty is "bitcast compatible"
312   // with a <VF x Ty> vector.
313   if (VF > 1) {
314     auto *VectorTy = VectorType::get(Ty, VF);
315     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
316   }
317 
318   // If the vectorization factor is one, we just check if an array of type Ty
319   // requires padding between elements.
320   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
321 }
322 
323 /// A helper function that returns the reciprocal of the block probability of
324 /// predicated blocks. If we return X, we are assuming the predicated block
325 /// will execute once for every X iterations of the loop header.
326 ///
327 /// TODO: We should use actual block probability here, if available. Currently,
328 ///       we always assume predicated blocks have a 50% chance of executing.
329 static unsigned getReciprocalPredBlockProb() { return 2; }
330 
331 /// A helper function that adds a 'fast' flag to floating-point operations.
332 static Value *addFastMathFlag(Value *V) {
333   if (isa<FPMathOperator>(V))
334     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
335   return V;
336 }
337 
338 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FMF);
341   return V;
342 }
343 
344 /// A helper function that returns an integer or floating-point constant with
345 /// value C.
346 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
347   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
348                            : ConstantFP::get(Ty, C);
349 }
350 
351 /// Returns "best known" trip count for the specified loop \p L as defined by
352 /// the following procedure:
353 ///   1) Returns exact trip count if it is known.
354 ///   2) Returns expected trip count according to profile data if any.
355 ///   3) Returns upper bound estimate if it is known.
356 ///   4) Returns None if all of the above failed.
357 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
358   // Check if exact trip count is known.
359   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
360     return ExpectedTC;
361 
362   // Check if there is an expected trip count available from profile data.
363   if (LoopVectorizeWithBlockFrequency)
364     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
365       return EstimatedTC;
366 
367   // Check if upper bound estimate is known.
368   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
369     return ExpectedTC;
370 
371   return None;
372 }
373 
374 namespace llvm {
375 
376 /// InnerLoopVectorizer vectorizes loops which contain only one basic
377 /// block to a specified vectorization factor (VF).
378 /// This class performs the widening of scalars into vectors, or multiple
379 /// scalars. This class also implements the following features:
380 /// * It inserts an epilogue loop for handling loops that don't have iteration
381 ///   counts that are known to be a multiple of the vectorization factor.
382 /// * It handles the code generation for reduction variables.
383 /// * Scalarization (implementation using scalars) of un-vectorizable
384 ///   instructions.
385 /// InnerLoopVectorizer does not perform any vectorization-legality
386 /// checks, and relies on the caller to check for the different legality
387 /// aspects. The InnerLoopVectorizer relies on the
388 /// LoopVectorizationLegality class to provide information about the induction
389 /// and reduction variables that were found to a given vectorization factor.
390 class InnerLoopVectorizer {
391 public:
392   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
393                       LoopInfo *LI, DominatorTree *DT,
394                       const TargetLibraryInfo *TLI,
395                       const TargetTransformInfo *TTI, AssumptionCache *AC,
396                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
397                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
398                       LoopVectorizationCostModel *CM)
399       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
400         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
401         Builder(PSE.getSE()->getContext()),
402         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
403   virtual ~InnerLoopVectorizer() = default;
404 
405   /// Create a new empty loop. Unlink the old loop and connect the new one.
406   /// Return the pre-header block of the new loop.
407   BasicBlock *createVectorizedLoopSkeleton();
408 
409   /// Widen a single instruction within the innermost loop.
410   void widenInstruction(Instruction &I);
411 
412   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
413   void fixVectorizedLoop();
414 
415   // Return true if any runtime check is added.
416   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
417 
418   /// A type for vectorized values in the new loop. Each value from the
419   /// original loop, when vectorized, is represented by UF vector values in the
420   /// new unrolled loop, where UF is the unroll factor.
421   using VectorParts = SmallVector<Value *, 2>;
422 
423   /// Vectorize a single GetElementPtrInst based on information gathered and
424   /// decisions taken during planning.
425   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
426                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
427 
428   /// Vectorize a single PHINode in a block. This method handles the induction
429   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
430   /// arbitrary length vectors.
431   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
432 
433   /// A helper function to scalarize a single Instruction in the innermost loop.
434   /// Generates a sequence of scalar instances for each lane between \p MinLane
435   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
436   /// inclusive..
437   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
438                             bool IfPredicateInstr);
439 
440   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
441   /// is provided, the integer induction variable will first be truncated to
442   /// the corresponding type.
443   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
444 
445   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
446   /// vector or scalar value on-demand if one is not yet available. When
447   /// vectorizing a loop, we visit the definition of an instruction before its
448   /// uses. When visiting the definition, we either vectorize or scalarize the
449   /// instruction, creating an entry for it in the corresponding map. (In some
450   /// cases, such as induction variables, we will create both vector and scalar
451   /// entries.) Then, as we encounter uses of the definition, we derive values
452   /// for each scalar or vector use unless such a value is already available.
453   /// For example, if we scalarize a definition and one of its uses is vector,
454   /// we build the required vector on-demand with an insertelement sequence
455   /// when visiting the use. Otherwise, if the use is scalar, we can use the
456   /// existing scalar definition.
457   ///
458   /// Return a value in the new loop corresponding to \p V from the original
459   /// loop at unroll index \p Part. If the value has already been vectorized,
460   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
461   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
462   /// a new vector value on-demand by inserting the scalar values into a vector
463   /// with an insertelement sequence. If the value has been neither vectorized
464   /// nor scalarized, it must be loop invariant, so we simply broadcast the
465   /// value into a vector.
466   Value *getOrCreateVectorValue(Value *V, unsigned Part);
467 
468   /// Return a value in the new loop corresponding to \p V from the original
469   /// loop at unroll and vector indices \p Instance. If the value has been
470   /// vectorized but not scalarized, the necessary extractelement instruction
471   /// will be generated.
472   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
473 
474   /// Construct the vector value of a scalarized value \p V one lane at a time.
475   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
476 
477   /// Try to vectorize the interleaved access group that \p Instr belongs to
478   /// with the base address given in \p Addr, optionally masking the vector
479   /// operations if \p BlockInMask is non-null. Use \p State to translate given
480   /// VPValues to IR values in the vectorized loop.
481   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
482                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
483 
484   /// Vectorize Load and Store instructions with the base address given in \p
485   /// Addr, optionally masking the vector operations if \p BlockInMask is
486   /// non-null. Use \p State to translate given VPValues to IR values in the
487   /// vectorized loop.
488   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
489                                   VPValue *Addr,
490                                   VPValue *BlockInMask = nullptr);
491 
492   /// Set the debug location in the builder using the debug location in
493   /// the instruction.
494   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
495 
496   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
497   void fixNonInductionPHIs(void);
498 
499 protected:
500   friend class LoopVectorizationPlanner;
501 
502   /// A small list of PHINodes.
503   using PhiVector = SmallVector<PHINode *, 4>;
504 
505   /// A type for scalarized values in the new loop. Each value from the
506   /// original loop, when scalarized, is represented by UF x VF scalar values
507   /// in the new unrolled loop, where UF is the unroll factor and VF is the
508   /// vectorization factor.
509   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
510 
511   /// Set up the values of the IVs correctly when exiting the vector loop.
512   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
513                     Value *CountRoundDown, Value *EndValue,
514                     BasicBlock *MiddleBlock);
515 
516   /// Create a new induction variable inside L.
517   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
518                                    Value *Step, Instruction *DL);
519 
520   /// Handle all cross-iteration phis in the header.
521   void fixCrossIterationPHIs();
522 
523   /// Fix a first-order recurrence. This is the second phase of vectorizing
524   /// this phi node.
525   void fixFirstOrderRecurrence(PHINode *Phi);
526 
527   /// Fix a reduction cross-iteration phi. This is the second phase of
528   /// vectorizing this phi node.
529   void fixReduction(PHINode *Phi);
530 
531   /// Clear NSW/NUW flags from reduction instructions if necessary.
532   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
533 
534   /// The Loop exit block may have single value PHI nodes with some
535   /// incoming value. While vectorizing we only handled real values
536   /// that were defined inside the loop and we should have one value for
537   /// each predecessor of its parent basic block. See PR14725.
538   void fixLCSSAPHIs();
539 
540   /// Iteratively sink the scalarized operands of a predicated instruction into
541   /// the block that was created for it.
542   void sinkScalarOperands(Instruction *PredInst);
543 
544   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545   /// represented as.
546   void truncateToMinimalBitwidths();
547 
548   /// Create a broadcast instruction. This method generates a broadcast
549   /// instruction (shuffle) for loop invariant values and for the induction
550   /// value. If this is the induction variable then we extend it to N, N+1, ...
551   /// this is needed because each iteration in the loop corresponds to a SIMD
552   /// element.
553   virtual Value *getBroadcastInstrs(Value *V);
554 
555   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
556   /// to each vector element of Val. The sequence starts at StartIndex.
557   /// \p Opcode is relevant for FP induction variable.
558   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
559                                Instruction::BinaryOps Opcode =
560                                Instruction::BinaryOpsEnd);
561 
562   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
563   /// variable on which to base the steps, \p Step is the size of the step, and
564   /// \p EntryVal is the value from the original loop that maps to the steps.
565   /// Note that \p EntryVal doesn't have to be an induction variable - it
566   /// can also be a truncate instruction.
567   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
568                         const InductionDescriptor &ID);
569 
570   /// Create a vector induction phi node based on an existing scalar one. \p
571   /// EntryVal is the value from the original loop that maps to the vector phi
572   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
573   /// truncate instruction, instead of widening the original IV, we widen a
574   /// version of the IV truncated to \p EntryVal's type.
575   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
576                                        Value *Step, Instruction *EntryVal);
577 
578   /// Returns true if an instruction \p I should be scalarized instead of
579   /// vectorized for the chosen vectorization factor.
580   bool shouldScalarizeInstruction(Instruction *I) const;
581 
582   /// Returns true if we should generate a scalar version of \p IV.
583   bool needsScalarInduction(Instruction *IV) const;
584 
585   /// If there is a cast involved in the induction variable \p ID, which should
586   /// be ignored in the vectorized loop body, this function records the
587   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
588   /// cast. We had already proved that the casted Phi is equal to the uncasted
589   /// Phi in the vectorized loop (under a runtime guard), and therefore
590   /// there is no need to vectorize the cast - the same value can be used in the
591   /// vector loop for both the Phi and the cast.
592   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
593   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
594   ///
595   /// \p EntryVal is the value from the original loop that maps to the vector
596   /// phi node and is used to distinguish what is the IV currently being
597   /// processed - original one (if \p EntryVal is a phi corresponding to the
598   /// original IV) or the "newly-created" one based on the proof mentioned above
599   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
600   /// latter case \p EntryVal is a TruncInst and we must not record anything for
601   /// that IV, but it's error-prone to expect callers of this routine to care
602   /// about that, hence this explicit parameter.
603   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
604                                              const Instruction *EntryVal,
605                                              Value *VectorLoopValue,
606                                              unsigned Part,
607                                              unsigned Lane = UINT_MAX);
608 
609   /// Generate a shuffle sequence that will reverse the vector Vec.
610   virtual Value *reverseVector(Value *Vec);
611 
612   /// Returns (and creates if needed) the original loop trip count.
613   Value *getOrCreateTripCount(Loop *NewLoop);
614 
615   /// Returns (and creates if needed) the trip count of the widened loop.
616   Value *getOrCreateVectorTripCount(Loop *NewLoop);
617 
618   /// Returns a bitcasted value to the requested vector type.
619   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
620   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
621                                 const DataLayout &DL);
622 
623   /// Emit a bypass check to see if the vector trip count is zero, including if
624   /// it overflows.
625   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
626 
627   /// Emit a bypass check to see if all of the SCEV assumptions we've
628   /// had to make are correct.
629   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit bypass checks to check any memory assumptions we may have made.
632   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
633 
634   /// Compute the transformed value of Index at offset StartValue using step
635   /// StepValue.
636   /// For integer induction, returns StartValue + Index * StepValue.
637   /// For pointer induction, returns StartValue[Index * StepValue].
638   /// FIXME: The newly created binary instructions should contain nsw/nuw
639   /// flags, which can be found from the original scalar operations.
640   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
641                               const DataLayout &DL,
642                               const InductionDescriptor &ID) const;
643 
644   /// Add additional metadata to \p To that was not present on \p Orig.
645   ///
646   /// Currently this is used to add the noalias annotations based on the
647   /// inserted memchecks.  Use this for instructions that are *cloned* into the
648   /// vector loop.
649   void addNewMetadata(Instruction *To, const Instruction *Orig);
650 
651   /// Add metadata from one instruction to another.
652   ///
653   /// This includes both the original MDs from \p From and additional ones (\see
654   /// addNewMetadata).  Use this for *newly created* instructions in the vector
655   /// loop.
656   void addMetadata(Instruction *To, Instruction *From);
657 
658   /// Similar to the previous function but it adds the metadata to a
659   /// vector of instructions.
660   void addMetadata(ArrayRef<Value *> To, Instruction *From);
661 
662   /// The original loop.
663   Loop *OrigLoop;
664 
665   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
666   /// dynamic knowledge to simplify SCEV expressions and converts them to a
667   /// more usable form.
668   PredicatedScalarEvolution &PSE;
669 
670   /// Loop Info.
671   LoopInfo *LI;
672 
673   /// Dominator Tree.
674   DominatorTree *DT;
675 
676   /// Alias Analysis.
677   AliasAnalysis *AA;
678 
679   /// Target Library Info.
680   const TargetLibraryInfo *TLI;
681 
682   /// Target Transform Info.
683   const TargetTransformInfo *TTI;
684 
685   /// Assumption Cache.
686   AssumptionCache *AC;
687 
688   /// Interface to emit optimization remarks.
689   OptimizationRemarkEmitter *ORE;
690 
691   /// LoopVersioning.  It's only set up (non-null) if memchecks were
692   /// used.
693   ///
694   /// This is currently only used to add no-alias metadata based on the
695   /// memchecks.  The actually versioning is performed manually.
696   std::unique_ptr<LoopVersioning> LVer;
697 
698   /// The vectorization SIMD factor to use. Each vector will have this many
699   /// vector elements.
700   unsigned VF;
701 
702   /// The vectorization unroll factor to use. Each scalar is vectorized to this
703   /// many different vector instructions.
704   unsigned UF;
705 
706   /// The builder that we use
707   IRBuilder<> Builder;
708 
709   // --- Vectorization state ---
710 
711   /// The vector-loop preheader.
712   BasicBlock *LoopVectorPreHeader;
713 
714   /// The scalar-loop preheader.
715   BasicBlock *LoopScalarPreHeader;
716 
717   /// Middle Block between the vector and the scalar.
718   BasicBlock *LoopMiddleBlock;
719 
720   /// The ExitBlock of the scalar loop.
721   BasicBlock *LoopExitBlock;
722 
723   /// The vector loop body.
724   BasicBlock *LoopVectorBody;
725 
726   /// The scalar loop body.
727   BasicBlock *LoopScalarBody;
728 
729   /// A list of all bypass blocks. The first block is the entry of the loop.
730   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
731 
732   /// The new Induction variable which was added to the new block.
733   PHINode *Induction = nullptr;
734 
735   /// The induction variable of the old basic block.
736   PHINode *OldInduction = nullptr;
737 
738   /// Maps values from the original loop to their corresponding values in the
739   /// vectorized loop. A key value can map to either vector values, scalar
740   /// values or both kinds of values, depending on whether the key was
741   /// vectorized and scalarized.
742   VectorizerValueMap VectorLoopValueMap;
743 
744   /// Store instructions that were predicated.
745   SmallVector<Instruction *, 4> PredicatedInstructions;
746 
747   /// Trip count of the original loop.
748   Value *TripCount = nullptr;
749 
750   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
751   Value *VectorTripCount = nullptr;
752 
753   /// The legality analysis.
754   LoopVectorizationLegality *Legal;
755 
756   /// The profitablity analysis.
757   LoopVectorizationCostModel *Cost;
758 
759   // Record whether runtime checks are added.
760   bool AddedSafetyChecks = false;
761 
762   // Holds the end values for each induction variable. We save the end values
763   // so we can later fix-up the external users of the induction variables.
764   DenseMap<PHINode *, Value *> IVEndValues;
765 
766   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
767   // fixed up at the end of vector code generation.
768   SmallVector<PHINode *, 8> OrigPHIsToFix;
769 };
770 
771 class InnerLoopUnroller : public InnerLoopVectorizer {
772 public:
773   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
774                     LoopInfo *LI, DominatorTree *DT,
775                     const TargetLibraryInfo *TLI,
776                     const TargetTransformInfo *TTI, AssumptionCache *AC,
777                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
778                     LoopVectorizationLegality *LVL,
779                     LoopVectorizationCostModel *CM)
780       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
781                             UnrollFactor, LVL, CM) {}
782 
783 private:
784   Value *getBroadcastInstrs(Value *V) override;
785   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
786                        Instruction::BinaryOps Opcode =
787                        Instruction::BinaryOpsEnd) override;
788   Value *reverseVector(Value *Vec) override;
789 };
790 
791 } // end namespace llvm
792 
793 /// Look for a meaningful debug location on the instruction or it's
794 /// operands.
795 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
796   if (!I)
797     return I;
798 
799   DebugLoc Empty;
800   if (I->getDebugLoc() != Empty)
801     return I;
802 
803   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
804     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
805       if (OpInst->getDebugLoc() != Empty)
806         return OpInst;
807   }
808 
809   return I;
810 }
811 
812 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
813   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
814     const DILocation *DIL = Inst->getDebugLoc();
815     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
816         !isa<DbgInfoIntrinsic>(Inst)) {
817       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
818       if (NewDIL)
819         B.SetCurrentDebugLocation(NewDIL.getValue());
820       else
821         LLVM_DEBUG(dbgs()
822                    << "Failed to create new discriminator: "
823                    << DIL->getFilename() << " Line: " << DIL->getLine());
824     }
825     else
826       B.SetCurrentDebugLocation(DIL);
827   } else
828     B.SetCurrentDebugLocation(DebugLoc());
829 }
830 
831 /// Write a record \p DebugMsg about vectorization failure to the debug
832 /// output stream. If \p I is passed, it is an instruction that prevents
833 /// vectorization.
834 #ifndef NDEBUG
835 static void debugVectorizationFailure(const StringRef DebugMsg,
836     Instruction *I) {
837   dbgs() << "LV: Not vectorizing: " << DebugMsg;
838   if (I != nullptr)
839     dbgs() << " " << *I;
840   else
841     dbgs() << '.';
842   dbgs() << '\n';
843 }
844 #endif
845 
846 /// Create an analysis remark that explains why vectorization failed
847 ///
848 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
849 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
850 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
851 /// the location of the remark.  \return the remark object that can be
852 /// streamed to.
853 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
854     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
855   Value *CodeRegion = TheLoop->getHeader();
856   DebugLoc DL = TheLoop->getStartLoc();
857 
858   if (I) {
859     CodeRegion = I->getParent();
860     // If there is no debug location attached to the instruction, revert back to
861     // using the loop's.
862     if (I->getDebugLoc())
863       DL = I->getDebugLoc();
864   }
865 
866   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
867   R << "loop not vectorized: ";
868   return R;
869 }
870 
871 namespace llvm {
872 
873 void reportVectorizationFailure(const StringRef DebugMsg,
874     const StringRef OREMsg, const StringRef ORETag,
875     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
876   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
877   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
878   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
879                 ORETag, TheLoop, I) << OREMsg);
880 }
881 
882 } // end namespace llvm
883 
884 #ifndef NDEBUG
885 /// \return string containing a file name and a line # for the given loop.
886 static std::string getDebugLocString(const Loop *L) {
887   std::string Result;
888   if (L) {
889     raw_string_ostream OS(Result);
890     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
891       LoopDbgLoc.print(OS);
892     else
893       // Just print the module name.
894       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
895     OS.flush();
896   }
897   return Result;
898 }
899 #endif
900 
901 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
902                                          const Instruction *Orig) {
903   // If the loop was versioned with memchecks, add the corresponding no-alias
904   // metadata.
905   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
906     LVer->annotateInstWithNoAlias(To, Orig);
907 }
908 
909 void InnerLoopVectorizer::addMetadata(Instruction *To,
910                                       Instruction *From) {
911   propagateMetadata(To, From);
912   addNewMetadata(To, From);
913 }
914 
915 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
916                                       Instruction *From) {
917   for (Value *V : To) {
918     if (Instruction *I = dyn_cast<Instruction>(V))
919       addMetadata(I, From);
920   }
921 }
922 
923 namespace llvm {
924 
925 // Loop vectorization cost-model hints how the scalar epilogue loop should be
926 // lowered.
927 enum ScalarEpilogueLowering {
928 
929   // The default: allowing scalar epilogues.
930   CM_ScalarEpilogueAllowed,
931 
932   // Vectorization with OptForSize: don't allow epilogues.
933   CM_ScalarEpilogueNotAllowedOptSize,
934 
935   // A special case of vectorisation with OptForSize: loops with a very small
936   // trip count are considered for vectorization under OptForSize, thereby
937   // making sure the cost of their loop body is dominant, free of runtime
938   // guards and scalar iteration overheads.
939   CM_ScalarEpilogueNotAllowedLowTripLoop,
940 
941   // Loop hint predicate indicating an epilogue is undesired.
942   CM_ScalarEpilogueNotNeededUsePredicate
943 };
944 
945 /// LoopVectorizationCostModel - estimates the expected speedups due to
946 /// vectorization.
947 /// In many cases vectorization is not profitable. This can happen because of
948 /// a number of reasons. In this class we mainly attempt to predict the
949 /// expected speedup/slowdowns due to the supported instruction set. We use the
950 /// TargetTransformInfo to query the different backends for the cost of
951 /// different operations.
952 class LoopVectorizationCostModel {
953 public:
954   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
955                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
956                              LoopVectorizationLegality *Legal,
957                              const TargetTransformInfo &TTI,
958                              const TargetLibraryInfo *TLI, DemandedBits *DB,
959                              AssumptionCache *AC,
960                              OptimizationRemarkEmitter *ORE, const Function *F,
961                              const LoopVectorizeHints *Hints,
962                              InterleavedAccessInfo &IAI)
963       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
964         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
965         Hints(Hints), InterleaveInfo(IAI) {}
966 
967   /// \return An upper bound for the vectorization factor, or None if
968   /// vectorization and interleaving should be avoided up front.
969   Optional<unsigned> computeMaxVF();
970 
971   /// \return True if runtime checks are required for vectorization, and false
972   /// otherwise.
973   bool runtimeChecksRequired();
974 
975   /// \return The most profitable vectorization factor and the cost of that VF.
976   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
977   /// then this vectorization factor will be selected if vectorization is
978   /// possible.
979   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
980 
981   /// Setup cost-based decisions for user vectorization factor.
982   void selectUserVectorizationFactor(unsigned UserVF) {
983     collectUniformsAndScalars(UserVF);
984     collectInstsToScalarize(UserVF);
985   }
986 
987   /// \return The size (in bits) of the smallest and widest types in the code
988   /// that needs to be vectorized. We ignore values that remain scalar such as
989   /// 64 bit loop indices.
990   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
991 
992   /// \return The desired interleave count.
993   /// If interleave count has been specified by metadata it will be returned.
994   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
995   /// are the selected vectorization factor and the cost of the selected VF.
996   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
997 
998   /// Memory access instruction may be vectorized in more than one way.
999   /// Form of instruction after vectorization depends on cost.
1000   /// This function takes cost-based decisions for Load/Store instructions
1001   /// and collects them in a map. This decisions map is used for building
1002   /// the lists of loop-uniform and loop-scalar instructions.
1003   /// The calculated cost is saved with widening decision in order to
1004   /// avoid redundant calculations.
1005   void setCostBasedWideningDecision(unsigned VF);
1006 
1007   /// A struct that represents some properties of the register usage
1008   /// of a loop.
1009   struct RegisterUsage {
1010     /// Holds the number of loop invariant values that are used in the loop.
1011     /// The key is ClassID of target-provided register class.
1012     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1013     /// Holds the maximum number of concurrent live intervals in the loop.
1014     /// The key is ClassID of target-provided register class.
1015     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1016   };
1017 
1018   /// \return Returns information about the register usages of the loop for the
1019   /// given vectorization factors.
1020   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1021 
1022   /// Collect values we want to ignore in the cost model.
1023   void collectValuesToIgnore();
1024 
1025   /// \returns The smallest bitwidth each instruction can be represented with.
1026   /// The vector equivalents of these instructions should be truncated to this
1027   /// type.
1028   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1029     return MinBWs;
1030   }
1031 
1032   /// \returns True if it is more profitable to scalarize instruction \p I for
1033   /// vectorization factor \p VF.
1034   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1035     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1036 
1037     // Cost model is not run in the VPlan-native path - return conservative
1038     // result until this changes.
1039     if (EnableVPlanNativePath)
1040       return false;
1041 
1042     auto Scalars = InstsToScalarize.find(VF);
1043     assert(Scalars != InstsToScalarize.end() &&
1044            "VF not yet analyzed for scalarization profitability");
1045     return Scalars->second.find(I) != Scalars->second.end();
1046   }
1047 
1048   /// Returns true if \p I is known to be uniform after vectorization.
1049   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1050     if (VF == 1)
1051       return true;
1052 
1053     // Cost model is not run in the VPlan-native path - return conservative
1054     // result until this changes.
1055     if (EnableVPlanNativePath)
1056       return false;
1057 
1058     auto UniformsPerVF = Uniforms.find(VF);
1059     assert(UniformsPerVF != Uniforms.end() &&
1060            "VF not yet analyzed for uniformity");
1061     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1062   }
1063 
1064   /// Returns true if \p I is known to be scalar after vectorization.
1065   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1066     if (VF == 1)
1067       return true;
1068 
1069     // Cost model is not run in the VPlan-native path - return conservative
1070     // result until this changes.
1071     if (EnableVPlanNativePath)
1072       return false;
1073 
1074     auto ScalarsPerVF = Scalars.find(VF);
1075     assert(ScalarsPerVF != Scalars.end() &&
1076            "Scalar values are not calculated for VF");
1077     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1078   }
1079 
1080   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1081   /// for vectorization factor \p VF.
1082   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1083     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1084            !isProfitableToScalarize(I, VF) &&
1085            !isScalarAfterVectorization(I, VF);
1086   }
1087 
1088   /// Decision that was taken during cost calculation for memory instruction.
1089   enum InstWidening {
1090     CM_Unknown,
1091     CM_Widen,         // For consecutive accesses with stride +1.
1092     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1093     CM_Interleave,
1094     CM_GatherScatter,
1095     CM_Scalarize
1096   };
1097 
1098   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1099   /// instruction \p I and vector width \p VF.
1100   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1101                            unsigned Cost) {
1102     assert(VF >= 2 && "Expected VF >=2");
1103     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1104   }
1105 
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// interleaving group \p Grp and vector width \p VF.
1108   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1109                            InstWidening W, unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     /// Broadcast this decicion to all instructions inside the group.
1112     /// But the cost will be assigned to one instruction only.
1113     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1114       if (auto *I = Grp->getMember(i)) {
1115         if (Grp->getInsertPos() == I)
1116           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1117         else
1118           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1119       }
1120     }
1121   }
1122 
1123   /// Return the cost model decision for the given instruction \p I and vector
1124   /// width \p VF. Return CM_Unknown if this instruction did not pass
1125   /// through the cost modeling.
1126   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1127     assert(VF >= 2 && "Expected VF >=2");
1128 
1129     // Cost model is not run in the VPlan-native path - return conservative
1130     // result until this changes.
1131     if (EnableVPlanNativePath)
1132       return CM_GatherScatter;
1133 
1134     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1135     auto Itr = WideningDecisions.find(InstOnVF);
1136     if (Itr == WideningDecisions.end())
1137       return CM_Unknown;
1138     return Itr->second.first;
1139   }
1140 
1141   /// Return the vectorization cost for the given instruction \p I and vector
1142   /// width \p VF.
1143   unsigned getWideningCost(Instruction *I, unsigned VF) {
1144     assert(VF >= 2 && "Expected VF >=2");
1145     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1146     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1147            "The cost is not calculated");
1148     return WideningDecisions[InstOnVF].second;
1149   }
1150 
1151   /// Return True if instruction \p I is an optimizable truncate whose operand
1152   /// is an induction variable. Such a truncate will be removed by adding a new
1153   /// induction variable with the destination type.
1154   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1155     // If the instruction is not a truncate, return false.
1156     auto *Trunc = dyn_cast<TruncInst>(I);
1157     if (!Trunc)
1158       return false;
1159 
1160     // Get the source and destination types of the truncate.
1161     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1162     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1163 
1164     // If the truncate is free for the given types, return false. Replacing a
1165     // free truncate with an induction variable would add an induction variable
1166     // update instruction to each iteration of the loop. We exclude from this
1167     // check the primary induction variable since it will need an update
1168     // instruction regardless.
1169     Value *Op = Trunc->getOperand(0);
1170     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1171       return false;
1172 
1173     // If the truncated value is not an induction variable, return false.
1174     return Legal->isInductionPhi(Op);
1175   }
1176 
1177   /// Collects the instructions to scalarize for each predicated instruction in
1178   /// the loop.
1179   void collectInstsToScalarize(unsigned VF);
1180 
1181   /// Collect Uniform and Scalar values for the given \p VF.
1182   /// The sets depend on CM decision for Load/Store instructions
1183   /// that may be vectorized as interleave, gather-scatter or scalarized.
1184   void collectUniformsAndScalars(unsigned VF) {
1185     // Do the analysis once.
1186     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1187       return;
1188     setCostBasedWideningDecision(VF);
1189     collectLoopUniforms(VF);
1190     collectLoopScalars(VF);
1191   }
1192 
1193   /// Returns true if the target machine supports masked store operation
1194   /// for the given \p DataType and kind of access to \p Ptr.
1195   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1196     return Legal->isConsecutivePtr(Ptr) &&
1197            TTI.isLegalMaskedStore(DataType, Alignment);
1198   }
1199 
1200   /// Returns true if the target machine supports masked load operation
1201   /// for the given \p DataType and kind of access to \p Ptr.
1202   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1203     return Legal->isConsecutivePtr(Ptr) &&
1204            TTI.isLegalMaskedLoad(DataType, Alignment);
1205   }
1206 
1207   /// Returns true if the target machine supports masked scatter operation
1208   /// for the given \p DataType.
1209   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1210     return TTI.isLegalMaskedScatter(DataType, Alignment);
1211   }
1212 
1213   /// Returns true if the target machine supports masked gather operation
1214   /// for the given \p DataType.
1215   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1216     return TTI.isLegalMaskedGather(DataType, Alignment);
1217   }
1218 
1219   /// Returns true if the target machine can represent \p V as a masked gather
1220   /// or scatter operation.
1221   bool isLegalGatherOrScatter(Value *V) {
1222     bool LI = isa<LoadInst>(V);
1223     bool SI = isa<StoreInst>(V);
1224     if (!LI && !SI)
1225       return false;
1226     auto *Ty = getMemInstValueType(V);
1227     MaybeAlign Align = getLoadStoreAlignment(V);
1228     return (LI && isLegalMaskedGather(Ty, Align)) ||
1229            (SI && isLegalMaskedScatter(Ty, Align));
1230   }
1231 
1232   /// Returns true if \p I is an instruction that will be scalarized with
1233   /// predication. Such instructions include conditional stores and
1234   /// instructions that may divide by zero.
1235   /// If a non-zero VF has been calculated, we check if I will be scalarized
1236   /// predication for that VF.
1237   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1238 
1239   // Returns true if \p I is an instruction that will be predicated either
1240   // through scalar predication or masked load/store or masked gather/scatter.
1241   // Superset of instructions that return true for isScalarWithPredication.
1242   bool isPredicatedInst(Instruction *I) {
1243     if (!blockNeedsPredication(I->getParent()))
1244       return false;
1245     // Loads and stores that need some form of masked operation are predicated
1246     // instructions.
1247     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1248       return Legal->isMaskRequired(I);
1249     return isScalarWithPredication(I);
1250   }
1251 
1252   /// Returns true if \p I is a memory instruction with consecutive memory
1253   /// access that can be widened.
1254   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1255 
1256   /// Returns true if \p I is a memory instruction in an interleaved-group
1257   /// of memory accesses that can be vectorized with wide vector loads/stores
1258   /// and shuffles.
1259   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1260 
1261   /// Check if \p Instr belongs to any interleaved access group.
1262   bool isAccessInterleaved(Instruction *Instr) {
1263     return InterleaveInfo.isInterleaved(Instr);
1264   }
1265 
1266   /// Get the interleaved access group that \p Instr belongs to.
1267   const InterleaveGroup<Instruction> *
1268   getInterleavedAccessGroup(Instruction *Instr) {
1269     return InterleaveInfo.getInterleaveGroup(Instr);
1270   }
1271 
1272   /// Returns true if an interleaved group requires a scalar iteration
1273   /// to handle accesses with gaps, and there is nothing preventing us from
1274   /// creating a scalar epilogue.
1275   bool requiresScalarEpilogue() const {
1276     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1277   }
1278 
1279   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1280   /// loop hint annotation.
1281   bool isScalarEpilogueAllowed() const {
1282     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1283   }
1284 
1285   /// Returns true if all loop blocks should be masked to fold tail loop.
1286   bool foldTailByMasking() const { return FoldTailByMasking; }
1287 
1288   bool blockNeedsPredication(BasicBlock *BB) {
1289     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1290   }
1291 
1292   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1293   /// with factor VF.  Return the cost of the instruction, including
1294   /// scalarization overhead if it's needed.
1295   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1296 
1297   /// Estimate cost of a call instruction CI if it were vectorized with factor
1298   /// VF. Return the cost of the instruction, including scalarization overhead
1299   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1300   /// scalarized -
1301   /// i.e. either vector version isn't available, or is too expensive.
1302   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1303 
1304 private:
1305   unsigned NumPredStores = 0;
1306 
1307   /// \return An upper bound for the vectorization factor, larger than zero.
1308   /// One is returned if vectorization should best be avoided due to cost.
1309   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1310 
1311   /// The vectorization cost is a combination of the cost itself and a boolean
1312   /// indicating whether any of the contributing operations will actually
1313   /// operate on
1314   /// vector values after type legalization in the backend. If this latter value
1315   /// is
1316   /// false, then all operations will be scalarized (i.e. no vectorization has
1317   /// actually taken place).
1318   using VectorizationCostTy = std::pair<unsigned, bool>;
1319 
1320   /// Returns the expected execution cost. The unit of the cost does
1321   /// not matter because we use the 'cost' units to compare different
1322   /// vector widths. The cost that is returned is *not* normalized by
1323   /// the factor width.
1324   VectorizationCostTy expectedCost(unsigned VF);
1325 
1326   /// Returns the execution time cost of an instruction for a given vector
1327   /// width. Vector width of one means scalar.
1328   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1329 
1330   /// The cost-computation logic from getInstructionCost which provides
1331   /// the vector type as an output parameter.
1332   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1333 
1334   /// Calculate vectorization cost of memory instruction \p I.
1335   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1336 
1337   /// The cost computation for scalarized memory instruction.
1338   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1339 
1340   /// The cost computation for interleaving group of memory instructions.
1341   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1342 
1343   /// The cost computation for Gather/Scatter instruction.
1344   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1345 
1346   /// The cost computation for widening instruction \p I with consecutive
1347   /// memory access.
1348   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1349 
1350   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1351   /// Load: scalar load + broadcast.
1352   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1353   /// element)
1354   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1355 
1356   /// Estimate the overhead of scalarizing an instruction. This is a
1357   /// convenience wrapper for the type-based getScalarizationOverhead API.
1358   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1359 
1360   /// Returns whether the instruction is a load or store and will be a emitted
1361   /// as a vector operation.
1362   bool isConsecutiveLoadOrStore(Instruction *I);
1363 
1364   /// Returns true if an artificially high cost for emulated masked memrefs
1365   /// should be used.
1366   bool useEmulatedMaskMemRefHack(Instruction *I);
1367 
1368   /// Map of scalar integer values to the smallest bitwidth they can be legally
1369   /// represented as. The vector equivalents of these values should be truncated
1370   /// to this type.
1371   MapVector<Instruction *, uint64_t> MinBWs;
1372 
1373   /// A type representing the costs for instructions if they were to be
1374   /// scalarized rather than vectorized. The entries are Instruction-Cost
1375   /// pairs.
1376   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1377 
1378   /// A set containing all BasicBlocks that are known to present after
1379   /// vectorization as a predicated block.
1380   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1381 
1382   /// Records whether it is allowed to have the original scalar loop execute at
1383   /// least once. This may be needed as a fallback loop in case runtime
1384   /// aliasing/dependence checks fail, or to handle the tail/remainder
1385   /// iterations when the trip count is unknown or doesn't divide by the VF,
1386   /// or as a peel-loop to handle gaps in interleave-groups.
1387   /// Under optsize and when the trip count is very small we don't allow any
1388   /// iterations to execute in the scalar loop.
1389   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1390 
1391   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1392   bool FoldTailByMasking = false;
1393 
1394   /// A map holding scalar costs for different vectorization factors. The
1395   /// presence of a cost for an instruction in the mapping indicates that the
1396   /// instruction will be scalarized when vectorizing with the associated
1397   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1398   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1399 
1400   /// Holds the instructions known to be uniform after vectorization.
1401   /// The data is collected per VF.
1402   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1403 
1404   /// Holds the instructions known to be scalar after vectorization.
1405   /// The data is collected per VF.
1406   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1407 
1408   /// Holds the instructions (address computations) that are forced to be
1409   /// scalarized.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1411 
1412   /// Returns the expected difference in cost from scalarizing the expression
1413   /// feeding a predicated instruction \p PredInst. The instructions to
1414   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1415   /// non-negative return value implies the expression will be scalarized.
1416   /// Currently, only single-use chains are considered for scalarization.
1417   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1418                               unsigned VF);
1419 
1420   /// Collect the instructions that are uniform after vectorization. An
1421   /// instruction is uniform if we represent it with a single scalar value in
1422   /// the vectorized loop corresponding to each vector iteration. Examples of
1423   /// uniform instructions include pointer operands of consecutive or
1424   /// interleaved memory accesses. Note that although uniformity implies an
1425   /// instruction will be scalar, the reverse is not true. In general, a
1426   /// scalarized instruction will be represented by VF scalar values in the
1427   /// vectorized loop, each corresponding to an iteration of the original
1428   /// scalar loop.
1429   void collectLoopUniforms(unsigned VF);
1430 
1431   /// Collect the instructions that are scalar after vectorization. An
1432   /// instruction is scalar if it is known to be uniform or will be scalarized
1433   /// during vectorization. Non-uniform scalarized instructions will be
1434   /// represented by VF values in the vectorized loop, each corresponding to an
1435   /// iteration of the original scalar loop.
1436   void collectLoopScalars(unsigned VF);
1437 
1438   /// Keeps cost model vectorization decision and cost for instructions.
1439   /// Right now it is used for memory instructions only.
1440   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1441                                 std::pair<InstWidening, unsigned>>;
1442 
1443   DecisionList WideningDecisions;
1444 
1445   /// Returns true if \p V is expected to be vectorized and it needs to be
1446   /// extracted.
1447   bool needsExtract(Value *V, unsigned VF) const {
1448     Instruction *I = dyn_cast<Instruction>(V);
1449     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1450       return false;
1451 
1452     // Assume we can vectorize V (and hence we need extraction) if the
1453     // scalars are not computed yet. This can happen, because it is called
1454     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1455     // the scalars are collected. That should be a safe assumption in most
1456     // cases, because we check if the operands have vectorizable types
1457     // beforehand in LoopVectorizationLegality.
1458     return Scalars.find(VF) == Scalars.end() ||
1459            !isScalarAfterVectorization(I, VF);
1460   };
1461 
1462   /// Returns a range containing only operands needing to be extracted.
1463   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1464                                                    unsigned VF) {
1465     return SmallVector<Value *, 4>(make_filter_range(
1466         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1467   }
1468 
1469 public:
1470   /// The loop that we evaluate.
1471   Loop *TheLoop;
1472 
1473   /// Predicated scalar evolution analysis.
1474   PredicatedScalarEvolution &PSE;
1475 
1476   /// Loop Info analysis.
1477   LoopInfo *LI;
1478 
1479   /// Vectorization legality.
1480   LoopVectorizationLegality *Legal;
1481 
1482   /// Vector target information.
1483   const TargetTransformInfo &TTI;
1484 
1485   /// Target Library Info.
1486   const TargetLibraryInfo *TLI;
1487 
1488   /// Demanded bits analysis.
1489   DemandedBits *DB;
1490 
1491   /// Assumption cache.
1492   AssumptionCache *AC;
1493 
1494   /// Interface to emit optimization remarks.
1495   OptimizationRemarkEmitter *ORE;
1496 
1497   const Function *TheFunction;
1498 
1499   /// Loop Vectorize Hint.
1500   const LoopVectorizeHints *Hints;
1501 
1502   /// The interleave access information contains groups of interleaved accesses
1503   /// with the same stride and close to each other.
1504   InterleavedAccessInfo &InterleaveInfo;
1505 
1506   /// Values to ignore in the cost model.
1507   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1508 
1509   /// Values to ignore in the cost model when VF > 1.
1510   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1511 };
1512 
1513 } // end namespace llvm
1514 
1515 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1516 // vectorization. The loop needs to be annotated with #pragma omp simd
1517 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1518 // vector length information is not provided, vectorization is not considered
1519 // explicit. Interleave hints are not allowed either. These limitations will be
1520 // relaxed in the future.
1521 // Please, note that we are currently forced to abuse the pragma 'clang
1522 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1523 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1524 // provides *explicit vectorization hints* (LV can bypass legal checks and
1525 // assume that vectorization is legal). However, both hints are implemented
1526 // using the same metadata (llvm.loop.vectorize, processed by
1527 // LoopVectorizeHints). This will be fixed in the future when the native IR
1528 // representation for pragma 'omp simd' is introduced.
1529 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1530                                    OptimizationRemarkEmitter *ORE) {
1531   assert(!OuterLp->empty() && "This is not an outer loop");
1532   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1533 
1534   // Only outer loops with an explicit vectorization hint are supported.
1535   // Unannotated outer loops are ignored.
1536   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1537     return false;
1538 
1539   Function *Fn = OuterLp->getHeader()->getParent();
1540   if (!Hints.allowVectorization(Fn, OuterLp,
1541                                 true /*VectorizeOnlyWhenForced*/)) {
1542     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1543     return false;
1544   }
1545 
1546   if (Hints.getInterleave() > 1) {
1547     // TODO: Interleave support is future work.
1548     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1549                          "outer loops.\n");
1550     Hints.emitRemarkWithHints();
1551     return false;
1552   }
1553 
1554   return true;
1555 }
1556 
1557 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1558                                   OptimizationRemarkEmitter *ORE,
1559                                   SmallVectorImpl<Loop *> &V) {
1560   // Collect inner loops and outer loops without irreducible control flow. For
1561   // now, only collect outer loops that have explicit vectorization hints. If we
1562   // are stress testing the VPlan H-CFG construction, we collect the outermost
1563   // loop of every loop nest.
1564   if (L.empty() || VPlanBuildStressTest ||
1565       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1566     LoopBlocksRPO RPOT(&L);
1567     RPOT.perform(LI);
1568     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1569       V.push_back(&L);
1570       // TODO: Collect inner loops inside marked outer loops in case
1571       // vectorization fails for the outer loop. Do not invoke
1572       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1573       // already known to be reducible. We can use an inherited attribute for
1574       // that.
1575       return;
1576     }
1577   }
1578   for (Loop *InnerL : L)
1579     collectSupportedLoops(*InnerL, LI, ORE, V);
1580 }
1581 
1582 namespace {
1583 
1584 /// The LoopVectorize Pass.
1585 struct LoopVectorize : public FunctionPass {
1586   /// Pass identification, replacement for typeid
1587   static char ID;
1588 
1589   LoopVectorizePass Impl;
1590 
1591   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1592                          bool VectorizeOnlyWhenForced = false)
1593       : FunctionPass(ID) {
1594     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1595     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1596     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1597   }
1598 
1599   bool runOnFunction(Function &F) override {
1600     if (skipFunction(F))
1601       return false;
1602 
1603     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1604     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1605     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1606     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1607     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1608     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1609     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1610     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1611     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1612     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1613     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1614     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1615     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1616 
1617     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1618         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1619 
1620     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1621                         GetLAA, *ORE, PSI);
1622   }
1623 
1624   void getAnalysisUsage(AnalysisUsage &AU) const override {
1625     AU.addRequired<AssumptionCacheTracker>();
1626     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1627     AU.addRequired<DominatorTreeWrapperPass>();
1628     AU.addRequired<LoopInfoWrapperPass>();
1629     AU.addRequired<ScalarEvolutionWrapperPass>();
1630     AU.addRequired<TargetTransformInfoWrapperPass>();
1631     AU.addRequired<AAResultsWrapperPass>();
1632     AU.addRequired<LoopAccessLegacyAnalysis>();
1633     AU.addRequired<DemandedBitsWrapperPass>();
1634     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1635     AU.addRequired<InjectTLIMappingsLegacy>();
1636 
1637     // We currently do not preserve loopinfo/dominator analyses with outer loop
1638     // vectorization. Until this is addressed, mark these analyses as preserved
1639     // only for non-VPlan-native path.
1640     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1641     if (!EnableVPlanNativePath) {
1642       AU.addPreserved<LoopInfoWrapperPass>();
1643       AU.addPreserved<DominatorTreeWrapperPass>();
1644     }
1645 
1646     AU.addPreserved<BasicAAWrapperPass>();
1647     AU.addPreserved<GlobalsAAWrapperPass>();
1648     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1649   }
1650 };
1651 
1652 } // end anonymous namespace
1653 
1654 //===----------------------------------------------------------------------===//
1655 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1656 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1657 //===----------------------------------------------------------------------===//
1658 
1659 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1660   // We need to place the broadcast of invariant variables outside the loop,
1661   // but only if it's proven safe to do so. Else, broadcast will be inside
1662   // vector loop body.
1663   Instruction *Instr = dyn_cast<Instruction>(V);
1664   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1665                      (!Instr ||
1666                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1667   // Place the code for broadcasting invariant variables in the new preheader.
1668   IRBuilder<>::InsertPointGuard Guard(Builder);
1669   if (SafeToHoist)
1670     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1671 
1672   // Broadcast the scalar into all locations in the vector.
1673   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1674 
1675   return Shuf;
1676 }
1677 
1678 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1679     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1680   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1681          "Expected either an induction phi-node or a truncate of it!");
1682   Value *Start = II.getStartValue();
1683 
1684   // Construct the initial value of the vector IV in the vector loop preheader
1685   auto CurrIP = Builder.saveIP();
1686   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1687   if (isa<TruncInst>(EntryVal)) {
1688     assert(Start->getType()->isIntegerTy() &&
1689            "Truncation requires an integer type");
1690     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1691     Step = Builder.CreateTrunc(Step, TruncType);
1692     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1693   }
1694   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1695   Value *SteppedStart =
1696       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1697 
1698   // We create vector phi nodes for both integer and floating-point induction
1699   // variables. Here, we determine the kind of arithmetic we will perform.
1700   Instruction::BinaryOps AddOp;
1701   Instruction::BinaryOps MulOp;
1702   if (Step->getType()->isIntegerTy()) {
1703     AddOp = Instruction::Add;
1704     MulOp = Instruction::Mul;
1705   } else {
1706     AddOp = II.getInductionOpcode();
1707     MulOp = Instruction::FMul;
1708   }
1709 
1710   // Multiply the vectorization factor by the step using integer or
1711   // floating-point arithmetic as appropriate.
1712   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1713   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1714 
1715   // Create a vector splat to use in the induction update.
1716   //
1717   // FIXME: If the step is non-constant, we create the vector splat with
1718   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1719   //        handle a constant vector splat.
1720   Value *SplatVF = isa<Constant>(Mul)
1721                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1722                        : Builder.CreateVectorSplat(VF, Mul);
1723   Builder.restoreIP(CurrIP);
1724 
1725   // We may need to add the step a number of times, depending on the unroll
1726   // factor. The last of those goes into the PHI.
1727   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1728                                     &*LoopVectorBody->getFirstInsertionPt());
1729   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1730   Instruction *LastInduction = VecInd;
1731   for (unsigned Part = 0; Part < UF; ++Part) {
1732     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1733 
1734     if (isa<TruncInst>(EntryVal))
1735       addMetadata(LastInduction, EntryVal);
1736     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1737 
1738     LastInduction = cast<Instruction>(addFastMathFlag(
1739         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1740     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1741   }
1742 
1743   // Move the last step to the end of the latch block. This ensures consistent
1744   // placement of all induction updates.
1745   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1746   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1747   auto *ICmp = cast<Instruction>(Br->getCondition());
1748   LastInduction->moveBefore(ICmp);
1749   LastInduction->setName("vec.ind.next");
1750 
1751   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1752   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1753 }
1754 
1755 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1756   return Cost->isScalarAfterVectorization(I, VF) ||
1757          Cost->isProfitableToScalarize(I, VF);
1758 }
1759 
1760 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1761   if (shouldScalarizeInstruction(IV))
1762     return true;
1763   auto isScalarInst = [&](User *U) -> bool {
1764     auto *I = cast<Instruction>(U);
1765     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1766   };
1767   return llvm::any_of(IV->users(), isScalarInst);
1768 }
1769 
1770 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1771     const InductionDescriptor &ID, const Instruction *EntryVal,
1772     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1773   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1774          "Expected either an induction phi-node or a truncate of it!");
1775 
1776   // This induction variable is not the phi from the original loop but the
1777   // newly-created IV based on the proof that casted Phi is equal to the
1778   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1779   // re-uses the same InductionDescriptor that original IV uses but we don't
1780   // have to do any recording in this case - that is done when original IV is
1781   // processed.
1782   if (isa<TruncInst>(EntryVal))
1783     return;
1784 
1785   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1786   if (Casts.empty())
1787     return;
1788   // Only the first Cast instruction in the Casts vector is of interest.
1789   // The rest of the Casts (if exist) have no uses outside the
1790   // induction update chain itself.
1791   Instruction *CastInst = *Casts.begin();
1792   if (Lane < UINT_MAX)
1793     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1794   else
1795     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1796 }
1797 
1798 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1799   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1800          "Primary induction variable must have an integer type");
1801 
1802   auto II = Legal->getInductionVars().find(IV);
1803   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1804 
1805   auto ID = II->second;
1806   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1807 
1808   // The scalar value to broadcast. This will be derived from the canonical
1809   // induction variable.
1810   Value *ScalarIV = nullptr;
1811 
1812   // The value from the original loop to which we are mapping the new induction
1813   // variable.
1814   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1815 
1816   // True if we have vectorized the induction variable.
1817   auto VectorizedIV = false;
1818 
1819   // Determine if we want a scalar version of the induction variable. This is
1820   // true if the induction variable itself is not widened, or if it has at
1821   // least one user in the loop that is not widened.
1822   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1823 
1824   // Generate code for the induction step. Note that induction steps are
1825   // required to be loop-invariant
1826   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1827          "Induction step should be loop invariant");
1828   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1829   Value *Step = nullptr;
1830   if (PSE.getSE()->isSCEVable(IV->getType())) {
1831     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1832     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1833                              LoopVectorPreHeader->getTerminator());
1834   } else {
1835     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1836   }
1837 
1838   // Try to create a new independent vector induction variable. If we can't
1839   // create the phi node, we will splat the scalar induction variable in each
1840   // loop iteration.
1841   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1842     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1843     VectorizedIV = true;
1844   }
1845 
1846   // If we haven't yet vectorized the induction variable, or if we will create
1847   // a scalar one, we need to define the scalar induction variable and step
1848   // values. If we were given a truncation type, truncate the canonical
1849   // induction variable and step. Otherwise, derive these values from the
1850   // induction descriptor.
1851   if (!VectorizedIV || NeedsScalarIV) {
1852     ScalarIV = Induction;
1853     if (IV != OldInduction) {
1854       ScalarIV = IV->getType()->isIntegerTy()
1855                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1856                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1857                                           IV->getType());
1858       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1859       ScalarIV->setName("offset.idx");
1860     }
1861     if (Trunc) {
1862       auto *TruncType = cast<IntegerType>(Trunc->getType());
1863       assert(Step->getType()->isIntegerTy() &&
1864              "Truncation requires an integer step");
1865       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1866       Step = Builder.CreateTrunc(Step, TruncType);
1867     }
1868   }
1869 
1870   // If we haven't yet vectorized the induction variable, splat the scalar
1871   // induction variable, and build the necessary step vectors.
1872   // TODO: Don't do it unless the vectorized IV is really required.
1873   if (!VectorizedIV) {
1874     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1875     for (unsigned Part = 0; Part < UF; ++Part) {
1876       Value *EntryPart =
1877           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1878       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1879       if (Trunc)
1880         addMetadata(EntryPart, Trunc);
1881       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1882     }
1883   }
1884 
1885   // If an induction variable is only used for counting loop iterations or
1886   // calculating addresses, it doesn't need to be widened. Create scalar steps
1887   // that can be used by instructions we will later scalarize. Note that the
1888   // addition of the scalar steps will not increase the number of instructions
1889   // in the loop in the common case prior to InstCombine. We will be trading
1890   // one vector extract for each scalar step.
1891   if (NeedsScalarIV)
1892     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1893 }
1894 
1895 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1896                                           Instruction::BinaryOps BinOp) {
1897   // Create and check the types.
1898   assert(Val->getType()->isVectorTy() && "Must be a vector");
1899   int VLen = Val->getType()->getVectorNumElements();
1900 
1901   Type *STy = Val->getType()->getScalarType();
1902   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1903          "Induction Step must be an integer or FP");
1904   assert(Step->getType() == STy && "Step has wrong type");
1905 
1906   SmallVector<Constant *, 8> Indices;
1907 
1908   if (STy->isIntegerTy()) {
1909     // Create a vector of consecutive numbers from zero to VF.
1910     for (int i = 0; i < VLen; ++i)
1911       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1912 
1913     // Add the consecutive indices to the vector value.
1914     Constant *Cv = ConstantVector::get(Indices);
1915     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1916     Step = Builder.CreateVectorSplat(VLen, Step);
1917     assert(Step->getType() == Val->getType() && "Invalid step vec");
1918     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1919     // which can be found from the original scalar operations.
1920     Step = Builder.CreateMul(Cv, Step);
1921     return Builder.CreateAdd(Val, Step, "induction");
1922   }
1923 
1924   // Floating point induction.
1925   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1926          "Binary Opcode should be specified for FP induction");
1927   // Create a vector of consecutive numbers from zero to VF.
1928   for (int i = 0; i < VLen; ++i)
1929     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1930 
1931   // Add the consecutive indices to the vector value.
1932   Constant *Cv = ConstantVector::get(Indices);
1933 
1934   Step = Builder.CreateVectorSplat(VLen, Step);
1935 
1936   // Floating point operations had to be 'fast' to enable the induction.
1937   FastMathFlags Flags;
1938   Flags.setFast();
1939 
1940   Value *MulOp = Builder.CreateFMul(Cv, Step);
1941   if (isa<Instruction>(MulOp))
1942     // Have to check, MulOp may be a constant
1943     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1944 
1945   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1946   if (isa<Instruction>(BOp))
1947     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1948   return BOp;
1949 }
1950 
1951 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1952                                            Instruction *EntryVal,
1953                                            const InductionDescriptor &ID) {
1954   // We shouldn't have to build scalar steps if we aren't vectorizing.
1955   assert(VF > 1 && "VF should be greater than one");
1956 
1957   // Get the value type and ensure it and the step have the same integer type.
1958   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1959   assert(ScalarIVTy == Step->getType() &&
1960          "Val and Step should have the same type");
1961 
1962   // We build scalar steps for both integer and floating-point induction
1963   // variables. Here, we determine the kind of arithmetic we will perform.
1964   Instruction::BinaryOps AddOp;
1965   Instruction::BinaryOps MulOp;
1966   if (ScalarIVTy->isIntegerTy()) {
1967     AddOp = Instruction::Add;
1968     MulOp = Instruction::Mul;
1969   } else {
1970     AddOp = ID.getInductionOpcode();
1971     MulOp = Instruction::FMul;
1972   }
1973 
1974   // Determine the number of scalars we need to generate for each unroll
1975   // iteration. If EntryVal is uniform, we only need to generate the first
1976   // lane. Otherwise, we generate all VF values.
1977   unsigned Lanes =
1978       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1979                                                                          : VF;
1980   // Compute the scalar steps and save the results in VectorLoopValueMap.
1981   for (unsigned Part = 0; Part < UF; ++Part) {
1982     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1983       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1984       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1985       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1986       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1987       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1988     }
1989   }
1990 }
1991 
1992 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1993   assert(V != Induction && "The new induction variable should not be used.");
1994   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1995   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1996 
1997   // If we have a stride that is replaced by one, do it here. Defer this for
1998   // the VPlan-native path until we start running Legal checks in that path.
1999   if (!EnableVPlanNativePath && Legal->hasStride(V))
2000     V = ConstantInt::get(V->getType(), 1);
2001 
2002   // If we have a vector mapped to this value, return it.
2003   if (VectorLoopValueMap.hasVectorValue(V, Part))
2004     return VectorLoopValueMap.getVectorValue(V, Part);
2005 
2006   // If the value has not been vectorized, check if it has been scalarized
2007   // instead. If it has been scalarized, and we actually need the value in
2008   // vector form, we will construct the vector values on demand.
2009   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2010     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2011 
2012     // If we've scalarized a value, that value should be an instruction.
2013     auto *I = cast<Instruction>(V);
2014 
2015     // If we aren't vectorizing, we can just copy the scalar map values over to
2016     // the vector map.
2017     if (VF == 1) {
2018       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2019       return ScalarValue;
2020     }
2021 
2022     // Get the last scalar instruction we generated for V and Part. If the value
2023     // is known to be uniform after vectorization, this corresponds to lane zero
2024     // of the Part unroll iteration. Otherwise, the last instruction is the one
2025     // we created for the last vector lane of the Part unroll iteration.
2026     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2027     auto *LastInst = cast<Instruction>(
2028         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2029 
2030     // Set the insert point after the last scalarized instruction. This ensures
2031     // the insertelement sequence will directly follow the scalar definitions.
2032     auto OldIP = Builder.saveIP();
2033     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2034     Builder.SetInsertPoint(&*NewIP);
2035 
2036     // However, if we are vectorizing, we need to construct the vector values.
2037     // If the value is known to be uniform after vectorization, we can just
2038     // broadcast the scalar value corresponding to lane zero for each unroll
2039     // iteration. Otherwise, we construct the vector values using insertelement
2040     // instructions. Since the resulting vectors are stored in
2041     // VectorLoopValueMap, we will only generate the insertelements once.
2042     Value *VectorValue = nullptr;
2043     if (Cost->isUniformAfterVectorization(I, VF)) {
2044       VectorValue = getBroadcastInstrs(ScalarValue);
2045       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2046     } else {
2047       // Initialize packing with insertelements to start from undef.
2048       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2049       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2050       for (unsigned Lane = 0; Lane < VF; ++Lane)
2051         packScalarIntoVectorValue(V, {Part, Lane});
2052       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2053     }
2054     Builder.restoreIP(OldIP);
2055     return VectorValue;
2056   }
2057 
2058   // If this scalar is unknown, assume that it is a constant or that it is
2059   // loop invariant. Broadcast V and save the value for future uses.
2060   Value *B = getBroadcastInstrs(V);
2061   VectorLoopValueMap.setVectorValue(V, Part, B);
2062   return B;
2063 }
2064 
2065 Value *
2066 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2067                                             const VPIteration &Instance) {
2068   // If the value is not an instruction contained in the loop, it should
2069   // already be scalar.
2070   if (OrigLoop->isLoopInvariant(V))
2071     return V;
2072 
2073   assert(Instance.Lane > 0
2074              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2075              : true && "Uniform values only have lane zero");
2076 
2077   // If the value from the original loop has not been vectorized, it is
2078   // represented by UF x VF scalar values in the new loop. Return the requested
2079   // scalar value.
2080   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2081     return VectorLoopValueMap.getScalarValue(V, Instance);
2082 
2083   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2084   // for the given unroll part. If this entry is not a vector type (i.e., the
2085   // vectorization factor is one), there is no need to generate an
2086   // extractelement instruction.
2087   auto *U = getOrCreateVectorValue(V, Instance.Part);
2088   if (!U->getType()->isVectorTy()) {
2089     assert(VF == 1 && "Value not scalarized has non-vector type");
2090     return U;
2091   }
2092 
2093   // Otherwise, the value from the original loop has been vectorized and is
2094   // represented by UF vector values. Extract and return the requested scalar
2095   // value from the appropriate vector lane.
2096   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2097 }
2098 
2099 void InnerLoopVectorizer::packScalarIntoVectorValue(
2100     Value *V, const VPIteration &Instance) {
2101   assert(V != Induction && "The new induction variable should not be used.");
2102   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2103   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2104 
2105   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2106   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2107   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2108                                             Builder.getInt32(Instance.Lane));
2109   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2110 }
2111 
2112 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2113   assert(Vec->getType()->isVectorTy() && "Invalid type");
2114   SmallVector<Constant *, 8> ShuffleMask;
2115   for (unsigned i = 0; i < VF; ++i)
2116     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2117 
2118   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2119                                      ConstantVector::get(ShuffleMask),
2120                                      "reverse");
2121 }
2122 
2123 // Return whether we allow using masked interleave-groups (for dealing with
2124 // strided loads/stores that reside in predicated blocks, or for dealing
2125 // with gaps).
2126 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2127   // If an override option has been passed in for interleaved accesses, use it.
2128   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2129     return EnableMaskedInterleavedMemAccesses;
2130 
2131   return TTI.enableMaskedInterleavedAccessVectorization();
2132 }
2133 
2134 // Try to vectorize the interleave group that \p Instr belongs to.
2135 //
2136 // E.g. Translate following interleaved load group (factor = 3):
2137 //   for (i = 0; i < N; i+=3) {
2138 //     R = Pic[i];             // Member of index 0
2139 //     G = Pic[i+1];           // Member of index 1
2140 //     B = Pic[i+2];           // Member of index 2
2141 //     ... // do something to R, G, B
2142 //   }
2143 // To:
2144 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2145 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2146 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2147 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2148 //
2149 // Or translate following interleaved store group (factor = 3):
2150 //   for (i = 0; i < N; i+=3) {
2151 //     ... do something to R, G, B
2152 //     Pic[i]   = R;           // Member of index 0
2153 //     Pic[i+1] = G;           // Member of index 1
2154 //     Pic[i+2] = B;           // Member of index 2
2155 //   }
2156 // To:
2157 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2158 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2159 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2160 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2161 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2162 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2163                                                    VPTransformState &State,
2164                                                    VPValue *Addr,
2165                                                    VPValue *BlockInMask) {
2166   const InterleaveGroup<Instruction> *Group =
2167       Cost->getInterleavedAccessGroup(Instr);
2168   assert(Group && "Fail to get an interleaved access group.");
2169 
2170   // Skip if current instruction is not the insert position.
2171   if (Instr != Group->getInsertPos())
2172     return;
2173 
2174   const DataLayout &DL = Instr->getModule()->getDataLayout();
2175 
2176   // Prepare for the vector type of the interleaved load/store.
2177   Type *ScalarTy = getMemInstValueType(Instr);
2178   unsigned InterleaveFactor = Group->getFactor();
2179   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2180 
2181   // Prepare for the new pointers.
2182   SmallVector<Value *, 2> AddrParts;
2183   unsigned Index = Group->getIndex(Instr);
2184 
2185   // TODO: extend the masked interleaved-group support to reversed access.
2186   assert((!BlockInMask || !Group->isReverse()) &&
2187          "Reversed masked interleave-group not supported.");
2188 
2189   // If the group is reverse, adjust the index to refer to the last vector lane
2190   // instead of the first. We adjust the index from the first vector lane,
2191   // rather than directly getting the pointer for lane VF - 1, because the
2192   // pointer operand of the interleaved access is supposed to be uniform. For
2193   // uniform instructions, we're only required to generate a value for the
2194   // first vector lane in each unroll iteration.
2195   if (Group->isReverse())
2196     Index += (VF - 1) * Group->getFactor();
2197 
2198   for (unsigned Part = 0; Part < UF; Part++) {
2199     Value *AddrPart = State.get(Addr, {Part, 0});
2200     setDebugLocFromInst(Builder, AddrPart);
2201 
2202     // Notice current instruction could be any index. Need to adjust the address
2203     // to the member of index 0.
2204     //
2205     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2206     //       b = A[i];       // Member of index 0
2207     // Current pointer is pointed to A[i+1], adjust it to A[i].
2208     //
2209     // E.g.  A[i+1] = a;     // Member of index 1
2210     //       A[i]   = b;     // Member of index 0
2211     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2212     // Current pointer is pointed to A[i+2], adjust it to A[i].
2213 
2214     bool InBounds = false;
2215     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2216       InBounds = gep->isInBounds();
2217     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2218     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2219 
2220     // Cast to the vector pointer type.
2221     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2222     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2223     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2224   }
2225 
2226   setDebugLocFromInst(Builder, Instr);
2227   Value *UndefVec = UndefValue::get(VecTy);
2228 
2229   Value *MaskForGaps = nullptr;
2230   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2231     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2232     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2233   }
2234 
2235   // Vectorize the interleaved load group.
2236   if (isa<LoadInst>(Instr)) {
2237     // For each unroll part, create a wide load for the group.
2238     SmallVector<Value *, 2> NewLoads;
2239     for (unsigned Part = 0; Part < UF; Part++) {
2240       Instruction *NewLoad;
2241       if (BlockInMask || MaskForGaps) {
2242         assert(useMaskedInterleavedAccesses(*TTI) &&
2243                "masked interleaved groups are not allowed.");
2244         Value *GroupMask = MaskForGaps;
2245         if (BlockInMask) {
2246           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2247           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2248           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2249           Value *ShuffledMask = Builder.CreateShuffleVector(
2250               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2251           GroupMask = MaskForGaps
2252                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2253                                                 MaskForGaps)
2254                           : ShuffledMask;
2255         }
2256         NewLoad =
2257             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2258                                      GroupMask, UndefVec, "wide.masked.vec");
2259       }
2260       else
2261         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2262                                             Group->getAlign(), "wide.vec");
2263       Group->addMetadata(NewLoad);
2264       NewLoads.push_back(NewLoad);
2265     }
2266 
2267     // For each member in the group, shuffle out the appropriate data from the
2268     // wide loads.
2269     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2270       Instruction *Member = Group->getMember(I);
2271 
2272       // Skip the gaps in the group.
2273       if (!Member)
2274         continue;
2275 
2276       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2277       for (unsigned Part = 0; Part < UF; Part++) {
2278         Value *StridedVec = Builder.CreateShuffleVector(
2279             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2280 
2281         // If this member has different type, cast the result type.
2282         if (Member->getType() != ScalarTy) {
2283           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2284           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2285         }
2286 
2287         if (Group->isReverse())
2288           StridedVec = reverseVector(StridedVec);
2289 
2290         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2291       }
2292     }
2293     return;
2294   }
2295 
2296   // The sub vector type for current instruction.
2297   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2298 
2299   // Vectorize the interleaved store group.
2300   for (unsigned Part = 0; Part < UF; Part++) {
2301     // Collect the stored vector from each member.
2302     SmallVector<Value *, 4> StoredVecs;
2303     for (unsigned i = 0; i < InterleaveFactor; i++) {
2304       // Interleaved store group doesn't allow a gap, so each index has a member
2305       Instruction *Member = Group->getMember(i);
2306       assert(Member && "Fail to get a member from an interleaved store group");
2307 
2308       Value *StoredVec = getOrCreateVectorValue(
2309           cast<StoreInst>(Member)->getValueOperand(), Part);
2310       if (Group->isReverse())
2311         StoredVec = reverseVector(StoredVec);
2312 
2313       // If this member has different type, cast it to a unified type.
2314 
2315       if (StoredVec->getType() != SubVT)
2316         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2317 
2318       StoredVecs.push_back(StoredVec);
2319     }
2320 
2321     // Concatenate all vectors into a wide vector.
2322     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2323 
2324     // Interleave the elements in the wide vector.
2325     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2326     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2327                                               "interleaved.vec");
2328 
2329     Instruction *NewStoreInstr;
2330     if (BlockInMask) {
2331       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2332       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2333       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2334       Value *ShuffledMask = Builder.CreateShuffleVector(
2335           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2336       NewStoreInstr = Builder.CreateMaskedStore(
2337           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2338     }
2339     else
2340       NewStoreInstr =
2341           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2342 
2343     Group->addMetadata(NewStoreInstr);
2344   }
2345 }
2346 
2347 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2348                                                      VPTransformState &State,
2349                                                      VPValue *Addr,
2350                                                      VPValue *BlockInMask) {
2351   // Attempt to issue a wide load.
2352   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2353   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2354 
2355   assert((LI || SI) && "Invalid Load/Store instruction");
2356 
2357   LoopVectorizationCostModel::InstWidening Decision =
2358       Cost->getWideningDecision(Instr, VF);
2359   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2360          "CM decision should be taken at this point");
2361   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2362     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2363 
2364   Type *ScalarDataTy = getMemInstValueType(Instr);
2365   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2366   // An alignment of 0 means target abi alignment. We need to use the scalar's
2367   // target abi alignment in such a case.
2368   const DataLayout &DL = Instr->getModule()->getDataLayout();
2369   const Align Alignment =
2370       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2371 
2372   // Determine if the pointer operand of the access is either consecutive or
2373   // reverse consecutive.
2374   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2375   bool ConsecutiveStride =
2376       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2377   bool CreateGatherScatter =
2378       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2379 
2380   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2381   // gather/scatter. Otherwise Decision should have been to Scalarize.
2382   assert((ConsecutiveStride || CreateGatherScatter) &&
2383          "The instruction should be scalarized");
2384   (void)ConsecutiveStride;
2385 
2386   VectorParts BlockInMaskParts(UF);
2387   bool isMaskRequired = BlockInMask;
2388   if (isMaskRequired)
2389     for (unsigned Part = 0; Part < UF; ++Part)
2390       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2391 
2392   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2393     // Calculate the pointer for the specific unroll-part.
2394     GetElementPtrInst *PartPtr = nullptr;
2395 
2396     bool InBounds = false;
2397     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2398       InBounds = gep->isInBounds();
2399 
2400     if (Reverse) {
2401       // If the address is consecutive but reversed, then the
2402       // wide store needs to start at the last vector element.
2403       PartPtr = cast<GetElementPtrInst>(
2404           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2405       PartPtr->setIsInBounds(InBounds);
2406       PartPtr = cast<GetElementPtrInst>(
2407           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2408       PartPtr->setIsInBounds(InBounds);
2409       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2410         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2411     } else {
2412       PartPtr = cast<GetElementPtrInst>(
2413           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2414       PartPtr->setIsInBounds(InBounds);
2415     }
2416 
2417     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2418     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2419   };
2420 
2421   // Handle Stores:
2422   if (SI) {
2423     setDebugLocFromInst(Builder, SI);
2424 
2425     for (unsigned Part = 0; Part < UF; ++Part) {
2426       Instruction *NewSI = nullptr;
2427       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2428       if (CreateGatherScatter) {
2429         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2430         Value *VectorGep = State.get(Addr, Part);
2431         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2432                                             MaskPart);
2433       } else {
2434         if (Reverse) {
2435           // If we store to reverse consecutive memory locations, then we need
2436           // to reverse the order of elements in the stored value.
2437           StoredVal = reverseVector(StoredVal);
2438           // We don't want to update the value in the map as it might be used in
2439           // another expression. So don't call resetVectorValue(StoredVal).
2440         }
2441         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2442         if (isMaskRequired)
2443           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2444                                             BlockInMaskParts[Part]);
2445         else
2446           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2447       }
2448       addMetadata(NewSI, SI);
2449     }
2450     return;
2451   }
2452 
2453   // Handle loads.
2454   assert(LI && "Must have a load instruction");
2455   setDebugLocFromInst(Builder, LI);
2456   for (unsigned Part = 0; Part < UF; ++Part) {
2457     Value *NewLI;
2458     if (CreateGatherScatter) {
2459       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2460       Value *VectorGep = State.get(Addr, Part);
2461       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2462                                          nullptr, "wide.masked.gather");
2463       addMetadata(NewLI, LI);
2464     } else {
2465       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2466       if (isMaskRequired)
2467         NewLI = Builder.CreateMaskedLoad(
2468             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2469             "wide.masked.load");
2470       else
2471         NewLI =
2472             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2473 
2474       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2475       addMetadata(NewLI, LI);
2476       if (Reverse)
2477         NewLI = reverseVector(NewLI);
2478     }
2479     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2480   }
2481 }
2482 
2483 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2484                                                const VPIteration &Instance,
2485                                                bool IfPredicateInstr) {
2486   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2487 
2488   setDebugLocFromInst(Builder, Instr);
2489 
2490   // Does this instruction return a value ?
2491   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2492 
2493   Instruction *Cloned = Instr->clone();
2494   if (!IsVoidRetTy)
2495     Cloned->setName(Instr->getName() + ".cloned");
2496 
2497   // Replace the operands of the cloned instructions with their scalar
2498   // equivalents in the new loop.
2499   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2500     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2501     Cloned->setOperand(op, NewOp);
2502   }
2503   addNewMetadata(Cloned, Instr);
2504 
2505   // Place the cloned scalar in the new loop.
2506   Builder.Insert(Cloned);
2507 
2508   // Add the cloned scalar to the scalar map entry.
2509   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2510 
2511   // If we just cloned a new assumption, add it the assumption cache.
2512   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2513     if (II->getIntrinsicID() == Intrinsic::assume)
2514       AC->registerAssumption(II);
2515 
2516   // End if-block.
2517   if (IfPredicateInstr)
2518     PredicatedInstructions.push_back(Cloned);
2519 }
2520 
2521 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2522                                                       Value *End, Value *Step,
2523                                                       Instruction *DL) {
2524   BasicBlock *Header = L->getHeader();
2525   BasicBlock *Latch = L->getLoopLatch();
2526   // As we're just creating this loop, it's possible no latch exists
2527   // yet. If so, use the header as this will be a single block loop.
2528   if (!Latch)
2529     Latch = Header;
2530 
2531   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2532   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2533   setDebugLocFromInst(Builder, OldInst);
2534   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2535 
2536   Builder.SetInsertPoint(Latch->getTerminator());
2537   setDebugLocFromInst(Builder, OldInst);
2538 
2539   // Create i+1 and fill the PHINode.
2540   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2541   Induction->addIncoming(Start, L->getLoopPreheader());
2542   Induction->addIncoming(Next, Latch);
2543   // Create the compare.
2544   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2545   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2546 
2547   // Now we have two terminators. Remove the old one from the block.
2548   Latch->getTerminator()->eraseFromParent();
2549 
2550   return Induction;
2551 }
2552 
2553 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2554   if (TripCount)
2555     return TripCount;
2556 
2557   assert(L && "Create Trip Count for null loop.");
2558   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2559   // Find the loop boundaries.
2560   ScalarEvolution *SE = PSE.getSE();
2561   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2562   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2563          "Invalid loop count");
2564 
2565   Type *IdxTy = Legal->getWidestInductionType();
2566   assert(IdxTy && "No type for induction");
2567 
2568   // The exit count might have the type of i64 while the phi is i32. This can
2569   // happen if we have an induction variable that is sign extended before the
2570   // compare. The only way that we get a backedge taken count is that the
2571   // induction variable was signed and as such will not overflow. In such a case
2572   // truncation is legal.
2573   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2574       IdxTy->getPrimitiveSizeInBits())
2575     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2576   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2577 
2578   // Get the total trip count from the count by adding 1.
2579   const SCEV *ExitCount = SE->getAddExpr(
2580       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2581 
2582   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2583 
2584   // Expand the trip count and place the new instructions in the preheader.
2585   // Notice that the pre-header does not change, only the loop body.
2586   SCEVExpander Exp(*SE, DL, "induction");
2587 
2588   // Count holds the overall loop count (N).
2589   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2590                                 L->getLoopPreheader()->getTerminator());
2591 
2592   if (TripCount->getType()->isPointerTy())
2593     TripCount =
2594         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2595                                     L->getLoopPreheader()->getTerminator());
2596 
2597   return TripCount;
2598 }
2599 
2600 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2601   if (VectorTripCount)
2602     return VectorTripCount;
2603 
2604   Value *TC = getOrCreateTripCount(L);
2605   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2606 
2607   Type *Ty = TC->getType();
2608   Constant *Step = ConstantInt::get(Ty, VF * UF);
2609 
2610   // If the tail is to be folded by masking, round the number of iterations N
2611   // up to a multiple of Step instead of rounding down. This is done by first
2612   // adding Step-1 and then rounding down. Note that it's ok if this addition
2613   // overflows: the vector induction variable will eventually wrap to zero given
2614   // that it starts at zero and its Step is a power of two; the loop will then
2615   // exit, with the last early-exit vector comparison also producing all-true.
2616   if (Cost->foldTailByMasking()) {
2617     assert(isPowerOf2_32(VF * UF) &&
2618            "VF*UF must be a power of 2 when folding tail by masking");
2619     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2620   }
2621 
2622   // Now we need to generate the expression for the part of the loop that the
2623   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2624   // iterations are not required for correctness, or N - Step, otherwise. Step
2625   // is equal to the vectorization factor (number of SIMD elements) times the
2626   // unroll factor (number of SIMD instructions).
2627   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2628 
2629   // If there is a non-reversed interleaved group that may speculatively access
2630   // memory out-of-bounds, we need to ensure that there will be at least one
2631   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2632   // the trip count, we set the remainder to be equal to the step. If the step
2633   // does not evenly divide the trip count, no adjustment is necessary since
2634   // there will already be scalar iterations. Note that the minimum iterations
2635   // check ensures that N >= Step.
2636   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2637     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2638     R = Builder.CreateSelect(IsZero, Step, R);
2639   }
2640 
2641   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2642 
2643   return VectorTripCount;
2644 }
2645 
2646 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2647                                                    const DataLayout &DL) {
2648   // Verify that V is a vector type with same number of elements as DstVTy.
2649   unsigned VF = DstVTy->getNumElements();
2650   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2651   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2652   Type *SrcElemTy = SrcVecTy->getElementType();
2653   Type *DstElemTy = DstVTy->getElementType();
2654   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2655          "Vector elements must have same size");
2656 
2657   // Do a direct cast if element types are castable.
2658   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2659     return Builder.CreateBitOrPointerCast(V, DstVTy);
2660   }
2661   // V cannot be directly casted to desired vector type.
2662   // May happen when V is a floating point vector but DstVTy is a vector of
2663   // pointers or vice-versa. Handle this using a two-step bitcast using an
2664   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2665   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2666          "Only one type should be a pointer type");
2667   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2668          "Only one type should be a floating point type");
2669   Type *IntTy =
2670       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2671   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2672   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2673   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2674 }
2675 
2676 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2677                                                          BasicBlock *Bypass) {
2678   Value *Count = getOrCreateTripCount(L);
2679   // Reuse existing vector loop preheader for TC checks.
2680   // Note that new preheader block is generated for vector loop.
2681   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2682   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2683 
2684   // Generate code to check if the loop's trip count is less than VF * UF, or
2685   // equal to it in case a scalar epilogue is required; this implies that the
2686   // vector trip count is zero. This check also covers the case where adding one
2687   // to the backedge-taken count overflowed leading to an incorrect trip count
2688   // of zero. In this case we will also jump to the scalar loop.
2689   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2690                                           : ICmpInst::ICMP_ULT;
2691 
2692   // If tail is to be folded, vector loop takes care of all iterations.
2693   Value *CheckMinIters = Builder.getFalse();
2694   if (!Cost->foldTailByMasking())
2695     CheckMinIters = Builder.CreateICmp(
2696         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2697         "min.iters.check");
2698 
2699   // Create new preheader for vector loop.
2700   LoopVectorPreHeader =
2701       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2702                  "vector.ph");
2703 
2704   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2705                                DT->getNode(Bypass)->getIDom()) &&
2706          "TC check is expected to dominate Bypass");
2707 
2708   // Update dominator for Bypass & LoopExit.
2709   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2710   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2711 
2712   ReplaceInstWithInst(
2713       TCCheckBlock->getTerminator(),
2714       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2715   LoopBypassBlocks.push_back(TCCheckBlock);
2716 }
2717 
2718 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2719   // Reuse existing vector loop preheader for SCEV checks.
2720   // Note that new preheader block is generated for vector loop.
2721   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2722 
2723   // Generate the code to check that the SCEV assumptions that we made.
2724   // We want the new basic block to start at the first instruction in a
2725   // sequence of instructions that form a check.
2726   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2727                    "scev.check");
2728   Value *SCEVCheck = Exp.expandCodeForPredicate(
2729       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2730 
2731   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2732     if (C->isZero())
2733       return;
2734 
2735   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2736          "Cannot SCEV check stride or overflow when optimizing for size");
2737 
2738   SCEVCheckBlock->setName("vector.scevcheck");
2739   // Create new preheader for vector loop.
2740   LoopVectorPreHeader =
2741       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2742                  nullptr, "vector.ph");
2743 
2744   // Update dominator only if this is first RT check.
2745   if (LoopBypassBlocks.empty()) {
2746     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2747     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2748   }
2749 
2750   ReplaceInstWithInst(
2751       SCEVCheckBlock->getTerminator(),
2752       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2753   LoopBypassBlocks.push_back(SCEVCheckBlock);
2754   AddedSafetyChecks = true;
2755 }
2756 
2757 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2758   // VPlan-native path does not do any analysis for runtime checks currently.
2759   if (EnableVPlanNativePath)
2760     return;
2761 
2762   // Reuse existing vector loop preheader for runtime memory checks.
2763   // Note that new preheader block is generated for vector loop.
2764   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2765 
2766   // Generate the code that checks in runtime if arrays overlap. We put the
2767   // checks into a separate block to make the more common case of few elements
2768   // faster.
2769   Instruction *FirstCheckInst;
2770   Instruction *MemRuntimeCheck;
2771   std::tie(FirstCheckInst, MemRuntimeCheck) =
2772       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2773   if (!MemRuntimeCheck)
2774     return;
2775 
2776   if (MemCheckBlock->getParent()->hasOptSize()) {
2777     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2778            "Cannot emit memory checks when optimizing for size, unless forced "
2779            "to vectorize.");
2780     ORE->emit([&]() {
2781       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2782                                         L->getStartLoc(), L->getHeader())
2783              << "Code-size may be reduced by not forcing "
2784                 "vectorization, or by source-code modifications "
2785                 "eliminating the need for runtime checks "
2786                 "(e.g., adding 'restrict').";
2787     });
2788   }
2789 
2790   MemCheckBlock->setName("vector.memcheck");
2791   // Create new preheader for vector loop.
2792   LoopVectorPreHeader =
2793       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2794                  "vector.ph");
2795 
2796   // Update dominator only if this is first RT check.
2797   if (LoopBypassBlocks.empty()) {
2798     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2799     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2800   }
2801 
2802   ReplaceInstWithInst(
2803       MemCheckBlock->getTerminator(),
2804       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2805   LoopBypassBlocks.push_back(MemCheckBlock);
2806   AddedSafetyChecks = true;
2807 
2808   // We currently don't use LoopVersioning for the actual loop cloning but we
2809   // still use it to add the noalias metadata.
2810   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2811                                           PSE.getSE());
2812   LVer->prepareNoAliasMetadata();
2813 }
2814 
2815 Value *InnerLoopVectorizer::emitTransformedIndex(
2816     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2817     const InductionDescriptor &ID) const {
2818 
2819   SCEVExpander Exp(*SE, DL, "induction");
2820   auto Step = ID.getStep();
2821   auto StartValue = ID.getStartValue();
2822   assert(Index->getType() == Step->getType() &&
2823          "Index type does not match StepValue type");
2824 
2825   // Note: the IR at this point is broken. We cannot use SE to create any new
2826   // SCEV and then expand it, hoping that SCEV's simplification will give us
2827   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2828   // lead to various SCEV crashes. So all we can do is to use builder and rely
2829   // on InstCombine for future simplifications. Here we handle some trivial
2830   // cases only.
2831   auto CreateAdd = [&B](Value *X, Value *Y) {
2832     assert(X->getType() == Y->getType() && "Types don't match!");
2833     if (auto *CX = dyn_cast<ConstantInt>(X))
2834       if (CX->isZero())
2835         return Y;
2836     if (auto *CY = dyn_cast<ConstantInt>(Y))
2837       if (CY->isZero())
2838         return X;
2839     return B.CreateAdd(X, Y);
2840   };
2841 
2842   auto CreateMul = [&B](Value *X, Value *Y) {
2843     assert(X->getType() == Y->getType() && "Types don't match!");
2844     if (auto *CX = dyn_cast<ConstantInt>(X))
2845       if (CX->isOne())
2846         return Y;
2847     if (auto *CY = dyn_cast<ConstantInt>(Y))
2848       if (CY->isOne())
2849         return X;
2850     return B.CreateMul(X, Y);
2851   };
2852 
2853   switch (ID.getKind()) {
2854   case InductionDescriptor::IK_IntInduction: {
2855     assert(Index->getType() == StartValue->getType() &&
2856            "Index type does not match StartValue type");
2857     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2858       return B.CreateSub(StartValue, Index);
2859     auto *Offset = CreateMul(
2860         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2861     return CreateAdd(StartValue, Offset);
2862   }
2863   case InductionDescriptor::IK_PtrInduction: {
2864     assert(isa<SCEVConstant>(Step) &&
2865            "Expected constant step for pointer induction");
2866     return B.CreateGEP(
2867         StartValue->getType()->getPointerElementType(), StartValue,
2868         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2869                                            &*B.GetInsertPoint())));
2870   }
2871   case InductionDescriptor::IK_FpInduction: {
2872     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2873     auto InductionBinOp = ID.getInductionBinOp();
2874     assert(InductionBinOp &&
2875            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2876             InductionBinOp->getOpcode() == Instruction::FSub) &&
2877            "Original bin op should be defined for FP induction");
2878 
2879     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2880 
2881     // Floating point operations had to be 'fast' to enable the induction.
2882     FastMathFlags Flags;
2883     Flags.setFast();
2884 
2885     Value *MulExp = B.CreateFMul(StepValue, Index);
2886     if (isa<Instruction>(MulExp))
2887       // We have to check, the MulExp may be a constant.
2888       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2889 
2890     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2891                                "induction");
2892     if (isa<Instruction>(BOp))
2893       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2894 
2895     return BOp;
2896   }
2897   case InductionDescriptor::IK_NoInduction:
2898     return nullptr;
2899   }
2900   llvm_unreachable("invalid enum");
2901 }
2902 
2903 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2904   /*
2905    In this function we generate a new loop. The new loop will contain
2906    the vectorized instructions while the old loop will continue to run the
2907    scalar remainder.
2908 
2909        [ ] <-- loop iteration number check.
2910     /   |
2911    /    v
2912   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2913   |  /  |
2914   | /   v
2915   ||   [ ]     <-- vector pre header.
2916   |/    |
2917   |     v
2918   |    [  ] \
2919   |    [  ]_|   <-- vector loop.
2920   |     |
2921   |     v
2922   |   -[ ]   <--- middle-block.
2923   |  /  |
2924   | /   v
2925   -|- >[ ]     <--- new preheader.
2926    |    |
2927    |    v
2928    |   [ ] \
2929    |   [ ]_|   <-- old scalar loop to handle remainder.
2930     \   |
2931      \  v
2932       >[ ]     <-- exit block.
2933    ...
2934    */
2935 
2936   MDNode *OrigLoopID = OrigLoop->getLoopID();
2937 
2938   // Some loops have a single integer induction variable, while other loops
2939   // don't. One example is c++ iterators that often have multiple pointer
2940   // induction variables. In the code below we also support a case where we
2941   // don't have a single induction variable.
2942   //
2943   // We try to obtain an induction variable from the original loop as hard
2944   // as possible. However if we don't find one that:
2945   //   - is an integer
2946   //   - counts from zero, stepping by one
2947   //   - is the size of the widest induction variable type
2948   // then we create a new one.
2949   OldInduction = Legal->getPrimaryInduction();
2950   Type *IdxTy = Legal->getWidestInductionType();
2951 
2952   // Split the single block loop into the two loop structure described above.
2953   LoopScalarBody = OrigLoop->getHeader();
2954   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2955   LoopExitBlock = OrigLoop->getExitBlock();
2956   assert(LoopExitBlock && "Must have an exit block");
2957   assert(LoopVectorPreHeader && "Invalid loop structure");
2958 
2959   LoopMiddleBlock =
2960       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2961                  LI, nullptr, "middle.block");
2962   LoopScalarPreHeader =
2963       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2964                  nullptr, "scalar.ph");
2965   // We intentionally don't let SplitBlock to update LoopInfo since
2966   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2967   // LoopVectorBody is explicitly added to the correct place few lines later.
2968   LoopVectorBody =
2969       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2970                  nullptr, nullptr, "vector.body");
2971 
2972   // Update dominator for loop exit.
2973   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2974 
2975   // Create and register the new vector loop.
2976   Loop *Lp = LI->AllocateLoop();
2977   Loop *ParentLoop = OrigLoop->getParentLoop();
2978 
2979   // Insert the new loop into the loop nest and register the new basic blocks
2980   // before calling any utilities such as SCEV that require valid LoopInfo.
2981   if (ParentLoop) {
2982     ParentLoop->addChildLoop(Lp);
2983   } else {
2984     LI->addTopLevelLoop(Lp);
2985   }
2986   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2987 
2988   // Find the loop boundaries.
2989   Value *Count = getOrCreateTripCount(Lp);
2990 
2991   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2992 
2993   // Now, compare the new count to zero. If it is zero skip the vector loop and
2994   // jump to the scalar loop. This check also covers the case where the
2995   // backedge-taken count is uint##_max: adding one to it will overflow leading
2996   // to an incorrect trip count of zero. In this (rare) case we will also jump
2997   // to the scalar loop.
2998   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
2999 
3000   // Generate the code to check any assumptions that we've made for SCEV
3001   // expressions.
3002   emitSCEVChecks(Lp, LoopScalarPreHeader);
3003 
3004   // Generate the code that checks in runtime if arrays overlap. We put the
3005   // checks into a separate block to make the more common case of few elements
3006   // faster.
3007   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3008 
3009   // Generate the induction variable.
3010   // The loop step is equal to the vectorization factor (num of SIMD elements)
3011   // times the unroll factor (num of SIMD instructions).
3012   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3013   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3014   Induction =
3015       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3016                               getDebugLocFromInstOrOperands(OldInduction));
3017 
3018   // We are going to resume the execution of the scalar loop.
3019   // Go over all of the induction variables that we found and fix the
3020   // PHIs that are left in the scalar version of the loop.
3021   // The starting values of PHI nodes depend on the counter of the last
3022   // iteration in the vectorized loop.
3023   // If we come from a bypass edge then we need to start from the original
3024   // start value.
3025 
3026   // This variable saves the new starting index for the scalar loop. It is used
3027   // to test if there are any tail iterations left once the vector loop has
3028   // completed.
3029   for (auto &InductionEntry : Legal->getInductionVars()) {
3030     PHINode *OrigPhi = InductionEntry.first;
3031     InductionDescriptor II = InductionEntry.second;
3032 
3033     // Create phi nodes to merge from the  backedge-taken check block.
3034     PHINode *BCResumeVal =
3035         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3036                         LoopScalarPreHeader->getTerminator());
3037     // Copy original phi DL over to the new one.
3038     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3039     Value *&EndValue = IVEndValues[OrigPhi];
3040     if (OrigPhi == OldInduction) {
3041       // We know what the end value is.
3042       EndValue = CountRoundDown;
3043     } else {
3044       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3045       Type *StepType = II.getStep()->getType();
3046       Instruction::CastOps CastOp =
3047           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3048       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3049       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3050       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3051       EndValue->setName("ind.end");
3052     }
3053 
3054     // The new PHI merges the original incoming value, in case of a bypass,
3055     // or the value at the end of the vectorized loop.
3056     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3057 
3058     // Fix the scalar body counter (PHI node).
3059     // The old induction's phi node in the scalar body needs the truncated
3060     // value.
3061     for (BasicBlock *BB : LoopBypassBlocks)
3062       BCResumeVal->addIncoming(II.getStartValue(), BB);
3063     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3064   }
3065 
3066   // We need the OrigLoop (scalar loop part) latch terminator to help
3067   // produce correct debug info for the middle block BB instructions.
3068   // The legality check stage guarantees that the loop will have a single
3069   // latch.
3070   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3071          "Scalar loop latch terminator isn't a branch");
3072   BranchInst *ScalarLatchBr =
3073       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3074 
3075   // Add a check in the middle block to see if we have completed
3076   // all of the iterations in the first vector loop.
3077   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3078   // If tail is to be folded, we know we don't need to run the remainder.
3079   Value *CmpN = Builder.getTrue();
3080   if (!Cost->foldTailByMasking()) {
3081     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3082                            CountRoundDown, "cmp.n",
3083                            LoopMiddleBlock->getTerminator());
3084 
3085     // Here we use the same DebugLoc as the scalar loop latch branch instead
3086     // of the corresponding compare because they may have ended up with
3087     // different line numbers and we want to avoid awkward line stepping while
3088     // debugging. Eg. if the compare has got a line number inside the loop.
3089     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3090   }
3091 
3092   BranchInst *BrInst =
3093       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3094   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3095   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3096 
3097   // Get ready to start creating new instructions into the vectorized body.
3098   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3099          "Inconsistent vector loop preheader");
3100   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3101 
3102   Optional<MDNode *> VectorizedLoopID =
3103       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3104                                       LLVMLoopVectorizeFollowupVectorized});
3105   if (VectorizedLoopID.hasValue()) {
3106     Lp->setLoopID(VectorizedLoopID.getValue());
3107 
3108     // Do not setAlreadyVectorized if loop attributes have been defined
3109     // explicitly.
3110     return LoopVectorPreHeader;
3111   }
3112 
3113   // Keep all loop hints from the original loop on the vector loop (we'll
3114   // replace the vectorizer-specific hints below).
3115   if (MDNode *LID = OrigLoop->getLoopID())
3116     Lp->setLoopID(LID);
3117 
3118   LoopVectorizeHints Hints(Lp, true, *ORE);
3119   Hints.setAlreadyVectorized();
3120 
3121 #ifdef EXPENSIVE_CHECKS
3122   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3123   LI->verify(*DT);
3124 #endif
3125 
3126   return LoopVectorPreHeader;
3127 }
3128 
3129 // Fix up external users of the induction variable. At this point, we are
3130 // in LCSSA form, with all external PHIs that use the IV having one input value,
3131 // coming from the remainder loop. We need those PHIs to also have a correct
3132 // value for the IV when arriving directly from the middle block.
3133 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3134                                        const InductionDescriptor &II,
3135                                        Value *CountRoundDown, Value *EndValue,
3136                                        BasicBlock *MiddleBlock) {
3137   // There are two kinds of external IV usages - those that use the value
3138   // computed in the last iteration (the PHI) and those that use the penultimate
3139   // value (the value that feeds into the phi from the loop latch).
3140   // We allow both, but they, obviously, have different values.
3141 
3142   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3143 
3144   DenseMap<Value *, Value *> MissingVals;
3145 
3146   // An external user of the last iteration's value should see the value that
3147   // the remainder loop uses to initialize its own IV.
3148   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3149   for (User *U : PostInc->users()) {
3150     Instruction *UI = cast<Instruction>(U);
3151     if (!OrigLoop->contains(UI)) {
3152       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3153       MissingVals[UI] = EndValue;
3154     }
3155   }
3156 
3157   // An external user of the penultimate value need to see EndValue - Step.
3158   // The simplest way to get this is to recompute it from the constituent SCEVs,
3159   // that is Start + (Step * (CRD - 1)).
3160   for (User *U : OrigPhi->users()) {
3161     auto *UI = cast<Instruction>(U);
3162     if (!OrigLoop->contains(UI)) {
3163       const DataLayout &DL =
3164           OrigLoop->getHeader()->getModule()->getDataLayout();
3165       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3166 
3167       IRBuilder<> B(MiddleBlock->getTerminator());
3168       Value *CountMinusOne = B.CreateSub(
3169           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3170       Value *CMO =
3171           !II.getStep()->getType()->isIntegerTy()
3172               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3173                              II.getStep()->getType())
3174               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3175       CMO->setName("cast.cmo");
3176       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3177       Escape->setName("ind.escape");
3178       MissingVals[UI] = Escape;
3179     }
3180   }
3181 
3182   for (auto &I : MissingVals) {
3183     PHINode *PHI = cast<PHINode>(I.first);
3184     // One corner case we have to handle is two IVs "chasing" each-other,
3185     // that is %IV2 = phi [...], [ %IV1, %latch ]
3186     // In this case, if IV1 has an external use, we need to avoid adding both
3187     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3188     // don't already have an incoming value for the middle block.
3189     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3190       PHI->addIncoming(I.second, MiddleBlock);
3191   }
3192 }
3193 
3194 namespace {
3195 
3196 struct CSEDenseMapInfo {
3197   static bool canHandle(const Instruction *I) {
3198     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3199            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3200   }
3201 
3202   static inline Instruction *getEmptyKey() {
3203     return DenseMapInfo<Instruction *>::getEmptyKey();
3204   }
3205 
3206   static inline Instruction *getTombstoneKey() {
3207     return DenseMapInfo<Instruction *>::getTombstoneKey();
3208   }
3209 
3210   static unsigned getHashValue(const Instruction *I) {
3211     assert(canHandle(I) && "Unknown instruction!");
3212     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3213                                                            I->value_op_end()));
3214   }
3215 
3216   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3217     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3218         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3219       return LHS == RHS;
3220     return LHS->isIdenticalTo(RHS);
3221   }
3222 };
3223 
3224 } // end anonymous namespace
3225 
3226 ///Perform cse of induction variable instructions.
3227 static void cse(BasicBlock *BB) {
3228   // Perform simple cse.
3229   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3230   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3231     Instruction *In = &*I++;
3232 
3233     if (!CSEDenseMapInfo::canHandle(In))
3234       continue;
3235 
3236     // Check if we can replace this instruction with any of the
3237     // visited instructions.
3238     if (Instruction *V = CSEMap.lookup(In)) {
3239       In->replaceAllUsesWith(V);
3240       In->eraseFromParent();
3241       continue;
3242     }
3243 
3244     CSEMap[In] = In;
3245   }
3246 }
3247 
3248 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3249                                                        unsigned VF,
3250                                                        bool &NeedToScalarize) {
3251   Function *F = CI->getCalledFunction();
3252   Type *ScalarRetTy = CI->getType();
3253   SmallVector<Type *, 4> Tys, ScalarTys;
3254   for (auto &ArgOp : CI->arg_operands())
3255     ScalarTys.push_back(ArgOp->getType());
3256 
3257   // Estimate cost of scalarized vector call. The source operands are assumed
3258   // to be vectors, so we need to extract individual elements from there,
3259   // execute VF scalar calls, and then gather the result into the vector return
3260   // value.
3261   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3262   if (VF == 1)
3263     return ScalarCallCost;
3264 
3265   // Compute corresponding vector type for return value and arguments.
3266   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3267   for (Type *ScalarTy : ScalarTys)
3268     Tys.push_back(ToVectorTy(ScalarTy, VF));
3269 
3270   // Compute costs of unpacking argument values for the scalar calls and
3271   // packing the return values to a vector.
3272   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3273 
3274   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3275 
3276   // If we can't emit a vector call for this function, then the currently found
3277   // cost is the cost we need to return.
3278   NeedToScalarize = true;
3279   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3280   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3281 
3282   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3283     return Cost;
3284 
3285   // If the corresponding vector cost is cheaper, return its cost.
3286   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3287   if (VectorCallCost < Cost) {
3288     NeedToScalarize = false;
3289     return VectorCallCost;
3290   }
3291   return Cost;
3292 }
3293 
3294 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3295                                                             unsigned VF) {
3296   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3297   assert(ID && "Expected intrinsic call!");
3298 
3299   FastMathFlags FMF;
3300   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3301     FMF = FPMO->getFastMathFlags();
3302 
3303   SmallVector<Value *, 4> Operands(CI->arg_operands());
3304   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3305 }
3306 
3307 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3308   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3309   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3310   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3311 }
3312 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3313   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3314   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3315   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3316 }
3317 
3318 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3319   // For every instruction `I` in MinBWs, truncate the operands, create a
3320   // truncated version of `I` and reextend its result. InstCombine runs
3321   // later and will remove any ext/trunc pairs.
3322   SmallPtrSet<Value *, 4> Erased;
3323   for (const auto &KV : Cost->getMinimalBitwidths()) {
3324     // If the value wasn't vectorized, we must maintain the original scalar
3325     // type. The absence of the value from VectorLoopValueMap indicates that it
3326     // wasn't vectorized.
3327     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3328       continue;
3329     for (unsigned Part = 0; Part < UF; ++Part) {
3330       Value *I = getOrCreateVectorValue(KV.first, Part);
3331       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3332           !isa<Instruction>(I))
3333         continue;
3334       Type *OriginalTy = I->getType();
3335       Type *ScalarTruncatedTy =
3336           IntegerType::get(OriginalTy->getContext(), KV.second);
3337       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3338                                           OriginalTy->getVectorNumElements());
3339       if (TruncatedTy == OriginalTy)
3340         continue;
3341 
3342       IRBuilder<> B(cast<Instruction>(I));
3343       auto ShrinkOperand = [&](Value *V) -> Value * {
3344         if (auto *ZI = dyn_cast<ZExtInst>(V))
3345           if (ZI->getSrcTy() == TruncatedTy)
3346             return ZI->getOperand(0);
3347         return B.CreateZExtOrTrunc(V, TruncatedTy);
3348       };
3349 
3350       // The actual instruction modification depends on the instruction type,
3351       // unfortunately.
3352       Value *NewI = nullptr;
3353       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3354         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3355                              ShrinkOperand(BO->getOperand(1)));
3356 
3357         // Any wrapping introduced by shrinking this operation shouldn't be
3358         // considered undefined behavior. So, we can't unconditionally copy
3359         // arithmetic wrapping flags to NewI.
3360         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3361       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3362         NewI =
3363             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3364                          ShrinkOperand(CI->getOperand(1)));
3365       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3366         NewI = B.CreateSelect(SI->getCondition(),
3367                               ShrinkOperand(SI->getTrueValue()),
3368                               ShrinkOperand(SI->getFalseValue()));
3369       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3370         switch (CI->getOpcode()) {
3371         default:
3372           llvm_unreachable("Unhandled cast!");
3373         case Instruction::Trunc:
3374           NewI = ShrinkOperand(CI->getOperand(0));
3375           break;
3376         case Instruction::SExt:
3377           NewI = B.CreateSExtOrTrunc(
3378               CI->getOperand(0),
3379               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3380           break;
3381         case Instruction::ZExt:
3382           NewI = B.CreateZExtOrTrunc(
3383               CI->getOperand(0),
3384               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3385           break;
3386         }
3387       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3388         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3389         auto *O0 = B.CreateZExtOrTrunc(
3390             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3391         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3392         auto *O1 = B.CreateZExtOrTrunc(
3393             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3394 
3395         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3396       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3397         // Don't do anything with the operands, just extend the result.
3398         continue;
3399       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3400         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3401         auto *O0 = B.CreateZExtOrTrunc(
3402             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3403         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3404         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3405       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3406         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3407         auto *O0 = B.CreateZExtOrTrunc(
3408             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3409         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3410       } else {
3411         // If we don't know what to do, be conservative and don't do anything.
3412         continue;
3413       }
3414 
3415       // Lastly, extend the result.
3416       NewI->takeName(cast<Instruction>(I));
3417       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3418       I->replaceAllUsesWith(Res);
3419       cast<Instruction>(I)->eraseFromParent();
3420       Erased.insert(I);
3421       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3422     }
3423   }
3424 
3425   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3426   for (const auto &KV : Cost->getMinimalBitwidths()) {
3427     // If the value wasn't vectorized, we must maintain the original scalar
3428     // type. The absence of the value from VectorLoopValueMap indicates that it
3429     // wasn't vectorized.
3430     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3431       continue;
3432     for (unsigned Part = 0; Part < UF; ++Part) {
3433       Value *I = getOrCreateVectorValue(KV.first, Part);
3434       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3435       if (Inst && Inst->use_empty()) {
3436         Value *NewI = Inst->getOperand(0);
3437         Inst->eraseFromParent();
3438         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3439       }
3440     }
3441   }
3442 }
3443 
3444 void InnerLoopVectorizer::fixVectorizedLoop() {
3445   // Insert truncates and extends for any truncated instructions as hints to
3446   // InstCombine.
3447   if (VF > 1)
3448     truncateToMinimalBitwidths();
3449 
3450   // Fix widened non-induction PHIs by setting up the PHI operands.
3451   if (OrigPHIsToFix.size()) {
3452     assert(EnableVPlanNativePath &&
3453            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3454     fixNonInductionPHIs();
3455   }
3456 
3457   // At this point every instruction in the original loop is widened to a
3458   // vector form. Now we need to fix the recurrences in the loop. These PHI
3459   // nodes are currently empty because we did not want to introduce cycles.
3460   // This is the second stage of vectorizing recurrences.
3461   fixCrossIterationPHIs();
3462 
3463   // Forget the original basic block.
3464   PSE.getSE()->forgetLoop(OrigLoop);
3465 
3466   // Fix-up external users of the induction variables.
3467   for (auto &Entry : Legal->getInductionVars())
3468     fixupIVUsers(Entry.first, Entry.second,
3469                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3470                  IVEndValues[Entry.first], LoopMiddleBlock);
3471 
3472   fixLCSSAPHIs();
3473   for (Instruction *PI : PredicatedInstructions)
3474     sinkScalarOperands(&*PI);
3475 
3476   // Remove redundant induction instructions.
3477   cse(LoopVectorBody);
3478 
3479   // Set/update profile weights for the vector and remainder loops as original
3480   // loop iterations are now distributed among them. Note that original loop
3481   // represented by LoopScalarBody becomes remainder loop after vectorization.
3482   //
3483   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3484   // end up getting slightly roughened result but that should be OK since
3485   // profile is not inherently precise anyway. Note also possible bypass of
3486   // vector code caused by legality checks is ignored, assigning all the weight
3487   // to the vector loop, optimistically.
3488   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3489                                LI->getLoopFor(LoopVectorBody),
3490                                LI->getLoopFor(LoopScalarBody), VF * UF);
3491 }
3492 
3493 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3494   // In order to support recurrences we need to be able to vectorize Phi nodes.
3495   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3496   // stage #2: We now need to fix the recurrences by adding incoming edges to
3497   // the currently empty PHI nodes. At this point every instruction in the
3498   // original loop is widened to a vector form so we can use them to construct
3499   // the incoming edges.
3500   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3501     // Handle first-order recurrences and reductions that need to be fixed.
3502     if (Legal->isFirstOrderRecurrence(&Phi))
3503       fixFirstOrderRecurrence(&Phi);
3504     else if (Legal->isReductionVariable(&Phi))
3505       fixReduction(&Phi);
3506   }
3507 }
3508 
3509 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3510   // This is the second phase of vectorizing first-order recurrences. An
3511   // overview of the transformation is described below. Suppose we have the
3512   // following loop.
3513   //
3514   //   for (int i = 0; i < n; ++i)
3515   //     b[i] = a[i] - a[i - 1];
3516   //
3517   // There is a first-order recurrence on "a". For this loop, the shorthand
3518   // scalar IR looks like:
3519   //
3520   //   scalar.ph:
3521   //     s_init = a[-1]
3522   //     br scalar.body
3523   //
3524   //   scalar.body:
3525   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3526   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3527   //     s2 = a[i]
3528   //     b[i] = s2 - s1
3529   //     br cond, scalar.body, ...
3530   //
3531   // In this example, s1 is a recurrence because it's value depends on the
3532   // previous iteration. In the first phase of vectorization, we created a
3533   // temporary value for s1. We now complete the vectorization and produce the
3534   // shorthand vector IR shown below (for VF = 4, UF = 1).
3535   //
3536   //   vector.ph:
3537   //     v_init = vector(..., ..., ..., a[-1])
3538   //     br vector.body
3539   //
3540   //   vector.body
3541   //     i = phi [0, vector.ph], [i+4, vector.body]
3542   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3543   //     v2 = a[i, i+1, i+2, i+3];
3544   //     v3 = vector(v1(3), v2(0, 1, 2))
3545   //     b[i, i+1, i+2, i+3] = v2 - v3
3546   //     br cond, vector.body, middle.block
3547   //
3548   //   middle.block:
3549   //     x = v2(3)
3550   //     br scalar.ph
3551   //
3552   //   scalar.ph:
3553   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3554   //     br scalar.body
3555   //
3556   // After execution completes the vector loop, we extract the next value of
3557   // the recurrence (x) to use as the initial value in the scalar loop.
3558 
3559   // Get the original loop preheader and single loop latch.
3560   auto *Preheader = OrigLoop->getLoopPreheader();
3561   auto *Latch = OrigLoop->getLoopLatch();
3562 
3563   // Get the initial and previous values of the scalar recurrence.
3564   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3565   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3566 
3567   // Create a vector from the initial value.
3568   auto *VectorInit = ScalarInit;
3569   if (VF > 1) {
3570     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3571     VectorInit = Builder.CreateInsertElement(
3572         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3573         Builder.getInt32(VF - 1), "vector.recur.init");
3574   }
3575 
3576   // We constructed a temporary phi node in the first phase of vectorization.
3577   // This phi node will eventually be deleted.
3578   Builder.SetInsertPoint(
3579       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3580 
3581   // Create a phi node for the new recurrence. The current value will either be
3582   // the initial value inserted into a vector or loop-varying vector value.
3583   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3584   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3585 
3586   // Get the vectorized previous value of the last part UF - 1. It appears last
3587   // among all unrolled iterations, due to the order of their construction.
3588   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3589 
3590   // Find and set the insertion point after the previous value if it is an
3591   // instruction.
3592   BasicBlock::iterator InsertPt;
3593   // Note that the previous value may have been constant-folded so it is not
3594   // guaranteed to be an instruction in the vector loop.
3595   // FIXME: Loop invariant values do not form recurrences. We should deal with
3596   //        them earlier.
3597   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3598     InsertPt = LoopVectorBody->getFirstInsertionPt();
3599   else {
3600     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3601     if (isa<PHINode>(PreviousLastPart))
3602       // If the previous value is a phi node, we should insert after all the phi
3603       // nodes in the block containing the PHI to avoid breaking basic block
3604       // verification. Note that the basic block may be different to
3605       // LoopVectorBody, in case we predicate the loop.
3606       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3607     else
3608       InsertPt = ++PreviousInst->getIterator();
3609   }
3610   Builder.SetInsertPoint(&*InsertPt);
3611 
3612   // We will construct a vector for the recurrence by combining the values for
3613   // the current and previous iterations. This is the required shuffle mask.
3614   SmallVector<Constant *, 8> ShuffleMask(VF);
3615   ShuffleMask[0] = Builder.getInt32(VF - 1);
3616   for (unsigned I = 1; I < VF; ++I)
3617     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3618 
3619   // The vector from which to take the initial value for the current iteration
3620   // (actual or unrolled). Initially, this is the vector phi node.
3621   Value *Incoming = VecPhi;
3622 
3623   // Shuffle the current and previous vector and update the vector parts.
3624   for (unsigned Part = 0; Part < UF; ++Part) {
3625     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3626     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3627     auto *Shuffle =
3628         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3629                                              ConstantVector::get(ShuffleMask))
3630                : Incoming;
3631     PhiPart->replaceAllUsesWith(Shuffle);
3632     cast<Instruction>(PhiPart)->eraseFromParent();
3633     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3634     Incoming = PreviousPart;
3635   }
3636 
3637   // Fix the latch value of the new recurrence in the vector loop.
3638   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3639 
3640   // Extract the last vector element in the middle block. This will be the
3641   // initial value for the recurrence when jumping to the scalar loop.
3642   auto *ExtractForScalar = Incoming;
3643   if (VF > 1) {
3644     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3645     ExtractForScalar = Builder.CreateExtractElement(
3646         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3647   }
3648   // Extract the second last element in the middle block if the
3649   // Phi is used outside the loop. We need to extract the phi itself
3650   // and not the last element (the phi update in the current iteration). This
3651   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3652   // when the scalar loop is not run at all.
3653   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3654   if (VF > 1)
3655     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3656         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3657   // When loop is unrolled without vectorizing, initialize
3658   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3659   // `Incoming`. This is analogous to the vectorized case above: extracting the
3660   // second last element when VF > 1.
3661   else if (UF > 1)
3662     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3663 
3664   // Fix the initial value of the original recurrence in the scalar loop.
3665   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3666   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3667   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3668     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3669     Start->addIncoming(Incoming, BB);
3670   }
3671 
3672   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3673   Phi->setName("scalar.recur");
3674 
3675   // Finally, fix users of the recurrence outside the loop. The users will need
3676   // either the last value of the scalar recurrence or the last value of the
3677   // vector recurrence we extracted in the middle block. Since the loop is in
3678   // LCSSA form, we just need to find all the phi nodes for the original scalar
3679   // recurrence in the exit block, and then add an edge for the middle block.
3680   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3681     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3682       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3683     }
3684   }
3685 }
3686 
3687 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3688   Constant *Zero = Builder.getInt32(0);
3689 
3690   // Get it's reduction variable descriptor.
3691   assert(Legal->isReductionVariable(Phi) &&
3692          "Unable to find the reduction variable");
3693   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3694 
3695   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3696   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3697   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3698   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3699     RdxDesc.getMinMaxRecurrenceKind();
3700   setDebugLocFromInst(Builder, ReductionStartValue);
3701 
3702   // We need to generate a reduction vector from the incoming scalar.
3703   // To do so, we need to generate the 'identity' vector and override
3704   // one of the elements with the incoming scalar reduction. We need
3705   // to do it in the vector-loop preheader.
3706   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3707 
3708   // This is the vector-clone of the value that leaves the loop.
3709   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3710 
3711   // Find the reduction identity variable. Zero for addition, or, xor,
3712   // one for multiplication, -1 for And.
3713   Value *Identity;
3714   Value *VectorStart;
3715   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3716       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3717     // MinMax reduction have the start value as their identify.
3718     if (VF == 1) {
3719       VectorStart = Identity = ReductionStartValue;
3720     } else {
3721       VectorStart = Identity =
3722         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3723     }
3724   } else {
3725     // Handle other reduction kinds:
3726     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3727         RK, VecTy->getScalarType());
3728     if (VF == 1) {
3729       Identity = Iden;
3730       // This vector is the Identity vector where the first element is the
3731       // incoming scalar reduction.
3732       VectorStart = ReductionStartValue;
3733     } else {
3734       Identity = ConstantVector::getSplat(VF, Iden);
3735 
3736       // This vector is the Identity vector where the first element is the
3737       // incoming scalar reduction.
3738       VectorStart =
3739         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3740     }
3741   }
3742 
3743   // Wrap flags are in general invalid after vectorization, clear them.
3744   clearReductionWrapFlags(RdxDesc);
3745 
3746   // Fix the vector-loop phi.
3747 
3748   // Reductions do not have to start at zero. They can start with
3749   // any loop invariant values.
3750   BasicBlock *Latch = OrigLoop->getLoopLatch();
3751   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3752 
3753   for (unsigned Part = 0; Part < UF; ++Part) {
3754     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3755     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3756     // Make sure to add the reduction start value only to the
3757     // first unroll part.
3758     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3759     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3760     cast<PHINode>(VecRdxPhi)
3761       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3762   }
3763 
3764   // Before each round, move the insertion point right between
3765   // the PHIs and the values we are going to write.
3766   // This allows us to write both PHINodes and the extractelement
3767   // instructions.
3768   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3769 
3770   setDebugLocFromInst(Builder, LoopExitInst);
3771 
3772   // If tail is folded by masking, the vector value to leave the loop should be
3773   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3774   // instead of the former.
3775   if (Cost->foldTailByMasking()) {
3776     for (unsigned Part = 0; Part < UF; ++Part) {
3777       Value *VecLoopExitInst =
3778           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3779       Value *Sel = nullptr;
3780       for (User *U : VecLoopExitInst->users()) {
3781         if (isa<SelectInst>(U)) {
3782           assert(!Sel && "Reduction exit feeding two selects");
3783           Sel = U;
3784         } else
3785           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3786       }
3787       assert(Sel && "Reduction exit feeds no select");
3788       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3789     }
3790   }
3791 
3792   // If the vector reduction can be performed in a smaller type, we truncate
3793   // then extend the loop exit value to enable InstCombine to evaluate the
3794   // entire expression in the smaller type.
3795   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3796     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3797     Builder.SetInsertPoint(
3798         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3799     VectorParts RdxParts(UF);
3800     for (unsigned Part = 0; Part < UF; ++Part) {
3801       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3802       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3803       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3804                                         : Builder.CreateZExt(Trunc, VecTy);
3805       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3806            UI != RdxParts[Part]->user_end();)
3807         if (*UI != Trunc) {
3808           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3809           RdxParts[Part] = Extnd;
3810         } else {
3811           ++UI;
3812         }
3813     }
3814     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3815     for (unsigned Part = 0; Part < UF; ++Part) {
3816       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3817       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3818     }
3819   }
3820 
3821   // Reduce all of the unrolled parts into a single vector.
3822   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3823   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3824 
3825   // The middle block terminator has already been assigned a DebugLoc here (the
3826   // OrigLoop's single latch terminator). We want the whole middle block to
3827   // appear to execute on this line because: (a) it is all compiler generated,
3828   // (b) these instructions are always executed after evaluating the latch
3829   // conditional branch, and (c) other passes may add new predecessors which
3830   // terminate on this line. This is the easiest way to ensure we don't
3831   // accidentally cause an extra step back into the loop while debugging.
3832   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3833   for (unsigned Part = 1; Part < UF; ++Part) {
3834     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3835     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3836       // Floating point operations had to be 'fast' to enable the reduction.
3837       ReducedPartRdx = addFastMathFlag(
3838           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3839                               ReducedPartRdx, "bin.rdx"),
3840           RdxDesc.getFastMathFlags());
3841     else
3842       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3843                                       RdxPart);
3844   }
3845 
3846   if (VF > 1) {
3847     bool NoNaN = Legal->hasFunNoNaNAttr();
3848     ReducedPartRdx =
3849         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3850     // If the reduction can be performed in a smaller type, we need to extend
3851     // the reduction to the wider type before we branch to the original loop.
3852     if (Phi->getType() != RdxDesc.getRecurrenceType())
3853       ReducedPartRdx =
3854         RdxDesc.isSigned()
3855         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3856         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3857   }
3858 
3859   // Create a phi node that merges control-flow from the backedge-taken check
3860   // block and the middle block.
3861   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3862                                         LoopScalarPreHeader->getTerminator());
3863   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3864     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3865   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3866 
3867   // Now, we need to fix the users of the reduction variable
3868   // inside and outside of the scalar remainder loop.
3869   // We know that the loop is in LCSSA form. We need to update the
3870   // PHI nodes in the exit blocks.
3871   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3872     // All PHINodes need to have a single entry edge, or two if
3873     // we already fixed them.
3874     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3875 
3876     // We found a reduction value exit-PHI. Update it with the
3877     // incoming bypass edge.
3878     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3879       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3880   } // end of the LCSSA phi scan.
3881 
3882     // Fix the scalar loop reduction variable with the incoming reduction sum
3883     // from the vector body and from the backedge value.
3884   int IncomingEdgeBlockIdx =
3885     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3886   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3887   // Pick the other block.
3888   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3889   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3890   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3891 }
3892 
3893 void InnerLoopVectorizer::clearReductionWrapFlags(
3894     RecurrenceDescriptor &RdxDesc) {
3895   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3896   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3897       RK != RecurrenceDescriptor::RK_IntegerMult)
3898     return;
3899 
3900   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3901   assert(LoopExitInstr && "null loop exit instruction");
3902   SmallVector<Instruction *, 8> Worklist;
3903   SmallPtrSet<Instruction *, 8> Visited;
3904   Worklist.push_back(LoopExitInstr);
3905   Visited.insert(LoopExitInstr);
3906 
3907   while (!Worklist.empty()) {
3908     Instruction *Cur = Worklist.pop_back_val();
3909     if (isa<OverflowingBinaryOperator>(Cur))
3910       for (unsigned Part = 0; Part < UF; ++Part) {
3911         Value *V = getOrCreateVectorValue(Cur, Part);
3912         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3913       }
3914 
3915     for (User *U : Cur->users()) {
3916       Instruction *UI = cast<Instruction>(U);
3917       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3918           Visited.insert(UI).second)
3919         Worklist.push_back(UI);
3920     }
3921   }
3922 }
3923 
3924 void InnerLoopVectorizer::fixLCSSAPHIs() {
3925   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3926     if (LCSSAPhi.getNumIncomingValues() == 1) {
3927       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3928       // Non-instruction incoming values will have only one value.
3929       unsigned LastLane = 0;
3930       if (isa<Instruction>(IncomingValue))
3931           LastLane = Cost->isUniformAfterVectorization(
3932                          cast<Instruction>(IncomingValue), VF)
3933                          ? 0
3934                          : VF - 1;
3935       // Can be a loop invariant incoming value or the last scalar value to be
3936       // extracted from the vectorized loop.
3937       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3938       Value *lastIncomingValue =
3939           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3940       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3941     }
3942   }
3943 }
3944 
3945 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3946   // The basic block and loop containing the predicated instruction.
3947   auto *PredBB = PredInst->getParent();
3948   auto *VectorLoop = LI->getLoopFor(PredBB);
3949 
3950   // Initialize a worklist with the operands of the predicated instruction.
3951   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3952 
3953   // Holds instructions that we need to analyze again. An instruction may be
3954   // reanalyzed if we don't yet know if we can sink it or not.
3955   SmallVector<Instruction *, 8> InstsToReanalyze;
3956 
3957   // Returns true if a given use occurs in the predicated block. Phi nodes use
3958   // their operands in their corresponding predecessor blocks.
3959   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3960     auto *I = cast<Instruction>(U.getUser());
3961     BasicBlock *BB = I->getParent();
3962     if (auto *Phi = dyn_cast<PHINode>(I))
3963       BB = Phi->getIncomingBlock(
3964           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3965     return BB == PredBB;
3966   };
3967 
3968   // Iteratively sink the scalarized operands of the predicated instruction
3969   // into the block we created for it. When an instruction is sunk, it's
3970   // operands are then added to the worklist. The algorithm ends after one pass
3971   // through the worklist doesn't sink a single instruction.
3972   bool Changed;
3973   do {
3974     // Add the instructions that need to be reanalyzed to the worklist, and
3975     // reset the changed indicator.
3976     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3977     InstsToReanalyze.clear();
3978     Changed = false;
3979 
3980     while (!Worklist.empty()) {
3981       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3982 
3983       // We can't sink an instruction if it is a phi node, is already in the
3984       // predicated block, is not in the loop, or may have side effects.
3985       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3986           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3987         continue;
3988 
3989       // It's legal to sink the instruction if all its uses occur in the
3990       // predicated block. Otherwise, there's nothing to do yet, and we may
3991       // need to reanalyze the instruction.
3992       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3993         InstsToReanalyze.push_back(I);
3994         continue;
3995       }
3996 
3997       // Move the instruction to the beginning of the predicated block, and add
3998       // it's operands to the worklist.
3999       I->moveBefore(&*PredBB->getFirstInsertionPt());
4000       Worklist.insert(I->op_begin(), I->op_end());
4001 
4002       // The sinking may have enabled other instructions to be sunk, so we will
4003       // need to iterate.
4004       Changed = true;
4005     }
4006   } while (Changed);
4007 }
4008 
4009 void InnerLoopVectorizer::fixNonInductionPHIs() {
4010   for (PHINode *OrigPhi : OrigPHIsToFix) {
4011     PHINode *NewPhi =
4012         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4013     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4014 
4015     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4016         predecessors(OrigPhi->getParent()));
4017     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4018         predecessors(NewPhi->getParent()));
4019     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4020            "Scalar and Vector BB should have the same number of predecessors");
4021 
4022     // The insertion point in Builder may be invalidated by the time we get
4023     // here. Force the Builder insertion point to something valid so that we do
4024     // not run into issues during insertion point restore in
4025     // getOrCreateVectorValue calls below.
4026     Builder.SetInsertPoint(NewPhi);
4027 
4028     // The predecessor order is preserved and we can rely on mapping between
4029     // scalar and vector block predecessors.
4030     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4031       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4032 
4033       // When looking up the new scalar/vector values to fix up, use incoming
4034       // values from original phi.
4035       Value *ScIncV =
4036           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4037 
4038       // Scalar incoming value may need a broadcast
4039       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4040       NewPhi->addIncoming(NewIncV, NewPredBB);
4041     }
4042   }
4043 }
4044 
4045 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4046                                    unsigned VF, bool IsPtrLoopInvariant,
4047                                    SmallBitVector &IsIndexLoopInvariant) {
4048   // Construct a vector GEP by widening the operands of the scalar GEP as
4049   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4050   // results in a vector of pointers when at least one operand of the GEP
4051   // is vector-typed. Thus, to keep the representation compact, we only use
4052   // vector-typed operands for loop-varying values.
4053 
4054   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4055     // If we are vectorizing, but the GEP has only loop-invariant operands,
4056     // the GEP we build (by only using vector-typed operands for
4057     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4058     // produce a vector of pointers, we need to either arbitrarily pick an
4059     // operand to broadcast, or broadcast a clone of the original GEP.
4060     // Here, we broadcast a clone of the original.
4061     //
4062     // TODO: If at some point we decide to scalarize instructions having
4063     //       loop-invariant operands, this special case will no longer be
4064     //       required. We would add the scalarization decision to
4065     //       collectLoopScalars() and teach getVectorValue() to broadcast
4066     //       the lane-zero scalar value.
4067     auto *Clone = Builder.Insert(GEP->clone());
4068     for (unsigned Part = 0; Part < UF; ++Part) {
4069       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4070       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4071       addMetadata(EntryPart, GEP);
4072     }
4073   } else {
4074     // If the GEP has at least one loop-varying operand, we are sure to
4075     // produce a vector of pointers. But if we are only unrolling, we want
4076     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4077     // produce with the code below will be scalar (if VF == 1) or vector
4078     // (otherwise). Note that for the unroll-only case, we still maintain
4079     // values in the vector mapping with initVector, as we do for other
4080     // instructions.
4081     for (unsigned Part = 0; Part < UF; ++Part) {
4082       // The pointer operand of the new GEP. If it's loop-invariant, we
4083       // won't broadcast it.
4084       auto *Ptr = IsPtrLoopInvariant
4085                       ? GEP->getPointerOperand()
4086                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4087 
4088       // Collect all the indices for the new GEP. If any index is
4089       // loop-invariant, we won't broadcast it.
4090       SmallVector<Value *, 4> Indices;
4091       for (auto Index : enumerate(GEP->indices())) {
4092         Value *User = Index.value().get();
4093         if (IsIndexLoopInvariant[Index.index()])
4094           Indices.push_back(User);
4095         else
4096           Indices.push_back(getOrCreateVectorValue(User, Part));
4097       }
4098 
4099       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4100       // but it should be a vector, otherwise.
4101       auto *NewGEP =
4102           GEP->isInBounds()
4103               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4104                                           Indices)
4105               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4106       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4107              "NewGEP is not a pointer vector");
4108       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4109       addMetadata(NewGEP, GEP);
4110     }
4111   }
4112 }
4113 
4114 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4115                                               unsigned VF) {
4116   PHINode *P = cast<PHINode>(PN);
4117   if (EnableVPlanNativePath) {
4118     // Currently we enter here in the VPlan-native path for non-induction
4119     // PHIs where all control flow is uniform. We simply widen these PHIs.
4120     // Create a vector phi with no operands - the vector phi operands will be
4121     // set at the end of vector code generation.
4122     Type *VecTy =
4123         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4124     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4125     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4126     OrigPHIsToFix.push_back(P);
4127 
4128     return;
4129   }
4130 
4131   assert(PN->getParent() == OrigLoop->getHeader() &&
4132          "Non-header phis should have been handled elsewhere");
4133 
4134   // In order to support recurrences we need to be able to vectorize Phi nodes.
4135   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4136   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4137   // this value when we vectorize all of the instructions that use the PHI.
4138   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4139     for (unsigned Part = 0; Part < UF; ++Part) {
4140       // This is phase one of vectorizing PHIs.
4141       Type *VecTy =
4142           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4143       Value *EntryPart = PHINode::Create(
4144           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4145       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4146     }
4147     return;
4148   }
4149 
4150   setDebugLocFromInst(Builder, P);
4151 
4152   // This PHINode must be an induction variable.
4153   // Make sure that we know about it.
4154   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4155 
4156   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4157   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4158 
4159   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4160   // which can be found from the original scalar operations.
4161   switch (II.getKind()) {
4162   case InductionDescriptor::IK_NoInduction:
4163     llvm_unreachable("Unknown induction");
4164   case InductionDescriptor::IK_IntInduction:
4165   case InductionDescriptor::IK_FpInduction:
4166     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4167   case InductionDescriptor::IK_PtrInduction: {
4168     // Handle the pointer induction variable case.
4169     assert(P->getType()->isPointerTy() && "Unexpected type.");
4170     // This is the normalized GEP that starts counting at zero.
4171     Value *PtrInd = Induction;
4172     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4173     // Determine the number of scalars we need to generate for each unroll
4174     // iteration. If the instruction is uniform, we only need to generate the
4175     // first lane. Otherwise, we generate all VF values.
4176     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4177     // These are the scalar results. Notice that we don't generate vector GEPs
4178     // because scalar GEPs result in better code.
4179     for (unsigned Part = 0; Part < UF; ++Part) {
4180       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4181         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4182         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4183         Value *SclrGep =
4184             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4185         SclrGep->setName("next.gep");
4186         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4187       }
4188     }
4189     return;
4190   }
4191   }
4192 }
4193 
4194 /// A helper function for checking whether an integer division-related
4195 /// instruction may divide by zero (in which case it must be predicated if
4196 /// executed conditionally in the scalar code).
4197 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4198 /// Non-zero divisors that are non compile-time constants will not be
4199 /// converted into multiplication, so we will still end up scalarizing
4200 /// the division, but can do so w/o predication.
4201 static bool mayDivideByZero(Instruction &I) {
4202   assert((I.getOpcode() == Instruction::UDiv ||
4203           I.getOpcode() == Instruction::SDiv ||
4204           I.getOpcode() == Instruction::URem ||
4205           I.getOpcode() == Instruction::SRem) &&
4206          "Unexpected instruction");
4207   Value *Divisor = I.getOperand(1);
4208   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4209   return !CInt || CInt->isZero();
4210 }
4211 
4212 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4213   switch (I.getOpcode()) {
4214   case Instruction::Br:
4215   case Instruction::PHI:
4216   case Instruction::GetElementPtr:
4217     llvm_unreachable("This instruction is handled by a different recipe.");
4218   case Instruction::UDiv:
4219   case Instruction::SDiv:
4220   case Instruction::SRem:
4221   case Instruction::URem:
4222   case Instruction::Add:
4223   case Instruction::FAdd:
4224   case Instruction::Sub:
4225   case Instruction::FSub:
4226   case Instruction::FNeg:
4227   case Instruction::Mul:
4228   case Instruction::FMul:
4229   case Instruction::FDiv:
4230   case Instruction::FRem:
4231   case Instruction::Shl:
4232   case Instruction::LShr:
4233   case Instruction::AShr:
4234   case Instruction::And:
4235   case Instruction::Or:
4236   case Instruction::Xor: {
4237     // Just widen unops and binops.
4238     setDebugLocFromInst(Builder, &I);
4239 
4240     for (unsigned Part = 0; Part < UF; ++Part) {
4241       SmallVector<Value *, 2> Ops;
4242       for (Value *Op : I.operands())
4243         Ops.push_back(getOrCreateVectorValue(Op, Part));
4244 
4245       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4246 
4247       if (auto *VecOp = dyn_cast<Instruction>(V))
4248         VecOp->copyIRFlags(&I);
4249 
4250       // Use this vector value for all users of the original instruction.
4251       VectorLoopValueMap.setVectorValue(&I, Part, V);
4252       addMetadata(V, &I);
4253     }
4254 
4255     break;
4256   }
4257   case Instruction::Select: {
4258     // Widen selects.
4259     // If the selector is loop invariant we can create a select
4260     // instruction with a scalar condition. Otherwise, use vector-select.
4261     auto *SE = PSE.getSE();
4262     bool InvariantCond =
4263         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4264     setDebugLocFromInst(Builder, &I);
4265 
4266     // The condition can be loop invariant  but still defined inside the
4267     // loop. This means that we can't just use the original 'cond' value.
4268     // We have to take the 'vectorized' value and pick the first lane.
4269     // Instcombine will make this a no-op.
4270 
4271     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4272 
4273     for (unsigned Part = 0; Part < UF; ++Part) {
4274       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4275       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4276       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4277       Value *Sel =
4278           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4279       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4280       addMetadata(Sel, &I);
4281     }
4282 
4283     break;
4284   }
4285 
4286   case Instruction::ICmp:
4287   case Instruction::FCmp: {
4288     // Widen compares. Generate vector compares.
4289     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4290     auto *Cmp = cast<CmpInst>(&I);
4291     setDebugLocFromInst(Builder, Cmp);
4292     for (unsigned Part = 0; Part < UF; ++Part) {
4293       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4294       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4295       Value *C = nullptr;
4296       if (FCmp) {
4297         // Propagate fast math flags.
4298         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4299         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4300         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4301       } else {
4302         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4303       }
4304       VectorLoopValueMap.setVectorValue(&I, Part, C);
4305       addMetadata(C, &I);
4306     }
4307 
4308     break;
4309   }
4310 
4311   case Instruction::ZExt:
4312   case Instruction::SExt:
4313   case Instruction::FPToUI:
4314   case Instruction::FPToSI:
4315   case Instruction::FPExt:
4316   case Instruction::PtrToInt:
4317   case Instruction::IntToPtr:
4318   case Instruction::SIToFP:
4319   case Instruction::UIToFP:
4320   case Instruction::Trunc:
4321   case Instruction::FPTrunc:
4322   case Instruction::BitCast: {
4323     auto *CI = cast<CastInst>(&I);
4324     setDebugLocFromInst(Builder, CI);
4325 
4326     /// Vectorize casts.
4327     Type *DestTy =
4328         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4329 
4330     for (unsigned Part = 0; Part < UF; ++Part) {
4331       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4332       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4333       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4334       addMetadata(Cast, &I);
4335     }
4336     break;
4337   }
4338 
4339   case Instruction::Call: {
4340     // Ignore dbg intrinsics.
4341     if (isa<DbgInfoIntrinsic>(I))
4342       break;
4343     setDebugLocFromInst(Builder, &I);
4344 
4345     Module *M = I.getParent()->getParent()->getParent();
4346     auto *CI = cast<CallInst>(&I);
4347 
4348     SmallVector<Type *, 4> Tys;
4349     for (Value *ArgOperand : CI->arg_operands())
4350       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4351 
4352     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4353 
4354     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4355     // version of the instruction.
4356     // Is it beneficial to perform intrinsic call compared to lib call?
4357     bool NeedToScalarize = false;
4358     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4359     bool UseVectorIntrinsic =
4360         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4361     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4362            "Instruction should be scalarized elsewhere.");
4363 
4364     for (unsigned Part = 0; Part < UF; ++Part) {
4365       SmallVector<Value *, 4> Args;
4366       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4367         Value *Arg = CI->getArgOperand(i);
4368         // Some intrinsics have a scalar argument - don't replace it with a
4369         // vector.
4370         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4371           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4372         Args.push_back(Arg);
4373       }
4374 
4375       Function *VectorF;
4376       if (UseVectorIntrinsic) {
4377         // Use vector version of the intrinsic.
4378         Type *TysForDecl[] = {CI->getType()};
4379         if (VF > 1)
4380           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4381         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4382       } else {
4383         // Use vector version of the function call.
4384         const VFShape Shape =
4385             VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4386 #ifndef NDEBUG
4387         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
4388         assert(std::find_if(Infos.begin(), Infos.end(),
4389                             [&Shape](const VFInfo &Info) {
4390                               return Info.Shape == Shape;
4391                             }) != Infos.end() &&
4392                "Vector function shape is missing from the database.");
4393 #endif
4394         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4395       }
4396       assert(VectorF && "Can't create vector function.");
4397 
4398       SmallVector<OperandBundleDef, 1> OpBundles;
4399       CI->getOperandBundlesAsDefs(OpBundles);
4400       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4401 
4402       if (isa<FPMathOperator>(V))
4403         V->copyFastMathFlags(CI);
4404 
4405       VectorLoopValueMap.setVectorValue(&I, Part, V);
4406       addMetadata(V, &I);
4407     }
4408 
4409     break;
4410   }
4411 
4412   default:
4413     // This instruction is not vectorized by simple widening.
4414     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4415     llvm_unreachable("Unhandled instruction!");
4416   } // end of switch.
4417 }
4418 
4419 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4420   // We should not collect Scalars more than once per VF. Right now, this
4421   // function is called from collectUniformsAndScalars(), which already does
4422   // this check. Collecting Scalars for VF=1 does not make any sense.
4423   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4424          "This function should not be visited twice for the same VF");
4425 
4426   SmallSetVector<Instruction *, 8> Worklist;
4427 
4428   // These sets are used to seed the analysis with pointers used by memory
4429   // accesses that will remain scalar.
4430   SmallSetVector<Instruction *, 8> ScalarPtrs;
4431   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4432 
4433   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4434   // The pointer operands of loads and stores will be scalar as long as the
4435   // memory access is not a gather or scatter operation. The value operand of a
4436   // store will remain scalar if the store is scalarized.
4437   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4438     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4439     assert(WideningDecision != CM_Unknown &&
4440            "Widening decision should be ready at this moment");
4441     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4442       if (Ptr == Store->getValueOperand())
4443         return WideningDecision == CM_Scalarize;
4444     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4445            "Ptr is neither a value or pointer operand");
4446     return WideningDecision != CM_GatherScatter;
4447   };
4448 
4449   // A helper that returns true if the given value is a bitcast or
4450   // getelementptr instruction contained in the loop.
4451   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4452     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4453             isa<GetElementPtrInst>(V)) &&
4454            !TheLoop->isLoopInvariant(V);
4455   };
4456 
4457   // A helper that evaluates a memory access's use of a pointer. If the use
4458   // will be a scalar use, and the pointer is only used by memory accesses, we
4459   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4460   // PossibleNonScalarPtrs.
4461   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4462     // We only care about bitcast and getelementptr instructions contained in
4463     // the loop.
4464     if (!isLoopVaryingBitCastOrGEP(Ptr))
4465       return;
4466 
4467     // If the pointer has already been identified as scalar (e.g., if it was
4468     // also identified as uniform), there's nothing to do.
4469     auto *I = cast<Instruction>(Ptr);
4470     if (Worklist.count(I))
4471       return;
4472 
4473     // If the use of the pointer will be a scalar use, and all users of the
4474     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4475     // place the pointer in PossibleNonScalarPtrs.
4476     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4477           return isa<LoadInst>(U) || isa<StoreInst>(U);
4478         }))
4479       ScalarPtrs.insert(I);
4480     else
4481       PossibleNonScalarPtrs.insert(I);
4482   };
4483 
4484   // We seed the scalars analysis with three classes of instructions: (1)
4485   // instructions marked uniform-after-vectorization, (2) bitcast and
4486   // getelementptr instructions used by memory accesses requiring a scalar use,
4487   // and (3) pointer induction variables and their update instructions (we
4488   // currently only scalarize these).
4489   //
4490   // (1) Add to the worklist all instructions that have been identified as
4491   // uniform-after-vectorization.
4492   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4493 
4494   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4495   // memory accesses requiring a scalar use. The pointer operands of loads and
4496   // stores will be scalar as long as the memory accesses is not a gather or
4497   // scatter operation. The value operand of a store will remain scalar if the
4498   // store is scalarized.
4499   for (auto *BB : TheLoop->blocks())
4500     for (auto &I : *BB) {
4501       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4502         evaluatePtrUse(Load, Load->getPointerOperand());
4503       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4504         evaluatePtrUse(Store, Store->getPointerOperand());
4505         evaluatePtrUse(Store, Store->getValueOperand());
4506       }
4507     }
4508   for (auto *I : ScalarPtrs)
4509     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4510       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4511       Worklist.insert(I);
4512     }
4513 
4514   // (3) Add to the worklist all pointer induction variables and their update
4515   // instructions.
4516   //
4517   // TODO: Once we are able to vectorize pointer induction variables we should
4518   //       no longer insert them into the worklist here.
4519   auto *Latch = TheLoop->getLoopLatch();
4520   for (auto &Induction : Legal->getInductionVars()) {
4521     auto *Ind = Induction.first;
4522     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4523     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4524       continue;
4525     Worklist.insert(Ind);
4526     Worklist.insert(IndUpdate);
4527     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4528     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4529                       << "\n");
4530   }
4531 
4532   // Insert the forced scalars.
4533   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4534   // induction variable when the PHI user is scalarized.
4535   auto ForcedScalar = ForcedScalars.find(VF);
4536   if (ForcedScalar != ForcedScalars.end())
4537     for (auto *I : ForcedScalar->second)
4538       Worklist.insert(I);
4539 
4540   // Expand the worklist by looking through any bitcasts and getelementptr
4541   // instructions we've already identified as scalar. This is similar to the
4542   // expansion step in collectLoopUniforms(); however, here we're only
4543   // expanding to include additional bitcasts and getelementptr instructions.
4544   unsigned Idx = 0;
4545   while (Idx != Worklist.size()) {
4546     Instruction *Dst = Worklist[Idx++];
4547     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4548       continue;
4549     auto *Src = cast<Instruction>(Dst->getOperand(0));
4550     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4551           auto *J = cast<Instruction>(U);
4552           return !TheLoop->contains(J) || Worklist.count(J) ||
4553                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4554                   isScalarUse(J, Src));
4555         })) {
4556       Worklist.insert(Src);
4557       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4558     }
4559   }
4560 
4561   // An induction variable will remain scalar if all users of the induction
4562   // variable and induction variable update remain scalar.
4563   for (auto &Induction : Legal->getInductionVars()) {
4564     auto *Ind = Induction.first;
4565     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4566 
4567     // We already considered pointer induction variables, so there's no reason
4568     // to look at their users again.
4569     //
4570     // TODO: Once we are able to vectorize pointer induction variables we
4571     //       should no longer skip over them here.
4572     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4573       continue;
4574 
4575     // Determine if all users of the induction variable are scalar after
4576     // vectorization.
4577     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4578       auto *I = cast<Instruction>(U);
4579       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4580     });
4581     if (!ScalarInd)
4582       continue;
4583 
4584     // Determine if all users of the induction variable update instruction are
4585     // scalar after vectorization.
4586     auto ScalarIndUpdate =
4587         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4588           auto *I = cast<Instruction>(U);
4589           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4590         });
4591     if (!ScalarIndUpdate)
4592       continue;
4593 
4594     // The induction variable and its update instruction will remain scalar.
4595     Worklist.insert(Ind);
4596     Worklist.insert(IndUpdate);
4597     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4598     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4599                       << "\n");
4600   }
4601 
4602   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4603 }
4604 
4605 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4606   if (!blockNeedsPredication(I->getParent()))
4607     return false;
4608   switch(I->getOpcode()) {
4609   default:
4610     break;
4611   case Instruction::Load:
4612   case Instruction::Store: {
4613     if (!Legal->isMaskRequired(I))
4614       return false;
4615     auto *Ptr = getLoadStorePointerOperand(I);
4616     auto *Ty = getMemInstValueType(I);
4617     // We have already decided how to vectorize this instruction, get that
4618     // result.
4619     if (VF > 1) {
4620       InstWidening WideningDecision = getWideningDecision(I, VF);
4621       assert(WideningDecision != CM_Unknown &&
4622              "Widening decision should be ready at this moment");
4623       return WideningDecision == CM_Scalarize;
4624     }
4625     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4626     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4627                                 isLegalMaskedGather(Ty, Alignment))
4628                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4629                                 isLegalMaskedScatter(Ty, Alignment));
4630   }
4631   case Instruction::UDiv:
4632   case Instruction::SDiv:
4633   case Instruction::SRem:
4634   case Instruction::URem:
4635     return mayDivideByZero(*I);
4636   }
4637   return false;
4638 }
4639 
4640 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4641                                                                unsigned VF) {
4642   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4643   assert(getWideningDecision(I, VF) == CM_Unknown &&
4644          "Decision should not be set yet.");
4645   auto *Group = getInterleavedAccessGroup(I);
4646   assert(Group && "Must have a group.");
4647 
4648   // If the instruction's allocated size doesn't equal it's type size, it
4649   // requires padding and will be scalarized.
4650   auto &DL = I->getModule()->getDataLayout();
4651   auto *ScalarTy = getMemInstValueType(I);
4652   if (hasIrregularType(ScalarTy, DL, VF))
4653     return false;
4654 
4655   // Check if masking is required.
4656   // A Group may need masking for one of two reasons: it resides in a block that
4657   // needs predication, or it was decided to use masking to deal with gaps.
4658   bool PredicatedAccessRequiresMasking =
4659       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4660   bool AccessWithGapsRequiresMasking =
4661       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4662   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4663     return true;
4664 
4665   // If masked interleaving is required, we expect that the user/target had
4666   // enabled it, because otherwise it either wouldn't have been created or
4667   // it should have been invalidated by the CostModel.
4668   assert(useMaskedInterleavedAccesses(TTI) &&
4669          "Masked interleave-groups for predicated accesses are not enabled.");
4670 
4671   auto *Ty = getMemInstValueType(I);
4672   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4673   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4674                           : TTI.isLegalMaskedStore(Ty, Alignment);
4675 }
4676 
4677 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4678                                                                unsigned VF) {
4679   // Get and ensure we have a valid memory instruction.
4680   LoadInst *LI = dyn_cast<LoadInst>(I);
4681   StoreInst *SI = dyn_cast<StoreInst>(I);
4682   assert((LI || SI) && "Invalid memory instruction");
4683 
4684   auto *Ptr = getLoadStorePointerOperand(I);
4685 
4686   // In order to be widened, the pointer should be consecutive, first of all.
4687   if (!Legal->isConsecutivePtr(Ptr))
4688     return false;
4689 
4690   // If the instruction is a store located in a predicated block, it will be
4691   // scalarized.
4692   if (isScalarWithPredication(I))
4693     return false;
4694 
4695   // If the instruction's allocated size doesn't equal it's type size, it
4696   // requires padding and will be scalarized.
4697   auto &DL = I->getModule()->getDataLayout();
4698   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4699   if (hasIrregularType(ScalarTy, DL, VF))
4700     return false;
4701 
4702   return true;
4703 }
4704 
4705 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4706   // We should not collect Uniforms more than once per VF. Right now,
4707   // this function is called from collectUniformsAndScalars(), which
4708   // already does this check. Collecting Uniforms for VF=1 does not make any
4709   // sense.
4710 
4711   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4712          "This function should not be visited twice for the same VF");
4713 
4714   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4715   // not analyze again.  Uniforms.count(VF) will return 1.
4716   Uniforms[VF].clear();
4717 
4718   // We now know that the loop is vectorizable!
4719   // Collect instructions inside the loop that will remain uniform after
4720   // vectorization.
4721 
4722   // Global values, params and instructions outside of current loop are out of
4723   // scope.
4724   auto isOutOfScope = [&](Value *V) -> bool {
4725     Instruction *I = dyn_cast<Instruction>(V);
4726     return (!I || !TheLoop->contains(I));
4727   };
4728 
4729   SetVector<Instruction *> Worklist;
4730   BasicBlock *Latch = TheLoop->getLoopLatch();
4731 
4732   // Instructions that are scalar with predication must not be considered
4733   // uniform after vectorization, because that would create an erroneous
4734   // replicating region where only a single instance out of VF should be formed.
4735   // TODO: optimize such seldom cases if found important, see PR40816.
4736   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4737     if (isScalarWithPredication(I, VF)) {
4738       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4739                         << *I << "\n");
4740       return;
4741     }
4742     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4743     Worklist.insert(I);
4744   };
4745 
4746   // Start with the conditional branch. If the branch condition is an
4747   // instruction contained in the loop that is only used by the branch, it is
4748   // uniform.
4749   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4750   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4751     addToWorklistIfAllowed(Cmp);
4752 
4753   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4754   // are pointers that are treated like consecutive pointers during
4755   // vectorization. The pointer operands of interleaved accesses are an
4756   // example.
4757   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4758 
4759   // Holds pointer operands of instructions that are possibly non-uniform.
4760   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4761 
4762   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4763     InstWidening WideningDecision = getWideningDecision(I, VF);
4764     assert(WideningDecision != CM_Unknown &&
4765            "Widening decision should be ready at this moment");
4766 
4767     return (WideningDecision == CM_Widen ||
4768             WideningDecision == CM_Widen_Reverse ||
4769             WideningDecision == CM_Interleave);
4770   };
4771   // Iterate over the instructions in the loop, and collect all
4772   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4773   // that a consecutive-like pointer operand will be scalarized, we collect it
4774   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4775   // getelementptr instruction can be used by both vectorized and scalarized
4776   // memory instructions. For example, if a loop loads and stores from the same
4777   // location, but the store is conditional, the store will be scalarized, and
4778   // the getelementptr won't remain uniform.
4779   for (auto *BB : TheLoop->blocks())
4780     for (auto &I : *BB) {
4781       // If there's no pointer operand, there's nothing to do.
4782       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4783       if (!Ptr)
4784         continue;
4785 
4786       // True if all users of Ptr are memory accesses that have Ptr as their
4787       // pointer operand.
4788       auto UsersAreMemAccesses =
4789           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4790             return getLoadStorePointerOperand(U) == Ptr;
4791           });
4792 
4793       // Ensure the memory instruction will not be scalarized or used by
4794       // gather/scatter, making its pointer operand non-uniform. If the pointer
4795       // operand is used by any instruction other than a memory access, we
4796       // conservatively assume the pointer operand may be non-uniform.
4797       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4798         PossibleNonUniformPtrs.insert(Ptr);
4799 
4800       // If the memory instruction will be vectorized and its pointer operand
4801       // is consecutive-like, or interleaving - the pointer operand should
4802       // remain uniform.
4803       else
4804         ConsecutiveLikePtrs.insert(Ptr);
4805     }
4806 
4807   // Add to the Worklist all consecutive and consecutive-like pointers that
4808   // aren't also identified as possibly non-uniform.
4809   for (auto *V : ConsecutiveLikePtrs)
4810     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4811       addToWorklistIfAllowed(V);
4812 
4813   // Expand Worklist in topological order: whenever a new instruction
4814   // is added , its users should be already inside Worklist.  It ensures
4815   // a uniform instruction will only be used by uniform instructions.
4816   unsigned idx = 0;
4817   while (idx != Worklist.size()) {
4818     Instruction *I = Worklist[idx++];
4819 
4820     for (auto OV : I->operand_values()) {
4821       // isOutOfScope operands cannot be uniform instructions.
4822       if (isOutOfScope(OV))
4823         continue;
4824       // First order recurrence Phi's should typically be considered
4825       // non-uniform.
4826       auto *OP = dyn_cast<PHINode>(OV);
4827       if (OP && Legal->isFirstOrderRecurrence(OP))
4828         continue;
4829       // If all the users of the operand are uniform, then add the
4830       // operand into the uniform worklist.
4831       auto *OI = cast<Instruction>(OV);
4832       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4833             auto *J = cast<Instruction>(U);
4834             return Worklist.count(J) ||
4835                    (OI == getLoadStorePointerOperand(J) &&
4836                     isUniformDecision(J, VF));
4837           }))
4838         addToWorklistIfAllowed(OI);
4839     }
4840   }
4841 
4842   // Returns true if Ptr is the pointer operand of a memory access instruction
4843   // I, and I is known to not require scalarization.
4844   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4845     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4846   };
4847 
4848   // For an instruction to be added into Worklist above, all its users inside
4849   // the loop should also be in Worklist. However, this condition cannot be
4850   // true for phi nodes that form a cyclic dependence. We must process phi
4851   // nodes separately. An induction variable will remain uniform if all users
4852   // of the induction variable and induction variable update remain uniform.
4853   // The code below handles both pointer and non-pointer induction variables.
4854   for (auto &Induction : Legal->getInductionVars()) {
4855     auto *Ind = Induction.first;
4856     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4857 
4858     // Determine if all users of the induction variable are uniform after
4859     // vectorization.
4860     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4861       auto *I = cast<Instruction>(U);
4862       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4863              isVectorizedMemAccessUse(I, Ind);
4864     });
4865     if (!UniformInd)
4866       continue;
4867 
4868     // Determine if all users of the induction variable update instruction are
4869     // uniform after vectorization.
4870     auto UniformIndUpdate =
4871         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4872           auto *I = cast<Instruction>(U);
4873           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4874                  isVectorizedMemAccessUse(I, IndUpdate);
4875         });
4876     if (!UniformIndUpdate)
4877       continue;
4878 
4879     // The induction variable and its update instruction will remain uniform.
4880     addToWorklistIfAllowed(Ind);
4881     addToWorklistIfAllowed(IndUpdate);
4882   }
4883 
4884   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4885 }
4886 
4887 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4888   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4889 
4890   if (Legal->getRuntimePointerChecking()->Need) {
4891     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4892         "runtime pointer checks needed. Enable vectorization of this "
4893         "loop with '#pragma clang loop vectorize(enable)' when "
4894         "compiling with -Os/-Oz",
4895         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4896     return true;
4897   }
4898 
4899   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4900     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4901         "runtime SCEV checks needed. Enable vectorization of this "
4902         "loop with '#pragma clang loop vectorize(enable)' when "
4903         "compiling with -Os/-Oz",
4904         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4905     return true;
4906   }
4907 
4908   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4909   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4910     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4911         "runtime stride == 1 checks needed. Enable vectorization of "
4912         "this loop with '#pragma clang loop vectorize(enable)' when "
4913         "compiling with -Os/-Oz",
4914         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4915     return true;
4916   }
4917 
4918   return false;
4919 }
4920 
4921 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4922   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4923     // TODO: It may by useful to do since it's still likely to be dynamically
4924     // uniform if the target can skip.
4925     reportVectorizationFailure(
4926         "Not inserting runtime ptr check for divergent target",
4927         "runtime pointer checks needed. Not enabled for divergent target",
4928         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4929     return None;
4930   }
4931 
4932   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4933   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4934   if (TC == 1) {
4935     reportVectorizationFailure("Single iteration (non) loop",
4936         "loop trip count is one, irrelevant for vectorization",
4937         "SingleIterationLoop", ORE, TheLoop);
4938     return None;
4939   }
4940 
4941   switch (ScalarEpilogueStatus) {
4942   case CM_ScalarEpilogueAllowed:
4943     return computeFeasibleMaxVF(TC);
4944   case CM_ScalarEpilogueNotNeededUsePredicate:
4945     LLVM_DEBUG(
4946         dbgs() << "LV: vector predicate hint/switch found.\n"
4947                << "LV: Not allowing scalar epilogue, creating predicated "
4948                << "vector loop.\n");
4949     break;
4950   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4951     // fallthrough as a special case of OptForSize
4952   case CM_ScalarEpilogueNotAllowedOptSize:
4953     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4954       LLVM_DEBUG(
4955           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4956     else
4957       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4958                         << "count.\n");
4959 
4960     // Bail if runtime checks are required, which are not good when optimising
4961     // for size.
4962     if (runtimeChecksRequired())
4963       return None;
4964     break;
4965   }
4966 
4967   // Now try the tail folding
4968 
4969   // Invalidate interleave groups that require an epilogue if we can't mask
4970   // the interleave-group.
4971   if (!useMaskedInterleavedAccesses(TTI))
4972     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4973 
4974   unsigned MaxVF = computeFeasibleMaxVF(TC);
4975   if (TC > 0 && TC % MaxVF == 0) {
4976     // Accept MaxVF if we do not have a tail.
4977     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4978     return MaxVF;
4979   }
4980 
4981   // If we don't know the precise trip count, or if the trip count that we
4982   // found modulo the vectorization factor is not zero, try to fold the tail
4983   // by masking.
4984   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4985   if (Legal->prepareToFoldTailByMasking()) {
4986     FoldTailByMasking = true;
4987     return MaxVF;
4988   }
4989 
4990   if (TC == 0) {
4991     reportVectorizationFailure(
4992         "Unable to calculate the loop count due to complex control flow",
4993         "unable to calculate the loop count due to complex control flow",
4994         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4995     return None;
4996   }
4997 
4998   reportVectorizationFailure(
4999       "Cannot optimize for size and vectorize at the same time.",
5000       "cannot optimize for size and vectorize at the same time. "
5001       "Enable vectorization of this loop with '#pragma clang loop "
5002       "vectorize(enable)' when compiling with -Os/-Oz",
5003       "NoTailLoopWithOptForSize", ORE, TheLoop);
5004   return None;
5005 }
5006 
5007 unsigned
5008 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5009   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5010   unsigned SmallestType, WidestType;
5011   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5012   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5013 
5014   // Get the maximum safe dependence distance in bits computed by LAA.
5015   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5016   // the memory accesses that is most restrictive (involved in the smallest
5017   // dependence distance).
5018   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5019 
5020   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5021 
5022   unsigned MaxVectorSize = WidestRegister / WidestType;
5023 
5024   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5025                     << " / " << WidestType << " bits.\n");
5026   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5027                     << WidestRegister << " bits.\n");
5028 
5029   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5030                                  " into one vector!");
5031   if (MaxVectorSize == 0) {
5032     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5033     MaxVectorSize = 1;
5034     return MaxVectorSize;
5035   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5036              isPowerOf2_32(ConstTripCount)) {
5037     // We need to clamp the VF to be the ConstTripCount. There is no point in
5038     // choosing a higher viable VF as done in the loop below.
5039     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5040                       << ConstTripCount << "\n");
5041     MaxVectorSize = ConstTripCount;
5042     return MaxVectorSize;
5043   }
5044 
5045   unsigned MaxVF = MaxVectorSize;
5046   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5047       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5048     // Collect all viable vectorization factors larger than the default MaxVF
5049     // (i.e. MaxVectorSize).
5050     SmallVector<unsigned, 8> VFs;
5051     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5052     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5053       VFs.push_back(VS);
5054 
5055     // For each VF calculate its register usage.
5056     auto RUs = calculateRegisterUsage(VFs);
5057 
5058     // Select the largest VF which doesn't require more registers than existing
5059     // ones.
5060     for (int i = RUs.size() - 1; i >= 0; --i) {
5061       bool Selected = true;
5062       for (auto& pair : RUs[i].MaxLocalUsers) {
5063         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5064         if (pair.second > TargetNumRegisters)
5065           Selected = false;
5066       }
5067       if (Selected) {
5068         MaxVF = VFs[i];
5069         break;
5070       }
5071     }
5072     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5073       if (MaxVF < MinVF) {
5074         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5075                           << ") with target's minimum: " << MinVF << '\n');
5076         MaxVF = MinVF;
5077       }
5078     }
5079   }
5080   return MaxVF;
5081 }
5082 
5083 VectorizationFactor
5084 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5085   float Cost = expectedCost(1).first;
5086   const float ScalarCost = Cost;
5087   unsigned Width = 1;
5088   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5089 
5090   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5091   if (ForceVectorization && MaxVF > 1) {
5092     // Ignore scalar width, because the user explicitly wants vectorization.
5093     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5094     // evaluation.
5095     Cost = std::numeric_limits<float>::max();
5096   }
5097 
5098   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5099     // Notice that the vector loop needs to be executed less times, so
5100     // we need to divide the cost of the vector loops by the width of
5101     // the vector elements.
5102     VectorizationCostTy C = expectedCost(i);
5103     float VectorCost = C.first / (float)i;
5104     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5105                       << " costs: " << (int)VectorCost << ".\n");
5106     if (!C.second && !ForceVectorization) {
5107       LLVM_DEBUG(
5108           dbgs() << "LV: Not considering vector loop of width " << i
5109                  << " because it will not generate any vector instructions.\n");
5110       continue;
5111     }
5112     if (VectorCost < Cost) {
5113       Cost = VectorCost;
5114       Width = i;
5115     }
5116   }
5117 
5118   if (!EnableCondStoresVectorization && NumPredStores) {
5119     reportVectorizationFailure("There are conditional stores.",
5120         "store that is conditionally executed prevents vectorization",
5121         "ConditionalStore", ORE, TheLoop);
5122     Width = 1;
5123     Cost = ScalarCost;
5124   }
5125 
5126   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5127              << "LV: Vectorization seems to be not beneficial, "
5128              << "but was forced by a user.\n");
5129   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5130   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5131   return Factor;
5132 }
5133 
5134 std::pair<unsigned, unsigned>
5135 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5136   unsigned MinWidth = -1U;
5137   unsigned MaxWidth = 8;
5138   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5139 
5140   // For each block.
5141   for (BasicBlock *BB : TheLoop->blocks()) {
5142     // For each instruction in the loop.
5143     for (Instruction &I : BB->instructionsWithoutDebug()) {
5144       Type *T = I.getType();
5145 
5146       // Skip ignored values.
5147       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5148         continue;
5149 
5150       // Only examine Loads, Stores and PHINodes.
5151       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5152         continue;
5153 
5154       // Examine PHI nodes that are reduction variables. Update the type to
5155       // account for the recurrence type.
5156       if (auto *PN = dyn_cast<PHINode>(&I)) {
5157         if (!Legal->isReductionVariable(PN))
5158           continue;
5159         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5160         T = RdxDesc.getRecurrenceType();
5161       }
5162 
5163       // Examine the stored values.
5164       if (auto *ST = dyn_cast<StoreInst>(&I))
5165         T = ST->getValueOperand()->getType();
5166 
5167       // Ignore loaded pointer types and stored pointer types that are not
5168       // vectorizable.
5169       //
5170       // FIXME: The check here attempts to predict whether a load or store will
5171       //        be vectorized. We only know this for certain after a VF has
5172       //        been selected. Here, we assume that if an access can be
5173       //        vectorized, it will be. We should also look at extending this
5174       //        optimization to non-pointer types.
5175       //
5176       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5177           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5178         continue;
5179 
5180       MinWidth = std::min(MinWidth,
5181                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5182       MaxWidth = std::max(MaxWidth,
5183                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5184     }
5185   }
5186 
5187   return {MinWidth, MaxWidth};
5188 }
5189 
5190 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5191                                                            unsigned LoopCost) {
5192   // -- The interleave heuristics --
5193   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5194   // There are many micro-architectural considerations that we can't predict
5195   // at this level. For example, frontend pressure (on decode or fetch) due to
5196   // code size, or the number and capabilities of the execution ports.
5197   //
5198   // We use the following heuristics to select the interleave count:
5199   // 1. If the code has reductions, then we interleave to break the cross
5200   // iteration dependency.
5201   // 2. If the loop is really small, then we interleave to reduce the loop
5202   // overhead.
5203   // 3. We don't interleave if we think that we will spill registers to memory
5204   // due to the increased register pressure.
5205 
5206   if (!isScalarEpilogueAllowed())
5207     return 1;
5208 
5209   // We used the distance for the interleave count.
5210   if (Legal->getMaxSafeDepDistBytes() != -1U)
5211     return 1;
5212 
5213   // Do not interleave loops with a relatively small known or estimated trip
5214   // count.
5215   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5216   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5217     return 1;
5218 
5219   RegisterUsage R = calculateRegisterUsage({VF})[0];
5220   // We divide by these constants so assume that we have at least one
5221   // instruction that uses at least one register.
5222   for (auto& pair : R.MaxLocalUsers) {
5223     pair.second = std::max(pair.second, 1U);
5224   }
5225 
5226   // We calculate the interleave count using the following formula.
5227   // Subtract the number of loop invariants from the number of available
5228   // registers. These registers are used by all of the interleaved instances.
5229   // Next, divide the remaining registers by the number of registers that is
5230   // required by the loop, in order to estimate how many parallel instances
5231   // fit without causing spills. All of this is rounded down if necessary to be
5232   // a power of two. We want power of two interleave count to simplify any
5233   // addressing operations or alignment considerations.
5234   // We also want power of two interleave counts to ensure that the induction
5235   // variable of the vector loop wraps to zero, when tail is folded by masking;
5236   // this currently happens when OptForSize, in which case IC is set to 1 above.
5237   unsigned IC = UINT_MAX;
5238 
5239   for (auto& pair : R.MaxLocalUsers) {
5240     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5241     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5242                       << " registers of "
5243                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5244     if (VF == 1) {
5245       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5246         TargetNumRegisters = ForceTargetNumScalarRegs;
5247     } else {
5248       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5249         TargetNumRegisters = ForceTargetNumVectorRegs;
5250     }
5251     unsigned MaxLocalUsers = pair.second;
5252     unsigned LoopInvariantRegs = 0;
5253     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5254       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5255 
5256     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5257     // Don't count the induction variable as interleaved.
5258     if (EnableIndVarRegisterHeur) {
5259       TmpIC =
5260           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5261                         std::max(1U, (MaxLocalUsers - 1)));
5262     }
5263 
5264     IC = std::min(IC, TmpIC);
5265   }
5266 
5267   // Clamp the interleave ranges to reasonable counts.
5268   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5269 
5270   // Check if the user has overridden the max.
5271   if (VF == 1) {
5272     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5273       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5274   } else {
5275     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5276       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5277   }
5278 
5279   // If trip count is known or estimated compile time constant, limit the
5280   // interleave count to be less than the trip count divided by VF.
5281   if (BestKnownTC) {
5282     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5283   }
5284 
5285   // If we did not calculate the cost for VF (because the user selected the VF)
5286   // then we calculate the cost of VF here.
5287   if (LoopCost == 0)
5288     LoopCost = expectedCost(VF).first;
5289 
5290   assert(LoopCost && "Non-zero loop cost expected");
5291 
5292   // Clamp the calculated IC to be between the 1 and the max interleave count
5293   // that the target and trip count allows.
5294   if (IC > MaxInterleaveCount)
5295     IC = MaxInterleaveCount;
5296   else if (IC < 1)
5297     IC = 1;
5298 
5299   // Interleave if we vectorized this loop and there is a reduction that could
5300   // benefit from interleaving.
5301   if (VF > 1 && !Legal->getReductionVars().empty()) {
5302     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5303     return IC;
5304   }
5305 
5306   // Note that if we've already vectorized the loop we will have done the
5307   // runtime check and so interleaving won't require further checks.
5308   bool InterleavingRequiresRuntimePointerCheck =
5309       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5310 
5311   // We want to interleave small loops in order to reduce the loop overhead and
5312   // potentially expose ILP opportunities.
5313   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5314   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5315     // We assume that the cost overhead is 1 and we use the cost model
5316     // to estimate the cost of the loop and interleave until the cost of the
5317     // loop overhead is about 5% of the cost of the loop.
5318     unsigned SmallIC =
5319         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5320 
5321     // Interleave until store/load ports (estimated by max interleave count) are
5322     // saturated.
5323     unsigned NumStores = Legal->getNumStores();
5324     unsigned NumLoads = Legal->getNumLoads();
5325     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5326     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5327 
5328     // If we have a scalar reduction (vector reductions are already dealt with
5329     // by this point), we can increase the critical path length if the loop
5330     // we're interleaving is inside another loop. Limit, by default to 2, so the
5331     // critical path only gets increased by one reduction operation.
5332     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5333       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5334       SmallIC = std::min(SmallIC, F);
5335       StoresIC = std::min(StoresIC, F);
5336       LoadsIC = std::min(LoadsIC, F);
5337     }
5338 
5339     if (EnableLoadStoreRuntimeInterleave &&
5340         std::max(StoresIC, LoadsIC) > SmallIC) {
5341       LLVM_DEBUG(
5342           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5343       return std::max(StoresIC, LoadsIC);
5344     }
5345 
5346     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5347     return SmallIC;
5348   }
5349 
5350   // Interleave if this is a large loop (small loops are already dealt with by
5351   // this point) that could benefit from interleaving.
5352   bool HasReductions = !Legal->getReductionVars().empty();
5353   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5354     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5355     return IC;
5356   }
5357 
5358   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5359   return 1;
5360 }
5361 
5362 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5363 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5364   // This function calculates the register usage by measuring the highest number
5365   // of values that are alive at a single location. Obviously, this is a very
5366   // rough estimation. We scan the loop in a topological order in order and
5367   // assign a number to each instruction. We use RPO to ensure that defs are
5368   // met before their users. We assume that each instruction that has in-loop
5369   // users starts an interval. We record every time that an in-loop value is
5370   // used, so we have a list of the first and last occurrences of each
5371   // instruction. Next, we transpose this data structure into a multi map that
5372   // holds the list of intervals that *end* at a specific location. This multi
5373   // map allows us to perform a linear search. We scan the instructions linearly
5374   // and record each time that a new interval starts, by placing it in a set.
5375   // If we find this value in the multi-map then we remove it from the set.
5376   // The max register usage is the maximum size of the set.
5377   // We also search for instructions that are defined outside the loop, but are
5378   // used inside the loop. We need this number separately from the max-interval
5379   // usage number because when we unroll, loop-invariant values do not take
5380   // more register.
5381   LoopBlocksDFS DFS(TheLoop);
5382   DFS.perform(LI);
5383 
5384   RegisterUsage RU;
5385 
5386   // Each 'key' in the map opens a new interval. The values
5387   // of the map are the index of the 'last seen' usage of the
5388   // instruction that is the key.
5389   using IntervalMap = DenseMap<Instruction *, unsigned>;
5390 
5391   // Maps instruction to its index.
5392   SmallVector<Instruction *, 64> IdxToInstr;
5393   // Marks the end of each interval.
5394   IntervalMap EndPoint;
5395   // Saves the list of instruction indices that are used in the loop.
5396   SmallPtrSet<Instruction *, 8> Ends;
5397   // Saves the list of values that are used in the loop but are
5398   // defined outside the loop, such as arguments and constants.
5399   SmallPtrSet<Value *, 8> LoopInvariants;
5400 
5401   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5402     for (Instruction &I : BB->instructionsWithoutDebug()) {
5403       IdxToInstr.push_back(&I);
5404 
5405       // Save the end location of each USE.
5406       for (Value *U : I.operands()) {
5407         auto *Instr = dyn_cast<Instruction>(U);
5408 
5409         // Ignore non-instruction values such as arguments, constants, etc.
5410         if (!Instr)
5411           continue;
5412 
5413         // If this instruction is outside the loop then record it and continue.
5414         if (!TheLoop->contains(Instr)) {
5415           LoopInvariants.insert(Instr);
5416           continue;
5417         }
5418 
5419         // Overwrite previous end points.
5420         EndPoint[Instr] = IdxToInstr.size();
5421         Ends.insert(Instr);
5422       }
5423     }
5424   }
5425 
5426   // Saves the list of intervals that end with the index in 'key'.
5427   using InstrList = SmallVector<Instruction *, 2>;
5428   DenseMap<unsigned, InstrList> TransposeEnds;
5429 
5430   // Transpose the EndPoints to a list of values that end at each index.
5431   for (auto &Interval : EndPoint)
5432     TransposeEnds[Interval.second].push_back(Interval.first);
5433 
5434   SmallPtrSet<Instruction *, 8> OpenIntervals;
5435 
5436   // Get the size of the widest register.
5437   unsigned MaxSafeDepDist = -1U;
5438   if (Legal->getMaxSafeDepDistBytes() != -1U)
5439     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5440   unsigned WidestRegister =
5441       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5442   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5443 
5444   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5445   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5446 
5447   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5448 
5449   // A lambda that gets the register usage for the given type and VF.
5450   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5451     if (Ty->isTokenTy())
5452       return 0U;
5453     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5454     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5455   };
5456 
5457   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5458     Instruction *I = IdxToInstr[i];
5459 
5460     // Remove all of the instructions that end at this location.
5461     InstrList &List = TransposeEnds[i];
5462     for (Instruction *ToRemove : List)
5463       OpenIntervals.erase(ToRemove);
5464 
5465     // Ignore instructions that are never used within the loop.
5466     if (Ends.find(I) == Ends.end())
5467       continue;
5468 
5469     // Skip ignored values.
5470     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5471       continue;
5472 
5473     // For each VF find the maximum usage of registers.
5474     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5475       // Count the number of live intervals.
5476       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5477 
5478       if (VFs[j] == 1) {
5479         for (auto Inst : OpenIntervals) {
5480           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5481           if (RegUsage.find(ClassID) == RegUsage.end())
5482             RegUsage[ClassID] = 1;
5483           else
5484             RegUsage[ClassID] += 1;
5485         }
5486       } else {
5487         collectUniformsAndScalars(VFs[j]);
5488         for (auto Inst : OpenIntervals) {
5489           // Skip ignored values for VF > 1.
5490           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5491             continue;
5492           if (isScalarAfterVectorization(Inst, VFs[j])) {
5493             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5494             if (RegUsage.find(ClassID) == RegUsage.end())
5495               RegUsage[ClassID] = 1;
5496             else
5497               RegUsage[ClassID] += 1;
5498           } else {
5499             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5500             if (RegUsage.find(ClassID) == RegUsage.end())
5501               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5502             else
5503               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5504           }
5505         }
5506       }
5507 
5508       for (auto& pair : RegUsage) {
5509         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5510           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5511         else
5512           MaxUsages[j][pair.first] = pair.second;
5513       }
5514     }
5515 
5516     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5517                       << OpenIntervals.size() << '\n');
5518 
5519     // Add the current instruction to the list of open intervals.
5520     OpenIntervals.insert(I);
5521   }
5522 
5523   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5524     SmallMapVector<unsigned, unsigned, 4> Invariant;
5525 
5526     for (auto Inst : LoopInvariants) {
5527       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5528       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5529       if (Invariant.find(ClassID) == Invariant.end())
5530         Invariant[ClassID] = Usage;
5531       else
5532         Invariant[ClassID] += Usage;
5533     }
5534 
5535     LLVM_DEBUG({
5536       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5537       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5538              << " item\n";
5539       for (const auto &pair : MaxUsages[i]) {
5540         dbgs() << "LV(REG): RegisterClass: "
5541                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5542                << " registers\n";
5543       }
5544       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5545              << " item\n";
5546       for (const auto &pair : Invariant) {
5547         dbgs() << "LV(REG): RegisterClass: "
5548                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5549                << " registers\n";
5550       }
5551     });
5552 
5553     RU.LoopInvariantRegs = Invariant;
5554     RU.MaxLocalUsers = MaxUsages[i];
5555     RUs[i] = RU;
5556   }
5557 
5558   return RUs;
5559 }
5560 
5561 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5562   // TODO: Cost model for emulated masked load/store is completely
5563   // broken. This hack guides the cost model to use an artificially
5564   // high enough value to practically disable vectorization with such
5565   // operations, except where previously deployed legality hack allowed
5566   // using very low cost values. This is to avoid regressions coming simply
5567   // from moving "masked load/store" check from legality to cost model.
5568   // Masked Load/Gather emulation was previously never allowed.
5569   // Limited number of Masked Store/Scatter emulation was allowed.
5570   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5571   return isa<LoadInst>(I) ||
5572          (isa<StoreInst>(I) &&
5573           NumPredStores > NumberOfStoresToPredicate);
5574 }
5575 
5576 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5577   // If we aren't vectorizing the loop, or if we've already collected the
5578   // instructions to scalarize, there's nothing to do. Collection may already
5579   // have occurred if we have a user-selected VF and are now computing the
5580   // expected cost for interleaving.
5581   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5582     return;
5583 
5584   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5585   // not profitable to scalarize any instructions, the presence of VF in the
5586   // map will indicate that we've analyzed it already.
5587   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5588 
5589   // Find all the instructions that are scalar with predication in the loop and
5590   // determine if it would be better to not if-convert the blocks they are in.
5591   // If so, we also record the instructions to scalarize.
5592   for (BasicBlock *BB : TheLoop->blocks()) {
5593     if (!blockNeedsPredication(BB))
5594       continue;
5595     for (Instruction &I : *BB)
5596       if (isScalarWithPredication(&I)) {
5597         ScalarCostsTy ScalarCosts;
5598         // Do not apply discount logic if hacked cost is needed
5599         // for emulated masked memrefs.
5600         if (!useEmulatedMaskMemRefHack(&I) &&
5601             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5602           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5603         // Remember that BB will remain after vectorization.
5604         PredicatedBBsAfterVectorization.insert(BB);
5605       }
5606   }
5607 }
5608 
5609 int LoopVectorizationCostModel::computePredInstDiscount(
5610     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5611     unsigned VF) {
5612   assert(!isUniformAfterVectorization(PredInst, VF) &&
5613          "Instruction marked uniform-after-vectorization will be predicated");
5614 
5615   // Initialize the discount to zero, meaning that the scalar version and the
5616   // vector version cost the same.
5617   int Discount = 0;
5618 
5619   // Holds instructions to analyze. The instructions we visit are mapped in
5620   // ScalarCosts. Those instructions are the ones that would be scalarized if
5621   // we find that the scalar version costs less.
5622   SmallVector<Instruction *, 8> Worklist;
5623 
5624   // Returns true if the given instruction can be scalarized.
5625   auto canBeScalarized = [&](Instruction *I) -> bool {
5626     // We only attempt to scalarize instructions forming a single-use chain
5627     // from the original predicated block that would otherwise be vectorized.
5628     // Although not strictly necessary, we give up on instructions we know will
5629     // already be scalar to avoid traversing chains that are unlikely to be
5630     // beneficial.
5631     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5632         isScalarAfterVectorization(I, VF))
5633       return false;
5634 
5635     // If the instruction is scalar with predication, it will be analyzed
5636     // separately. We ignore it within the context of PredInst.
5637     if (isScalarWithPredication(I))
5638       return false;
5639 
5640     // If any of the instruction's operands are uniform after vectorization,
5641     // the instruction cannot be scalarized. This prevents, for example, a
5642     // masked load from being scalarized.
5643     //
5644     // We assume we will only emit a value for lane zero of an instruction
5645     // marked uniform after vectorization, rather than VF identical values.
5646     // Thus, if we scalarize an instruction that uses a uniform, we would
5647     // create uses of values corresponding to the lanes we aren't emitting code
5648     // for. This behavior can be changed by allowing getScalarValue to clone
5649     // the lane zero values for uniforms rather than asserting.
5650     for (Use &U : I->operands())
5651       if (auto *J = dyn_cast<Instruction>(U.get()))
5652         if (isUniformAfterVectorization(J, VF))
5653           return false;
5654 
5655     // Otherwise, we can scalarize the instruction.
5656     return true;
5657   };
5658 
5659   // Compute the expected cost discount from scalarizing the entire expression
5660   // feeding the predicated instruction. We currently only consider expressions
5661   // that are single-use instruction chains.
5662   Worklist.push_back(PredInst);
5663   while (!Worklist.empty()) {
5664     Instruction *I = Worklist.pop_back_val();
5665 
5666     // If we've already analyzed the instruction, there's nothing to do.
5667     if (ScalarCosts.find(I) != ScalarCosts.end())
5668       continue;
5669 
5670     // Compute the cost of the vector instruction. Note that this cost already
5671     // includes the scalarization overhead of the predicated instruction.
5672     unsigned VectorCost = getInstructionCost(I, VF).first;
5673 
5674     // Compute the cost of the scalarized instruction. This cost is the cost of
5675     // the instruction as if it wasn't if-converted and instead remained in the
5676     // predicated block. We will scale this cost by block probability after
5677     // computing the scalarization overhead.
5678     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5679 
5680     // Compute the scalarization overhead of needed insertelement instructions
5681     // and phi nodes.
5682     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5683       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5684                                                  true, false);
5685       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5686     }
5687 
5688     // Compute the scalarization overhead of needed extractelement
5689     // instructions. For each of the instruction's operands, if the operand can
5690     // be scalarized, add it to the worklist; otherwise, account for the
5691     // overhead.
5692     for (Use &U : I->operands())
5693       if (auto *J = dyn_cast<Instruction>(U.get())) {
5694         assert(VectorType::isValidElementType(J->getType()) &&
5695                "Instruction has non-scalar type");
5696         if (canBeScalarized(J))
5697           Worklist.push_back(J);
5698         else if (needsExtract(J, VF))
5699           ScalarCost += TTI.getScalarizationOverhead(
5700                               ToVectorTy(J->getType(),VF), false, true);
5701       }
5702 
5703     // Scale the total scalar cost by block probability.
5704     ScalarCost /= getReciprocalPredBlockProb();
5705 
5706     // Compute the discount. A non-negative discount means the vector version
5707     // of the instruction costs more, and scalarizing would be beneficial.
5708     Discount += VectorCost - ScalarCost;
5709     ScalarCosts[I] = ScalarCost;
5710   }
5711 
5712   return Discount;
5713 }
5714 
5715 LoopVectorizationCostModel::VectorizationCostTy
5716 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5717   VectorizationCostTy Cost;
5718 
5719   // For each block.
5720   for (BasicBlock *BB : TheLoop->blocks()) {
5721     VectorizationCostTy BlockCost;
5722 
5723     // For each instruction in the old loop.
5724     for (Instruction &I : BB->instructionsWithoutDebug()) {
5725       // Skip ignored values.
5726       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5727           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5728         continue;
5729 
5730       VectorizationCostTy C = getInstructionCost(&I, VF);
5731 
5732       // Check if we should override the cost.
5733       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5734         C.first = ForceTargetInstructionCost;
5735 
5736       BlockCost.first += C.first;
5737       BlockCost.second |= C.second;
5738       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5739                         << " for VF " << VF << " For instruction: " << I
5740                         << '\n');
5741     }
5742 
5743     // If we are vectorizing a predicated block, it will have been
5744     // if-converted. This means that the block's instructions (aside from
5745     // stores and instructions that may divide by zero) will now be
5746     // unconditionally executed. For the scalar case, we may not always execute
5747     // the predicated block. Thus, scale the block's cost by the probability of
5748     // executing it.
5749     if (VF == 1 && blockNeedsPredication(BB))
5750       BlockCost.first /= getReciprocalPredBlockProb();
5751 
5752     Cost.first += BlockCost.first;
5753     Cost.second |= BlockCost.second;
5754   }
5755 
5756   return Cost;
5757 }
5758 
5759 /// Gets Address Access SCEV after verifying that the access pattern
5760 /// is loop invariant except the induction variable dependence.
5761 ///
5762 /// This SCEV can be sent to the Target in order to estimate the address
5763 /// calculation cost.
5764 static const SCEV *getAddressAccessSCEV(
5765               Value *Ptr,
5766               LoopVectorizationLegality *Legal,
5767               PredicatedScalarEvolution &PSE,
5768               const Loop *TheLoop) {
5769 
5770   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5771   if (!Gep)
5772     return nullptr;
5773 
5774   // We are looking for a gep with all loop invariant indices except for one
5775   // which should be an induction variable.
5776   auto SE = PSE.getSE();
5777   unsigned NumOperands = Gep->getNumOperands();
5778   for (unsigned i = 1; i < NumOperands; ++i) {
5779     Value *Opd = Gep->getOperand(i);
5780     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5781         !Legal->isInductionVariable(Opd))
5782       return nullptr;
5783   }
5784 
5785   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5786   return PSE.getSCEV(Ptr);
5787 }
5788 
5789 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5790   return Legal->hasStride(I->getOperand(0)) ||
5791          Legal->hasStride(I->getOperand(1));
5792 }
5793 
5794 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5795                                                                  unsigned VF) {
5796   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5797   Type *ValTy = getMemInstValueType(I);
5798   auto SE = PSE.getSE();
5799 
5800   unsigned AS = getLoadStoreAddressSpace(I);
5801   Value *Ptr = getLoadStorePointerOperand(I);
5802   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5803 
5804   // Figure out whether the access is strided and get the stride value
5805   // if it's known in compile time
5806   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5807 
5808   // Get the cost of the scalar memory instruction and address computation.
5809   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5810 
5811   // Don't pass *I here, since it is scalar but will actually be part of a
5812   // vectorized loop where the user of it is a vectorized instruction.
5813   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5814   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5815                                    Alignment, AS);
5816 
5817   // Get the overhead of the extractelement and insertelement instructions
5818   // we might create due to scalarization.
5819   Cost += getScalarizationOverhead(I, VF);
5820 
5821   // If we have a predicated store, it may not be executed for each vector
5822   // lane. Scale the cost by the probability of executing the predicated
5823   // block.
5824   if (isPredicatedInst(I)) {
5825     Cost /= getReciprocalPredBlockProb();
5826 
5827     if (useEmulatedMaskMemRefHack(I))
5828       // Artificially setting to a high enough value to practically disable
5829       // vectorization with such operations.
5830       Cost = 3000000;
5831   }
5832 
5833   return Cost;
5834 }
5835 
5836 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5837                                                              unsigned VF) {
5838   Type *ValTy = getMemInstValueType(I);
5839   Type *VectorTy = ToVectorTy(ValTy, VF);
5840   Value *Ptr = getLoadStorePointerOperand(I);
5841   unsigned AS = getLoadStoreAddressSpace(I);
5842   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5843 
5844   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5845          "Stride should be 1 or -1 for consecutive memory access");
5846   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5847   unsigned Cost = 0;
5848   if (Legal->isMaskRequired(I))
5849     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5850                                       Alignment ? Alignment->value() : 0, AS);
5851   else
5852     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5853 
5854   bool Reverse = ConsecutiveStride < 0;
5855   if (Reverse)
5856     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5857   return Cost;
5858 }
5859 
5860 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5861                                                          unsigned VF) {
5862   Type *ValTy = getMemInstValueType(I);
5863   Type *VectorTy = ToVectorTy(ValTy, VF);
5864   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5865   unsigned AS = getLoadStoreAddressSpace(I);
5866   if (isa<LoadInst>(I)) {
5867     return TTI.getAddressComputationCost(ValTy) +
5868            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5869            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5870   }
5871   StoreInst *SI = cast<StoreInst>(I);
5872 
5873   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5874   return TTI.getAddressComputationCost(ValTy) +
5875          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5876          (isLoopInvariantStoreValue
5877               ? 0
5878               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5879                                        VF - 1));
5880 }
5881 
5882 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5883                                                           unsigned VF) {
5884   Type *ValTy = getMemInstValueType(I);
5885   Type *VectorTy = ToVectorTy(ValTy, VF);
5886   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5887   Value *Ptr = getLoadStorePointerOperand(I);
5888 
5889   return TTI.getAddressComputationCost(VectorTy) +
5890          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5891                                     Legal->isMaskRequired(I),
5892                                     Alignment ? Alignment->value() : 0);
5893 }
5894 
5895 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5896                                                             unsigned VF) {
5897   Type *ValTy = getMemInstValueType(I);
5898   Type *VectorTy = ToVectorTy(ValTy, VF);
5899   unsigned AS = getLoadStoreAddressSpace(I);
5900 
5901   auto Group = getInterleavedAccessGroup(I);
5902   assert(Group && "Fail to get an interleaved access group.");
5903 
5904   unsigned InterleaveFactor = Group->getFactor();
5905   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5906 
5907   // Holds the indices of existing members in an interleaved load group.
5908   // An interleaved store group doesn't need this as it doesn't allow gaps.
5909   SmallVector<unsigned, 4> Indices;
5910   if (isa<LoadInst>(I)) {
5911     for (unsigned i = 0; i < InterleaveFactor; i++)
5912       if (Group->getMember(i))
5913         Indices.push_back(i);
5914   }
5915 
5916   // Calculate the cost of the whole interleaved group.
5917   bool UseMaskForGaps =
5918       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5919   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5920       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5921       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5922 
5923   if (Group->isReverse()) {
5924     // TODO: Add support for reversed masked interleaved access.
5925     assert(!Legal->isMaskRequired(I) &&
5926            "Reverse masked interleaved access not supported.");
5927     Cost += Group->getNumMembers() *
5928             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5929   }
5930   return Cost;
5931 }
5932 
5933 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5934                                                               unsigned VF) {
5935   // Calculate scalar cost only. Vectorization cost should be ready at this
5936   // moment.
5937   if (VF == 1) {
5938     Type *ValTy = getMemInstValueType(I);
5939     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5940     unsigned AS = getLoadStoreAddressSpace(I);
5941 
5942     return TTI.getAddressComputationCost(ValTy) +
5943            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5944   }
5945   return getWideningCost(I, VF);
5946 }
5947 
5948 LoopVectorizationCostModel::VectorizationCostTy
5949 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5950   // If we know that this instruction will remain uniform, check the cost of
5951   // the scalar version.
5952   if (isUniformAfterVectorization(I, VF))
5953     VF = 1;
5954 
5955   if (VF > 1 && isProfitableToScalarize(I, VF))
5956     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5957 
5958   // Forced scalars do not have any scalarization overhead.
5959   auto ForcedScalar = ForcedScalars.find(VF);
5960   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5961     auto InstSet = ForcedScalar->second;
5962     if (InstSet.find(I) != InstSet.end())
5963       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5964   }
5965 
5966   Type *VectorTy;
5967   unsigned C = getInstructionCost(I, VF, VectorTy);
5968 
5969   bool TypeNotScalarized =
5970       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5971   return VectorizationCostTy(C, TypeNotScalarized);
5972 }
5973 
5974 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5975                                                               unsigned VF) {
5976 
5977   if (VF == 1)
5978     return 0;
5979 
5980   unsigned Cost = 0;
5981   Type *RetTy = ToVectorTy(I->getType(), VF);
5982   if (!RetTy->isVoidTy() &&
5983       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5984     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5985 
5986   // Some targets keep addresses scalar.
5987   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5988     return Cost;
5989 
5990   // Some targets support efficient element stores.
5991   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5992     return Cost;
5993 
5994   // Collect operands to consider.
5995   CallInst *CI = dyn_cast<CallInst>(I);
5996   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5997 
5998   // Skip operands that do not require extraction/scalarization and do not incur
5999   // any overhead.
6000   return Cost + TTI.getOperandsScalarizationOverhead(
6001                     filterExtractingOperands(Ops, VF), VF);
6002 }
6003 
6004 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6005   if (VF == 1)
6006     return;
6007   NumPredStores = 0;
6008   for (BasicBlock *BB : TheLoop->blocks()) {
6009     // For each instruction in the old loop.
6010     for (Instruction &I : *BB) {
6011       Value *Ptr =  getLoadStorePointerOperand(&I);
6012       if (!Ptr)
6013         continue;
6014 
6015       // TODO: We should generate better code and update the cost model for
6016       // predicated uniform stores. Today they are treated as any other
6017       // predicated store (see added test cases in
6018       // invariant-store-vectorization.ll).
6019       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6020         NumPredStores++;
6021 
6022       if (Legal->isUniform(Ptr) &&
6023           // Conditional loads and stores should be scalarized and predicated.
6024           // isScalarWithPredication cannot be used here since masked
6025           // gather/scatters are not considered scalar with predication.
6026           !Legal->blockNeedsPredication(I.getParent())) {
6027         // TODO: Avoid replicating loads and stores instead of
6028         // relying on instcombine to remove them.
6029         // Load: Scalar load + broadcast
6030         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6031         unsigned Cost = getUniformMemOpCost(&I, VF);
6032         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6033         continue;
6034       }
6035 
6036       // We assume that widening is the best solution when possible.
6037       if (memoryInstructionCanBeWidened(&I, VF)) {
6038         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6039         int ConsecutiveStride =
6040                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6041         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6042                "Expected consecutive stride.");
6043         InstWidening Decision =
6044             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6045         setWideningDecision(&I, VF, Decision, Cost);
6046         continue;
6047       }
6048 
6049       // Choose between Interleaving, Gather/Scatter or Scalarization.
6050       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6051       unsigned NumAccesses = 1;
6052       if (isAccessInterleaved(&I)) {
6053         auto Group = getInterleavedAccessGroup(&I);
6054         assert(Group && "Fail to get an interleaved access group.");
6055 
6056         // Make one decision for the whole group.
6057         if (getWideningDecision(&I, VF) != CM_Unknown)
6058           continue;
6059 
6060         NumAccesses = Group->getNumMembers();
6061         if (interleavedAccessCanBeWidened(&I, VF))
6062           InterleaveCost = getInterleaveGroupCost(&I, VF);
6063       }
6064 
6065       unsigned GatherScatterCost =
6066           isLegalGatherOrScatter(&I)
6067               ? getGatherScatterCost(&I, VF) * NumAccesses
6068               : std::numeric_limits<unsigned>::max();
6069 
6070       unsigned ScalarizationCost =
6071           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6072 
6073       // Choose better solution for the current VF,
6074       // write down this decision and use it during vectorization.
6075       unsigned Cost;
6076       InstWidening Decision;
6077       if (InterleaveCost <= GatherScatterCost &&
6078           InterleaveCost < ScalarizationCost) {
6079         Decision = CM_Interleave;
6080         Cost = InterleaveCost;
6081       } else if (GatherScatterCost < ScalarizationCost) {
6082         Decision = CM_GatherScatter;
6083         Cost = GatherScatterCost;
6084       } else {
6085         Decision = CM_Scalarize;
6086         Cost = ScalarizationCost;
6087       }
6088       // If the instructions belongs to an interleave group, the whole group
6089       // receives the same decision. The whole group receives the cost, but
6090       // the cost will actually be assigned to one instruction.
6091       if (auto Group = getInterleavedAccessGroup(&I))
6092         setWideningDecision(Group, VF, Decision, Cost);
6093       else
6094         setWideningDecision(&I, VF, Decision, Cost);
6095     }
6096   }
6097 
6098   // Make sure that any load of address and any other address computation
6099   // remains scalar unless there is gather/scatter support. This avoids
6100   // inevitable extracts into address registers, and also has the benefit of
6101   // activating LSR more, since that pass can't optimize vectorized
6102   // addresses.
6103   if (TTI.prefersVectorizedAddressing())
6104     return;
6105 
6106   // Start with all scalar pointer uses.
6107   SmallPtrSet<Instruction *, 8> AddrDefs;
6108   for (BasicBlock *BB : TheLoop->blocks())
6109     for (Instruction &I : *BB) {
6110       Instruction *PtrDef =
6111         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6112       if (PtrDef && TheLoop->contains(PtrDef) &&
6113           getWideningDecision(&I, VF) != CM_GatherScatter)
6114         AddrDefs.insert(PtrDef);
6115     }
6116 
6117   // Add all instructions used to generate the addresses.
6118   SmallVector<Instruction *, 4> Worklist;
6119   for (auto *I : AddrDefs)
6120     Worklist.push_back(I);
6121   while (!Worklist.empty()) {
6122     Instruction *I = Worklist.pop_back_val();
6123     for (auto &Op : I->operands())
6124       if (auto *InstOp = dyn_cast<Instruction>(Op))
6125         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6126             AddrDefs.insert(InstOp).second)
6127           Worklist.push_back(InstOp);
6128   }
6129 
6130   for (auto *I : AddrDefs) {
6131     if (isa<LoadInst>(I)) {
6132       // Setting the desired widening decision should ideally be handled in
6133       // by cost functions, but since this involves the task of finding out
6134       // if the loaded register is involved in an address computation, it is
6135       // instead changed here when we know this is the case.
6136       InstWidening Decision = getWideningDecision(I, VF);
6137       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6138         // Scalarize a widened load of address.
6139         setWideningDecision(I, VF, CM_Scalarize,
6140                             (VF * getMemoryInstructionCost(I, 1)));
6141       else if (auto Group = getInterleavedAccessGroup(I)) {
6142         // Scalarize an interleave group of address loads.
6143         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6144           if (Instruction *Member = Group->getMember(I))
6145             setWideningDecision(Member, VF, CM_Scalarize,
6146                                 (VF * getMemoryInstructionCost(Member, 1)));
6147         }
6148       }
6149     } else
6150       // Make sure I gets scalarized and a cost estimate without
6151       // scalarization overhead.
6152       ForcedScalars[VF].insert(I);
6153   }
6154 }
6155 
6156 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6157                                                         unsigned VF,
6158                                                         Type *&VectorTy) {
6159   Type *RetTy = I->getType();
6160   if (canTruncateToMinimalBitwidth(I, VF))
6161     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6162   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6163   auto SE = PSE.getSE();
6164 
6165   // TODO: We need to estimate the cost of intrinsic calls.
6166   switch (I->getOpcode()) {
6167   case Instruction::GetElementPtr:
6168     // We mark this instruction as zero-cost because the cost of GEPs in
6169     // vectorized code depends on whether the corresponding memory instruction
6170     // is scalarized or not. Therefore, we handle GEPs with the memory
6171     // instruction cost.
6172     return 0;
6173   case Instruction::Br: {
6174     // In cases of scalarized and predicated instructions, there will be VF
6175     // predicated blocks in the vectorized loop. Each branch around these
6176     // blocks requires also an extract of its vector compare i1 element.
6177     bool ScalarPredicatedBB = false;
6178     BranchInst *BI = cast<BranchInst>(I);
6179     if (VF > 1 && BI->isConditional() &&
6180         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6181              PredicatedBBsAfterVectorization.end() ||
6182          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6183              PredicatedBBsAfterVectorization.end()))
6184       ScalarPredicatedBB = true;
6185 
6186     if (ScalarPredicatedBB) {
6187       // Return cost for branches around scalarized and predicated blocks.
6188       Type *Vec_i1Ty =
6189           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6190       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6191               (TTI.getCFInstrCost(Instruction::Br) * VF));
6192     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6193       // The back-edge branch will remain, as will all scalar branches.
6194       return TTI.getCFInstrCost(Instruction::Br);
6195     else
6196       // This branch will be eliminated by if-conversion.
6197       return 0;
6198     // Note: We currently assume zero cost for an unconditional branch inside
6199     // a predicated block since it will become a fall-through, although we
6200     // may decide in the future to call TTI for all branches.
6201   }
6202   case Instruction::PHI: {
6203     auto *Phi = cast<PHINode>(I);
6204 
6205     // First-order recurrences are replaced by vector shuffles inside the loop.
6206     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6207     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6208       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6209                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6210 
6211     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6212     // converted into select instructions. We require N - 1 selects per phi
6213     // node, where N is the number of incoming values.
6214     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6215       return (Phi->getNumIncomingValues() - 1) *
6216              TTI.getCmpSelInstrCost(
6217                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6218                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6219 
6220     return TTI.getCFInstrCost(Instruction::PHI);
6221   }
6222   case Instruction::UDiv:
6223   case Instruction::SDiv:
6224   case Instruction::URem:
6225   case Instruction::SRem:
6226     // If we have a predicated instruction, it may not be executed for each
6227     // vector lane. Get the scalarization cost and scale this amount by the
6228     // probability of executing the predicated block. If the instruction is not
6229     // predicated, we fall through to the next case.
6230     if (VF > 1 && isScalarWithPredication(I)) {
6231       unsigned Cost = 0;
6232 
6233       // These instructions have a non-void type, so account for the phi nodes
6234       // that we will create. This cost is likely to be zero. The phi node
6235       // cost, if any, should be scaled by the block probability because it
6236       // models a copy at the end of each predicated block.
6237       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6238 
6239       // The cost of the non-predicated instruction.
6240       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6241 
6242       // The cost of insertelement and extractelement instructions needed for
6243       // scalarization.
6244       Cost += getScalarizationOverhead(I, VF);
6245 
6246       // Scale the cost by the probability of executing the predicated blocks.
6247       // This assumes the predicated block for each vector lane is equally
6248       // likely.
6249       return Cost / getReciprocalPredBlockProb();
6250     }
6251     LLVM_FALLTHROUGH;
6252   case Instruction::Add:
6253   case Instruction::FAdd:
6254   case Instruction::Sub:
6255   case Instruction::FSub:
6256   case Instruction::Mul:
6257   case Instruction::FMul:
6258   case Instruction::FDiv:
6259   case Instruction::FRem:
6260   case Instruction::Shl:
6261   case Instruction::LShr:
6262   case Instruction::AShr:
6263   case Instruction::And:
6264   case Instruction::Or:
6265   case Instruction::Xor: {
6266     // Since we will replace the stride by 1 the multiplication should go away.
6267     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6268       return 0;
6269     // Certain instructions can be cheaper to vectorize if they have a constant
6270     // second vector operand. One example of this are shifts on x86.
6271     Value *Op2 = I->getOperand(1);
6272     TargetTransformInfo::OperandValueProperties Op2VP;
6273     TargetTransformInfo::OperandValueKind Op2VK =
6274         TTI.getOperandInfo(Op2, Op2VP);
6275     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6276       Op2VK = TargetTransformInfo::OK_UniformValue;
6277 
6278     SmallVector<const Value *, 4> Operands(I->operand_values());
6279     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6280     return N * TTI.getArithmeticInstrCost(
6281                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6282                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6283   }
6284   case Instruction::FNeg: {
6285     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6286     return N * TTI.getArithmeticInstrCost(
6287                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6288                    TargetTransformInfo::OK_AnyValue,
6289                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6290                    I->getOperand(0), I);
6291   }
6292   case Instruction::Select: {
6293     SelectInst *SI = cast<SelectInst>(I);
6294     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6295     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6296     Type *CondTy = SI->getCondition()->getType();
6297     if (!ScalarCond)
6298       CondTy = VectorType::get(CondTy, VF);
6299 
6300     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6301   }
6302   case Instruction::ICmp:
6303   case Instruction::FCmp: {
6304     Type *ValTy = I->getOperand(0)->getType();
6305     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6306     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6307       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6308     VectorTy = ToVectorTy(ValTy, VF);
6309     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6310   }
6311   case Instruction::Store:
6312   case Instruction::Load: {
6313     unsigned Width = VF;
6314     if (Width > 1) {
6315       InstWidening Decision = getWideningDecision(I, Width);
6316       assert(Decision != CM_Unknown &&
6317              "CM decision should be taken at this point");
6318       if (Decision == CM_Scalarize)
6319         Width = 1;
6320     }
6321     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6322     return getMemoryInstructionCost(I, VF);
6323   }
6324   case Instruction::ZExt:
6325   case Instruction::SExt:
6326   case Instruction::FPToUI:
6327   case Instruction::FPToSI:
6328   case Instruction::FPExt:
6329   case Instruction::PtrToInt:
6330   case Instruction::IntToPtr:
6331   case Instruction::SIToFP:
6332   case Instruction::UIToFP:
6333   case Instruction::Trunc:
6334   case Instruction::FPTrunc:
6335   case Instruction::BitCast: {
6336     // We optimize the truncation of induction variables having constant
6337     // integer steps. The cost of these truncations is the same as the scalar
6338     // operation.
6339     if (isOptimizableIVTruncate(I, VF)) {
6340       auto *Trunc = cast<TruncInst>(I);
6341       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6342                                   Trunc->getSrcTy(), Trunc);
6343     }
6344 
6345     Type *SrcScalarTy = I->getOperand(0)->getType();
6346     Type *SrcVecTy =
6347         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6348     if (canTruncateToMinimalBitwidth(I, VF)) {
6349       // This cast is going to be shrunk. This may remove the cast or it might
6350       // turn it into slightly different cast. For example, if MinBW == 16,
6351       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6352       //
6353       // Calculate the modified src and dest types.
6354       Type *MinVecTy = VectorTy;
6355       if (I->getOpcode() == Instruction::Trunc) {
6356         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6357         VectorTy =
6358             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6359       } else if (I->getOpcode() == Instruction::ZExt ||
6360                  I->getOpcode() == Instruction::SExt) {
6361         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6362         VectorTy =
6363             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6364       }
6365     }
6366 
6367     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6368     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6369   }
6370   case Instruction::Call: {
6371     bool NeedToScalarize;
6372     CallInst *CI = cast<CallInst>(I);
6373     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6374     if (getVectorIntrinsicIDForCall(CI, TLI))
6375       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6376     return CallCost;
6377   }
6378   default:
6379     // The cost of executing VF copies of the scalar instruction. This opcode
6380     // is unknown. Assume that it is the same as 'mul'.
6381     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6382            getScalarizationOverhead(I, VF);
6383   } // end of switch.
6384 }
6385 
6386 char LoopVectorize::ID = 0;
6387 
6388 static const char lv_name[] = "Loop Vectorization";
6389 
6390 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6391 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6392 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6396 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6397 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6400 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6401 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6402 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6403 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6404 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6405 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6406 
6407 namespace llvm {
6408 
6409 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6410 
6411 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6412                               bool VectorizeOnlyWhenForced) {
6413   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6414 }
6415 
6416 } // end namespace llvm
6417 
6418 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6419   // Check if the pointer operand of a load or store instruction is
6420   // consecutive.
6421   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6422     return Legal->isConsecutivePtr(Ptr);
6423   return false;
6424 }
6425 
6426 void LoopVectorizationCostModel::collectValuesToIgnore() {
6427   // Ignore ephemeral values.
6428   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6429 
6430   // Ignore type-promoting instructions we identified during reduction
6431   // detection.
6432   for (auto &Reduction : Legal->getReductionVars()) {
6433     RecurrenceDescriptor &RedDes = Reduction.second;
6434     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6435     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6436   }
6437   // Ignore type-casting instructions we identified during induction
6438   // detection.
6439   for (auto &Induction : Legal->getInductionVars()) {
6440     InductionDescriptor &IndDes = Induction.second;
6441     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6442     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6443   }
6444 }
6445 
6446 // TODO: we could return a pair of values that specify the max VF and
6447 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6448 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6449 // doesn't have a cost model that can choose which plan to execute if
6450 // more than one is generated.
6451 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6452                                  LoopVectorizationCostModel &CM) {
6453   unsigned WidestType;
6454   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6455   return WidestVectorRegBits / WidestType;
6456 }
6457 
6458 VectorizationFactor
6459 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6460   unsigned VF = UserVF;
6461   // Outer loop handling: They may require CFG and instruction level
6462   // transformations before even evaluating whether vectorization is profitable.
6463   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6464   // the vectorization pipeline.
6465   if (!OrigLoop->empty()) {
6466     // If the user doesn't provide a vectorization factor, determine a
6467     // reasonable one.
6468     if (!UserVF) {
6469       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6470       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6471 
6472       // Make sure we have a VF > 1 for stress testing.
6473       if (VPlanBuildStressTest && VF < 2) {
6474         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6475                           << "overriding computed VF.\n");
6476         VF = 4;
6477       }
6478     }
6479     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6480     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6481     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6482                       << " to build VPlans.\n");
6483     buildVPlans(VF, VF);
6484 
6485     // For VPlan build stress testing, we bail out after VPlan construction.
6486     if (VPlanBuildStressTest)
6487       return VectorizationFactor::Disabled();
6488 
6489     return {VF, 0};
6490   }
6491 
6492   LLVM_DEBUG(
6493       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6494                 "VPlan-native path.\n");
6495   return VectorizationFactor::Disabled();
6496 }
6497 
6498 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6499   assert(OrigLoop->empty() && "Inner loop expected.");
6500   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6501   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6502     return None;
6503 
6504   // Invalidate interleave groups if all blocks of loop will be predicated.
6505   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6506       !useMaskedInterleavedAccesses(*TTI)) {
6507     LLVM_DEBUG(
6508         dbgs()
6509         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6510            "which requires masked-interleaved support.\n");
6511     CM.InterleaveInfo.reset();
6512   }
6513 
6514   if (UserVF) {
6515     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6516     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6517     // Collect the instructions (and their associated costs) that will be more
6518     // profitable to scalarize.
6519     CM.selectUserVectorizationFactor(UserVF);
6520     buildVPlansWithVPRecipes(UserVF, UserVF);
6521     LLVM_DEBUG(printPlans(dbgs()));
6522     return {{UserVF, 0}};
6523   }
6524 
6525   unsigned MaxVF = MaybeMaxVF.getValue();
6526   assert(MaxVF != 0 && "MaxVF is zero.");
6527 
6528   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6529     // Collect Uniform and Scalar instructions after vectorization with VF.
6530     CM.collectUniformsAndScalars(VF);
6531 
6532     // Collect the instructions (and their associated costs) that will be more
6533     // profitable to scalarize.
6534     if (VF > 1)
6535       CM.collectInstsToScalarize(VF);
6536   }
6537 
6538   buildVPlansWithVPRecipes(1, MaxVF);
6539   LLVM_DEBUG(printPlans(dbgs()));
6540   if (MaxVF == 1)
6541     return VectorizationFactor::Disabled();
6542 
6543   // Select the optimal vectorization factor.
6544   return CM.selectVectorizationFactor(MaxVF);
6545 }
6546 
6547 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6548   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6549                     << '\n');
6550   BestVF = VF;
6551   BestUF = UF;
6552 
6553   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6554     return !Plan->hasVF(VF);
6555   });
6556   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6557 }
6558 
6559 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6560                                            DominatorTree *DT) {
6561   // Perform the actual loop transformation.
6562 
6563   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6564   VPCallbackILV CallbackILV(ILV);
6565 
6566   VPTransformState State{BestVF, BestUF,      LI,
6567                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6568                          &ILV,   CallbackILV};
6569   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6570   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6571 
6572   //===------------------------------------------------===//
6573   //
6574   // Notice: any optimization or new instruction that go
6575   // into the code below should also be implemented in
6576   // the cost-model.
6577   //
6578   //===------------------------------------------------===//
6579 
6580   // 2. Copy and widen instructions from the old loop into the new loop.
6581   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6582   VPlans.front()->execute(&State);
6583 
6584   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6585   //    predication, updating analyses.
6586   ILV.fixVectorizedLoop();
6587 }
6588 
6589 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6590     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6591   BasicBlock *Latch = OrigLoop->getLoopLatch();
6592 
6593   // We create new control-flow for the vectorized loop, so the original
6594   // condition will be dead after vectorization if it's only used by the
6595   // branch.
6596   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6597   if (Cmp && Cmp->hasOneUse())
6598     DeadInstructions.insert(Cmp);
6599 
6600   // We create new "steps" for induction variable updates to which the original
6601   // induction variables map. An original update instruction will be dead if
6602   // all its users except the induction variable are dead.
6603   for (auto &Induction : Legal->getInductionVars()) {
6604     PHINode *Ind = Induction.first;
6605     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6606     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6607           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6608                                  DeadInstructions.end();
6609         }))
6610       DeadInstructions.insert(IndUpdate);
6611 
6612     // We record as "Dead" also the type-casting instructions we had identified
6613     // during induction analysis. We don't need any handling for them in the
6614     // vectorized loop because we have proven that, under a proper runtime
6615     // test guarding the vectorized loop, the value of the phi, and the casted
6616     // value of the phi, are the same. The last instruction in this casting chain
6617     // will get its scalar/vector/widened def from the scalar/vector/widened def
6618     // of the respective phi node. Any other casts in the induction def-use chain
6619     // have no other uses outside the phi update chain, and will be ignored.
6620     InductionDescriptor &IndDes = Induction.second;
6621     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6622     DeadInstructions.insert(Casts.begin(), Casts.end());
6623   }
6624 }
6625 
6626 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6627 
6628 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6629 
6630 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6631                                         Instruction::BinaryOps BinOp) {
6632   // When unrolling and the VF is 1, we only need to add a simple scalar.
6633   Type *Ty = Val->getType();
6634   assert(!Ty->isVectorTy() && "Val must be a scalar");
6635 
6636   if (Ty->isFloatingPointTy()) {
6637     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6638 
6639     // Floating point operations had to be 'fast' to enable the unrolling.
6640     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6641     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6642   }
6643   Constant *C = ConstantInt::get(Ty, StartIdx);
6644   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6645 }
6646 
6647 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6648   SmallVector<Metadata *, 4> MDs;
6649   // Reserve first location for self reference to the LoopID metadata node.
6650   MDs.push_back(nullptr);
6651   bool IsUnrollMetadata = false;
6652   MDNode *LoopID = L->getLoopID();
6653   if (LoopID) {
6654     // First find existing loop unrolling disable metadata.
6655     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6656       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6657       if (MD) {
6658         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6659         IsUnrollMetadata =
6660             S && S->getString().startswith("llvm.loop.unroll.disable");
6661       }
6662       MDs.push_back(LoopID->getOperand(i));
6663     }
6664   }
6665 
6666   if (!IsUnrollMetadata) {
6667     // Add runtime unroll disable metadata.
6668     LLVMContext &Context = L->getHeader()->getContext();
6669     SmallVector<Metadata *, 1> DisableOperands;
6670     DisableOperands.push_back(
6671         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6672     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6673     MDs.push_back(DisableNode);
6674     MDNode *NewLoopID = MDNode::get(Context, MDs);
6675     // Set operand 0 to refer to the loop id itself.
6676     NewLoopID->replaceOperandWith(0, NewLoopID);
6677     L->setLoopID(NewLoopID);
6678   }
6679 }
6680 
6681 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6682     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6683   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6684   bool PredicateAtRangeStart = Predicate(Range.Start);
6685 
6686   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6687     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6688       Range.End = TmpVF;
6689       break;
6690     }
6691 
6692   return PredicateAtRangeStart;
6693 }
6694 
6695 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6696 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6697 /// of VF's starting at a given VF and extending it as much as possible. Each
6698 /// vectorization decision can potentially shorten this sub-range during
6699 /// buildVPlan().
6700 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6701   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6702     VFRange SubRange = {VF, MaxVF + 1};
6703     VPlans.push_back(buildVPlan(SubRange));
6704     VF = SubRange.End;
6705   }
6706 }
6707 
6708 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6709                                          VPlanPtr &Plan) {
6710   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6711 
6712   // Look for cached value.
6713   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6714   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6715   if (ECEntryIt != EdgeMaskCache.end())
6716     return ECEntryIt->second;
6717 
6718   VPValue *SrcMask = createBlockInMask(Src, Plan);
6719 
6720   // The terminator has to be a branch inst!
6721   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6722   assert(BI && "Unexpected terminator found");
6723 
6724   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6725     return EdgeMaskCache[Edge] = SrcMask;
6726 
6727   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6728   assert(EdgeMask && "No Edge Mask found for condition");
6729 
6730   if (BI->getSuccessor(0) != Dst)
6731     EdgeMask = Builder.createNot(EdgeMask);
6732 
6733   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6734     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6735 
6736   return EdgeMaskCache[Edge] = EdgeMask;
6737 }
6738 
6739 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6740   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6741 
6742   // Look for cached value.
6743   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6744   if (BCEntryIt != BlockMaskCache.end())
6745     return BCEntryIt->second;
6746 
6747   // All-one mask is modelled as no-mask following the convention for masked
6748   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6749   VPValue *BlockMask = nullptr;
6750 
6751   if (OrigLoop->getHeader() == BB) {
6752     if (!CM.blockNeedsPredication(BB))
6753       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6754 
6755     // Introduce the early-exit compare IV <= BTC to form header block mask.
6756     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6757     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6758     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6759     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6760     return BlockMaskCache[BB] = BlockMask;
6761   }
6762 
6763   // This is the block mask. We OR all incoming edges.
6764   for (auto *Predecessor : predecessors(BB)) {
6765     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6766     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6767       return BlockMaskCache[BB] = EdgeMask;
6768 
6769     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6770       BlockMask = EdgeMask;
6771       continue;
6772     }
6773 
6774     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6775   }
6776 
6777   return BlockMaskCache[BB] = BlockMask;
6778 }
6779 
6780 VPWidenMemoryInstructionRecipe *
6781 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6782                                   VPlanPtr &Plan) {
6783   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6784     return nullptr;
6785 
6786   auto willWiden = [&](unsigned VF) -> bool {
6787     if (VF == 1)
6788       return false;
6789     LoopVectorizationCostModel::InstWidening Decision =
6790         CM.getWideningDecision(I, VF);
6791     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6792            "CM decision should be taken at this point.");
6793     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6794       return true;
6795     if (CM.isScalarAfterVectorization(I, VF) ||
6796         CM.isProfitableToScalarize(I, VF))
6797       return false;
6798     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6799   };
6800 
6801   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6802     return nullptr;
6803 
6804   VPValue *Mask = nullptr;
6805   if (Legal->isMaskRequired(I))
6806     Mask = createBlockInMask(I->getParent(), Plan);
6807 
6808   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6809   return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6810 }
6811 
6812 VPWidenIntOrFpInductionRecipe *
6813 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6814   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6815     // Check if this is an integer or fp induction. If so, build the recipe that
6816     // produces its scalar and vector values.
6817     InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
6818     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6819         II.getKind() == InductionDescriptor::IK_FpInduction)
6820       return new VPWidenIntOrFpInductionRecipe(Phi);
6821 
6822     return nullptr;
6823   }
6824 
6825   // Optimize the special case where the source is a constant integer
6826   // induction variable. Notice that we can only optimize the 'trunc' case
6827   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6828   // (c) other casts depend on pointer size.
6829 
6830   // Determine whether \p K is a truncation based on an induction variable that
6831   // can be optimized.
6832   auto isOptimizableIVTruncate =
6833       [&](Instruction *K) -> std::function<bool(unsigned)> {
6834     return
6835         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6836   };
6837 
6838   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6839                                isOptimizableIVTruncate(I), Range))
6840     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6841                                              cast<TruncInst>(I));
6842   return nullptr;
6843 }
6844 
6845 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6846   PHINode *Phi = dyn_cast<PHINode>(I);
6847   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6848     return nullptr;
6849 
6850   // We know that all PHIs in non-header blocks are converted into selects, so
6851   // we don't have to worry about the insertion order and we can just use the
6852   // builder. At this point we generate the predication tree. There may be
6853   // duplications since this is a simple recursive scan, but future
6854   // optimizations will clean it up.
6855 
6856   SmallVector<VPValue *, 2> Masks;
6857   unsigned NumIncoming = Phi->getNumIncomingValues();
6858   for (unsigned In = 0; In < NumIncoming; In++) {
6859     VPValue *EdgeMask =
6860       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6861     assert((EdgeMask || NumIncoming == 1) &&
6862            "Multiple predecessors with one having a full mask");
6863     if (EdgeMask)
6864       Masks.push_back(EdgeMask);
6865   }
6866   return new VPBlendRecipe(Phi, Masks);
6867 }
6868 
6869 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6870                                  VFRange &Range) {
6871 
6872   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6873       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6874 
6875   if (IsPredicated)
6876     return false;
6877 
6878   auto IsVectorizableOpcode = [](unsigned Opcode) {
6879     switch (Opcode) {
6880     case Instruction::Add:
6881     case Instruction::And:
6882     case Instruction::AShr:
6883     case Instruction::BitCast:
6884     case Instruction::Br:
6885     case Instruction::Call:
6886     case Instruction::FAdd:
6887     case Instruction::FCmp:
6888     case Instruction::FDiv:
6889     case Instruction::FMul:
6890     case Instruction::FNeg:
6891     case Instruction::FPExt:
6892     case Instruction::FPToSI:
6893     case Instruction::FPToUI:
6894     case Instruction::FPTrunc:
6895     case Instruction::FRem:
6896     case Instruction::FSub:
6897     case Instruction::ICmp:
6898     case Instruction::IntToPtr:
6899     case Instruction::Load:
6900     case Instruction::LShr:
6901     case Instruction::Mul:
6902     case Instruction::Or:
6903     case Instruction::PHI:
6904     case Instruction::PtrToInt:
6905     case Instruction::SDiv:
6906     case Instruction::Select:
6907     case Instruction::SExt:
6908     case Instruction::Shl:
6909     case Instruction::SIToFP:
6910     case Instruction::SRem:
6911     case Instruction::Store:
6912     case Instruction::Sub:
6913     case Instruction::Trunc:
6914     case Instruction::UDiv:
6915     case Instruction::UIToFP:
6916     case Instruction::URem:
6917     case Instruction::Xor:
6918     case Instruction::ZExt:
6919       return true;
6920     }
6921     return false;
6922   };
6923 
6924   if (!IsVectorizableOpcode(I->getOpcode()))
6925     return false;
6926 
6927   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6928     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6929     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6930                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6931       return false;
6932   }
6933 
6934   auto willWiden = [&](unsigned VF) -> bool {
6935     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6936                              CM.isProfitableToScalarize(I, VF)))
6937       return false;
6938     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6939       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6940       // The following case may be scalarized depending on the VF.
6941       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6942       // version of the instruction.
6943       // Is it beneficial to perform intrinsic call compared to lib call?
6944       bool NeedToScalarize;
6945       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6946       bool UseVectorIntrinsic =
6947           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6948       return UseVectorIntrinsic || !NeedToScalarize;
6949     }
6950     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6951       assert(CM.getWideningDecision(I, VF) ==
6952                  LoopVectorizationCostModel::CM_Scalarize &&
6953              "Memory widening decisions should have been taken care by now");
6954       return false;
6955     }
6956     return true;
6957   };
6958 
6959   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6960     return false;
6961   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6962   // to avoid having to split recipes later.
6963   bool IsSingleton = Ingredient2Recipe.count(I);
6964 
6965   // Success: widen this instruction.
6966 
6967   // Use the default widening recipe. We optimize the common case where
6968   // consecutive instructions can be represented by a single recipe.
6969   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6970       LastExtensibleRecipe->appendInstruction(I))
6971     return true;
6972 
6973   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6974   if (!IsSingleton)
6975     LastExtensibleRecipe = WidenRecipe;
6976   setRecipe(I, WidenRecipe);
6977   VPBB->appendRecipe(WidenRecipe);
6978   return true;
6979 }
6980 
6981 VPBasicBlock *VPRecipeBuilder::handleReplication(
6982     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6983     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6984     VPlanPtr &Plan) {
6985   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6986       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6987       Range);
6988 
6989   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6990       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6991 
6992   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6993   setRecipe(I, Recipe);
6994 
6995   // Find if I uses a predicated instruction. If so, it will use its scalar
6996   // value. Avoid hoisting the insert-element which packs the scalar value into
6997   // a vector value, as that happens iff all users use the vector value.
6998   for (auto &Op : I->operands())
6999     if (auto *PredInst = dyn_cast<Instruction>(Op))
7000       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7001         PredInst2Recipe[PredInst]->setAlsoPack(false);
7002 
7003   // Finalize the recipe for Instr, first if it is not predicated.
7004   if (!IsPredicated) {
7005     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7006     VPBB->appendRecipe(Recipe);
7007     return VPBB;
7008   }
7009   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7010   assert(VPBB->getSuccessors().empty() &&
7011          "VPBB has successors when handling predicated replication.");
7012   // Record predicated instructions for above packing optimizations.
7013   PredInst2Recipe[I] = Recipe;
7014   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7015   VPBlockUtils::insertBlockAfter(Region, VPBB);
7016   auto *RegSucc = new VPBasicBlock();
7017   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7018   return RegSucc;
7019 }
7020 
7021 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7022                                                       VPRecipeBase *PredRecipe,
7023                                                       VPlanPtr &Plan) {
7024   // Instructions marked for predication are replicated and placed under an
7025   // if-then construct to prevent side-effects.
7026 
7027   // Generate recipes to compute the block mask for this region.
7028   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7029 
7030   // Build the triangular if-then region.
7031   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7032   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7033   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7034   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7035   auto *PHIRecipe =
7036       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7037   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7038   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7039   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7040 
7041   // Note: first set Entry as region entry and then connect successors starting
7042   // from it in order, to propagate the "parent" of each VPBasicBlock.
7043   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7044   VPBlockUtils::connectBlocks(Pred, Exit);
7045 
7046   return Region;
7047 }
7048 
7049 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7050                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7051   VPRecipeBase *Recipe = nullptr;
7052 
7053   // First, check for specific widening recipes that deal with memory
7054   // operations, inductions and Phi nodes.
7055   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7056       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7057       (Recipe = tryToBlend(Instr, Plan)) ||
7058       (isa<PHINode>(Instr) &&
7059        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7060     setRecipe(Instr, Recipe);
7061     VPBB->appendRecipe(Recipe);
7062     return true;
7063   }
7064 
7065   // Handle GEP widening.
7066   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7067     auto Scalarize = [&](unsigned VF) {
7068       return CM.isScalarWithPredication(Instr, VF) ||
7069              CM.isScalarAfterVectorization(Instr, VF) ||
7070              CM.isProfitableToScalarize(Instr, VF);
7071     };
7072     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7073       return false;
7074     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7075     setRecipe(Instr, Recipe);
7076     VPBB->appendRecipe(Recipe);
7077     return true;
7078   }
7079 
7080   // Check if Instr is to be widened by a general VPWidenRecipe, after
7081   // having first checked for specific widening recipes.
7082   if (tryToWiden(Instr, VPBB, Range))
7083     return true;
7084 
7085   return false;
7086 }
7087 
7088 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7089                                                         unsigned MaxVF) {
7090   assert(OrigLoop->empty() && "Inner loop expected.");
7091 
7092   // Collect conditions feeding internal conditional branches; they need to be
7093   // represented in VPlan for it to model masking.
7094   SmallPtrSet<Value *, 1> NeedDef;
7095 
7096   auto *Latch = OrigLoop->getLoopLatch();
7097   for (BasicBlock *BB : OrigLoop->blocks()) {
7098     if (BB == Latch)
7099       continue;
7100     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7101     if (Branch && Branch->isConditional())
7102       NeedDef.insert(Branch->getCondition());
7103   }
7104 
7105   // If the tail is to be folded by masking, the primary induction variable
7106   // needs to be represented in VPlan for it to model early-exit masking.
7107   // Also, both the Phi and the live-out instruction of each reduction are
7108   // required in order to introduce a select between them in VPlan.
7109   if (CM.foldTailByMasking()) {
7110     NeedDef.insert(Legal->getPrimaryInduction());
7111     for (auto &Reduction : Legal->getReductionVars()) {
7112       NeedDef.insert(Reduction.first);
7113       NeedDef.insert(Reduction.second.getLoopExitInstr());
7114     }
7115   }
7116 
7117   // Collect instructions from the original loop that will become trivially dead
7118   // in the vectorized loop. We don't need to vectorize these instructions. For
7119   // example, original induction update instructions can become dead because we
7120   // separately emit induction "steps" when generating code for the new loop.
7121   // Similarly, we create a new latch condition when setting up the structure
7122   // of the new loop, so the old one can become dead.
7123   SmallPtrSet<Instruction *, 4> DeadInstructions;
7124   collectTriviallyDeadInstructions(DeadInstructions);
7125 
7126   // Add assume instructions we need to drop to DeadInstructions, to prevent
7127   // them from being added to the VPlan.
7128   // TODO: We only need to drop assumes in blocks that get flattend. If the
7129   // control flow is preserved, we should keep them.
7130   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7131   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7132 
7133   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7134   // Dead instructions do not need sinking. Remove them from SinkAfter.
7135   for (Instruction *I : DeadInstructions)
7136     SinkAfter.erase(I);
7137 
7138   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7139     VFRange SubRange = {VF, MaxVF + 1};
7140     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7141                                              DeadInstructions, SinkAfter));
7142     VF = SubRange.End;
7143   }
7144 }
7145 
7146 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7147     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7148     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7149     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7150 
7151   // Hold a mapping from predicated instructions to their recipes, in order to
7152   // fix their AlsoPack behavior if a user is determined to replicate and use a
7153   // scalar instead of vector value.
7154   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7155 
7156   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7157 
7158   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7159 
7160   // ---------------------------------------------------------------------------
7161   // Pre-construction: record ingredients whose recipes we'll need to further
7162   // process after constructing the initial VPlan.
7163   // ---------------------------------------------------------------------------
7164 
7165   // Mark instructions we'll need to sink later and their targets as
7166   // ingredients whose recipe we'll need to record.
7167   for (auto &Entry : SinkAfter) {
7168     RecipeBuilder.recordRecipeOf(Entry.first);
7169     RecipeBuilder.recordRecipeOf(Entry.second);
7170   }
7171 
7172   // For each interleave group which is relevant for this (possibly trimmed)
7173   // Range, add it to the set of groups to be later applied to the VPlan and add
7174   // placeholders for its members' Recipes which we'll be replacing with a
7175   // single VPInterleaveRecipe.
7176   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7177     auto applyIG = [IG, this](unsigned VF) -> bool {
7178       return (VF >= 2 && // Query is illegal for VF == 1
7179               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7180                   LoopVectorizationCostModel::CM_Interleave);
7181     };
7182     if (!getDecisionAndClampRange(applyIG, Range))
7183       continue;
7184     InterleaveGroups.insert(IG);
7185     for (unsigned i = 0; i < IG->getFactor(); i++)
7186       if (Instruction *Member = IG->getMember(i))
7187         RecipeBuilder.recordRecipeOf(Member);
7188   };
7189 
7190   // ---------------------------------------------------------------------------
7191   // Build initial VPlan: Scan the body of the loop in a topological order to
7192   // visit each basic block after having visited its predecessor basic blocks.
7193   // ---------------------------------------------------------------------------
7194 
7195   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7196   auto Plan = std::make_unique<VPlan>();
7197   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7198   Plan->setEntry(VPBB);
7199 
7200   // Represent values that will have defs inside VPlan.
7201   for (Value *V : NeedDef)
7202     Plan->addVPValue(V);
7203 
7204   // Scan the body of the loop in a topological order to visit each basic block
7205   // after having visited its predecessor basic blocks.
7206   LoopBlocksDFS DFS(OrigLoop);
7207   DFS.perform(LI);
7208 
7209   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7210     // Relevant instructions from basic block BB will be grouped into VPRecipe
7211     // ingredients and fill a new VPBasicBlock.
7212     unsigned VPBBsForBB = 0;
7213     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7214     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7215     VPBB = FirstVPBBForBB;
7216     Builder.setInsertPoint(VPBB);
7217 
7218     // Introduce each ingredient into VPlan.
7219     for (Instruction &I : BB->instructionsWithoutDebug()) {
7220       Instruction *Instr = &I;
7221 
7222       // First filter out irrelevant instructions, to ensure no recipes are
7223       // built for them.
7224       if (isa<BranchInst>(Instr) ||
7225           DeadInstructions.find(Instr) != DeadInstructions.end())
7226         continue;
7227 
7228       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7229         continue;
7230 
7231       // Otherwise, if all widening options failed, Instruction is to be
7232       // replicated. This may create a successor for VPBB.
7233       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7234           Instr, Range, VPBB, PredInst2Recipe, Plan);
7235       if (NextVPBB != VPBB) {
7236         VPBB = NextVPBB;
7237         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7238                                     : "");
7239       }
7240     }
7241   }
7242 
7243   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7244   // may also be empty, such as the last one VPBB, reflecting original
7245   // basic-blocks with no recipes.
7246   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7247   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7248   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7249   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7250   delete PreEntry;
7251 
7252   // ---------------------------------------------------------------------------
7253   // Transform initial VPlan: Apply previously taken decisions, in order, to
7254   // bring the VPlan to its final state.
7255   // ---------------------------------------------------------------------------
7256 
7257   // Apply Sink-After legal constraints.
7258   for (auto &Entry : SinkAfter) {
7259     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7260     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7261     Sink->moveAfter(Target);
7262   }
7263 
7264   // Interleave memory: for each Interleave Group we marked earlier as relevant
7265   // for this VPlan, replace the Recipes widening its memory instructions with a
7266   // single VPInterleaveRecipe at its insertion point.
7267   for (auto IG : InterleaveGroups) {
7268     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7269         RecipeBuilder.getRecipe(IG->getInsertPos()));
7270     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7271         ->insertBefore(Recipe);
7272 
7273     for (unsigned i = 0; i < IG->getFactor(); ++i)
7274       if (Instruction *Member = IG->getMember(i)) {
7275         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7276       }
7277   }
7278 
7279   // Finally, if tail is folded by masking, introduce selects between the phi
7280   // and the live-out instruction of each reduction, at the end of the latch.
7281   if (CM.foldTailByMasking()) {
7282     Builder.setInsertPoint(VPBB);
7283     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7284     for (auto &Reduction : Legal->getReductionVars()) {
7285       VPValue *Phi = Plan->getVPValue(Reduction.first);
7286       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7287       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7288     }
7289   }
7290 
7291   std::string PlanName;
7292   raw_string_ostream RSO(PlanName);
7293   unsigned VF = Range.Start;
7294   Plan->addVF(VF);
7295   RSO << "Initial VPlan for VF={" << VF;
7296   for (VF *= 2; VF < Range.End; VF *= 2) {
7297     Plan->addVF(VF);
7298     RSO << "," << VF;
7299   }
7300   RSO << "},UF>=1";
7301   RSO.flush();
7302   Plan->setName(PlanName);
7303 
7304   return Plan;
7305 }
7306 
7307 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7308   // Outer loop handling: They may require CFG and instruction level
7309   // transformations before even evaluating whether vectorization is profitable.
7310   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7311   // the vectorization pipeline.
7312   assert(!OrigLoop->empty());
7313   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7314 
7315   // Create new empty VPlan
7316   auto Plan = std::make_unique<VPlan>();
7317 
7318   // Build hierarchical CFG
7319   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7320   HCFGBuilder.buildHierarchicalCFG();
7321 
7322   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7323     Plan->addVF(VF);
7324 
7325   if (EnableVPlanPredication) {
7326     VPlanPredicator VPP(*Plan);
7327     VPP.predicate();
7328 
7329     // Avoid running transformation to recipes until masked code generation in
7330     // VPlan-native path is in place.
7331     return Plan;
7332   }
7333 
7334   SmallPtrSet<Instruction *, 1> DeadInstructions;
7335   VPlanTransforms::VPInstructionsToVPRecipes(
7336       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7337   return Plan;
7338 }
7339 
7340 Value* LoopVectorizationPlanner::VPCallbackILV::
7341 getOrCreateVectorValues(Value *V, unsigned Part) {
7342       return ILV.getOrCreateVectorValue(V, Part);
7343 }
7344 
7345 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7346     Value *V, const VPIteration &Instance) {
7347   return ILV.getOrCreateScalarValue(V, Instance);
7348 }
7349 
7350 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7351                                VPSlotTracker &SlotTracker) const {
7352   O << " +\n"
7353     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7354   IG->getInsertPos()->printAsOperand(O, false);
7355   O << ", ";
7356   getAddr()->printAsOperand(O, SlotTracker);
7357   VPValue *Mask = getMask();
7358   if (Mask) {
7359     O << ", ";
7360     Mask->printAsOperand(O, SlotTracker);
7361   }
7362   O << "\\l\"";
7363   for (unsigned i = 0; i < IG->getFactor(); ++i)
7364     if (Instruction *I = IG->getMember(i))
7365       O << " +\n"
7366         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7367 }
7368 
7369 void VPWidenRecipe::execute(VPTransformState &State) {
7370   for (auto &Instr : make_range(Begin, End))
7371     State.ILV->widenInstruction(Instr);
7372 }
7373 
7374 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7375   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7376                       IsIndexLoopInvariant);
7377 }
7378 
7379 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7380   assert(!State.Instance && "Int or FP induction being replicated.");
7381   State.ILV->widenIntOrFpInduction(IV, Trunc);
7382 }
7383 
7384 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7385   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7386 }
7387 
7388 void VPBlendRecipe::execute(VPTransformState &State) {
7389   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7390   // We know that all PHIs in non-header blocks are converted into
7391   // selects, so we don't have to worry about the insertion order and we
7392   // can just use the builder.
7393   // At this point we generate the predication tree. There may be
7394   // duplications since this is a simple recursive scan, but future
7395   // optimizations will clean it up.
7396 
7397   unsigned NumIncoming = Phi->getNumIncomingValues();
7398 
7399   assert((User || NumIncoming == 1) &&
7400          "Multiple predecessors with predecessors having a full mask");
7401   // Generate a sequence of selects of the form:
7402   // SELECT(Mask3, In3,
7403   //      SELECT(Mask2, In2,
7404   //                   ( ...)))
7405   InnerLoopVectorizer::VectorParts Entry(State.UF);
7406   for (unsigned In = 0; In < NumIncoming; ++In) {
7407     for (unsigned Part = 0; Part < State.UF; ++Part) {
7408       // We might have single edge PHIs (blocks) - use an identity
7409       // 'select' for the first PHI operand.
7410       Value *In0 =
7411           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7412       if (In == 0)
7413         Entry[Part] = In0; // Initialize with the first incoming value.
7414       else {
7415         // Select between the current value and the previous incoming edge
7416         // based on the incoming mask.
7417         Value *Cond = State.get(User->getOperand(In), Part);
7418         Entry[Part] =
7419             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7420       }
7421     }
7422   }
7423   for (unsigned Part = 0; Part < State.UF; ++Part)
7424     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7425 }
7426 
7427 void VPInterleaveRecipe::execute(VPTransformState &State) {
7428   assert(!State.Instance && "Interleave group being replicated.");
7429   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7430                                       getMask());
7431 }
7432 
7433 void VPReplicateRecipe::execute(VPTransformState &State) {
7434   if (State.Instance) { // Generate a single instance.
7435     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7436     // Insert scalar instance packing it into a vector.
7437     if (AlsoPack && State.VF > 1) {
7438       // If we're constructing lane 0, initialize to start from undef.
7439       if (State.Instance->Lane == 0) {
7440         Value *Undef =
7441             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7442         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7443       }
7444       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7445     }
7446     return;
7447   }
7448 
7449   // Generate scalar instances for all VF lanes of all UF parts, unless the
7450   // instruction is uniform inwhich case generate only the first lane for each
7451   // of the UF parts.
7452   unsigned EndLane = IsUniform ? 1 : State.VF;
7453   for (unsigned Part = 0; Part < State.UF; ++Part)
7454     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7455       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7456 }
7457 
7458 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7459   assert(State.Instance && "Branch on Mask works only on single instance.");
7460 
7461   unsigned Part = State.Instance->Part;
7462   unsigned Lane = State.Instance->Lane;
7463 
7464   Value *ConditionBit = nullptr;
7465   if (!User) // Block in mask is all-one.
7466     ConditionBit = State.Builder.getTrue();
7467   else {
7468     VPValue *BlockInMask = User->getOperand(0);
7469     ConditionBit = State.get(BlockInMask, Part);
7470     if (ConditionBit->getType()->isVectorTy())
7471       ConditionBit = State.Builder.CreateExtractElement(
7472           ConditionBit, State.Builder.getInt32(Lane));
7473   }
7474 
7475   // Replace the temporary unreachable terminator with a new conditional branch,
7476   // whose two destinations will be set later when they are created.
7477   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7478   assert(isa<UnreachableInst>(CurrentTerminator) &&
7479          "Expected to replace unreachable terminator with conditional branch.");
7480   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7481   CondBr->setSuccessor(0, nullptr);
7482   ReplaceInstWithInst(CurrentTerminator, CondBr);
7483 }
7484 
7485 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7486   assert(State.Instance && "Predicated instruction PHI works per instance.");
7487   Instruction *ScalarPredInst = cast<Instruction>(
7488       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7489   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7490   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7491   assert(PredicatingBB && "Predicated block has no single predecessor.");
7492 
7493   // By current pack/unpack logic we need to generate only a single phi node: if
7494   // a vector value for the predicated instruction exists at this point it means
7495   // the instruction has vector users only, and a phi for the vector value is
7496   // needed. In this case the recipe of the predicated instruction is marked to
7497   // also do that packing, thereby "hoisting" the insert-element sequence.
7498   // Otherwise, a phi node for the scalar value is needed.
7499   unsigned Part = State.Instance->Part;
7500   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7501     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7502     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7503     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7504     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7505     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7506     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7507   } else {
7508     Type *PredInstType = PredInst->getType();
7509     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7510     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7511     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7512     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7513   }
7514 }
7515 
7516 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7517   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7518 }
7519 
7520 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7521 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7522 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7523 // for predication.
7524 static ScalarEpilogueLowering getScalarEpilogueLowering(
7525     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7526     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7527     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7528     LoopVectorizationLegality &LVL) {
7529   bool OptSize =
7530       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7531                                                      PGSOQueryType::IRPass);
7532   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7533   // don't look at hints or options, and don't request a scalar epilogue.
7534   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7535     return CM_ScalarEpilogueNotAllowedOptSize;
7536 
7537   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7538                               !PreferPredicateOverEpilog;
7539 
7540   // 2) Next, if disabling predication is requested on the command line, honour
7541   // this and request a scalar epilogue. Also do this if we don't have a
7542   // primary induction variable, which is required for predication.
7543   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7544     return CM_ScalarEpilogueAllowed;
7545 
7546   // 3) and 4) look if enabling predication is requested on the command line,
7547   // with a loop hint, or if the TTI hook indicates this is profitable, request
7548   // predication .
7549   if (PreferPredicateOverEpilog ||
7550       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7551       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7552                                         LVL.getLAI()) &&
7553        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7554     return CM_ScalarEpilogueNotNeededUsePredicate;
7555 
7556   return CM_ScalarEpilogueAllowed;
7557 }
7558 
7559 // Process the loop in the VPlan-native vectorization path. This path builds
7560 // VPlan upfront in the vectorization pipeline, which allows to apply
7561 // VPlan-to-VPlan transformations from the very beginning without modifying the
7562 // input LLVM IR.
7563 static bool processLoopInVPlanNativePath(
7564     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7565     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7566     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7567     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7568     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7569 
7570   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7571   Function *F = L->getHeader()->getParent();
7572   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7573 
7574   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7575       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7576 
7577   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7578                                 &Hints, IAI);
7579   // Use the planner for outer loop vectorization.
7580   // TODO: CM is not used at this point inside the planner. Turn CM into an
7581   // optional argument if we don't need it in the future.
7582   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7583 
7584   // Get user vectorization factor.
7585   const unsigned UserVF = Hints.getWidth();
7586 
7587   // Plan how to best vectorize, return the best VF and its cost.
7588   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7589 
7590   // If we are stress testing VPlan builds, do not attempt to generate vector
7591   // code. Masked vector code generation support will follow soon.
7592   // Also, do not attempt to vectorize if no vector code will be produced.
7593   if (VPlanBuildStressTest || EnableVPlanPredication ||
7594       VectorizationFactor::Disabled() == VF)
7595     return false;
7596 
7597   LVP.setBestPlan(VF.Width, 1);
7598 
7599   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7600                          &CM);
7601   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7602                     << L->getHeader()->getParent()->getName() << "\"\n");
7603   LVP.executePlan(LB, DT);
7604 
7605   // Mark the loop as already vectorized to avoid vectorizing again.
7606   Hints.setAlreadyVectorized();
7607 
7608   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7609   return true;
7610 }
7611 
7612 bool LoopVectorizePass::processLoop(Loop *L) {
7613   assert((EnableVPlanNativePath || L->empty()) &&
7614          "VPlan-native path is not enabled. Only process inner loops.");
7615 
7616 #ifndef NDEBUG
7617   const std::string DebugLocStr = getDebugLocString(L);
7618 #endif /* NDEBUG */
7619 
7620   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7621                     << L->getHeader()->getParent()->getName() << "\" from "
7622                     << DebugLocStr << "\n");
7623 
7624   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7625 
7626   LLVM_DEBUG(
7627       dbgs() << "LV: Loop hints:"
7628              << " force="
7629              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7630                      ? "disabled"
7631                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7632                             ? "enabled"
7633                             : "?"))
7634              << " width=" << Hints.getWidth()
7635              << " unroll=" << Hints.getInterleave() << "\n");
7636 
7637   // Function containing loop
7638   Function *F = L->getHeader()->getParent();
7639 
7640   // Looking at the diagnostic output is the only way to determine if a loop
7641   // was vectorized (other than looking at the IR or machine code), so it
7642   // is important to generate an optimization remark for each loop. Most of
7643   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7644   // generated as OptimizationRemark and OptimizationRemarkMissed are
7645   // less verbose reporting vectorized loops and unvectorized loops that may
7646   // benefit from vectorization, respectively.
7647 
7648   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7649     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7650     return false;
7651   }
7652 
7653   PredicatedScalarEvolution PSE(*SE, *L);
7654 
7655   // Check if it is legal to vectorize the loop.
7656   LoopVectorizationRequirements Requirements(*ORE);
7657   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7658                                 &Requirements, &Hints, DB, AC);
7659   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7660     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7661     Hints.emitRemarkWithHints();
7662     return false;
7663   }
7664 
7665   // Check the function attributes and profiles to find out if this function
7666   // should be optimized for size.
7667   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7668       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7669 
7670   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7671   // here. They may require CFG and instruction level transformations before
7672   // even evaluating whether vectorization is profitable. Since we cannot modify
7673   // the incoming IR, we need to build VPlan upfront in the vectorization
7674   // pipeline.
7675   if (!L->empty())
7676     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7677                                         ORE, BFI, PSI, Hints);
7678 
7679   assert(L->empty() && "Inner loop expected.");
7680 
7681   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7682   // count by optimizing for size, to minimize overheads.
7683   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7684   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7685     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7686                       << "This loop is worth vectorizing only if no scalar "
7687                       << "iteration overheads are incurred.");
7688     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7689       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7690     else {
7691       LLVM_DEBUG(dbgs() << "\n");
7692       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7693     }
7694   }
7695 
7696   // Check the function attributes to see if implicit floats are allowed.
7697   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7698   // an integer loop and the vector instructions selected are purely integer
7699   // vector instructions?
7700   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7701     reportVectorizationFailure(
7702         "Can't vectorize when the NoImplicitFloat attribute is used",
7703         "loop not vectorized due to NoImplicitFloat attribute",
7704         "NoImplicitFloat", ORE, L);
7705     Hints.emitRemarkWithHints();
7706     return false;
7707   }
7708 
7709   // Check if the target supports potentially unsafe FP vectorization.
7710   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7711   // for the target we're vectorizing for, to make sure none of the
7712   // additional fp-math flags can help.
7713   if (Hints.isPotentiallyUnsafe() &&
7714       TTI->isFPVectorizationPotentiallyUnsafe()) {
7715     reportVectorizationFailure(
7716         "Potentially unsafe FP op prevents vectorization",
7717         "loop not vectorized due to unsafe FP support.",
7718         "UnsafeFP", ORE, L);
7719     Hints.emitRemarkWithHints();
7720     return false;
7721   }
7722 
7723   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7724   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7725 
7726   // If an override option has been passed in for interleaved accesses, use it.
7727   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7728     UseInterleaved = EnableInterleavedMemAccesses;
7729 
7730   // Analyze interleaved memory accesses.
7731   if (UseInterleaved) {
7732     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7733   }
7734 
7735   // Use the cost model.
7736   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7737                                 F, &Hints, IAI);
7738   CM.collectValuesToIgnore();
7739 
7740   // Use the planner for vectorization.
7741   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7742 
7743   // Get user vectorization factor.
7744   unsigned UserVF = Hints.getWidth();
7745 
7746   // Plan how to best vectorize, return the best VF and its cost.
7747   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7748 
7749   VectorizationFactor VF = VectorizationFactor::Disabled();
7750   unsigned IC = 1;
7751   unsigned UserIC = Hints.getInterleave();
7752 
7753   if (MaybeVF) {
7754     VF = *MaybeVF;
7755     // Select the interleave count.
7756     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7757   }
7758 
7759   // Identify the diagnostic messages that should be produced.
7760   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7761   bool VectorizeLoop = true, InterleaveLoop = true;
7762   if (Requirements.doesNotMeet(F, L, Hints)) {
7763     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7764                          "requirements.\n");
7765     Hints.emitRemarkWithHints();
7766     return false;
7767   }
7768 
7769   if (VF.Width == 1) {
7770     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7771     VecDiagMsg = std::make_pair(
7772         "VectorizationNotBeneficial",
7773         "the cost-model indicates that vectorization is not beneficial");
7774     VectorizeLoop = false;
7775   }
7776 
7777   if (!MaybeVF && UserIC > 1) {
7778     // Tell the user interleaving was avoided up-front, despite being explicitly
7779     // requested.
7780     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7781                          "interleaving should be avoided up front\n");
7782     IntDiagMsg = std::make_pair(
7783         "InterleavingAvoided",
7784         "Ignoring UserIC, because interleaving was avoided up front");
7785     InterleaveLoop = false;
7786   } else if (IC == 1 && UserIC <= 1) {
7787     // Tell the user interleaving is not beneficial.
7788     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7789     IntDiagMsg = std::make_pair(
7790         "InterleavingNotBeneficial",
7791         "the cost-model indicates that interleaving is not beneficial");
7792     InterleaveLoop = false;
7793     if (UserIC == 1) {
7794       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7795       IntDiagMsg.second +=
7796           " and is explicitly disabled or interleave count is set to 1";
7797     }
7798   } else if (IC > 1 && UserIC == 1) {
7799     // Tell the user interleaving is beneficial, but it explicitly disabled.
7800     LLVM_DEBUG(
7801         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7802     IntDiagMsg = std::make_pair(
7803         "InterleavingBeneficialButDisabled",
7804         "the cost-model indicates that interleaving is beneficial "
7805         "but is explicitly disabled or interleave count is set to 1");
7806     InterleaveLoop = false;
7807   }
7808 
7809   // Override IC if user provided an interleave count.
7810   IC = UserIC > 0 ? UserIC : IC;
7811 
7812   // Emit diagnostic messages, if any.
7813   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7814   if (!VectorizeLoop && !InterleaveLoop) {
7815     // Do not vectorize or interleaving the loop.
7816     ORE->emit([&]() {
7817       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7818                                       L->getStartLoc(), L->getHeader())
7819              << VecDiagMsg.second;
7820     });
7821     ORE->emit([&]() {
7822       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7823                                       L->getStartLoc(), L->getHeader())
7824              << IntDiagMsg.second;
7825     });
7826     return false;
7827   } else if (!VectorizeLoop && InterleaveLoop) {
7828     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7829     ORE->emit([&]() {
7830       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7831                                         L->getStartLoc(), L->getHeader())
7832              << VecDiagMsg.second;
7833     });
7834   } else if (VectorizeLoop && !InterleaveLoop) {
7835     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7836                       << ") in " << DebugLocStr << '\n');
7837     ORE->emit([&]() {
7838       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7839                                         L->getStartLoc(), L->getHeader())
7840              << IntDiagMsg.second;
7841     });
7842   } else if (VectorizeLoop && InterleaveLoop) {
7843     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7844                       << ") in " << DebugLocStr << '\n');
7845     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7846   }
7847 
7848   LVP.setBestPlan(VF.Width, IC);
7849 
7850   using namespace ore;
7851   bool DisableRuntimeUnroll = false;
7852   MDNode *OrigLoopID = L->getLoopID();
7853 
7854   if (!VectorizeLoop) {
7855     assert(IC > 1 && "interleave count should not be 1 or 0");
7856     // If we decided that it is not legal to vectorize the loop, then
7857     // interleave it.
7858     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7859                                &CM);
7860     LVP.executePlan(Unroller, DT);
7861 
7862     ORE->emit([&]() {
7863       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7864                                 L->getHeader())
7865              << "interleaved loop (interleaved count: "
7866              << NV("InterleaveCount", IC) << ")";
7867     });
7868   } else {
7869     // If we decided that it is *legal* to vectorize the loop, then do it.
7870     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7871                            &LVL, &CM);
7872     LVP.executePlan(LB, DT);
7873     ++LoopsVectorized;
7874 
7875     // Add metadata to disable runtime unrolling a scalar loop when there are
7876     // no runtime checks about strides and memory. A scalar loop that is
7877     // rarely used is not worth unrolling.
7878     if (!LB.areSafetyChecksAdded())
7879       DisableRuntimeUnroll = true;
7880 
7881     // Report the vectorization decision.
7882     ORE->emit([&]() {
7883       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7884                                 L->getHeader())
7885              << "vectorized loop (vectorization width: "
7886              << NV("VectorizationFactor", VF.Width)
7887              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7888     });
7889   }
7890 
7891   Optional<MDNode *> RemainderLoopID =
7892       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7893                                       LLVMLoopVectorizeFollowupEpilogue});
7894   if (RemainderLoopID.hasValue()) {
7895     L->setLoopID(RemainderLoopID.getValue());
7896   } else {
7897     if (DisableRuntimeUnroll)
7898       AddRuntimeUnrollDisableMetaData(L);
7899 
7900     // Mark the loop as already vectorized to avoid vectorizing again.
7901     Hints.setAlreadyVectorized();
7902   }
7903 
7904   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7905   return true;
7906 }
7907 
7908 bool LoopVectorizePass::runImpl(
7909     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7910     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7911     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7912     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7913     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7914   SE = &SE_;
7915   LI = &LI_;
7916   TTI = &TTI_;
7917   DT = &DT_;
7918   BFI = &BFI_;
7919   TLI = TLI_;
7920   AA = &AA_;
7921   AC = &AC_;
7922   GetLAA = &GetLAA_;
7923   DB = &DB_;
7924   ORE = &ORE_;
7925   PSI = PSI_;
7926 
7927   // Don't attempt if
7928   // 1. the target claims to have no vector registers, and
7929   // 2. interleaving won't help ILP.
7930   //
7931   // The second condition is necessary because, even if the target has no
7932   // vector registers, loop vectorization may still enable scalar
7933   // interleaving.
7934   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7935       TTI->getMaxInterleaveFactor(1) < 2)
7936     return false;
7937 
7938   bool Changed = false;
7939 
7940   // The vectorizer requires loops to be in simplified form.
7941   // Since simplification may add new inner loops, it has to run before the
7942   // legality and profitability checks. This means running the loop vectorizer
7943   // will simplify all loops, regardless of whether anything end up being
7944   // vectorized.
7945   for (auto &L : *LI)
7946     Changed |=
7947         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7948 
7949   // Build up a worklist of inner-loops to vectorize. This is necessary as
7950   // the act of vectorizing or partially unrolling a loop creates new loops
7951   // and can invalidate iterators across the loops.
7952   SmallVector<Loop *, 8> Worklist;
7953 
7954   for (Loop *L : *LI)
7955     collectSupportedLoops(*L, LI, ORE, Worklist);
7956 
7957   LoopsAnalyzed += Worklist.size();
7958 
7959   // Now walk the identified inner loops.
7960   while (!Worklist.empty()) {
7961     Loop *L = Worklist.pop_back_val();
7962 
7963     // For the inner loops we actually process, form LCSSA to simplify the
7964     // transform.
7965     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7966 
7967     Changed |= processLoop(L);
7968   }
7969 
7970   // Process each loop nest in the function.
7971   return Changed;
7972 }
7973 
7974 PreservedAnalyses LoopVectorizePass::run(Function &F,
7975                                          FunctionAnalysisManager &AM) {
7976     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7977     auto &LI = AM.getResult<LoopAnalysis>(F);
7978     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7979     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7980     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7981     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7982     auto &AA = AM.getResult<AAManager>(F);
7983     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7984     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7985     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7986     MemorySSA *MSSA = EnableMSSALoopDependency
7987                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7988                           : nullptr;
7989 
7990     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7991     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7992         [&](Loop &L) -> const LoopAccessInfo & {
7993       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7994       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7995     };
7996     const ModuleAnalysisManager &MAM =
7997         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7998     ProfileSummaryInfo *PSI =
7999         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8000     bool Changed =
8001         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8002     if (!Changed)
8003       return PreservedAnalyses::all();
8004     PreservedAnalyses PA;
8005 
8006     // We currently do not preserve loopinfo/dominator analyses with outer loop
8007     // vectorization. Until this is addressed, mark these analyses as preserved
8008     // only for non-VPlan-native path.
8009     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8010     if (!EnableVPlanNativePath) {
8011       PA.preserve<LoopAnalysis>();
8012       PA.preserve<DominatorTreeAnalysis>();
8013     }
8014     PA.preserve<BasicAA>();
8015     PA.preserve<GlobalsAA>();
8016     return PA;
8017 }
8018