1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 static cl::opt<bool>
269     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
270                            cl::Hidden,
271                            cl::desc("Prefer in-loop vector reductions, "
272                                     "overriding the targets preference."));
273 
274 cl::opt<bool> EnableVPlanNativePath(
275     "enable-vplan-native-path", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path with "
277              "support for outer loop vectorization."));
278 
279 // FIXME: Remove this switch once we have divergence analysis. Currently we
280 // assume divergent non-backedge branches when this switch is true.
281 cl::opt<bool> EnableVPlanPredication(
282     "enable-vplan-predication", cl::init(false), cl::Hidden,
283     cl::desc("Enable VPlan-native vectorization path predicator with "
284              "support for outer loop vectorization."));
285 
286 // This flag enables the stress testing of the VPlan H-CFG construction in the
287 // VPlan-native vectorization path. It must be used in conjuction with
288 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
289 // verification of the H-CFGs built.
290 static cl::opt<bool> VPlanBuildStressTest(
291     "vplan-build-stress-test", cl::init(false), cl::Hidden,
292     cl::desc(
293         "Build VPlan for every supported loop nest in the function and bail "
294         "out right after the build (stress test the VPlan H-CFG construction "
295         "in the VPlan-native vectorization path)."));
296 
297 cl::opt<bool> llvm::EnableLoopInterleaving(
298     "interleave-loops", cl::init(true), cl::Hidden,
299     cl::desc("Enable loop interleaving in Loop vectorization passes"));
300 cl::opt<bool> llvm::EnableLoopVectorization(
301     "vectorize-loops", cl::init(true), cl::Hidden,
302     cl::desc("Run the Loop vectorization passes"));
303 
304 /// A helper function that returns the type of loaded or stored value.
305 static Type *getMemInstValueType(Value *I) {
306   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
307          "Expected Load or Store instruction");
308   if (auto *LI = dyn_cast<LoadInst>(I))
309     return LI->getType();
310   return cast<StoreInst>(I)->getValueOperand()->getType();
311 }
312 
313 /// A helper function that returns true if the given type is irregular. The
314 /// type is irregular if its allocated size doesn't equal the store size of an
315 /// element of the corresponding vector type at the given vectorization factor.
316 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
317   // Determine if an array of VF elements of type Ty is "bitcast compatible"
318   // with a <VF x Ty> vector.
319   if (VF > 1) {
320     auto *VectorTy = FixedVectorType::get(Ty, VF);
321     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
322   }
323 
324   // If the vectorization factor is one, we just check if an array of type Ty
325   // requires padding between elements.
326   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
327 }
328 
329 /// A helper function that returns the reciprocal of the block probability of
330 /// predicated blocks. If we return X, we are assuming the predicated block
331 /// will execute once for every X iterations of the loop header.
332 ///
333 /// TODO: We should use actual block probability here, if available. Currently,
334 ///       we always assume predicated blocks have a 50% chance of executing.
335 static unsigned getReciprocalPredBlockProb() { return 2; }
336 
337 /// A helper function that adds a 'fast' flag to floating-point operations.
338 static Value *addFastMathFlag(Value *V) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
341   return V;
342 }
343 
344 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
345   if (isa<FPMathOperator>(V))
346     cast<Instruction>(V)->setFastMathFlags(FMF);
347   return V;
348 }
349 
350 /// A helper function that returns an integer or floating-point constant with
351 /// value C.
352 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
353   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
354                            : ConstantFP::get(Ty, C);
355 }
356 
357 /// Returns "best known" trip count for the specified loop \p L as defined by
358 /// the following procedure:
359 ///   1) Returns exact trip count if it is known.
360 ///   2) Returns expected trip count according to profile data if any.
361 ///   3) Returns upper bound estimate if it is known.
362 ///   4) Returns None if all of the above failed.
363 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
364   // Check if exact trip count is known.
365   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
366     return ExpectedTC;
367 
368   // Check if there is an expected trip count available from profile data.
369   if (LoopVectorizeWithBlockFrequency)
370     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
371       return EstimatedTC;
372 
373   // Check if upper bound estimate is known.
374   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
375     return ExpectedTC;
376 
377   return None;
378 }
379 
380 namespace llvm {
381 
382 /// InnerLoopVectorizer vectorizes loops which contain only one basic
383 /// block to a specified vectorization factor (VF).
384 /// This class performs the widening of scalars into vectors, or multiple
385 /// scalars. This class also implements the following features:
386 /// * It inserts an epilogue loop for handling loops that don't have iteration
387 ///   counts that are known to be a multiple of the vectorization factor.
388 /// * It handles the code generation for reduction variables.
389 /// * Scalarization (implementation using scalars) of un-vectorizable
390 ///   instructions.
391 /// InnerLoopVectorizer does not perform any vectorization-legality
392 /// checks, and relies on the caller to check for the different legality
393 /// aspects. The InnerLoopVectorizer relies on the
394 /// LoopVectorizationLegality class to provide information about the induction
395 /// and reduction variables that were found to a given vectorization factor.
396 class InnerLoopVectorizer {
397 public:
398   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
399                       LoopInfo *LI, DominatorTree *DT,
400                       const TargetLibraryInfo *TLI,
401                       const TargetTransformInfo *TTI, AssumptionCache *AC,
402                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
403                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
404                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
405                       ProfileSummaryInfo *PSI)
406       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
407         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
408         Builder(PSE.getSE()->getContext()),
409         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
410         BFI(BFI), PSI(PSI) {
411     // Query this against the original loop and save it here because the profile
412     // of the original loop header may change as the transformation happens.
413     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
414         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
415   }
416 
417   virtual ~InnerLoopVectorizer() = default;
418 
419   /// Create a new empty loop that will contain vectorized instructions later
420   /// on, while the old loop will be used as the scalar remainder. Control flow
421   /// is generated around the vectorized (and scalar epilogue) loops consisting
422   /// of various checks and bypasses. Return the pre-header block of the new
423   /// loop.
424   BasicBlock *createVectorizedLoopSkeleton();
425 
426   /// Widen a single instruction within the innermost loop.
427   void widenInstruction(Instruction &I, VPUser &Operands,
428                         VPTransformState &State);
429 
430   /// Widen a single call instruction within the innermost loop.
431   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
432                             VPTransformState &State);
433 
434   /// Widen a single select instruction within the innermost loop.
435   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
436                               bool InvariantCond, VPTransformState &State);
437 
438   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
439   void fixVectorizedLoop();
440 
441   // Return true if any runtime check is added.
442   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
443 
444   /// A type for vectorized values in the new loop. Each value from the
445   /// original loop, when vectorized, is represented by UF vector values in the
446   /// new unrolled loop, where UF is the unroll factor.
447   using VectorParts = SmallVector<Value *, 2>;
448 
449   /// Vectorize a single GetElementPtrInst based on information gathered and
450   /// decisions taken during planning.
451   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
452                 unsigned VF, bool IsPtrLoopInvariant,
453                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
454 
455   /// Vectorize a single PHINode in a block. This method handles the induction
456   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
457   /// arbitrary length vectors.
458   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
459 
460   /// A helper function to scalarize a single Instruction in the innermost loop.
461   /// Generates a sequence of scalar instances for each lane between \p MinLane
462   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
463   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
464   /// Instr's operands.
465   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
466                             const VPIteration &Instance, bool IfPredicateInstr,
467                             VPTransformState &State);
468 
469   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
470   /// is provided, the integer induction variable will first be truncated to
471   /// the corresponding type.
472   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
473 
474   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
475   /// vector or scalar value on-demand if one is not yet available. When
476   /// vectorizing a loop, we visit the definition of an instruction before its
477   /// uses. When visiting the definition, we either vectorize or scalarize the
478   /// instruction, creating an entry for it in the corresponding map. (In some
479   /// cases, such as induction variables, we will create both vector and scalar
480   /// entries.) Then, as we encounter uses of the definition, we derive values
481   /// for each scalar or vector use unless such a value is already available.
482   /// For example, if we scalarize a definition and one of its uses is vector,
483   /// we build the required vector on-demand with an insertelement sequence
484   /// when visiting the use. Otherwise, if the use is scalar, we can use the
485   /// existing scalar definition.
486   ///
487   /// Return a value in the new loop corresponding to \p V from the original
488   /// loop at unroll index \p Part. If the value has already been vectorized,
489   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
490   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
491   /// a new vector value on-demand by inserting the scalar values into a vector
492   /// with an insertelement sequence. If the value has been neither vectorized
493   /// nor scalarized, it must be loop invariant, so we simply broadcast the
494   /// value into a vector.
495   Value *getOrCreateVectorValue(Value *V, unsigned Part);
496 
497   /// Return a value in the new loop corresponding to \p V from the original
498   /// loop at unroll and vector indices \p Instance. If the value has been
499   /// vectorized but not scalarized, the necessary extractelement instruction
500   /// will be generated.
501   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
502 
503   /// Construct the vector value of a scalarized value \p V one lane at a time.
504   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
505 
506   /// Try to vectorize interleaved access group \p Group with the base address
507   /// given in \p Addr, optionally masking the vector operations if \p
508   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
509   /// values in the vectorized loop.
510   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
511                                 VPTransformState &State, VPValue *Addr,
512                                 VPValue *BlockInMask = nullptr);
513 
514   /// Vectorize Load and Store instructions with the base address given in \p
515   /// Addr, optionally masking the vector operations if \p BlockInMask is
516   /// non-null. Use \p State to translate given VPValues to IR values in the
517   /// vectorized loop.
518   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
519                                   VPValue *Addr, VPValue *StoredValue,
520                                   VPValue *BlockInMask);
521 
522   /// Set the debug location in the builder using the debug location in
523   /// the instruction.
524   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
525 
526   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
527   void fixNonInductionPHIs(void);
528 
529 protected:
530   friend class LoopVectorizationPlanner;
531 
532   /// A small list of PHINodes.
533   using PhiVector = SmallVector<PHINode *, 4>;
534 
535   /// A type for scalarized values in the new loop. Each value from the
536   /// original loop, when scalarized, is represented by UF x VF scalar values
537   /// in the new unrolled loop, where UF is the unroll factor and VF is the
538   /// vectorization factor.
539   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
540 
541   /// Set up the values of the IVs correctly when exiting the vector loop.
542   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
543                     Value *CountRoundDown, Value *EndValue,
544                     BasicBlock *MiddleBlock);
545 
546   /// Create a new induction variable inside L.
547   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
548                                    Value *Step, Instruction *DL);
549 
550   /// Handle all cross-iteration phis in the header.
551   void fixCrossIterationPHIs();
552 
553   /// Fix a first-order recurrence. This is the second phase of vectorizing
554   /// this phi node.
555   void fixFirstOrderRecurrence(PHINode *Phi);
556 
557   /// Fix a reduction cross-iteration phi. This is the second phase of
558   /// vectorizing this phi node.
559   void fixReduction(PHINode *Phi);
560 
561   /// Clear NSW/NUW flags from reduction instructions if necessary.
562   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
563 
564   /// The Loop exit block may have single value PHI nodes with some
565   /// incoming value. While vectorizing we only handled real values
566   /// that were defined inside the loop and we should have one value for
567   /// each predecessor of its parent basic block. See PR14725.
568   void fixLCSSAPHIs();
569 
570   /// Iteratively sink the scalarized operands of a predicated instruction into
571   /// the block that was created for it.
572   void sinkScalarOperands(Instruction *PredInst);
573 
574   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
575   /// represented as.
576   void truncateToMinimalBitwidths();
577 
578   /// Create a broadcast instruction. This method generates a broadcast
579   /// instruction (shuffle) for loop invariant values and for the induction
580   /// value. If this is the induction variable then we extend it to N, N+1, ...
581   /// this is needed because each iteration in the loop corresponds to a SIMD
582   /// element.
583   virtual Value *getBroadcastInstrs(Value *V);
584 
585   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
586   /// to each vector element of Val. The sequence starts at StartIndex.
587   /// \p Opcode is relevant for FP induction variable.
588   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
589                                Instruction::BinaryOps Opcode =
590                                Instruction::BinaryOpsEnd);
591 
592   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
593   /// variable on which to base the steps, \p Step is the size of the step, and
594   /// \p EntryVal is the value from the original loop that maps to the steps.
595   /// Note that \p EntryVal doesn't have to be an induction variable - it
596   /// can also be a truncate instruction.
597   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
598                         const InductionDescriptor &ID);
599 
600   /// Create a vector induction phi node based on an existing scalar one. \p
601   /// EntryVal is the value from the original loop that maps to the vector phi
602   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
603   /// truncate instruction, instead of widening the original IV, we widen a
604   /// version of the IV truncated to \p EntryVal's type.
605   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
606                                        Value *Step, Instruction *EntryVal);
607 
608   /// Returns true if an instruction \p I should be scalarized instead of
609   /// vectorized for the chosen vectorization factor.
610   bool shouldScalarizeInstruction(Instruction *I) const;
611 
612   /// Returns true if we should generate a scalar version of \p IV.
613   bool needsScalarInduction(Instruction *IV) const;
614 
615   /// If there is a cast involved in the induction variable \p ID, which should
616   /// be ignored in the vectorized loop body, this function records the
617   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
618   /// cast. We had already proved that the casted Phi is equal to the uncasted
619   /// Phi in the vectorized loop (under a runtime guard), and therefore
620   /// there is no need to vectorize the cast - the same value can be used in the
621   /// vector loop for both the Phi and the cast.
622   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
623   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
624   ///
625   /// \p EntryVal is the value from the original loop that maps to the vector
626   /// phi node and is used to distinguish what is the IV currently being
627   /// processed - original one (if \p EntryVal is a phi corresponding to the
628   /// original IV) or the "newly-created" one based on the proof mentioned above
629   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
630   /// latter case \p EntryVal is a TruncInst and we must not record anything for
631   /// that IV, but it's error-prone to expect callers of this routine to care
632   /// about that, hence this explicit parameter.
633   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
634                                              const Instruction *EntryVal,
635                                              Value *VectorLoopValue,
636                                              unsigned Part,
637                                              unsigned Lane = UINT_MAX);
638 
639   /// Generate a shuffle sequence that will reverse the vector Vec.
640   virtual Value *reverseVector(Value *Vec);
641 
642   /// Returns (and creates if needed) the original loop trip count.
643   Value *getOrCreateTripCount(Loop *NewLoop);
644 
645   /// Returns (and creates if needed) the trip count of the widened loop.
646   Value *getOrCreateVectorTripCount(Loop *NewLoop);
647 
648   /// Returns a bitcasted value to the requested vector type.
649   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
650   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
651                                 const DataLayout &DL);
652 
653   /// Emit a bypass check to see if the vector trip count is zero, including if
654   /// it overflows.
655   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
656 
657   /// Emit a bypass check to see if all of the SCEV assumptions we've
658   /// had to make are correct.
659   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
660 
661   /// Emit bypass checks to check any memory assumptions we may have made.
662   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
663 
664   /// Compute the transformed value of Index at offset StartValue using step
665   /// StepValue.
666   /// For integer induction, returns StartValue + Index * StepValue.
667   /// For pointer induction, returns StartValue[Index * StepValue].
668   /// FIXME: The newly created binary instructions should contain nsw/nuw
669   /// flags, which can be found from the original scalar operations.
670   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
671                               const DataLayout &DL,
672                               const InductionDescriptor &ID) const;
673 
674   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
675   /// vector loop preheader, middle block and scalar preheader. Also
676   /// allocate a loop object for the new vector loop and return it.
677   Loop *createVectorLoopSkeleton(StringRef Prefix);
678 
679   /// Create new phi nodes for the induction variables to resume iteration count
680   /// in the scalar epilogue, from where the vectorized loop left off (given by
681   /// \p VectorTripCount).
682   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
683 
684   /// Complete the loop skeleton by adding debug MDs, creating appropriate
685   /// conditional branches in the middle block, preparing the builder and
686   /// running the verifier. Take in the vector loop \p L as argument, and return
687   /// the preheader of the completed vector loop.
688   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
689 
690   /// Add additional metadata to \p To that was not present on \p Orig.
691   ///
692   /// Currently this is used to add the noalias annotations based on the
693   /// inserted memchecks.  Use this for instructions that are *cloned* into the
694   /// vector loop.
695   void addNewMetadata(Instruction *To, const Instruction *Orig);
696 
697   /// Add metadata from one instruction to another.
698   ///
699   /// This includes both the original MDs from \p From and additional ones (\see
700   /// addNewMetadata).  Use this for *newly created* instructions in the vector
701   /// loop.
702   void addMetadata(Instruction *To, Instruction *From);
703 
704   /// Similar to the previous function but it adds the metadata to a
705   /// vector of instructions.
706   void addMetadata(ArrayRef<Value *> To, Instruction *From);
707 
708   /// The original loop.
709   Loop *OrigLoop;
710 
711   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
712   /// dynamic knowledge to simplify SCEV expressions and converts them to a
713   /// more usable form.
714   PredicatedScalarEvolution &PSE;
715 
716   /// Loop Info.
717   LoopInfo *LI;
718 
719   /// Dominator Tree.
720   DominatorTree *DT;
721 
722   /// Alias Analysis.
723   AAResults *AA;
724 
725   /// Target Library Info.
726   const TargetLibraryInfo *TLI;
727 
728   /// Target Transform Info.
729   const TargetTransformInfo *TTI;
730 
731   /// Assumption Cache.
732   AssumptionCache *AC;
733 
734   /// Interface to emit optimization remarks.
735   OptimizationRemarkEmitter *ORE;
736 
737   /// LoopVersioning.  It's only set up (non-null) if memchecks were
738   /// used.
739   ///
740   /// This is currently only used to add no-alias metadata based on the
741   /// memchecks.  The actually versioning is performed manually.
742   std::unique_ptr<LoopVersioning> LVer;
743 
744   /// The vectorization SIMD factor to use. Each vector will have this many
745   /// vector elements.
746   unsigned VF;
747 
748   /// The vectorization unroll factor to use. Each scalar is vectorized to this
749   /// many different vector instructions.
750   unsigned UF;
751 
752   /// The builder that we use
753   IRBuilder<> Builder;
754 
755   // --- Vectorization state ---
756 
757   /// The vector-loop preheader.
758   BasicBlock *LoopVectorPreHeader;
759 
760   /// The scalar-loop preheader.
761   BasicBlock *LoopScalarPreHeader;
762 
763   /// Middle Block between the vector and the scalar.
764   BasicBlock *LoopMiddleBlock;
765 
766   /// The ExitBlock of the scalar loop.
767   BasicBlock *LoopExitBlock;
768 
769   /// The vector loop body.
770   BasicBlock *LoopVectorBody;
771 
772   /// The scalar loop body.
773   BasicBlock *LoopScalarBody;
774 
775   /// A list of all bypass blocks. The first block is the entry of the loop.
776   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
777 
778   /// The new Induction variable which was added to the new block.
779   PHINode *Induction = nullptr;
780 
781   /// The induction variable of the old basic block.
782   PHINode *OldInduction = nullptr;
783 
784   /// Maps values from the original loop to their corresponding values in the
785   /// vectorized loop. A key value can map to either vector values, scalar
786   /// values or both kinds of values, depending on whether the key was
787   /// vectorized and scalarized.
788   VectorizerValueMap VectorLoopValueMap;
789 
790   /// Store instructions that were predicated.
791   SmallVector<Instruction *, 4> PredicatedInstructions;
792 
793   /// Trip count of the original loop.
794   Value *TripCount = nullptr;
795 
796   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
797   Value *VectorTripCount = nullptr;
798 
799   /// The legality analysis.
800   LoopVectorizationLegality *Legal;
801 
802   /// The profitablity analysis.
803   LoopVectorizationCostModel *Cost;
804 
805   // Record whether runtime checks are added.
806   bool AddedSafetyChecks = false;
807 
808   // Holds the end values for each induction variable. We save the end values
809   // so we can later fix-up the external users of the induction variables.
810   DenseMap<PHINode *, Value *> IVEndValues;
811 
812   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
813   // fixed up at the end of vector code generation.
814   SmallVector<PHINode *, 8> OrigPHIsToFix;
815 
816   /// BFI and PSI are used to check for profile guided size optimizations.
817   BlockFrequencyInfo *BFI;
818   ProfileSummaryInfo *PSI;
819 
820   // Whether this loop should be optimized for size based on profile guided size
821   // optimizatios.
822   bool OptForSizeBasedOnProfile;
823 };
824 
825 class InnerLoopUnroller : public InnerLoopVectorizer {
826 public:
827   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
828                     LoopInfo *LI, DominatorTree *DT,
829                     const TargetLibraryInfo *TLI,
830                     const TargetTransformInfo *TTI, AssumptionCache *AC,
831                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
832                     LoopVectorizationLegality *LVL,
833                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
834                     ProfileSummaryInfo *PSI)
835       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
836                             UnrollFactor, LVL, CM, BFI, PSI) {}
837 
838 private:
839   Value *getBroadcastInstrs(Value *V) override;
840   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
841                        Instruction::BinaryOps Opcode =
842                        Instruction::BinaryOpsEnd) override;
843   Value *reverseVector(Value *Vec) override;
844 };
845 
846 } // end namespace llvm
847 
848 /// Look for a meaningful debug location on the instruction or it's
849 /// operands.
850 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
851   if (!I)
852     return I;
853 
854   DebugLoc Empty;
855   if (I->getDebugLoc() != Empty)
856     return I;
857 
858   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
859     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
860       if (OpInst->getDebugLoc() != Empty)
861         return OpInst;
862   }
863 
864   return I;
865 }
866 
867 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
868   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
869     const DILocation *DIL = Inst->getDebugLoc();
870     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
871         !isa<DbgInfoIntrinsic>(Inst)) {
872       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
873       if (NewDIL)
874         B.SetCurrentDebugLocation(NewDIL.getValue());
875       else
876         LLVM_DEBUG(dbgs()
877                    << "Failed to create new discriminator: "
878                    << DIL->getFilename() << " Line: " << DIL->getLine());
879     }
880     else
881       B.SetCurrentDebugLocation(DIL);
882   } else
883     B.SetCurrentDebugLocation(DebugLoc());
884 }
885 
886 /// Write a record \p DebugMsg about vectorization failure to the debug
887 /// output stream. If \p I is passed, it is an instruction that prevents
888 /// vectorization.
889 #ifndef NDEBUG
890 static void debugVectorizationFailure(const StringRef DebugMsg,
891     Instruction *I) {
892   dbgs() << "LV: Not vectorizing: " << DebugMsg;
893   if (I != nullptr)
894     dbgs() << " " << *I;
895   else
896     dbgs() << '.';
897   dbgs() << '\n';
898 }
899 #endif
900 
901 /// Create an analysis remark that explains why vectorization failed
902 ///
903 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
904 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
905 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
906 /// the location of the remark.  \return the remark object that can be
907 /// streamed to.
908 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
909     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
910   Value *CodeRegion = TheLoop->getHeader();
911   DebugLoc DL = TheLoop->getStartLoc();
912 
913   if (I) {
914     CodeRegion = I->getParent();
915     // If there is no debug location attached to the instruction, revert back to
916     // using the loop's.
917     if (I->getDebugLoc())
918       DL = I->getDebugLoc();
919   }
920 
921   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
922   R << "loop not vectorized: ";
923   return R;
924 }
925 
926 namespace llvm {
927 
928 void reportVectorizationFailure(const StringRef DebugMsg,
929     const StringRef OREMsg, const StringRef ORETag,
930     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
931   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
932   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
933   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
934                 ORETag, TheLoop, I) << OREMsg);
935 }
936 
937 } // end namespace llvm
938 
939 #ifndef NDEBUG
940 /// \return string containing a file name and a line # for the given loop.
941 static std::string getDebugLocString(const Loop *L) {
942   std::string Result;
943   if (L) {
944     raw_string_ostream OS(Result);
945     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
946       LoopDbgLoc.print(OS);
947     else
948       // Just print the module name.
949       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
950     OS.flush();
951   }
952   return Result;
953 }
954 #endif
955 
956 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
957                                          const Instruction *Orig) {
958   // If the loop was versioned with memchecks, add the corresponding no-alias
959   // metadata.
960   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
961     LVer->annotateInstWithNoAlias(To, Orig);
962 }
963 
964 void InnerLoopVectorizer::addMetadata(Instruction *To,
965                                       Instruction *From) {
966   propagateMetadata(To, From);
967   addNewMetadata(To, From);
968 }
969 
970 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
971                                       Instruction *From) {
972   for (Value *V : To) {
973     if (Instruction *I = dyn_cast<Instruction>(V))
974       addMetadata(I, From);
975   }
976 }
977 
978 namespace llvm {
979 
980 // Loop vectorization cost-model hints how the scalar epilogue loop should be
981 // lowered.
982 enum ScalarEpilogueLowering {
983 
984   // The default: allowing scalar epilogues.
985   CM_ScalarEpilogueAllowed,
986 
987   // Vectorization with OptForSize: don't allow epilogues.
988   CM_ScalarEpilogueNotAllowedOptSize,
989 
990   // A special case of vectorisation with OptForSize: loops with a very small
991   // trip count are considered for vectorization under OptForSize, thereby
992   // making sure the cost of their loop body is dominant, free of runtime
993   // guards and scalar iteration overheads.
994   CM_ScalarEpilogueNotAllowedLowTripLoop,
995 
996   // Loop hint predicate indicating an epilogue is undesired.
997   CM_ScalarEpilogueNotNeededUsePredicate
998 };
999 
1000 /// LoopVectorizationCostModel - estimates the expected speedups due to
1001 /// vectorization.
1002 /// In many cases vectorization is not profitable. This can happen because of
1003 /// a number of reasons. In this class we mainly attempt to predict the
1004 /// expected speedup/slowdowns due to the supported instruction set. We use the
1005 /// TargetTransformInfo to query the different backends for the cost of
1006 /// different operations.
1007 class LoopVectorizationCostModel {
1008 public:
1009   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1010                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1011                              LoopVectorizationLegality *Legal,
1012                              const TargetTransformInfo &TTI,
1013                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1014                              AssumptionCache *AC,
1015                              OptimizationRemarkEmitter *ORE, const Function *F,
1016                              const LoopVectorizeHints *Hints,
1017                              InterleavedAccessInfo &IAI)
1018       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1019         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1020         Hints(Hints), InterleaveInfo(IAI) {}
1021 
1022   /// \return An upper bound for the vectorization factor, or None if
1023   /// vectorization and interleaving should be avoided up front.
1024   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1025 
1026   /// \return True if runtime checks are required for vectorization, and false
1027   /// otherwise.
1028   bool runtimeChecksRequired();
1029 
1030   /// \return The most profitable vectorization factor and the cost of that VF.
1031   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1032   /// then this vectorization factor will be selected if vectorization is
1033   /// possible.
1034   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1035 
1036   /// Setup cost-based decisions for user vectorization factor.
1037   void selectUserVectorizationFactor(unsigned UserVF) {
1038     collectUniformsAndScalars(UserVF);
1039     collectInstsToScalarize(UserVF);
1040   }
1041 
1042   /// \return The size (in bits) of the smallest and widest types in the code
1043   /// that needs to be vectorized. We ignore values that remain scalar such as
1044   /// 64 bit loop indices.
1045   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1046 
1047   /// \return The desired interleave count.
1048   /// If interleave count has been specified by metadata it will be returned.
1049   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1050   /// are the selected vectorization factor and the cost of the selected VF.
1051   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1052 
1053   /// Memory access instruction may be vectorized in more than one way.
1054   /// Form of instruction after vectorization depends on cost.
1055   /// This function takes cost-based decisions for Load/Store instructions
1056   /// and collects them in a map. This decisions map is used for building
1057   /// the lists of loop-uniform and loop-scalar instructions.
1058   /// The calculated cost is saved with widening decision in order to
1059   /// avoid redundant calculations.
1060   void setCostBasedWideningDecision(unsigned VF);
1061 
1062   /// A struct that represents some properties of the register usage
1063   /// of a loop.
1064   struct RegisterUsage {
1065     /// Holds the number of loop invariant values that are used in the loop.
1066     /// The key is ClassID of target-provided register class.
1067     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1068     /// Holds the maximum number of concurrent live intervals in the loop.
1069     /// The key is ClassID of target-provided register class.
1070     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1071   };
1072 
1073   /// \return Returns information about the register usages of the loop for the
1074   /// given vectorization factors.
1075   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1076 
1077   /// Collect values we want to ignore in the cost model.
1078   void collectValuesToIgnore();
1079 
1080   /// Split reductions into those that happen in the loop, and those that happen
1081   /// outside. In loop reductions are collected into InLoopReductionChains.
1082   void collectInLoopReductions();
1083 
1084   /// \returns The smallest bitwidth each instruction can be represented with.
1085   /// The vector equivalents of these instructions should be truncated to this
1086   /// type.
1087   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1088     return MinBWs;
1089   }
1090 
1091   /// \returns True if it is more profitable to scalarize instruction \p I for
1092   /// vectorization factor \p VF.
1093   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1094     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1095 
1096     // Cost model is not run in the VPlan-native path - return conservative
1097     // result until this changes.
1098     if (EnableVPlanNativePath)
1099       return false;
1100 
1101     auto Scalars = InstsToScalarize.find(VF);
1102     assert(Scalars != InstsToScalarize.end() &&
1103            "VF not yet analyzed for scalarization profitability");
1104     return Scalars->second.find(I) != Scalars->second.end();
1105   }
1106 
1107   /// Returns true if \p I is known to be uniform after vectorization.
1108   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1109     if (VF == 1)
1110       return true;
1111 
1112     // Cost model is not run in the VPlan-native path - return conservative
1113     // result until this changes.
1114     if (EnableVPlanNativePath)
1115       return false;
1116 
1117     auto UniformsPerVF = Uniforms.find(VF);
1118     assert(UniformsPerVF != Uniforms.end() &&
1119            "VF not yet analyzed for uniformity");
1120     return UniformsPerVF->second.count(I);
1121   }
1122 
1123   /// Returns true if \p I is known to be scalar after vectorization.
1124   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1125     if (VF == 1)
1126       return true;
1127 
1128     // Cost model is not run in the VPlan-native path - return conservative
1129     // result until this changes.
1130     if (EnableVPlanNativePath)
1131       return false;
1132 
1133     auto ScalarsPerVF = Scalars.find(VF);
1134     assert(ScalarsPerVF != Scalars.end() &&
1135            "Scalar values are not calculated for VF");
1136     return ScalarsPerVF->second.count(I);
1137   }
1138 
1139   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1140   /// for vectorization factor \p VF.
1141   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1142     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1143            !isProfitableToScalarize(I, VF) &&
1144            !isScalarAfterVectorization(I, VF);
1145   }
1146 
1147   /// Decision that was taken during cost calculation for memory instruction.
1148   enum InstWidening {
1149     CM_Unknown,
1150     CM_Widen,         // For consecutive accesses with stride +1.
1151     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1152     CM_Interleave,
1153     CM_GatherScatter,
1154     CM_Scalarize
1155   };
1156 
1157   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1158   /// instruction \p I and vector width \p VF.
1159   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1160                            unsigned Cost) {
1161     assert(VF >= 2 && "Expected VF >=2");
1162     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1163   }
1164 
1165   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1166   /// interleaving group \p Grp and vector width \p VF.
1167   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1168                            InstWidening W, unsigned Cost) {
1169     assert(VF >= 2 && "Expected VF >=2");
1170     /// Broadcast this decicion to all instructions inside the group.
1171     /// But the cost will be assigned to one instruction only.
1172     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1173       if (auto *I = Grp->getMember(i)) {
1174         if (Grp->getInsertPos() == I)
1175           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1176         else
1177           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1178       }
1179     }
1180   }
1181 
1182   /// Return the cost model decision for the given instruction \p I and vector
1183   /// width \p VF. Return CM_Unknown if this instruction did not pass
1184   /// through the cost modeling.
1185   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1186     assert(VF >= 2 && "Expected VF >=2");
1187 
1188     // Cost model is not run in the VPlan-native path - return conservative
1189     // result until this changes.
1190     if (EnableVPlanNativePath)
1191       return CM_GatherScatter;
1192 
1193     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1194     auto Itr = WideningDecisions.find(InstOnVF);
1195     if (Itr == WideningDecisions.end())
1196       return CM_Unknown;
1197     return Itr->second.first;
1198   }
1199 
1200   /// Return the vectorization cost for the given instruction \p I and vector
1201   /// width \p VF.
1202   unsigned getWideningCost(Instruction *I, unsigned VF) {
1203     assert(VF >= 2 && "Expected VF >=2");
1204     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1205     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1206            "The cost is not calculated");
1207     return WideningDecisions[InstOnVF].second;
1208   }
1209 
1210   /// Return True if instruction \p I is an optimizable truncate whose operand
1211   /// is an induction variable. Such a truncate will be removed by adding a new
1212   /// induction variable with the destination type.
1213   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1214     // If the instruction is not a truncate, return false.
1215     auto *Trunc = dyn_cast<TruncInst>(I);
1216     if (!Trunc)
1217       return false;
1218 
1219     // Get the source and destination types of the truncate.
1220     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1221     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1222 
1223     // If the truncate is free for the given types, return false. Replacing a
1224     // free truncate with an induction variable would add an induction variable
1225     // update instruction to each iteration of the loop. We exclude from this
1226     // check the primary induction variable since it will need an update
1227     // instruction regardless.
1228     Value *Op = Trunc->getOperand(0);
1229     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1230       return false;
1231 
1232     // If the truncated value is not an induction variable, return false.
1233     return Legal->isInductionPhi(Op);
1234   }
1235 
1236   /// Collects the instructions to scalarize for each predicated instruction in
1237   /// the loop.
1238   void collectInstsToScalarize(unsigned VF);
1239 
1240   /// Collect Uniform and Scalar values for the given \p VF.
1241   /// The sets depend on CM decision for Load/Store instructions
1242   /// that may be vectorized as interleave, gather-scatter or scalarized.
1243   void collectUniformsAndScalars(unsigned VF) {
1244     // Do the analysis once.
1245     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1246       return;
1247     setCostBasedWideningDecision(VF);
1248     collectLoopUniforms(VF);
1249     collectLoopScalars(VF);
1250   }
1251 
1252   /// Returns true if the target machine supports masked store operation
1253   /// for the given \p DataType and kind of access to \p Ptr.
1254   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1255     return Legal->isConsecutivePtr(Ptr) &&
1256            TTI.isLegalMaskedStore(DataType, Alignment);
1257   }
1258 
1259   /// Returns true if the target machine supports masked load operation
1260   /// for the given \p DataType and kind of access to \p Ptr.
1261   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1262     return Legal->isConsecutivePtr(Ptr) &&
1263            TTI.isLegalMaskedLoad(DataType, Alignment);
1264   }
1265 
1266   /// Returns true if the target machine supports masked scatter operation
1267   /// for the given \p DataType.
1268   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1269     return TTI.isLegalMaskedScatter(DataType, Alignment);
1270   }
1271 
1272   /// Returns true if the target machine supports masked gather operation
1273   /// for the given \p DataType.
1274   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1275     return TTI.isLegalMaskedGather(DataType, Alignment);
1276   }
1277 
1278   /// Returns true if the target machine can represent \p V as a masked gather
1279   /// or scatter operation.
1280   bool isLegalGatherOrScatter(Value *V) {
1281     bool LI = isa<LoadInst>(V);
1282     bool SI = isa<StoreInst>(V);
1283     if (!LI && !SI)
1284       return false;
1285     auto *Ty = getMemInstValueType(V);
1286     Align Align = getLoadStoreAlignment(V);
1287     return (LI && isLegalMaskedGather(Ty, Align)) ||
1288            (SI && isLegalMaskedScatter(Ty, Align));
1289   }
1290 
1291   /// Returns true if \p I is an instruction that will be scalarized with
1292   /// predication. Such instructions include conditional stores and
1293   /// instructions that may divide by zero.
1294   /// If a non-zero VF has been calculated, we check if I will be scalarized
1295   /// predication for that VF.
1296   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1297 
1298   // Returns true if \p I is an instruction that will be predicated either
1299   // through scalar predication or masked load/store or masked gather/scatter.
1300   // Superset of instructions that return true for isScalarWithPredication.
1301   bool isPredicatedInst(Instruction *I) {
1302     if (!blockNeedsPredication(I->getParent()))
1303       return false;
1304     // Loads and stores that need some form of masked operation are predicated
1305     // instructions.
1306     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1307       return Legal->isMaskRequired(I);
1308     return isScalarWithPredication(I);
1309   }
1310 
1311   /// Returns true if \p I is a memory instruction with consecutive memory
1312   /// access that can be widened.
1313   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1314 
1315   /// Returns true if \p I is a memory instruction in an interleaved-group
1316   /// of memory accesses that can be vectorized with wide vector loads/stores
1317   /// and shuffles.
1318   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1319 
1320   /// Check if \p Instr belongs to any interleaved access group.
1321   bool isAccessInterleaved(Instruction *Instr) {
1322     return InterleaveInfo.isInterleaved(Instr);
1323   }
1324 
1325   /// Get the interleaved access group that \p Instr belongs to.
1326   const InterleaveGroup<Instruction> *
1327   getInterleavedAccessGroup(Instruction *Instr) {
1328     return InterleaveInfo.getInterleaveGroup(Instr);
1329   }
1330 
1331   /// Returns true if an interleaved group requires a scalar iteration
1332   /// to handle accesses with gaps, and there is nothing preventing us from
1333   /// creating a scalar epilogue.
1334   bool requiresScalarEpilogue() const {
1335     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1336   }
1337 
1338   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1339   /// loop hint annotation.
1340   bool isScalarEpilogueAllowed() const {
1341     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1342   }
1343 
1344   /// Returns true if all loop blocks should be masked to fold tail loop.
1345   bool foldTailByMasking() const { return FoldTailByMasking; }
1346 
1347   bool blockNeedsPredication(BasicBlock *BB) {
1348     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1349   }
1350 
1351   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1352   /// nodes to the chain of instructions representing the reductions. Uses a
1353   /// MapVector to ensure deterministic iteration order.
1354   using ReductionChainMap =
1355       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1356 
1357   /// Return the chain of instructions representing an inloop reduction.
1358   const ReductionChainMap &getInLoopReductionChains() const {
1359     return InLoopReductionChains;
1360   }
1361 
1362   /// Returns true if the Phi is part of an inloop reduction.
1363   bool isInLoopReduction(PHINode *Phi) const {
1364     return InLoopReductionChains.count(Phi);
1365   }
1366 
1367   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1368   /// with factor VF.  Return the cost of the instruction, including
1369   /// scalarization overhead if it's needed.
1370   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1371 
1372   /// Estimate cost of a call instruction CI if it were vectorized with factor
1373   /// VF. Return the cost of the instruction, including scalarization overhead
1374   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1375   /// scalarized -
1376   /// i.e. either vector version isn't available, or is too expensive.
1377   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1378 
1379   /// Invalidates decisions already taken by the cost model.
1380   void invalidateCostModelingDecisions() {
1381     WideningDecisions.clear();
1382     Uniforms.clear();
1383     Scalars.clear();
1384   }
1385 
1386 private:
1387   unsigned NumPredStores = 0;
1388 
1389   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1390   /// than zero. One is returned if vectorization should best be avoided due
1391   /// to cost.
1392   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1393 
1394   /// The vectorization cost is a combination of the cost itself and a boolean
1395   /// indicating whether any of the contributing operations will actually
1396   /// operate on
1397   /// vector values after type legalization in the backend. If this latter value
1398   /// is
1399   /// false, then all operations will be scalarized (i.e. no vectorization has
1400   /// actually taken place).
1401   using VectorizationCostTy = std::pair<unsigned, bool>;
1402 
1403   /// Returns the expected execution cost. The unit of the cost does
1404   /// not matter because we use the 'cost' units to compare different
1405   /// vector widths. The cost that is returned is *not* normalized by
1406   /// the factor width.
1407   VectorizationCostTy expectedCost(unsigned VF);
1408 
1409   /// Returns the execution time cost of an instruction for a given vector
1410   /// width. Vector width of one means scalar.
1411   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1412 
1413   /// The cost-computation logic from getInstructionCost which provides
1414   /// the vector type as an output parameter.
1415   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1416 
1417   /// Calculate vectorization cost of memory instruction \p I.
1418   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1419 
1420   /// The cost computation for scalarized memory instruction.
1421   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1422 
1423   /// The cost computation for interleaving group of memory instructions.
1424   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1425 
1426   /// The cost computation for Gather/Scatter instruction.
1427   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1428 
1429   /// The cost computation for widening instruction \p I with consecutive
1430   /// memory access.
1431   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1432 
1433   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1434   /// Load: scalar load + broadcast.
1435   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1436   /// element)
1437   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1438 
1439   /// Estimate the overhead of scalarizing an instruction. This is a
1440   /// convenience wrapper for the type-based getScalarizationOverhead API.
1441   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1442 
1443   /// Returns whether the instruction is a load or store and will be a emitted
1444   /// as a vector operation.
1445   bool isConsecutiveLoadOrStore(Instruction *I);
1446 
1447   /// Returns true if an artificially high cost for emulated masked memrefs
1448   /// should be used.
1449   bool useEmulatedMaskMemRefHack(Instruction *I);
1450 
1451   /// Map of scalar integer values to the smallest bitwidth they can be legally
1452   /// represented as. The vector equivalents of these values should be truncated
1453   /// to this type.
1454   MapVector<Instruction *, uint64_t> MinBWs;
1455 
1456   /// A type representing the costs for instructions if they were to be
1457   /// scalarized rather than vectorized. The entries are Instruction-Cost
1458   /// pairs.
1459   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1460 
1461   /// A set containing all BasicBlocks that are known to present after
1462   /// vectorization as a predicated block.
1463   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1464 
1465   /// Records whether it is allowed to have the original scalar loop execute at
1466   /// least once. This may be needed as a fallback loop in case runtime
1467   /// aliasing/dependence checks fail, or to handle the tail/remainder
1468   /// iterations when the trip count is unknown or doesn't divide by the VF,
1469   /// or as a peel-loop to handle gaps in interleave-groups.
1470   /// Under optsize and when the trip count is very small we don't allow any
1471   /// iterations to execute in the scalar loop.
1472   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1473 
1474   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1475   bool FoldTailByMasking = false;
1476 
1477   /// A map holding scalar costs for different vectorization factors. The
1478   /// presence of a cost for an instruction in the mapping indicates that the
1479   /// instruction will be scalarized when vectorizing with the associated
1480   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1481   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1482 
1483   /// Holds the instructions known to be uniform after vectorization.
1484   /// The data is collected per VF.
1485   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1486 
1487   /// Holds the instructions known to be scalar after vectorization.
1488   /// The data is collected per VF.
1489   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1490 
1491   /// Holds the instructions (address computations) that are forced to be
1492   /// scalarized.
1493   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1494 
1495   /// PHINodes of the reductions that should be expanded in-loop along with
1496   /// their associated chains of reduction operations, in program order from top
1497   /// (PHI) to bottom
1498   ReductionChainMap InLoopReductionChains;
1499 
1500   /// Returns the expected difference in cost from scalarizing the expression
1501   /// feeding a predicated instruction \p PredInst. The instructions to
1502   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1503   /// non-negative return value implies the expression will be scalarized.
1504   /// Currently, only single-use chains are considered for scalarization.
1505   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1506                               unsigned VF);
1507 
1508   /// Collect the instructions that are uniform after vectorization. An
1509   /// instruction is uniform if we represent it with a single scalar value in
1510   /// the vectorized loop corresponding to each vector iteration. Examples of
1511   /// uniform instructions include pointer operands of consecutive or
1512   /// interleaved memory accesses. Note that although uniformity implies an
1513   /// instruction will be scalar, the reverse is not true. In general, a
1514   /// scalarized instruction will be represented by VF scalar values in the
1515   /// vectorized loop, each corresponding to an iteration of the original
1516   /// scalar loop.
1517   void collectLoopUniforms(unsigned VF);
1518 
1519   /// Collect the instructions that are scalar after vectorization. An
1520   /// instruction is scalar if it is known to be uniform or will be scalarized
1521   /// during vectorization. Non-uniform scalarized instructions will be
1522   /// represented by VF values in the vectorized loop, each corresponding to an
1523   /// iteration of the original scalar loop.
1524   void collectLoopScalars(unsigned VF);
1525 
1526   /// Keeps cost model vectorization decision and cost for instructions.
1527   /// Right now it is used for memory instructions only.
1528   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1529                                 std::pair<InstWidening, unsigned>>;
1530 
1531   DecisionList WideningDecisions;
1532 
1533   /// Returns true if \p V is expected to be vectorized and it needs to be
1534   /// extracted.
1535   bool needsExtract(Value *V, unsigned VF) const {
1536     Instruction *I = dyn_cast<Instruction>(V);
1537     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1538       return false;
1539 
1540     // Assume we can vectorize V (and hence we need extraction) if the
1541     // scalars are not computed yet. This can happen, because it is called
1542     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1543     // the scalars are collected. That should be a safe assumption in most
1544     // cases, because we check if the operands have vectorizable types
1545     // beforehand in LoopVectorizationLegality.
1546     return Scalars.find(VF) == Scalars.end() ||
1547            !isScalarAfterVectorization(I, VF);
1548   };
1549 
1550   /// Returns a range containing only operands needing to be extracted.
1551   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1552                                                    unsigned VF) {
1553     return SmallVector<Value *, 4>(make_filter_range(
1554         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1555   }
1556 
1557 public:
1558   /// The loop that we evaluate.
1559   Loop *TheLoop;
1560 
1561   /// Predicated scalar evolution analysis.
1562   PredicatedScalarEvolution &PSE;
1563 
1564   /// Loop Info analysis.
1565   LoopInfo *LI;
1566 
1567   /// Vectorization legality.
1568   LoopVectorizationLegality *Legal;
1569 
1570   /// Vector target information.
1571   const TargetTransformInfo &TTI;
1572 
1573   /// Target Library Info.
1574   const TargetLibraryInfo *TLI;
1575 
1576   /// Demanded bits analysis.
1577   DemandedBits *DB;
1578 
1579   /// Assumption cache.
1580   AssumptionCache *AC;
1581 
1582   /// Interface to emit optimization remarks.
1583   OptimizationRemarkEmitter *ORE;
1584 
1585   const Function *TheFunction;
1586 
1587   /// Loop Vectorize Hint.
1588   const LoopVectorizeHints *Hints;
1589 
1590   /// The interleave access information contains groups of interleaved accesses
1591   /// with the same stride and close to each other.
1592   InterleavedAccessInfo &InterleaveInfo;
1593 
1594   /// Values to ignore in the cost model.
1595   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1596 
1597   /// Values to ignore in the cost model when VF > 1.
1598   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1599 };
1600 
1601 } // end namespace llvm
1602 
1603 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1604 // vectorization. The loop needs to be annotated with #pragma omp simd
1605 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1606 // vector length information is not provided, vectorization is not considered
1607 // explicit. Interleave hints are not allowed either. These limitations will be
1608 // relaxed in the future.
1609 // Please, note that we are currently forced to abuse the pragma 'clang
1610 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1611 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1612 // provides *explicit vectorization hints* (LV can bypass legal checks and
1613 // assume that vectorization is legal). However, both hints are implemented
1614 // using the same metadata (llvm.loop.vectorize, processed by
1615 // LoopVectorizeHints). This will be fixed in the future when the native IR
1616 // representation for pragma 'omp simd' is introduced.
1617 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1618                                    OptimizationRemarkEmitter *ORE) {
1619   assert(!OuterLp->empty() && "This is not an outer loop");
1620   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1621 
1622   // Only outer loops with an explicit vectorization hint are supported.
1623   // Unannotated outer loops are ignored.
1624   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1625     return false;
1626 
1627   Function *Fn = OuterLp->getHeader()->getParent();
1628   if (!Hints.allowVectorization(Fn, OuterLp,
1629                                 true /*VectorizeOnlyWhenForced*/)) {
1630     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1631     return false;
1632   }
1633 
1634   if (Hints.getInterleave() > 1) {
1635     // TODO: Interleave support is future work.
1636     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1637                          "outer loops.\n");
1638     Hints.emitRemarkWithHints();
1639     return false;
1640   }
1641 
1642   return true;
1643 }
1644 
1645 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1646                                   OptimizationRemarkEmitter *ORE,
1647                                   SmallVectorImpl<Loop *> &V) {
1648   // Collect inner loops and outer loops without irreducible control flow. For
1649   // now, only collect outer loops that have explicit vectorization hints. If we
1650   // are stress testing the VPlan H-CFG construction, we collect the outermost
1651   // loop of every loop nest.
1652   if (L.empty() || VPlanBuildStressTest ||
1653       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1654     LoopBlocksRPO RPOT(&L);
1655     RPOT.perform(LI);
1656     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1657       V.push_back(&L);
1658       // TODO: Collect inner loops inside marked outer loops in case
1659       // vectorization fails for the outer loop. Do not invoke
1660       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1661       // already known to be reducible. We can use an inherited attribute for
1662       // that.
1663       return;
1664     }
1665   }
1666   for (Loop *InnerL : L)
1667     collectSupportedLoops(*InnerL, LI, ORE, V);
1668 }
1669 
1670 namespace {
1671 
1672 /// The LoopVectorize Pass.
1673 struct LoopVectorize : public FunctionPass {
1674   /// Pass identification, replacement for typeid
1675   static char ID;
1676 
1677   LoopVectorizePass Impl;
1678 
1679   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1680                          bool VectorizeOnlyWhenForced = false)
1681       : FunctionPass(ID),
1682         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1683     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1684   }
1685 
1686   bool runOnFunction(Function &F) override {
1687     if (skipFunction(F))
1688       return false;
1689 
1690     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1691     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1692     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1693     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1694     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1695     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1696     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1697     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1698     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1699     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1700     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1701     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1702     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1703 
1704     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1705         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1706 
1707     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1708                         GetLAA, *ORE, PSI).MadeAnyChange;
1709   }
1710 
1711   void getAnalysisUsage(AnalysisUsage &AU) const override {
1712     AU.addRequired<AssumptionCacheTracker>();
1713     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1714     AU.addRequired<DominatorTreeWrapperPass>();
1715     AU.addRequired<LoopInfoWrapperPass>();
1716     AU.addRequired<ScalarEvolutionWrapperPass>();
1717     AU.addRequired<TargetTransformInfoWrapperPass>();
1718     AU.addRequired<AAResultsWrapperPass>();
1719     AU.addRequired<LoopAccessLegacyAnalysis>();
1720     AU.addRequired<DemandedBitsWrapperPass>();
1721     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1722     AU.addRequired<InjectTLIMappingsLegacy>();
1723 
1724     // We currently do not preserve loopinfo/dominator analyses with outer loop
1725     // vectorization. Until this is addressed, mark these analyses as preserved
1726     // only for non-VPlan-native path.
1727     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1728     if (!EnableVPlanNativePath) {
1729       AU.addPreserved<LoopInfoWrapperPass>();
1730       AU.addPreserved<DominatorTreeWrapperPass>();
1731     }
1732 
1733     AU.addPreserved<BasicAAWrapperPass>();
1734     AU.addPreserved<GlobalsAAWrapperPass>();
1735     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1736   }
1737 };
1738 
1739 } // end anonymous namespace
1740 
1741 //===----------------------------------------------------------------------===//
1742 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1743 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1744 //===----------------------------------------------------------------------===//
1745 
1746 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1747   // We need to place the broadcast of invariant variables outside the loop,
1748   // but only if it's proven safe to do so. Else, broadcast will be inside
1749   // vector loop body.
1750   Instruction *Instr = dyn_cast<Instruction>(V);
1751   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1752                      (!Instr ||
1753                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1754   // Place the code for broadcasting invariant variables in the new preheader.
1755   IRBuilder<>::InsertPointGuard Guard(Builder);
1756   if (SafeToHoist)
1757     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1758 
1759   // Broadcast the scalar into all locations in the vector.
1760   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1761 
1762   return Shuf;
1763 }
1764 
1765 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1766     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1767   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1768          "Expected either an induction phi-node or a truncate of it!");
1769   Value *Start = II.getStartValue();
1770 
1771   // Construct the initial value of the vector IV in the vector loop preheader
1772   auto CurrIP = Builder.saveIP();
1773   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1774   if (isa<TruncInst>(EntryVal)) {
1775     assert(Start->getType()->isIntegerTy() &&
1776            "Truncation requires an integer type");
1777     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1778     Step = Builder.CreateTrunc(Step, TruncType);
1779     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1780   }
1781   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1782   Value *SteppedStart =
1783       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1784 
1785   // We create vector phi nodes for both integer and floating-point induction
1786   // variables. Here, we determine the kind of arithmetic we will perform.
1787   Instruction::BinaryOps AddOp;
1788   Instruction::BinaryOps MulOp;
1789   if (Step->getType()->isIntegerTy()) {
1790     AddOp = Instruction::Add;
1791     MulOp = Instruction::Mul;
1792   } else {
1793     AddOp = II.getInductionOpcode();
1794     MulOp = Instruction::FMul;
1795   }
1796 
1797   // Multiply the vectorization factor by the step using integer or
1798   // floating-point arithmetic as appropriate.
1799   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1800   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1801 
1802   // Create a vector splat to use in the induction update.
1803   //
1804   // FIXME: If the step is non-constant, we create the vector splat with
1805   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1806   //        handle a constant vector splat.
1807   Value *SplatVF = isa<Constant>(Mul)
1808                        ? ConstantVector::getSplat(ElementCount::getFixed(VF),
1809                                                   cast<Constant>(Mul))
1810                        : Builder.CreateVectorSplat(VF, Mul);
1811   Builder.restoreIP(CurrIP);
1812 
1813   // We may need to add the step a number of times, depending on the unroll
1814   // factor. The last of those goes into the PHI.
1815   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1816                                     &*LoopVectorBody->getFirstInsertionPt());
1817   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1818   Instruction *LastInduction = VecInd;
1819   for (unsigned Part = 0; Part < UF; ++Part) {
1820     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1821 
1822     if (isa<TruncInst>(EntryVal))
1823       addMetadata(LastInduction, EntryVal);
1824     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1825 
1826     LastInduction = cast<Instruction>(addFastMathFlag(
1827         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1828     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1829   }
1830 
1831   // Move the last step to the end of the latch block. This ensures consistent
1832   // placement of all induction updates.
1833   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1834   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1835   auto *ICmp = cast<Instruction>(Br->getCondition());
1836   LastInduction->moveBefore(ICmp);
1837   LastInduction->setName("vec.ind.next");
1838 
1839   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1840   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1841 }
1842 
1843 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1844   return Cost->isScalarAfterVectorization(I, VF) ||
1845          Cost->isProfitableToScalarize(I, VF);
1846 }
1847 
1848 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1849   if (shouldScalarizeInstruction(IV))
1850     return true;
1851   auto isScalarInst = [&](User *U) -> bool {
1852     auto *I = cast<Instruction>(U);
1853     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1854   };
1855   return llvm::any_of(IV->users(), isScalarInst);
1856 }
1857 
1858 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1859     const InductionDescriptor &ID, const Instruction *EntryVal,
1860     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1861   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1862          "Expected either an induction phi-node or a truncate of it!");
1863 
1864   // This induction variable is not the phi from the original loop but the
1865   // newly-created IV based on the proof that casted Phi is equal to the
1866   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1867   // re-uses the same InductionDescriptor that original IV uses but we don't
1868   // have to do any recording in this case - that is done when original IV is
1869   // processed.
1870   if (isa<TruncInst>(EntryVal))
1871     return;
1872 
1873   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1874   if (Casts.empty())
1875     return;
1876   // Only the first Cast instruction in the Casts vector is of interest.
1877   // The rest of the Casts (if exist) have no uses outside the
1878   // induction update chain itself.
1879   Instruction *CastInst = *Casts.begin();
1880   if (Lane < UINT_MAX)
1881     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1882   else
1883     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1884 }
1885 
1886 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1887   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1888          "Primary induction variable must have an integer type");
1889 
1890   auto II = Legal->getInductionVars().find(IV);
1891   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1892 
1893   auto ID = II->second;
1894   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1895 
1896   // The value from the original loop to which we are mapping the new induction
1897   // variable.
1898   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1899 
1900   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1901 
1902   // Generate code for the induction step. Note that induction steps are
1903   // required to be loop-invariant
1904   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1905     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1906            "Induction step should be loop invariant");
1907     if (PSE.getSE()->isSCEVable(IV->getType())) {
1908       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1909       return Exp.expandCodeFor(Step, Step->getType(),
1910                                LoopVectorPreHeader->getTerminator());
1911     }
1912     return cast<SCEVUnknown>(Step)->getValue();
1913   };
1914 
1915   // The scalar value to broadcast. This is derived from the canonical
1916   // induction variable. If a truncation type is given, truncate the canonical
1917   // induction variable and step. Otherwise, derive these values from the
1918   // induction descriptor.
1919   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1920     Value *ScalarIV = Induction;
1921     if (IV != OldInduction) {
1922       ScalarIV = IV->getType()->isIntegerTy()
1923                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1924                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1925                                           IV->getType());
1926       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1927       ScalarIV->setName("offset.idx");
1928     }
1929     if (Trunc) {
1930       auto *TruncType = cast<IntegerType>(Trunc->getType());
1931       assert(Step->getType()->isIntegerTy() &&
1932              "Truncation requires an integer step");
1933       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1934       Step = Builder.CreateTrunc(Step, TruncType);
1935     }
1936     return ScalarIV;
1937   };
1938 
1939   // Create the vector values from the scalar IV, in the absence of creating a
1940   // vector IV.
1941   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1942     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1943     for (unsigned Part = 0; Part < UF; ++Part) {
1944       Value *EntryPart =
1945           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1946       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1947       if (Trunc)
1948         addMetadata(EntryPart, Trunc);
1949       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1950     }
1951   };
1952 
1953   // Now do the actual transformations, and start with creating the step value.
1954   Value *Step = CreateStepValue(ID.getStep());
1955   if (VF <= 1) {
1956     Value *ScalarIV = CreateScalarIV(Step);
1957     CreateSplatIV(ScalarIV, Step);
1958     return;
1959   }
1960 
1961   // Determine if we want a scalar version of the induction variable. This is
1962   // true if the induction variable itself is not widened, or if it has at
1963   // least one user in the loop that is not widened.
1964   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1965   if (!NeedsScalarIV) {
1966     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1967     return;
1968   }
1969 
1970   // Try to create a new independent vector induction variable. If we can't
1971   // create the phi node, we will splat the scalar induction variable in each
1972   // loop iteration.
1973   if (!shouldScalarizeInstruction(EntryVal)) {
1974     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1975     Value *ScalarIV = CreateScalarIV(Step);
1976     // Create scalar steps that can be used by instructions we will later
1977     // scalarize. Note that the addition of the scalar steps will not increase
1978     // the number of instructions in the loop in the common case prior to
1979     // InstCombine. We will be trading one vector extract for each scalar step.
1980     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1981     return;
1982   }
1983 
1984   // All IV users are scalar instructions, so only emit a scalar IV, not a
1985   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
1986   // predicate used by the masked loads/stores.
1987   Value *ScalarIV = CreateScalarIV(Step);
1988   if (!Cost->isScalarEpilogueAllowed())
1989     CreateSplatIV(ScalarIV, Step);
1990   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1991 }
1992 
1993 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1994                                           Instruction::BinaryOps BinOp) {
1995   // Create and check the types.
1996   auto *ValVTy = cast<VectorType>(Val->getType());
1997   int VLen = ValVTy->getNumElements();
1998 
1999   Type *STy = Val->getType()->getScalarType();
2000   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2001          "Induction Step must be an integer or FP");
2002   assert(Step->getType() == STy && "Step has wrong type");
2003 
2004   SmallVector<Constant *, 8> Indices;
2005 
2006   if (STy->isIntegerTy()) {
2007     // Create a vector of consecutive numbers from zero to VF.
2008     for (int i = 0; i < VLen; ++i)
2009       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2010 
2011     // Add the consecutive indices to the vector value.
2012     Constant *Cv = ConstantVector::get(Indices);
2013     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2014     Step = Builder.CreateVectorSplat(VLen, Step);
2015     assert(Step->getType() == Val->getType() && "Invalid step vec");
2016     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2017     // which can be found from the original scalar operations.
2018     Step = Builder.CreateMul(Cv, Step);
2019     return Builder.CreateAdd(Val, Step, "induction");
2020   }
2021 
2022   // Floating point induction.
2023   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2024          "Binary Opcode should be specified for FP induction");
2025   // Create a vector of consecutive numbers from zero to VF.
2026   for (int i = 0; i < VLen; ++i)
2027     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2028 
2029   // Add the consecutive indices to the vector value.
2030   Constant *Cv = ConstantVector::get(Indices);
2031 
2032   Step = Builder.CreateVectorSplat(VLen, Step);
2033 
2034   // Floating point operations had to be 'fast' to enable the induction.
2035   FastMathFlags Flags;
2036   Flags.setFast();
2037 
2038   Value *MulOp = Builder.CreateFMul(Cv, Step);
2039   if (isa<Instruction>(MulOp))
2040     // Have to check, MulOp may be a constant
2041     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2042 
2043   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2044   if (isa<Instruction>(BOp))
2045     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2046   return BOp;
2047 }
2048 
2049 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2050                                            Instruction *EntryVal,
2051                                            const InductionDescriptor &ID) {
2052   // We shouldn't have to build scalar steps if we aren't vectorizing.
2053   assert(VF > 1 && "VF should be greater than one");
2054 
2055   // Get the value type and ensure it and the step have the same integer type.
2056   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2057   assert(ScalarIVTy == Step->getType() &&
2058          "Val and Step should have the same type");
2059 
2060   // We build scalar steps for both integer and floating-point induction
2061   // variables. Here, we determine the kind of arithmetic we will perform.
2062   Instruction::BinaryOps AddOp;
2063   Instruction::BinaryOps MulOp;
2064   if (ScalarIVTy->isIntegerTy()) {
2065     AddOp = Instruction::Add;
2066     MulOp = Instruction::Mul;
2067   } else {
2068     AddOp = ID.getInductionOpcode();
2069     MulOp = Instruction::FMul;
2070   }
2071 
2072   // Determine the number of scalars we need to generate for each unroll
2073   // iteration. If EntryVal is uniform, we only need to generate the first
2074   // lane. Otherwise, we generate all VF values.
2075   unsigned Lanes =
2076       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2077                                                                          : VF;
2078   // Compute the scalar steps and save the results in VectorLoopValueMap.
2079   for (unsigned Part = 0; Part < UF; ++Part) {
2080     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2081       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2082       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2083       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2084       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2085       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2086     }
2087   }
2088 }
2089 
2090 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2091   assert(V != Induction && "The new induction variable should not be used.");
2092   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2093   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2094 
2095   // If we have a stride that is replaced by one, do it here. Defer this for
2096   // the VPlan-native path until we start running Legal checks in that path.
2097   if (!EnableVPlanNativePath && Legal->hasStride(V))
2098     V = ConstantInt::get(V->getType(), 1);
2099 
2100   // If we have a vector mapped to this value, return it.
2101   if (VectorLoopValueMap.hasVectorValue(V, Part))
2102     return VectorLoopValueMap.getVectorValue(V, Part);
2103 
2104   // If the value has not been vectorized, check if it has been scalarized
2105   // instead. If it has been scalarized, and we actually need the value in
2106   // vector form, we will construct the vector values on demand.
2107   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2108     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2109 
2110     // If we've scalarized a value, that value should be an instruction.
2111     auto *I = cast<Instruction>(V);
2112 
2113     // If we aren't vectorizing, we can just copy the scalar map values over to
2114     // the vector map.
2115     if (VF == 1) {
2116       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2117       return ScalarValue;
2118     }
2119 
2120     // Get the last scalar instruction we generated for V and Part. If the value
2121     // is known to be uniform after vectorization, this corresponds to lane zero
2122     // of the Part unroll iteration. Otherwise, the last instruction is the one
2123     // we created for the last vector lane of the Part unroll iteration.
2124     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2125     auto *LastInst = cast<Instruction>(
2126         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2127 
2128     // Set the insert point after the last scalarized instruction. This ensures
2129     // the insertelement sequence will directly follow the scalar definitions.
2130     auto OldIP = Builder.saveIP();
2131     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2132     Builder.SetInsertPoint(&*NewIP);
2133 
2134     // However, if we are vectorizing, we need to construct the vector values.
2135     // If the value is known to be uniform after vectorization, we can just
2136     // broadcast the scalar value corresponding to lane zero for each unroll
2137     // iteration. Otherwise, we construct the vector values using insertelement
2138     // instructions. Since the resulting vectors are stored in
2139     // VectorLoopValueMap, we will only generate the insertelements once.
2140     Value *VectorValue = nullptr;
2141     if (Cost->isUniformAfterVectorization(I, VF)) {
2142       VectorValue = getBroadcastInstrs(ScalarValue);
2143       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2144     } else {
2145       // Initialize packing with insertelements to start from undef.
2146       Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
2147       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2148       for (unsigned Lane = 0; Lane < VF; ++Lane)
2149         packScalarIntoVectorValue(V, {Part, Lane});
2150       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2151     }
2152     Builder.restoreIP(OldIP);
2153     return VectorValue;
2154   }
2155 
2156   // If this scalar is unknown, assume that it is a constant or that it is
2157   // loop invariant. Broadcast V and save the value for future uses.
2158   Value *B = getBroadcastInstrs(V);
2159   VectorLoopValueMap.setVectorValue(V, Part, B);
2160   return B;
2161 }
2162 
2163 Value *
2164 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2165                                             const VPIteration &Instance) {
2166   // If the value is not an instruction contained in the loop, it should
2167   // already be scalar.
2168   if (OrigLoop->isLoopInvariant(V))
2169     return V;
2170 
2171   assert(Instance.Lane > 0
2172              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2173              : true && "Uniform values only have lane zero");
2174 
2175   // If the value from the original loop has not been vectorized, it is
2176   // represented by UF x VF scalar values in the new loop. Return the requested
2177   // scalar value.
2178   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2179     return VectorLoopValueMap.getScalarValue(V, Instance);
2180 
2181   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2182   // for the given unroll part. If this entry is not a vector type (i.e., the
2183   // vectorization factor is one), there is no need to generate an
2184   // extractelement instruction.
2185   auto *U = getOrCreateVectorValue(V, Instance.Part);
2186   if (!U->getType()->isVectorTy()) {
2187     assert(VF == 1 && "Value not scalarized has non-vector type");
2188     return U;
2189   }
2190 
2191   // Otherwise, the value from the original loop has been vectorized and is
2192   // represented by UF vector values. Extract and return the requested scalar
2193   // value from the appropriate vector lane.
2194   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2195 }
2196 
2197 void InnerLoopVectorizer::packScalarIntoVectorValue(
2198     Value *V, const VPIteration &Instance) {
2199   assert(V != Induction && "The new induction variable should not be used.");
2200   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2201   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2202 
2203   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2204   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2205   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2206                                             Builder.getInt32(Instance.Lane));
2207   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2208 }
2209 
2210 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2211   assert(Vec->getType()->isVectorTy() && "Invalid type");
2212   SmallVector<int, 8> ShuffleMask;
2213   for (unsigned i = 0; i < VF; ++i)
2214     ShuffleMask.push_back(VF - i - 1);
2215 
2216   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2217                                      ShuffleMask, "reverse");
2218 }
2219 
2220 // Return whether we allow using masked interleave-groups (for dealing with
2221 // strided loads/stores that reside in predicated blocks, or for dealing
2222 // with gaps).
2223 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2224   // If an override option has been passed in for interleaved accesses, use it.
2225   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2226     return EnableMaskedInterleavedMemAccesses;
2227 
2228   return TTI.enableMaskedInterleavedAccessVectorization();
2229 }
2230 
2231 // Try to vectorize the interleave group that \p Instr belongs to.
2232 //
2233 // E.g. Translate following interleaved load group (factor = 3):
2234 //   for (i = 0; i < N; i+=3) {
2235 //     R = Pic[i];             // Member of index 0
2236 //     G = Pic[i+1];           // Member of index 1
2237 //     B = Pic[i+2];           // Member of index 2
2238 //     ... // do something to R, G, B
2239 //   }
2240 // To:
2241 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2242 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2243 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2244 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2245 //
2246 // Or translate following interleaved store group (factor = 3):
2247 //   for (i = 0; i < N; i+=3) {
2248 //     ... do something to R, G, B
2249 //     Pic[i]   = R;           // Member of index 0
2250 //     Pic[i+1] = G;           // Member of index 1
2251 //     Pic[i+2] = B;           // Member of index 2
2252 //   }
2253 // To:
2254 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2255 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2256 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2257 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2258 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2259 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2260     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2261     VPValue *Addr, VPValue *BlockInMask) {
2262   Instruction *Instr = Group->getInsertPos();
2263   const DataLayout &DL = Instr->getModule()->getDataLayout();
2264 
2265   // Prepare for the vector type of the interleaved load/store.
2266   Type *ScalarTy = getMemInstValueType(Instr);
2267   unsigned InterleaveFactor = Group->getFactor();
2268   auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
2269 
2270   // Prepare for the new pointers.
2271   SmallVector<Value *, 2> AddrParts;
2272   unsigned Index = Group->getIndex(Instr);
2273 
2274   // TODO: extend the masked interleaved-group support to reversed access.
2275   assert((!BlockInMask || !Group->isReverse()) &&
2276          "Reversed masked interleave-group not supported.");
2277 
2278   // If the group is reverse, adjust the index to refer to the last vector lane
2279   // instead of the first. We adjust the index from the first vector lane,
2280   // rather than directly getting the pointer for lane VF - 1, because the
2281   // pointer operand of the interleaved access is supposed to be uniform. For
2282   // uniform instructions, we're only required to generate a value for the
2283   // first vector lane in each unroll iteration.
2284   if (Group->isReverse())
2285     Index += (VF - 1) * Group->getFactor();
2286 
2287   for (unsigned Part = 0; Part < UF; Part++) {
2288     Value *AddrPart = State.get(Addr, {Part, 0});
2289     setDebugLocFromInst(Builder, AddrPart);
2290 
2291     // Notice current instruction could be any index. Need to adjust the address
2292     // to the member of index 0.
2293     //
2294     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2295     //       b = A[i];       // Member of index 0
2296     // Current pointer is pointed to A[i+1], adjust it to A[i].
2297     //
2298     // E.g.  A[i+1] = a;     // Member of index 1
2299     //       A[i]   = b;     // Member of index 0
2300     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2301     // Current pointer is pointed to A[i+2], adjust it to A[i].
2302 
2303     bool InBounds = false;
2304     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2305       InBounds = gep->isInBounds();
2306     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2307     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2308 
2309     // Cast to the vector pointer type.
2310     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2311     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2312     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2313   }
2314 
2315   setDebugLocFromInst(Builder, Instr);
2316   Value *UndefVec = UndefValue::get(VecTy);
2317 
2318   Value *MaskForGaps = nullptr;
2319   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2320     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2321     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2322   }
2323 
2324   // Vectorize the interleaved load group.
2325   if (isa<LoadInst>(Instr)) {
2326     // For each unroll part, create a wide load for the group.
2327     SmallVector<Value *, 2> NewLoads;
2328     for (unsigned Part = 0; Part < UF; Part++) {
2329       Instruction *NewLoad;
2330       if (BlockInMask || MaskForGaps) {
2331         assert(useMaskedInterleavedAccesses(*TTI) &&
2332                "masked interleaved groups are not allowed.");
2333         Value *GroupMask = MaskForGaps;
2334         if (BlockInMask) {
2335           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2336           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2337           Value *ShuffledMask = Builder.CreateShuffleVector(
2338               BlockInMaskPart, Undefs,
2339               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2340           GroupMask = MaskForGaps
2341                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2342                                                 MaskForGaps)
2343                           : ShuffledMask;
2344         }
2345         NewLoad =
2346             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2347                                      GroupMask, UndefVec, "wide.masked.vec");
2348       }
2349       else
2350         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2351                                             Group->getAlign(), "wide.vec");
2352       Group->addMetadata(NewLoad);
2353       NewLoads.push_back(NewLoad);
2354     }
2355 
2356     // For each member in the group, shuffle out the appropriate data from the
2357     // wide loads.
2358     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2359       Instruction *Member = Group->getMember(I);
2360 
2361       // Skip the gaps in the group.
2362       if (!Member)
2363         continue;
2364 
2365       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2366       for (unsigned Part = 0; Part < UF; Part++) {
2367         Value *StridedVec = Builder.CreateShuffleVector(
2368             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2369 
2370         // If this member has different type, cast the result type.
2371         if (Member->getType() != ScalarTy) {
2372           VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
2373           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2374         }
2375 
2376         if (Group->isReverse())
2377           StridedVec = reverseVector(StridedVec);
2378 
2379         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2380       }
2381     }
2382     return;
2383   }
2384 
2385   // The sub vector type for current instruction.
2386   auto *SubVT = FixedVectorType::get(ScalarTy, VF);
2387 
2388   // Vectorize the interleaved store group.
2389   for (unsigned Part = 0; Part < UF; Part++) {
2390     // Collect the stored vector from each member.
2391     SmallVector<Value *, 4> StoredVecs;
2392     for (unsigned i = 0; i < InterleaveFactor; i++) {
2393       // Interleaved store group doesn't allow a gap, so each index has a member
2394       Instruction *Member = Group->getMember(i);
2395       assert(Member && "Fail to get a member from an interleaved store group");
2396 
2397       Value *StoredVec = getOrCreateVectorValue(
2398           cast<StoreInst>(Member)->getValueOperand(), Part);
2399       if (Group->isReverse())
2400         StoredVec = reverseVector(StoredVec);
2401 
2402       // If this member has different type, cast it to a unified type.
2403 
2404       if (StoredVec->getType() != SubVT)
2405         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2406 
2407       StoredVecs.push_back(StoredVec);
2408     }
2409 
2410     // Concatenate all vectors into a wide vector.
2411     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2412 
2413     // Interleave the elements in the wide vector.
2414     Value *IVec = Builder.CreateShuffleVector(
2415         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2416         "interleaved.vec");
2417 
2418     Instruction *NewStoreInstr;
2419     if (BlockInMask) {
2420       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2421       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2422       Value *ShuffledMask = Builder.CreateShuffleVector(
2423           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2424           "interleaved.mask");
2425       NewStoreInstr = Builder.CreateMaskedStore(
2426           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2427     }
2428     else
2429       NewStoreInstr =
2430           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2431 
2432     Group->addMetadata(NewStoreInstr);
2433   }
2434 }
2435 
2436 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2437                                                      VPTransformState &State,
2438                                                      VPValue *Addr,
2439                                                      VPValue *StoredValue,
2440                                                      VPValue *BlockInMask) {
2441   // Attempt to issue a wide load.
2442   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2443   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2444 
2445   assert((LI || SI) && "Invalid Load/Store instruction");
2446   assert((!SI || StoredValue) && "No stored value provided for widened store");
2447   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2448 
2449   LoopVectorizationCostModel::InstWidening Decision =
2450       Cost->getWideningDecision(Instr, VF);
2451   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2452           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2453           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2454          "CM decision is not to widen the memory instruction");
2455 
2456   Type *ScalarDataTy = getMemInstValueType(Instr);
2457   auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
2458   const Align Alignment = getLoadStoreAlignment(Instr);
2459 
2460   // Determine if the pointer operand of the access is either consecutive or
2461   // reverse consecutive.
2462   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2463   bool ConsecutiveStride =
2464       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2465   bool CreateGatherScatter =
2466       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2467 
2468   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2469   // gather/scatter. Otherwise Decision should have been to Scalarize.
2470   assert((ConsecutiveStride || CreateGatherScatter) &&
2471          "The instruction should be scalarized");
2472   (void)ConsecutiveStride;
2473 
2474   VectorParts BlockInMaskParts(UF);
2475   bool isMaskRequired = BlockInMask;
2476   if (isMaskRequired)
2477     for (unsigned Part = 0; Part < UF; ++Part)
2478       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2479 
2480   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2481     // Calculate the pointer for the specific unroll-part.
2482     GetElementPtrInst *PartPtr = nullptr;
2483 
2484     bool InBounds = false;
2485     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2486       InBounds = gep->isInBounds();
2487 
2488     if (Reverse) {
2489       // If the address is consecutive but reversed, then the
2490       // wide store needs to start at the last vector element.
2491       PartPtr = cast<GetElementPtrInst>(
2492           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2493       PartPtr->setIsInBounds(InBounds);
2494       PartPtr = cast<GetElementPtrInst>(
2495           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2496       PartPtr->setIsInBounds(InBounds);
2497       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2498         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2499     } else {
2500       PartPtr = cast<GetElementPtrInst>(
2501           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2502       PartPtr->setIsInBounds(InBounds);
2503     }
2504 
2505     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2506     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2507   };
2508 
2509   // Handle Stores:
2510   if (SI) {
2511     setDebugLocFromInst(Builder, SI);
2512 
2513     for (unsigned Part = 0; Part < UF; ++Part) {
2514       Instruction *NewSI = nullptr;
2515       Value *StoredVal = State.get(StoredValue, Part);
2516       if (CreateGatherScatter) {
2517         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2518         Value *VectorGep = State.get(Addr, Part);
2519         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2520                                             MaskPart);
2521       } else {
2522         if (Reverse) {
2523           // If we store to reverse consecutive memory locations, then we need
2524           // to reverse the order of elements in the stored value.
2525           StoredVal = reverseVector(StoredVal);
2526           // We don't want to update the value in the map as it might be used in
2527           // another expression. So don't call resetVectorValue(StoredVal).
2528         }
2529         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2530         if (isMaskRequired)
2531           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2532                                             BlockInMaskParts[Part]);
2533         else
2534           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2535       }
2536       addMetadata(NewSI, SI);
2537     }
2538     return;
2539   }
2540 
2541   // Handle loads.
2542   assert(LI && "Must have a load instruction");
2543   setDebugLocFromInst(Builder, LI);
2544   for (unsigned Part = 0; Part < UF; ++Part) {
2545     Value *NewLI;
2546     if (CreateGatherScatter) {
2547       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2548       Value *VectorGep = State.get(Addr, Part);
2549       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2550                                          nullptr, "wide.masked.gather");
2551       addMetadata(NewLI, LI);
2552     } else {
2553       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2554       if (isMaskRequired)
2555         NewLI = Builder.CreateMaskedLoad(
2556             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2557             "wide.masked.load");
2558       else
2559         NewLI =
2560             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2561 
2562       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2563       addMetadata(NewLI, LI);
2564       if (Reverse)
2565         NewLI = reverseVector(NewLI);
2566     }
2567     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2568   }
2569 }
2570 
2571 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2572                                                const VPIteration &Instance,
2573                                                bool IfPredicateInstr,
2574                                                VPTransformState &State) {
2575   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2576 
2577   setDebugLocFromInst(Builder, Instr);
2578 
2579   // Does this instruction return a value ?
2580   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2581 
2582   Instruction *Cloned = Instr->clone();
2583   if (!IsVoidRetTy)
2584     Cloned->setName(Instr->getName() + ".cloned");
2585 
2586   // Replace the operands of the cloned instructions with their scalar
2587   // equivalents in the new loop.
2588   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2589     auto *NewOp = State.get(User.getOperand(op), Instance);
2590     Cloned->setOperand(op, NewOp);
2591   }
2592   addNewMetadata(Cloned, Instr);
2593 
2594   // Place the cloned scalar in the new loop.
2595   Builder.Insert(Cloned);
2596 
2597   // Add the cloned scalar to the scalar map entry.
2598   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2599 
2600   // If we just cloned a new assumption, add it the assumption cache.
2601   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2602     if (II->getIntrinsicID() == Intrinsic::assume)
2603       AC->registerAssumption(II);
2604 
2605   // End if-block.
2606   if (IfPredicateInstr)
2607     PredicatedInstructions.push_back(Cloned);
2608 }
2609 
2610 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2611                                                       Value *End, Value *Step,
2612                                                       Instruction *DL) {
2613   BasicBlock *Header = L->getHeader();
2614   BasicBlock *Latch = L->getLoopLatch();
2615   // As we're just creating this loop, it's possible no latch exists
2616   // yet. If so, use the header as this will be a single block loop.
2617   if (!Latch)
2618     Latch = Header;
2619 
2620   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2621   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2622   setDebugLocFromInst(Builder, OldInst);
2623   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2624 
2625   Builder.SetInsertPoint(Latch->getTerminator());
2626   setDebugLocFromInst(Builder, OldInst);
2627 
2628   // Create i+1 and fill the PHINode.
2629   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2630   Induction->addIncoming(Start, L->getLoopPreheader());
2631   Induction->addIncoming(Next, Latch);
2632   // Create the compare.
2633   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2634   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2635 
2636   // Now we have two terminators. Remove the old one from the block.
2637   Latch->getTerminator()->eraseFromParent();
2638 
2639   return Induction;
2640 }
2641 
2642 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2643   if (TripCount)
2644     return TripCount;
2645 
2646   assert(L && "Create Trip Count for null loop.");
2647   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2648   // Find the loop boundaries.
2649   ScalarEvolution *SE = PSE.getSE();
2650   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2651   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2652          "Invalid loop count");
2653 
2654   Type *IdxTy = Legal->getWidestInductionType();
2655   assert(IdxTy && "No type for induction");
2656 
2657   // The exit count might have the type of i64 while the phi is i32. This can
2658   // happen if we have an induction variable that is sign extended before the
2659   // compare. The only way that we get a backedge taken count is that the
2660   // induction variable was signed and as such will not overflow. In such a case
2661   // truncation is legal.
2662   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2663       IdxTy->getPrimitiveSizeInBits())
2664     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2665   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2666 
2667   // Get the total trip count from the count by adding 1.
2668   const SCEV *ExitCount = SE->getAddExpr(
2669       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2670 
2671   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2672 
2673   // Expand the trip count and place the new instructions in the preheader.
2674   // Notice that the pre-header does not change, only the loop body.
2675   SCEVExpander Exp(*SE, DL, "induction");
2676 
2677   // Count holds the overall loop count (N).
2678   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2679                                 L->getLoopPreheader()->getTerminator());
2680 
2681   if (TripCount->getType()->isPointerTy())
2682     TripCount =
2683         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2684                                     L->getLoopPreheader()->getTerminator());
2685 
2686   return TripCount;
2687 }
2688 
2689 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2690   if (VectorTripCount)
2691     return VectorTripCount;
2692 
2693   Value *TC = getOrCreateTripCount(L);
2694   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2695 
2696   Type *Ty = TC->getType();
2697   Constant *Step = ConstantInt::get(Ty, VF * UF);
2698 
2699   // If the tail is to be folded by masking, round the number of iterations N
2700   // up to a multiple of Step instead of rounding down. This is done by first
2701   // adding Step-1 and then rounding down. Note that it's ok if this addition
2702   // overflows: the vector induction variable will eventually wrap to zero given
2703   // that it starts at zero and its Step is a power of two; the loop will then
2704   // exit, with the last early-exit vector comparison also producing all-true.
2705   if (Cost->foldTailByMasking()) {
2706     assert(isPowerOf2_32(VF * UF) &&
2707            "VF*UF must be a power of 2 when folding tail by masking");
2708     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2709   }
2710 
2711   // Now we need to generate the expression for the part of the loop that the
2712   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2713   // iterations are not required for correctness, or N - Step, otherwise. Step
2714   // is equal to the vectorization factor (number of SIMD elements) times the
2715   // unroll factor (number of SIMD instructions).
2716   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2717 
2718   // If there is a non-reversed interleaved group that may speculatively access
2719   // memory out-of-bounds, we need to ensure that there will be at least one
2720   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2721   // the trip count, we set the remainder to be equal to the step. If the step
2722   // does not evenly divide the trip count, no adjustment is necessary since
2723   // there will already be scalar iterations. Note that the minimum iterations
2724   // check ensures that N >= Step.
2725   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2726     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2727     R = Builder.CreateSelect(IsZero, Step, R);
2728   }
2729 
2730   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2731 
2732   return VectorTripCount;
2733 }
2734 
2735 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2736                                                    const DataLayout &DL) {
2737   // Verify that V is a vector type with same number of elements as DstVTy.
2738   unsigned VF = DstVTy->getNumElements();
2739   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2740   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2741   Type *SrcElemTy = SrcVecTy->getElementType();
2742   Type *DstElemTy = DstVTy->getElementType();
2743   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2744          "Vector elements must have same size");
2745 
2746   // Do a direct cast if element types are castable.
2747   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2748     return Builder.CreateBitOrPointerCast(V, DstVTy);
2749   }
2750   // V cannot be directly casted to desired vector type.
2751   // May happen when V is a floating point vector but DstVTy is a vector of
2752   // pointers or vice-versa. Handle this using a two-step bitcast using an
2753   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2754   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2755          "Only one type should be a pointer type");
2756   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2757          "Only one type should be a floating point type");
2758   Type *IntTy =
2759       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2760   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2761   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2762   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2763 }
2764 
2765 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2766                                                          BasicBlock *Bypass) {
2767   Value *Count = getOrCreateTripCount(L);
2768   // Reuse existing vector loop preheader for TC checks.
2769   // Note that new preheader block is generated for vector loop.
2770   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2771   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2772 
2773   // Generate code to check if the loop's trip count is less than VF * UF, or
2774   // equal to it in case a scalar epilogue is required; this implies that the
2775   // vector trip count is zero. This check also covers the case where adding one
2776   // to the backedge-taken count overflowed leading to an incorrect trip count
2777   // of zero. In this case we will also jump to the scalar loop.
2778   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2779                                           : ICmpInst::ICMP_ULT;
2780 
2781   // If tail is to be folded, vector loop takes care of all iterations.
2782   Value *CheckMinIters = Builder.getFalse();
2783   if (!Cost->foldTailByMasking())
2784     CheckMinIters = Builder.CreateICmp(
2785         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2786         "min.iters.check");
2787 
2788   // Create new preheader for vector loop.
2789   LoopVectorPreHeader =
2790       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2791                  "vector.ph");
2792 
2793   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2794                                DT->getNode(Bypass)->getIDom()) &&
2795          "TC check is expected to dominate Bypass");
2796 
2797   // Update dominator for Bypass & LoopExit.
2798   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2799   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2800 
2801   ReplaceInstWithInst(
2802       TCCheckBlock->getTerminator(),
2803       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2804   LoopBypassBlocks.push_back(TCCheckBlock);
2805 }
2806 
2807 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2808   // Reuse existing vector loop preheader for SCEV checks.
2809   // Note that new preheader block is generated for vector loop.
2810   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2811 
2812   // Generate the code to check that the SCEV assumptions that we made.
2813   // We want the new basic block to start at the first instruction in a
2814   // sequence of instructions that form a check.
2815   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2816                    "scev.check");
2817   Value *SCEVCheck = Exp.expandCodeForPredicate(
2818       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2819 
2820   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2821     if (C->isZero())
2822       return;
2823 
2824   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2825            (OptForSizeBasedOnProfile &&
2826             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2827          "Cannot SCEV check stride or overflow when optimizing for size");
2828 
2829   SCEVCheckBlock->setName("vector.scevcheck");
2830   // Create new preheader for vector loop.
2831   LoopVectorPreHeader =
2832       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2833                  nullptr, "vector.ph");
2834 
2835   // Update dominator only if this is first RT check.
2836   if (LoopBypassBlocks.empty()) {
2837     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2838     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2839   }
2840 
2841   ReplaceInstWithInst(
2842       SCEVCheckBlock->getTerminator(),
2843       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2844   LoopBypassBlocks.push_back(SCEVCheckBlock);
2845   AddedSafetyChecks = true;
2846 }
2847 
2848 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2849   // VPlan-native path does not do any analysis for runtime checks currently.
2850   if (EnableVPlanNativePath)
2851     return;
2852 
2853   // Reuse existing vector loop preheader for runtime memory checks.
2854   // Note that new preheader block is generated for vector loop.
2855   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2856 
2857   // Generate the code that checks in runtime if arrays overlap. We put the
2858   // checks into a separate block to make the more common case of few elements
2859   // faster.
2860   auto *LAI = Legal->getLAI();
2861   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2862   if (!RtPtrChecking.Need)
2863     return;
2864   Instruction *FirstCheckInst;
2865   Instruction *MemRuntimeCheck;
2866   std::tie(FirstCheckInst, MemRuntimeCheck) =
2867       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2868                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2869   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2870                             "claimed checks are required");
2871 
2872   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2873     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2874            "Cannot emit memory checks when optimizing for size, unless forced "
2875            "to vectorize.");
2876     ORE->emit([&]() {
2877       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2878                                         L->getStartLoc(), L->getHeader())
2879              << "Code-size may be reduced by not forcing "
2880                 "vectorization, or by source-code modifications "
2881                 "eliminating the need for runtime checks "
2882                 "(e.g., adding 'restrict').";
2883     });
2884   }
2885 
2886   MemCheckBlock->setName("vector.memcheck");
2887   // Create new preheader for vector loop.
2888   LoopVectorPreHeader =
2889       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2890                  "vector.ph");
2891 
2892   // Update dominator only if this is first RT check.
2893   if (LoopBypassBlocks.empty()) {
2894     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2895     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2896   }
2897 
2898   ReplaceInstWithInst(
2899       MemCheckBlock->getTerminator(),
2900       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2901   LoopBypassBlocks.push_back(MemCheckBlock);
2902   AddedSafetyChecks = true;
2903 
2904   // We currently don't use LoopVersioning for the actual loop cloning but we
2905   // still use it to add the noalias metadata.
2906   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2907                                           PSE.getSE());
2908   LVer->prepareNoAliasMetadata();
2909 }
2910 
2911 Value *InnerLoopVectorizer::emitTransformedIndex(
2912     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2913     const InductionDescriptor &ID) const {
2914 
2915   SCEVExpander Exp(*SE, DL, "induction");
2916   auto Step = ID.getStep();
2917   auto StartValue = ID.getStartValue();
2918   assert(Index->getType() == Step->getType() &&
2919          "Index type does not match StepValue type");
2920 
2921   // Note: the IR at this point is broken. We cannot use SE to create any new
2922   // SCEV and then expand it, hoping that SCEV's simplification will give us
2923   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2924   // lead to various SCEV crashes. So all we can do is to use builder and rely
2925   // on InstCombine for future simplifications. Here we handle some trivial
2926   // cases only.
2927   auto CreateAdd = [&B](Value *X, Value *Y) {
2928     assert(X->getType() == Y->getType() && "Types don't match!");
2929     if (auto *CX = dyn_cast<ConstantInt>(X))
2930       if (CX->isZero())
2931         return Y;
2932     if (auto *CY = dyn_cast<ConstantInt>(Y))
2933       if (CY->isZero())
2934         return X;
2935     return B.CreateAdd(X, Y);
2936   };
2937 
2938   auto CreateMul = [&B](Value *X, Value *Y) {
2939     assert(X->getType() == Y->getType() && "Types don't match!");
2940     if (auto *CX = dyn_cast<ConstantInt>(X))
2941       if (CX->isOne())
2942         return Y;
2943     if (auto *CY = dyn_cast<ConstantInt>(Y))
2944       if (CY->isOne())
2945         return X;
2946     return B.CreateMul(X, Y);
2947   };
2948 
2949   // Get a suitable insert point for SCEV expansion. For blocks in the vector
2950   // loop, choose the end of the vector loop header (=LoopVectorBody), because
2951   // the DomTree is not kept up-to-date for additional blocks generated in the
2952   // vector loop. By using the header as insertion point, we guarantee that the
2953   // expanded instructions dominate all their uses.
2954   auto GetInsertPoint = [this, &B]() {
2955     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
2956     if (InsertBB != LoopVectorBody &&
2957         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
2958       return LoopVectorBody->getTerminator();
2959     return &*B.GetInsertPoint();
2960   };
2961   switch (ID.getKind()) {
2962   case InductionDescriptor::IK_IntInduction: {
2963     assert(Index->getType() == StartValue->getType() &&
2964            "Index type does not match StartValue type");
2965     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2966       return B.CreateSub(StartValue, Index);
2967     auto *Offset = CreateMul(
2968         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
2969     return CreateAdd(StartValue, Offset);
2970   }
2971   case InductionDescriptor::IK_PtrInduction: {
2972     assert(isa<SCEVConstant>(Step) &&
2973            "Expected constant step for pointer induction");
2974     return B.CreateGEP(
2975         StartValue->getType()->getPointerElementType(), StartValue,
2976         CreateMul(Index,
2977                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
2978   }
2979   case InductionDescriptor::IK_FpInduction: {
2980     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2981     auto InductionBinOp = ID.getInductionBinOp();
2982     assert(InductionBinOp &&
2983            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2984             InductionBinOp->getOpcode() == Instruction::FSub) &&
2985            "Original bin op should be defined for FP induction");
2986 
2987     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2988 
2989     // Floating point operations had to be 'fast' to enable the induction.
2990     FastMathFlags Flags;
2991     Flags.setFast();
2992 
2993     Value *MulExp = B.CreateFMul(StepValue, Index);
2994     if (isa<Instruction>(MulExp))
2995       // We have to check, the MulExp may be a constant.
2996       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2997 
2998     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2999                                "induction");
3000     if (isa<Instruction>(BOp))
3001       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3002 
3003     return BOp;
3004   }
3005   case InductionDescriptor::IK_NoInduction:
3006     return nullptr;
3007   }
3008   llvm_unreachable("invalid enum");
3009 }
3010 
3011 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3012   LoopScalarBody = OrigLoop->getHeader();
3013   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3014   LoopExitBlock = OrigLoop->getExitBlock();
3015   assert(LoopExitBlock && "Must have an exit block");
3016   assert(LoopVectorPreHeader && "Invalid loop structure");
3017 
3018   LoopMiddleBlock =
3019       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3020                  LI, nullptr, Twine(Prefix) + "middle.block");
3021   LoopScalarPreHeader =
3022       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3023                  nullptr, Twine(Prefix) + "scalar.ph");
3024   // We intentionally don't let SplitBlock to update LoopInfo since
3025   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3026   // LoopVectorBody is explicitly added to the correct place few lines later.
3027   LoopVectorBody =
3028       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3029                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3030 
3031   // Update dominator for loop exit.
3032   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3033 
3034   // Create and register the new vector loop.
3035   Loop *Lp = LI->AllocateLoop();
3036   Loop *ParentLoop = OrigLoop->getParentLoop();
3037 
3038   // Insert the new loop into the loop nest and register the new basic blocks
3039   // before calling any utilities such as SCEV that require valid LoopInfo.
3040   if (ParentLoop) {
3041     ParentLoop->addChildLoop(Lp);
3042   } else {
3043     LI->addTopLevelLoop(Lp);
3044   }
3045   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3046   return Lp;
3047 }
3048 
3049 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3050                                                       Value *VectorTripCount) {
3051   assert(VectorTripCount && L && "Expected valid arguments");
3052   // We are going to resume the execution of the scalar loop.
3053   // Go over all of the induction variables that we found and fix the
3054   // PHIs that are left in the scalar version of the loop.
3055   // The starting values of PHI nodes depend on the counter of the last
3056   // iteration in the vectorized loop.
3057   // If we come from a bypass edge then we need to start from the original
3058   // start value.
3059   for (auto &InductionEntry : Legal->getInductionVars()) {
3060     PHINode *OrigPhi = InductionEntry.first;
3061     InductionDescriptor II = InductionEntry.second;
3062 
3063     // Create phi nodes to merge from the  backedge-taken check block.
3064     PHINode *BCResumeVal =
3065         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3066                         LoopScalarPreHeader->getTerminator());
3067     // Copy original phi DL over to the new one.
3068     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3069     Value *&EndValue = IVEndValues[OrigPhi];
3070     if (OrigPhi == OldInduction) {
3071       // We know what the end value is.
3072       EndValue = VectorTripCount;
3073     } else {
3074       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3075       Type *StepType = II.getStep()->getType();
3076       Instruction::CastOps CastOp =
3077           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3078       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3079       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3080       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3081       EndValue->setName("ind.end");
3082     }
3083 
3084     // The new PHI merges the original incoming value, in case of a bypass,
3085     // or the value at the end of the vectorized loop.
3086     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3087 
3088     // Fix the scalar body counter (PHI node).
3089     // The old induction's phi node in the scalar body needs the truncated
3090     // value.
3091     for (BasicBlock *BB : LoopBypassBlocks)
3092       BCResumeVal->addIncoming(II.getStartValue(), BB);
3093     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3094   }
3095 }
3096 
3097 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3098                                                       MDNode *OrigLoopID) {
3099   assert(L && "Expected valid loop.");
3100 
3101   // The trip counts should be cached by now.
3102   Value *Count = getOrCreateTripCount(L);
3103   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3104 
3105   // We need the OrigLoop (scalar loop part) latch terminator to help
3106   // produce correct debug info for the middle block BB instructions.
3107   // The legality check stage guarantees that the loop will have a single
3108   // latch.
3109   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3110          "Scalar loop latch terminator isn't a branch");
3111   BranchInst *ScalarLatchBr =
3112       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3113 
3114   // Add a check in the middle block to see if we have completed
3115   // all of the iterations in the first vector loop.
3116   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3117   // If tail is to be folded, we know we don't need to run the remainder.
3118   Value *CmpN = Builder.getTrue();
3119   if (!Cost->foldTailByMasking()) {
3120     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3121                            VectorTripCount, "cmp.n",
3122                            LoopMiddleBlock->getTerminator());
3123 
3124     // Here we use the same DebugLoc as the scalar loop latch branch instead
3125     // of the corresponding compare because they may have ended up with
3126     // different line numbers and we want to avoid awkward line stepping while
3127     // debugging. Eg. if the compare has got a line number inside the loop.
3128     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3129   }
3130 
3131   BranchInst *BrInst =
3132       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3133   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3134   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3135 
3136   // Get ready to start creating new instructions into the vectorized body.
3137   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3138          "Inconsistent vector loop preheader");
3139   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3140 
3141   Optional<MDNode *> VectorizedLoopID =
3142       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3143                                       LLVMLoopVectorizeFollowupVectorized});
3144   if (VectorizedLoopID.hasValue()) {
3145     L->setLoopID(VectorizedLoopID.getValue());
3146 
3147     // Do not setAlreadyVectorized if loop attributes have been defined
3148     // explicitly.
3149     return LoopVectorPreHeader;
3150   }
3151 
3152   // Keep all loop hints from the original loop on the vector loop (we'll
3153   // replace the vectorizer-specific hints below).
3154   if (MDNode *LID = OrigLoop->getLoopID())
3155     L->setLoopID(LID);
3156 
3157   LoopVectorizeHints Hints(L, true, *ORE);
3158   Hints.setAlreadyVectorized();
3159 
3160 #ifdef EXPENSIVE_CHECKS
3161   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3162   LI->verify(*DT);
3163 #endif
3164 
3165   return LoopVectorPreHeader;
3166 }
3167 
3168 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3169   /*
3170    In this function we generate a new loop. The new loop will contain
3171    the vectorized instructions while the old loop will continue to run the
3172    scalar remainder.
3173 
3174        [ ] <-- loop iteration number check.
3175     /   |
3176    /    v
3177   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3178   |  /  |
3179   | /   v
3180   ||   [ ]     <-- vector pre header.
3181   |/    |
3182   |     v
3183   |    [  ] \
3184   |    [  ]_|   <-- vector loop.
3185   |     |
3186   |     v
3187   |   -[ ]   <--- middle-block.
3188   |  /  |
3189   | /   v
3190   -|- >[ ]     <--- new preheader.
3191    |    |
3192    |    v
3193    |   [ ] \
3194    |   [ ]_|   <-- old scalar loop to handle remainder.
3195     \   |
3196      \  v
3197       >[ ]     <-- exit block.
3198    ...
3199    */
3200 
3201   // Get the metadata of the original loop before it gets modified.
3202   MDNode *OrigLoopID = OrigLoop->getLoopID();
3203 
3204   // Create an empty vector loop, and prepare basic blocks for the runtime
3205   // checks.
3206   Loop *Lp = createVectorLoopSkeleton("");
3207 
3208   // Now, compare the new count to zero. If it is zero skip the vector loop and
3209   // jump to the scalar loop. This check also covers the case where the
3210   // backedge-taken count is uint##_max: adding one to it will overflow leading
3211   // to an incorrect trip count of zero. In this (rare) case we will also jump
3212   // to the scalar loop.
3213   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3214 
3215   // Generate the code to check any assumptions that we've made for SCEV
3216   // expressions.
3217   emitSCEVChecks(Lp, LoopScalarPreHeader);
3218 
3219   // Generate the code that checks in runtime if arrays overlap. We put the
3220   // checks into a separate block to make the more common case of few elements
3221   // faster.
3222   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3223 
3224   // Some loops have a single integer induction variable, while other loops
3225   // don't. One example is c++ iterators that often have multiple pointer
3226   // induction variables. In the code below we also support a case where we
3227   // don't have a single induction variable.
3228   //
3229   // We try to obtain an induction variable from the original loop as hard
3230   // as possible. However if we don't find one that:
3231   //   - is an integer
3232   //   - counts from zero, stepping by one
3233   //   - is the size of the widest induction variable type
3234   // then we create a new one.
3235   OldInduction = Legal->getPrimaryInduction();
3236   Type *IdxTy = Legal->getWidestInductionType();
3237   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3238   // The loop step is equal to the vectorization factor (num of SIMD elements)
3239   // times the unroll factor (num of SIMD instructions).
3240   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3241   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3242   Induction =
3243       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3244                               getDebugLocFromInstOrOperands(OldInduction));
3245 
3246   // Emit phis for the new starting index of the scalar loop.
3247   createInductionResumeValues(Lp, CountRoundDown);
3248 
3249   return completeLoopSkeleton(Lp, OrigLoopID);
3250 }
3251 
3252 // Fix up external users of the induction variable. At this point, we are
3253 // in LCSSA form, with all external PHIs that use the IV having one input value,
3254 // coming from the remainder loop. We need those PHIs to also have a correct
3255 // value for the IV when arriving directly from the middle block.
3256 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3257                                        const InductionDescriptor &II,
3258                                        Value *CountRoundDown, Value *EndValue,
3259                                        BasicBlock *MiddleBlock) {
3260   // There are two kinds of external IV usages - those that use the value
3261   // computed in the last iteration (the PHI) and those that use the penultimate
3262   // value (the value that feeds into the phi from the loop latch).
3263   // We allow both, but they, obviously, have different values.
3264 
3265   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3266 
3267   DenseMap<Value *, Value *> MissingVals;
3268 
3269   // An external user of the last iteration's value should see the value that
3270   // the remainder loop uses to initialize its own IV.
3271   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3272   for (User *U : PostInc->users()) {
3273     Instruction *UI = cast<Instruction>(U);
3274     if (!OrigLoop->contains(UI)) {
3275       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3276       MissingVals[UI] = EndValue;
3277     }
3278   }
3279 
3280   // An external user of the penultimate value need to see EndValue - Step.
3281   // The simplest way to get this is to recompute it from the constituent SCEVs,
3282   // that is Start + (Step * (CRD - 1)).
3283   for (User *U : OrigPhi->users()) {
3284     auto *UI = cast<Instruction>(U);
3285     if (!OrigLoop->contains(UI)) {
3286       const DataLayout &DL =
3287           OrigLoop->getHeader()->getModule()->getDataLayout();
3288       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3289 
3290       IRBuilder<> B(MiddleBlock->getTerminator());
3291       Value *CountMinusOne = B.CreateSub(
3292           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3293       Value *CMO =
3294           !II.getStep()->getType()->isIntegerTy()
3295               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3296                              II.getStep()->getType())
3297               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3298       CMO->setName("cast.cmo");
3299       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3300       Escape->setName("ind.escape");
3301       MissingVals[UI] = Escape;
3302     }
3303   }
3304 
3305   for (auto &I : MissingVals) {
3306     PHINode *PHI = cast<PHINode>(I.first);
3307     // One corner case we have to handle is two IVs "chasing" each-other,
3308     // that is %IV2 = phi [...], [ %IV1, %latch ]
3309     // In this case, if IV1 has an external use, we need to avoid adding both
3310     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3311     // don't already have an incoming value for the middle block.
3312     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3313       PHI->addIncoming(I.second, MiddleBlock);
3314   }
3315 }
3316 
3317 namespace {
3318 
3319 struct CSEDenseMapInfo {
3320   static bool canHandle(const Instruction *I) {
3321     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3322            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3323   }
3324 
3325   static inline Instruction *getEmptyKey() {
3326     return DenseMapInfo<Instruction *>::getEmptyKey();
3327   }
3328 
3329   static inline Instruction *getTombstoneKey() {
3330     return DenseMapInfo<Instruction *>::getTombstoneKey();
3331   }
3332 
3333   static unsigned getHashValue(const Instruction *I) {
3334     assert(canHandle(I) && "Unknown instruction!");
3335     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3336                                                            I->value_op_end()));
3337   }
3338 
3339   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3340     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3341         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3342       return LHS == RHS;
3343     return LHS->isIdenticalTo(RHS);
3344   }
3345 };
3346 
3347 } // end anonymous namespace
3348 
3349 ///Perform cse of induction variable instructions.
3350 static void cse(BasicBlock *BB) {
3351   // Perform simple cse.
3352   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3353   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3354     Instruction *In = &*I++;
3355 
3356     if (!CSEDenseMapInfo::canHandle(In))
3357       continue;
3358 
3359     // Check if we can replace this instruction with any of the
3360     // visited instructions.
3361     if (Instruction *V = CSEMap.lookup(In)) {
3362       In->replaceAllUsesWith(V);
3363       In->eraseFromParent();
3364       continue;
3365     }
3366 
3367     CSEMap[In] = In;
3368   }
3369 }
3370 
3371 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3372                                                        unsigned VF,
3373                                                        bool &NeedToScalarize) {
3374   Function *F = CI->getCalledFunction();
3375   Type *ScalarRetTy = CI->getType();
3376   SmallVector<Type *, 4> Tys, ScalarTys;
3377   for (auto &ArgOp : CI->arg_operands())
3378     ScalarTys.push_back(ArgOp->getType());
3379 
3380   // Estimate cost of scalarized vector call. The source operands are assumed
3381   // to be vectors, so we need to extract individual elements from there,
3382   // execute VF scalar calls, and then gather the result into the vector return
3383   // value.
3384   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3385                                                  TTI::TCK_RecipThroughput);
3386   if (VF == 1)
3387     return ScalarCallCost;
3388 
3389   // Compute corresponding vector type for return value and arguments.
3390   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3391   for (Type *ScalarTy : ScalarTys)
3392     Tys.push_back(ToVectorTy(ScalarTy, VF));
3393 
3394   // Compute costs of unpacking argument values for the scalar calls and
3395   // packing the return values to a vector.
3396   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3397 
3398   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3399 
3400   // If we can't emit a vector call for this function, then the currently found
3401   // cost is the cost we need to return.
3402   NeedToScalarize = true;
3403   VFShape Shape =
3404       VFShape::get(*CI, ElementCount::getFixed(VF), false /*HasGlobalPred*/);
3405   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3406 
3407   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3408     return Cost;
3409 
3410   // If the corresponding vector cost is cheaper, return its cost.
3411   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3412                                                  TTI::TCK_RecipThroughput);
3413   if (VectorCallCost < Cost) {
3414     NeedToScalarize = false;
3415     return VectorCallCost;
3416   }
3417   return Cost;
3418 }
3419 
3420 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3421                                                             unsigned VF) {
3422   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3423   assert(ID && "Expected intrinsic call!");
3424 
3425   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3426   return TTI.getIntrinsicInstrCost(CostAttrs,
3427                                    TargetTransformInfo::TCK_RecipThroughput);
3428 }
3429 
3430 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3431   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3432   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3433   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3434 }
3435 
3436 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3437   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3438   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3439   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3440 }
3441 
3442 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3443   // For every instruction `I` in MinBWs, truncate the operands, create a
3444   // truncated version of `I` and reextend its result. InstCombine runs
3445   // later and will remove any ext/trunc pairs.
3446   SmallPtrSet<Value *, 4> Erased;
3447   for (const auto &KV : Cost->getMinimalBitwidths()) {
3448     // If the value wasn't vectorized, we must maintain the original scalar
3449     // type. The absence of the value from VectorLoopValueMap indicates that it
3450     // wasn't vectorized.
3451     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3452       continue;
3453     for (unsigned Part = 0; Part < UF; ++Part) {
3454       Value *I = getOrCreateVectorValue(KV.first, Part);
3455       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3456         continue;
3457       Type *OriginalTy = I->getType();
3458       Type *ScalarTruncatedTy =
3459           IntegerType::get(OriginalTy->getContext(), KV.second);
3460       auto *TruncatedTy = FixedVectorType::get(
3461           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3462       if (TruncatedTy == OriginalTy)
3463         continue;
3464 
3465       IRBuilder<> B(cast<Instruction>(I));
3466       auto ShrinkOperand = [&](Value *V) -> Value * {
3467         if (auto *ZI = dyn_cast<ZExtInst>(V))
3468           if (ZI->getSrcTy() == TruncatedTy)
3469             return ZI->getOperand(0);
3470         return B.CreateZExtOrTrunc(V, TruncatedTy);
3471       };
3472 
3473       // The actual instruction modification depends on the instruction type,
3474       // unfortunately.
3475       Value *NewI = nullptr;
3476       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3477         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3478                              ShrinkOperand(BO->getOperand(1)));
3479 
3480         // Any wrapping introduced by shrinking this operation shouldn't be
3481         // considered undefined behavior. So, we can't unconditionally copy
3482         // arithmetic wrapping flags to NewI.
3483         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3484       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3485         NewI =
3486             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3487                          ShrinkOperand(CI->getOperand(1)));
3488       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3489         NewI = B.CreateSelect(SI->getCondition(),
3490                               ShrinkOperand(SI->getTrueValue()),
3491                               ShrinkOperand(SI->getFalseValue()));
3492       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3493         switch (CI->getOpcode()) {
3494         default:
3495           llvm_unreachable("Unhandled cast!");
3496         case Instruction::Trunc:
3497           NewI = ShrinkOperand(CI->getOperand(0));
3498           break;
3499         case Instruction::SExt:
3500           NewI = B.CreateSExtOrTrunc(
3501               CI->getOperand(0),
3502               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3503           break;
3504         case Instruction::ZExt:
3505           NewI = B.CreateZExtOrTrunc(
3506               CI->getOperand(0),
3507               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3508           break;
3509         }
3510       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3511         auto Elements0 =
3512             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3513         auto *O0 = B.CreateZExtOrTrunc(
3514             SI->getOperand(0),
3515             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3516         auto Elements1 =
3517             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3518         auto *O1 = B.CreateZExtOrTrunc(
3519             SI->getOperand(1),
3520             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3521 
3522         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3523       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3524         // Don't do anything with the operands, just extend the result.
3525         continue;
3526       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3527         auto Elements =
3528             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3529         auto *O0 = B.CreateZExtOrTrunc(
3530             IE->getOperand(0),
3531             FixedVectorType::get(ScalarTruncatedTy, Elements));
3532         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3533         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3534       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3535         auto Elements =
3536             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3537         auto *O0 = B.CreateZExtOrTrunc(
3538             EE->getOperand(0),
3539             FixedVectorType::get(ScalarTruncatedTy, Elements));
3540         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3541       } else {
3542         // If we don't know what to do, be conservative and don't do anything.
3543         continue;
3544       }
3545 
3546       // Lastly, extend the result.
3547       NewI->takeName(cast<Instruction>(I));
3548       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3549       I->replaceAllUsesWith(Res);
3550       cast<Instruction>(I)->eraseFromParent();
3551       Erased.insert(I);
3552       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3553     }
3554   }
3555 
3556   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3557   for (const auto &KV : Cost->getMinimalBitwidths()) {
3558     // If the value wasn't vectorized, we must maintain the original scalar
3559     // type. The absence of the value from VectorLoopValueMap indicates that it
3560     // wasn't vectorized.
3561     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3562       continue;
3563     for (unsigned Part = 0; Part < UF; ++Part) {
3564       Value *I = getOrCreateVectorValue(KV.first, Part);
3565       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3566       if (Inst && Inst->use_empty()) {
3567         Value *NewI = Inst->getOperand(0);
3568         Inst->eraseFromParent();
3569         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3570       }
3571     }
3572   }
3573 }
3574 
3575 void InnerLoopVectorizer::fixVectorizedLoop() {
3576   // Insert truncates and extends for any truncated instructions as hints to
3577   // InstCombine.
3578   if (VF > 1)
3579     truncateToMinimalBitwidths();
3580 
3581   // Fix widened non-induction PHIs by setting up the PHI operands.
3582   if (OrigPHIsToFix.size()) {
3583     assert(EnableVPlanNativePath &&
3584            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3585     fixNonInductionPHIs();
3586   }
3587 
3588   // At this point every instruction in the original loop is widened to a
3589   // vector form. Now we need to fix the recurrences in the loop. These PHI
3590   // nodes are currently empty because we did not want to introduce cycles.
3591   // This is the second stage of vectorizing recurrences.
3592   fixCrossIterationPHIs();
3593 
3594   // Forget the original basic block.
3595   PSE.getSE()->forgetLoop(OrigLoop);
3596 
3597   // Fix-up external users of the induction variables.
3598   for (auto &Entry : Legal->getInductionVars())
3599     fixupIVUsers(Entry.first, Entry.second,
3600                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3601                  IVEndValues[Entry.first], LoopMiddleBlock);
3602 
3603   fixLCSSAPHIs();
3604   for (Instruction *PI : PredicatedInstructions)
3605     sinkScalarOperands(&*PI);
3606 
3607   // Remove redundant induction instructions.
3608   cse(LoopVectorBody);
3609 
3610   // Set/update profile weights for the vector and remainder loops as original
3611   // loop iterations are now distributed among them. Note that original loop
3612   // represented by LoopScalarBody becomes remainder loop after vectorization.
3613   //
3614   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3615   // end up getting slightly roughened result but that should be OK since
3616   // profile is not inherently precise anyway. Note also possible bypass of
3617   // vector code caused by legality checks is ignored, assigning all the weight
3618   // to the vector loop, optimistically.
3619   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3620                                LI->getLoopFor(LoopVectorBody),
3621                                LI->getLoopFor(LoopScalarBody), VF * UF);
3622 }
3623 
3624 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3625   // In order to support recurrences we need to be able to vectorize Phi nodes.
3626   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3627   // stage #2: We now need to fix the recurrences by adding incoming edges to
3628   // the currently empty PHI nodes. At this point every instruction in the
3629   // original loop is widened to a vector form so we can use them to construct
3630   // the incoming edges.
3631   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3632     // Handle first-order recurrences and reductions that need to be fixed.
3633     if (Legal->isFirstOrderRecurrence(&Phi))
3634       fixFirstOrderRecurrence(&Phi);
3635     else if (Legal->isReductionVariable(&Phi))
3636       fixReduction(&Phi);
3637   }
3638 }
3639 
3640 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3641   // This is the second phase of vectorizing first-order recurrences. An
3642   // overview of the transformation is described below. Suppose we have the
3643   // following loop.
3644   //
3645   //   for (int i = 0; i < n; ++i)
3646   //     b[i] = a[i] - a[i - 1];
3647   //
3648   // There is a first-order recurrence on "a". For this loop, the shorthand
3649   // scalar IR looks like:
3650   //
3651   //   scalar.ph:
3652   //     s_init = a[-1]
3653   //     br scalar.body
3654   //
3655   //   scalar.body:
3656   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3657   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3658   //     s2 = a[i]
3659   //     b[i] = s2 - s1
3660   //     br cond, scalar.body, ...
3661   //
3662   // In this example, s1 is a recurrence because it's value depends on the
3663   // previous iteration. In the first phase of vectorization, we created a
3664   // temporary value for s1. We now complete the vectorization and produce the
3665   // shorthand vector IR shown below (for VF = 4, UF = 1).
3666   //
3667   //   vector.ph:
3668   //     v_init = vector(..., ..., ..., a[-1])
3669   //     br vector.body
3670   //
3671   //   vector.body
3672   //     i = phi [0, vector.ph], [i+4, vector.body]
3673   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3674   //     v2 = a[i, i+1, i+2, i+3];
3675   //     v3 = vector(v1(3), v2(0, 1, 2))
3676   //     b[i, i+1, i+2, i+3] = v2 - v3
3677   //     br cond, vector.body, middle.block
3678   //
3679   //   middle.block:
3680   //     x = v2(3)
3681   //     br scalar.ph
3682   //
3683   //   scalar.ph:
3684   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3685   //     br scalar.body
3686   //
3687   // After execution completes the vector loop, we extract the next value of
3688   // the recurrence (x) to use as the initial value in the scalar loop.
3689 
3690   // Get the original loop preheader and single loop latch.
3691   auto *Preheader = OrigLoop->getLoopPreheader();
3692   auto *Latch = OrigLoop->getLoopLatch();
3693 
3694   // Get the initial and previous values of the scalar recurrence.
3695   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3696   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3697 
3698   // Create a vector from the initial value.
3699   auto *VectorInit = ScalarInit;
3700   if (VF > 1) {
3701     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3702     VectorInit = Builder.CreateInsertElement(
3703         UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
3704         VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
3705   }
3706 
3707   // We constructed a temporary phi node in the first phase of vectorization.
3708   // This phi node will eventually be deleted.
3709   Builder.SetInsertPoint(
3710       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3711 
3712   // Create a phi node for the new recurrence. The current value will either be
3713   // the initial value inserted into a vector or loop-varying vector value.
3714   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3715   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3716 
3717   // Get the vectorized previous value of the last part UF - 1. It appears last
3718   // among all unrolled iterations, due to the order of their construction.
3719   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3720 
3721   // Find and set the insertion point after the previous value if it is an
3722   // instruction.
3723   BasicBlock::iterator InsertPt;
3724   // Note that the previous value may have been constant-folded so it is not
3725   // guaranteed to be an instruction in the vector loop.
3726   // FIXME: Loop invariant values do not form recurrences. We should deal with
3727   //        them earlier.
3728   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3729     InsertPt = LoopVectorBody->getFirstInsertionPt();
3730   else {
3731     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3732     if (isa<PHINode>(PreviousLastPart))
3733       // If the previous value is a phi node, we should insert after all the phi
3734       // nodes in the block containing the PHI to avoid breaking basic block
3735       // verification. Note that the basic block may be different to
3736       // LoopVectorBody, in case we predicate the loop.
3737       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3738     else
3739       InsertPt = ++PreviousInst->getIterator();
3740   }
3741   Builder.SetInsertPoint(&*InsertPt);
3742 
3743   // We will construct a vector for the recurrence by combining the values for
3744   // the current and previous iterations. This is the required shuffle mask.
3745   SmallVector<int, 8> ShuffleMask(VF);
3746   ShuffleMask[0] = VF - 1;
3747   for (unsigned I = 1; I < VF; ++I)
3748     ShuffleMask[I] = I + VF - 1;
3749 
3750   // The vector from which to take the initial value for the current iteration
3751   // (actual or unrolled). Initially, this is the vector phi node.
3752   Value *Incoming = VecPhi;
3753 
3754   // Shuffle the current and previous vector and update the vector parts.
3755   for (unsigned Part = 0; Part < UF; ++Part) {
3756     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3757     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3758     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3759                                                          ShuffleMask)
3760                            : Incoming;
3761     PhiPart->replaceAllUsesWith(Shuffle);
3762     cast<Instruction>(PhiPart)->eraseFromParent();
3763     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3764     Incoming = PreviousPart;
3765   }
3766 
3767   // Fix the latch value of the new recurrence in the vector loop.
3768   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3769 
3770   // Extract the last vector element in the middle block. This will be the
3771   // initial value for the recurrence when jumping to the scalar loop.
3772   auto *ExtractForScalar = Incoming;
3773   if (VF > 1) {
3774     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3775     ExtractForScalar = Builder.CreateExtractElement(
3776         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3777   }
3778   // Extract the second last element in the middle block if the
3779   // Phi is used outside the loop. We need to extract the phi itself
3780   // and not the last element (the phi update in the current iteration). This
3781   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3782   // when the scalar loop is not run at all.
3783   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3784   if (VF > 1)
3785     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3786         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3787   // When loop is unrolled without vectorizing, initialize
3788   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3789   // `Incoming`. This is analogous to the vectorized case above: extracting the
3790   // second last element when VF > 1.
3791   else if (UF > 1)
3792     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3793 
3794   // Fix the initial value of the original recurrence in the scalar loop.
3795   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3796   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3797   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3798     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3799     Start->addIncoming(Incoming, BB);
3800   }
3801 
3802   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3803   Phi->setName("scalar.recur");
3804 
3805   // Finally, fix users of the recurrence outside the loop. The users will need
3806   // either the last value of the scalar recurrence or the last value of the
3807   // vector recurrence we extracted in the middle block. Since the loop is in
3808   // LCSSA form, we just need to find all the phi nodes for the original scalar
3809   // recurrence in the exit block, and then add an edge for the middle block.
3810   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3811     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3812       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3813     }
3814   }
3815 }
3816 
3817 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3818   Constant *Zero = Builder.getInt32(0);
3819 
3820   // Get it's reduction variable descriptor.
3821   assert(Legal->isReductionVariable(Phi) &&
3822          "Unable to find the reduction variable");
3823   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3824 
3825   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3826   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3827   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3828   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3829     RdxDesc.getMinMaxRecurrenceKind();
3830   setDebugLocFromInst(Builder, ReductionStartValue);
3831   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3832 
3833   // We need to generate a reduction vector from the incoming scalar.
3834   // To do so, we need to generate the 'identity' vector and override
3835   // one of the elements with the incoming scalar reduction. We need
3836   // to do it in the vector-loop preheader.
3837   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3838 
3839   // This is the vector-clone of the value that leaves the loop.
3840   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3841 
3842   // Find the reduction identity variable. Zero for addition, or, xor,
3843   // one for multiplication, -1 for And.
3844   Value *Identity;
3845   Value *VectorStart;
3846   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3847       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3848     // MinMax reduction have the start value as their identify.
3849     if (VF == 1 || IsInLoopReductionPhi) {
3850       VectorStart = Identity = ReductionStartValue;
3851     } else {
3852       VectorStart = Identity =
3853         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3854     }
3855   } else {
3856     // Handle other reduction kinds:
3857     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3858         RK, VecTy->getScalarType());
3859     if (VF == 1 || IsInLoopReductionPhi) {
3860       Identity = Iden;
3861       // This vector is the Identity vector where the first element is the
3862       // incoming scalar reduction.
3863       VectorStart = ReductionStartValue;
3864     } else {
3865       Identity = ConstantVector::getSplat(ElementCount::getFixed(VF), Iden);
3866 
3867       // This vector is the Identity vector where the first element is the
3868       // incoming scalar reduction.
3869       VectorStart =
3870         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3871     }
3872   }
3873 
3874   // Wrap flags are in general invalid after vectorization, clear them.
3875   clearReductionWrapFlags(RdxDesc);
3876 
3877   // Fix the vector-loop phi.
3878 
3879   // Reductions do not have to start at zero. They can start with
3880   // any loop invariant values.
3881   BasicBlock *Latch = OrigLoop->getLoopLatch();
3882   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3883 
3884   for (unsigned Part = 0; Part < UF; ++Part) {
3885     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3886     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3887     // Make sure to add the reduction start value only to the
3888     // first unroll part.
3889     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3890     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3891     cast<PHINode>(VecRdxPhi)
3892       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3893   }
3894 
3895   // Before each round, move the insertion point right between
3896   // the PHIs and the values we are going to write.
3897   // This allows us to write both PHINodes and the extractelement
3898   // instructions.
3899   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3900 
3901   setDebugLocFromInst(Builder, LoopExitInst);
3902 
3903   // If tail is folded by masking, the vector value to leave the loop should be
3904   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3905   // instead of the former.
3906   if (Cost->foldTailByMasking()) {
3907     for (unsigned Part = 0; Part < UF; ++Part) {
3908       Value *VecLoopExitInst =
3909           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3910       Value *Sel = nullptr;
3911       for (User *U : VecLoopExitInst->users()) {
3912         if (isa<SelectInst>(U)) {
3913           assert(!Sel && "Reduction exit feeding two selects");
3914           Sel = U;
3915         } else
3916           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3917       }
3918       assert(Sel && "Reduction exit feeds no select");
3919       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3920     }
3921   }
3922 
3923   // If the vector reduction can be performed in a smaller type, we truncate
3924   // then extend the loop exit value to enable InstCombine to evaluate the
3925   // entire expression in the smaller type.
3926   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3927     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
3928     Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
3929     Builder.SetInsertPoint(
3930         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3931     VectorParts RdxParts(UF);
3932     for (unsigned Part = 0; Part < UF; ++Part) {
3933       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3934       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3935       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3936                                         : Builder.CreateZExt(Trunc, VecTy);
3937       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3938            UI != RdxParts[Part]->user_end();)
3939         if (*UI != Trunc) {
3940           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3941           RdxParts[Part] = Extnd;
3942         } else {
3943           ++UI;
3944         }
3945     }
3946     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3947     for (unsigned Part = 0; Part < UF; ++Part) {
3948       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3949       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3950     }
3951   }
3952 
3953   // Reduce all of the unrolled parts into a single vector.
3954   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3955   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3956 
3957   // The middle block terminator has already been assigned a DebugLoc here (the
3958   // OrigLoop's single latch terminator). We want the whole middle block to
3959   // appear to execute on this line because: (a) it is all compiler generated,
3960   // (b) these instructions are always executed after evaluating the latch
3961   // conditional branch, and (c) other passes may add new predecessors which
3962   // terminate on this line. This is the easiest way to ensure we don't
3963   // accidentally cause an extra step back into the loop while debugging.
3964   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3965   for (unsigned Part = 1; Part < UF; ++Part) {
3966     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3967     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3968       // Floating point operations had to be 'fast' to enable the reduction.
3969       ReducedPartRdx = addFastMathFlag(
3970           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3971                               ReducedPartRdx, "bin.rdx"),
3972           RdxDesc.getFastMathFlags());
3973     else
3974       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3975                                       RdxPart);
3976   }
3977 
3978   // Create the reduction after the loop. Note that inloop reductions create the
3979   // target reduction in the loop using a Reduction recipe.
3980   if (VF > 1 && !IsInLoopReductionPhi) {
3981     bool NoNaN = Legal->hasFunNoNaNAttr();
3982     ReducedPartRdx =
3983         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3984     // If the reduction can be performed in a smaller type, we need to extend
3985     // the reduction to the wider type before we branch to the original loop.
3986     if (Phi->getType() != RdxDesc.getRecurrenceType())
3987       ReducedPartRdx =
3988         RdxDesc.isSigned()
3989         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3990         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3991   }
3992 
3993   // Create a phi node that merges control-flow from the backedge-taken check
3994   // block and the middle block.
3995   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3996                                         LoopScalarPreHeader->getTerminator());
3997   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3998     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3999   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4000 
4001   // Now, we need to fix the users of the reduction variable
4002   // inside and outside of the scalar remainder loop.
4003   // We know that the loop is in LCSSA form. We need to update the
4004   // PHI nodes in the exit blocks.
4005   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4006     // All PHINodes need to have a single entry edge, or two if
4007     // we already fixed them.
4008     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4009 
4010     // We found a reduction value exit-PHI. Update it with the
4011     // incoming bypass edge.
4012     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4013       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4014   } // end of the LCSSA phi scan.
4015 
4016     // Fix the scalar loop reduction variable with the incoming reduction sum
4017     // from the vector body and from the backedge value.
4018   int IncomingEdgeBlockIdx =
4019     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4020   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4021   // Pick the other block.
4022   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4023   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4024   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4025 }
4026 
4027 void InnerLoopVectorizer::clearReductionWrapFlags(
4028     RecurrenceDescriptor &RdxDesc) {
4029   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4030   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4031       RK != RecurrenceDescriptor::RK_IntegerMult)
4032     return;
4033 
4034   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4035   assert(LoopExitInstr && "null loop exit instruction");
4036   SmallVector<Instruction *, 8> Worklist;
4037   SmallPtrSet<Instruction *, 8> Visited;
4038   Worklist.push_back(LoopExitInstr);
4039   Visited.insert(LoopExitInstr);
4040 
4041   while (!Worklist.empty()) {
4042     Instruction *Cur = Worklist.pop_back_val();
4043     if (isa<OverflowingBinaryOperator>(Cur))
4044       for (unsigned Part = 0; Part < UF; ++Part) {
4045         Value *V = getOrCreateVectorValue(Cur, Part);
4046         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4047       }
4048 
4049     for (User *U : Cur->users()) {
4050       Instruction *UI = cast<Instruction>(U);
4051       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4052           Visited.insert(UI).second)
4053         Worklist.push_back(UI);
4054     }
4055   }
4056 }
4057 
4058 void InnerLoopVectorizer::fixLCSSAPHIs() {
4059   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4060     if (LCSSAPhi.getNumIncomingValues() == 1) {
4061       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4062       // Non-instruction incoming values will have only one value.
4063       unsigned LastLane = 0;
4064       if (isa<Instruction>(IncomingValue))
4065           LastLane = Cost->isUniformAfterVectorization(
4066                          cast<Instruction>(IncomingValue), VF)
4067                          ? 0
4068                          : VF - 1;
4069       // Can be a loop invariant incoming value or the last scalar value to be
4070       // extracted from the vectorized loop.
4071       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4072       Value *lastIncomingValue =
4073           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4074       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4075     }
4076   }
4077 }
4078 
4079 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4080   // The basic block and loop containing the predicated instruction.
4081   auto *PredBB = PredInst->getParent();
4082   auto *VectorLoop = LI->getLoopFor(PredBB);
4083 
4084   // Initialize a worklist with the operands of the predicated instruction.
4085   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4086 
4087   // Holds instructions that we need to analyze again. An instruction may be
4088   // reanalyzed if we don't yet know if we can sink it or not.
4089   SmallVector<Instruction *, 8> InstsToReanalyze;
4090 
4091   // Returns true if a given use occurs in the predicated block. Phi nodes use
4092   // their operands in their corresponding predecessor blocks.
4093   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4094     auto *I = cast<Instruction>(U.getUser());
4095     BasicBlock *BB = I->getParent();
4096     if (auto *Phi = dyn_cast<PHINode>(I))
4097       BB = Phi->getIncomingBlock(
4098           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4099     return BB == PredBB;
4100   };
4101 
4102   // Iteratively sink the scalarized operands of the predicated instruction
4103   // into the block we created for it. When an instruction is sunk, it's
4104   // operands are then added to the worklist. The algorithm ends after one pass
4105   // through the worklist doesn't sink a single instruction.
4106   bool Changed;
4107   do {
4108     // Add the instructions that need to be reanalyzed to the worklist, and
4109     // reset the changed indicator.
4110     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4111     InstsToReanalyze.clear();
4112     Changed = false;
4113 
4114     while (!Worklist.empty()) {
4115       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4116 
4117       // We can't sink an instruction if it is a phi node, is already in the
4118       // predicated block, is not in the loop, or may have side effects.
4119       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4120           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4121         continue;
4122 
4123       // It's legal to sink the instruction if all its uses occur in the
4124       // predicated block. Otherwise, there's nothing to do yet, and we may
4125       // need to reanalyze the instruction.
4126       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4127         InstsToReanalyze.push_back(I);
4128         continue;
4129       }
4130 
4131       // Move the instruction to the beginning of the predicated block, and add
4132       // it's operands to the worklist.
4133       I->moveBefore(&*PredBB->getFirstInsertionPt());
4134       Worklist.insert(I->op_begin(), I->op_end());
4135 
4136       // The sinking may have enabled other instructions to be sunk, so we will
4137       // need to iterate.
4138       Changed = true;
4139     }
4140   } while (Changed);
4141 }
4142 
4143 void InnerLoopVectorizer::fixNonInductionPHIs() {
4144   for (PHINode *OrigPhi : OrigPHIsToFix) {
4145     PHINode *NewPhi =
4146         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4147     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4148 
4149     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4150         predecessors(OrigPhi->getParent()));
4151     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4152         predecessors(NewPhi->getParent()));
4153     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4154            "Scalar and Vector BB should have the same number of predecessors");
4155 
4156     // The insertion point in Builder may be invalidated by the time we get
4157     // here. Force the Builder insertion point to something valid so that we do
4158     // not run into issues during insertion point restore in
4159     // getOrCreateVectorValue calls below.
4160     Builder.SetInsertPoint(NewPhi);
4161 
4162     // The predecessor order is preserved and we can rely on mapping between
4163     // scalar and vector block predecessors.
4164     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4165       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4166 
4167       // When looking up the new scalar/vector values to fix up, use incoming
4168       // values from original phi.
4169       Value *ScIncV =
4170           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4171 
4172       // Scalar incoming value may need a broadcast
4173       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4174       NewPhi->addIncoming(NewIncV, NewPredBB);
4175     }
4176   }
4177 }
4178 
4179 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4180                                    unsigned UF, unsigned VF,
4181                                    bool IsPtrLoopInvariant,
4182                                    SmallBitVector &IsIndexLoopInvariant,
4183                                    VPTransformState &State) {
4184   // Construct a vector GEP by widening the operands of the scalar GEP as
4185   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4186   // results in a vector of pointers when at least one operand of the GEP
4187   // is vector-typed. Thus, to keep the representation compact, we only use
4188   // vector-typed operands for loop-varying values.
4189 
4190   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4191     // If we are vectorizing, but the GEP has only loop-invariant operands,
4192     // the GEP we build (by only using vector-typed operands for
4193     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4194     // produce a vector of pointers, we need to either arbitrarily pick an
4195     // operand to broadcast, or broadcast a clone of the original GEP.
4196     // Here, we broadcast a clone of the original.
4197     //
4198     // TODO: If at some point we decide to scalarize instructions having
4199     //       loop-invariant operands, this special case will no longer be
4200     //       required. We would add the scalarization decision to
4201     //       collectLoopScalars() and teach getVectorValue() to broadcast
4202     //       the lane-zero scalar value.
4203     auto *Clone = Builder.Insert(GEP->clone());
4204     for (unsigned Part = 0; Part < UF; ++Part) {
4205       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4206       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4207       addMetadata(EntryPart, GEP);
4208     }
4209   } else {
4210     // If the GEP has at least one loop-varying operand, we are sure to
4211     // produce a vector of pointers. But if we are only unrolling, we want
4212     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4213     // produce with the code below will be scalar (if VF == 1) or vector
4214     // (otherwise). Note that for the unroll-only case, we still maintain
4215     // values in the vector mapping with initVector, as we do for other
4216     // instructions.
4217     for (unsigned Part = 0; Part < UF; ++Part) {
4218       // The pointer operand of the new GEP. If it's loop-invariant, we
4219       // won't broadcast it.
4220       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4221                                      : State.get(Operands.getOperand(0), Part);
4222 
4223       // Collect all the indices for the new GEP. If any index is
4224       // loop-invariant, we won't broadcast it.
4225       SmallVector<Value *, 4> Indices;
4226       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4227         VPValue *Operand = Operands.getOperand(I);
4228         if (IsIndexLoopInvariant[I - 1])
4229           Indices.push_back(State.get(Operand, {0, 0}));
4230         else
4231           Indices.push_back(State.get(Operand, Part));
4232       }
4233 
4234       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4235       // but it should be a vector, otherwise.
4236       auto *NewGEP =
4237           GEP->isInBounds()
4238               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4239                                           Indices)
4240               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4241       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4242              "NewGEP is not a pointer vector");
4243       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4244       addMetadata(NewGEP, GEP);
4245     }
4246   }
4247 }
4248 
4249 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4250                                               unsigned VF) {
4251   PHINode *P = cast<PHINode>(PN);
4252   if (EnableVPlanNativePath) {
4253     // Currently we enter here in the VPlan-native path for non-induction
4254     // PHIs where all control flow is uniform. We simply widen these PHIs.
4255     // Create a vector phi with no operands - the vector phi operands will be
4256     // set at the end of vector code generation.
4257     Type *VecTy =
4258         (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4259     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4260     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4261     OrigPHIsToFix.push_back(P);
4262 
4263     return;
4264   }
4265 
4266   assert(PN->getParent() == OrigLoop->getHeader() &&
4267          "Non-header phis should have been handled elsewhere");
4268 
4269   // In order to support recurrences we need to be able to vectorize Phi nodes.
4270   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4271   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4272   // this value when we vectorize all of the instructions that use the PHI.
4273   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4274     for (unsigned Part = 0; Part < UF; ++Part) {
4275       // This is phase one of vectorizing PHIs.
4276       bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast<PHINode>(PN));
4277       Type *VecTy =
4278           ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4279       Value *EntryPart = PHINode::Create(
4280           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4281       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4282     }
4283     return;
4284   }
4285 
4286   setDebugLocFromInst(Builder, P);
4287 
4288   // This PHINode must be an induction variable.
4289   // Make sure that we know about it.
4290   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4291 
4292   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4293   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4294 
4295   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4296   // which can be found from the original scalar operations.
4297   switch (II.getKind()) {
4298   case InductionDescriptor::IK_NoInduction:
4299     llvm_unreachable("Unknown induction");
4300   case InductionDescriptor::IK_IntInduction:
4301   case InductionDescriptor::IK_FpInduction:
4302     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4303   case InductionDescriptor::IK_PtrInduction: {
4304     // Handle the pointer induction variable case.
4305     assert(P->getType()->isPointerTy() && "Unexpected type.");
4306 
4307     if (Cost->isScalarAfterVectorization(P, VF)) {
4308       // This is the normalized GEP that starts counting at zero.
4309       Value *PtrInd =
4310           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4311       // Determine the number of scalars we need to generate for each unroll
4312       // iteration. If the instruction is uniform, we only need to generate the
4313       // first lane. Otherwise, we generate all VF values.
4314       unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4315       for (unsigned Part = 0; Part < UF; ++Part) {
4316         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4317           Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4318           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4319           Value *SclrGep =
4320               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4321           SclrGep->setName("next.gep");
4322           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4323         }
4324       }
4325       return;
4326     }
4327     assert(isa<SCEVConstant>(II.getStep()) &&
4328            "Induction step not a SCEV constant!");
4329     Type *PhiType = II.getStep()->getType();
4330 
4331     // Build a pointer phi
4332     Value *ScalarStartValue = II.getStartValue();
4333     Type *ScStValueType = ScalarStartValue->getType();
4334     PHINode *NewPointerPhi =
4335         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4336     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4337 
4338     // A pointer induction, performed by using a gep
4339     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4340     Instruction *InductionLoc = LoopLatch->getTerminator();
4341     const SCEV *ScalarStep = II.getStep();
4342     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4343     Value *ScalarStepValue =
4344         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4345     Value *InductionGEP = GetElementPtrInst::Create(
4346         ScStValueType->getPointerElementType(), NewPointerPhi,
4347         Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)),
4348         "ptr.ind", InductionLoc);
4349     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4350 
4351     // Create UF many actual address geps that use the pointer
4352     // phi as base and a vectorized version of the step value
4353     // (<step*0, ..., step*N>) as offset.
4354     for (unsigned Part = 0; Part < UF; ++Part) {
4355       SmallVector<Constant *, 8> Indices;
4356       // Create a vector of consecutive numbers from zero to VF.
4357       for (unsigned i = 0; i < VF; ++i)
4358         Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
4359       Constant *StartOffset = ConstantVector::get(Indices);
4360 
4361       Value *GEP = Builder.CreateGEP(
4362           ScStValueType->getPointerElementType(), NewPointerPhi,
4363           Builder.CreateMul(StartOffset,
4364                             Builder.CreateVectorSplat(VF, ScalarStepValue),
4365                             "vector.gep"));
4366       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4367     }
4368   }
4369   }
4370 }
4371 
4372 /// A helper function for checking whether an integer division-related
4373 /// instruction may divide by zero (in which case it must be predicated if
4374 /// executed conditionally in the scalar code).
4375 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4376 /// Non-zero divisors that are non compile-time constants will not be
4377 /// converted into multiplication, so we will still end up scalarizing
4378 /// the division, but can do so w/o predication.
4379 static bool mayDivideByZero(Instruction &I) {
4380   assert((I.getOpcode() == Instruction::UDiv ||
4381           I.getOpcode() == Instruction::SDiv ||
4382           I.getOpcode() == Instruction::URem ||
4383           I.getOpcode() == Instruction::SRem) &&
4384          "Unexpected instruction");
4385   Value *Divisor = I.getOperand(1);
4386   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4387   return !CInt || CInt->isZero();
4388 }
4389 
4390 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4391                                            VPTransformState &State) {
4392   switch (I.getOpcode()) {
4393   case Instruction::Call:
4394   case Instruction::Br:
4395   case Instruction::PHI:
4396   case Instruction::GetElementPtr:
4397   case Instruction::Select:
4398     llvm_unreachable("This instruction is handled by a different recipe.");
4399   case Instruction::UDiv:
4400   case Instruction::SDiv:
4401   case Instruction::SRem:
4402   case Instruction::URem:
4403   case Instruction::Add:
4404   case Instruction::FAdd:
4405   case Instruction::Sub:
4406   case Instruction::FSub:
4407   case Instruction::FNeg:
4408   case Instruction::Mul:
4409   case Instruction::FMul:
4410   case Instruction::FDiv:
4411   case Instruction::FRem:
4412   case Instruction::Shl:
4413   case Instruction::LShr:
4414   case Instruction::AShr:
4415   case Instruction::And:
4416   case Instruction::Or:
4417   case Instruction::Xor: {
4418     // Just widen unops and binops.
4419     setDebugLocFromInst(Builder, &I);
4420 
4421     for (unsigned Part = 0; Part < UF; ++Part) {
4422       SmallVector<Value *, 2> Ops;
4423       for (VPValue *VPOp : User.operands())
4424         Ops.push_back(State.get(VPOp, Part));
4425 
4426       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4427 
4428       if (auto *VecOp = dyn_cast<Instruction>(V))
4429         VecOp->copyIRFlags(&I);
4430 
4431       // Use this vector value for all users of the original instruction.
4432       VectorLoopValueMap.setVectorValue(&I, Part, V);
4433       addMetadata(V, &I);
4434     }
4435 
4436     break;
4437   }
4438   case Instruction::ICmp:
4439   case Instruction::FCmp: {
4440     // Widen compares. Generate vector compares.
4441     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4442     auto *Cmp = cast<CmpInst>(&I);
4443     setDebugLocFromInst(Builder, Cmp);
4444     for (unsigned Part = 0; Part < UF; ++Part) {
4445       Value *A = State.get(User.getOperand(0), Part);
4446       Value *B = State.get(User.getOperand(1), Part);
4447       Value *C = nullptr;
4448       if (FCmp) {
4449         // Propagate fast math flags.
4450         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4451         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4452         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4453       } else {
4454         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4455       }
4456       VectorLoopValueMap.setVectorValue(&I, Part, C);
4457       addMetadata(C, &I);
4458     }
4459 
4460     break;
4461   }
4462 
4463   case Instruction::ZExt:
4464   case Instruction::SExt:
4465   case Instruction::FPToUI:
4466   case Instruction::FPToSI:
4467   case Instruction::FPExt:
4468   case Instruction::PtrToInt:
4469   case Instruction::IntToPtr:
4470   case Instruction::SIToFP:
4471   case Instruction::UIToFP:
4472   case Instruction::Trunc:
4473   case Instruction::FPTrunc:
4474   case Instruction::BitCast: {
4475     auto *CI = cast<CastInst>(&I);
4476     setDebugLocFromInst(Builder, CI);
4477 
4478     /// Vectorize casts.
4479     Type *DestTy =
4480         (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
4481 
4482     for (unsigned Part = 0; Part < UF; ++Part) {
4483       Value *A = State.get(User.getOperand(0), Part);
4484       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4485       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4486       addMetadata(Cast, &I);
4487     }
4488     break;
4489   }
4490   default:
4491     // This instruction is not vectorized by simple widening.
4492     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4493     llvm_unreachable("Unhandled instruction!");
4494   } // end of switch.
4495 }
4496 
4497 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4498                                                VPTransformState &State) {
4499   assert(!isa<DbgInfoIntrinsic>(I) &&
4500          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4501   setDebugLocFromInst(Builder, &I);
4502 
4503   Module *M = I.getParent()->getParent()->getParent();
4504   auto *CI = cast<CallInst>(&I);
4505 
4506   SmallVector<Type *, 4> Tys;
4507   for (Value *ArgOperand : CI->arg_operands())
4508     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4509 
4510   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4511 
4512   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4513   // version of the instruction.
4514   // Is it beneficial to perform intrinsic call compared to lib call?
4515   bool NeedToScalarize = false;
4516   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4517   bool UseVectorIntrinsic =
4518       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4519   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4520          "Instruction should be scalarized elsewhere.");
4521 
4522   for (unsigned Part = 0; Part < UF; ++Part) {
4523     SmallVector<Value *, 4> Args;
4524     for (auto &I : enumerate(ArgOperands.operands())) {
4525       // Some intrinsics have a scalar argument - don't replace it with a
4526       // vector.
4527       Value *Arg;
4528       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4529         Arg = State.get(I.value(), Part);
4530       else
4531         Arg = State.get(I.value(), {0, 0});
4532       Args.push_back(Arg);
4533     }
4534 
4535     Function *VectorF;
4536     if (UseVectorIntrinsic) {
4537       // Use vector version of the intrinsic.
4538       Type *TysForDecl[] = {CI->getType()};
4539       if (VF > 1)
4540         TysForDecl[0] =
4541             FixedVectorType::get(CI->getType()->getScalarType(), VF);
4542       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4543       assert(VectorF && "Can't retrieve vector intrinsic.");
4544     } else {
4545       // Use vector version of the function call.
4546       const VFShape Shape = VFShape::get(*CI, ElementCount::getFixed(VF),
4547                                          false /*HasGlobalPred*/);
4548 #ifndef NDEBUG
4549       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4550              "Can't create vector function.");
4551 #endif
4552         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4553     }
4554       SmallVector<OperandBundleDef, 1> OpBundles;
4555       CI->getOperandBundlesAsDefs(OpBundles);
4556       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4557 
4558       if (isa<FPMathOperator>(V))
4559         V->copyFastMathFlags(CI);
4560 
4561       VectorLoopValueMap.setVectorValue(&I, Part, V);
4562       addMetadata(V, &I);
4563   }
4564 }
4565 
4566 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4567                                                  VPUser &Operands,
4568                                                  bool InvariantCond,
4569                                                  VPTransformState &State) {
4570   setDebugLocFromInst(Builder, &I);
4571 
4572   // The condition can be loop invariant  but still defined inside the
4573   // loop. This means that we can't just use the original 'cond' value.
4574   // We have to take the 'vectorized' value and pick the first lane.
4575   // Instcombine will make this a no-op.
4576   auto *InvarCond =
4577       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4578 
4579   for (unsigned Part = 0; Part < UF; ++Part) {
4580     Value *Cond =
4581         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4582     Value *Op0 = State.get(Operands.getOperand(1), Part);
4583     Value *Op1 = State.get(Operands.getOperand(2), Part);
4584     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4585     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4586     addMetadata(Sel, &I);
4587   }
4588 }
4589 
4590 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4591   // We should not collect Scalars more than once per VF. Right now, this
4592   // function is called from collectUniformsAndScalars(), which already does
4593   // this check. Collecting Scalars for VF=1 does not make any sense.
4594   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4595          "This function should not be visited twice for the same VF");
4596 
4597   SmallSetVector<Instruction *, 8> Worklist;
4598 
4599   // These sets are used to seed the analysis with pointers used by memory
4600   // accesses that will remain scalar.
4601   SmallSetVector<Instruction *, 8> ScalarPtrs;
4602   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4603   auto *Latch = TheLoop->getLoopLatch();
4604 
4605   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4606   // The pointer operands of loads and stores will be scalar as long as the
4607   // memory access is not a gather or scatter operation. The value operand of a
4608   // store will remain scalar if the store is scalarized.
4609   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4610     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4611     assert(WideningDecision != CM_Unknown &&
4612            "Widening decision should be ready at this moment");
4613     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4614       if (Ptr == Store->getValueOperand())
4615         return WideningDecision == CM_Scalarize;
4616     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4617            "Ptr is neither a value or pointer operand");
4618     return WideningDecision != CM_GatherScatter;
4619   };
4620 
4621   // A helper that returns true if the given value is a bitcast or
4622   // getelementptr instruction contained in the loop.
4623   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4624     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4625             isa<GetElementPtrInst>(V)) &&
4626            !TheLoop->isLoopInvariant(V);
4627   };
4628 
4629   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4630     if (!isa<PHINode>(Ptr) ||
4631         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4632       return false;
4633     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4634     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4635       return false;
4636     return isScalarUse(MemAccess, Ptr);
4637   };
4638 
4639   // A helper that evaluates a memory access's use of a pointer. If the
4640   // pointer is actually the pointer induction of a loop, it is being
4641   // inserted into Worklist. If the use will be a scalar use, and the
4642   // pointer is only used by memory accesses, we place the pointer in
4643   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4644   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4645     if (isScalarPtrInduction(MemAccess, Ptr)) {
4646       Worklist.insert(cast<Instruction>(Ptr));
4647       Instruction *Update = cast<Instruction>(
4648           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4649       Worklist.insert(Update);
4650       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4651                         << "\n");
4652       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4653                         << "\n");
4654       return;
4655     }
4656     // We only care about bitcast and getelementptr instructions contained in
4657     // the loop.
4658     if (!isLoopVaryingBitCastOrGEP(Ptr))
4659       return;
4660 
4661     // If the pointer has already been identified as scalar (e.g., if it was
4662     // also identified as uniform), there's nothing to do.
4663     auto *I = cast<Instruction>(Ptr);
4664     if (Worklist.count(I))
4665       return;
4666 
4667     // If the use of the pointer will be a scalar use, and all users of the
4668     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4669     // place the pointer in PossibleNonScalarPtrs.
4670     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4671           return isa<LoadInst>(U) || isa<StoreInst>(U);
4672         }))
4673       ScalarPtrs.insert(I);
4674     else
4675       PossibleNonScalarPtrs.insert(I);
4676   };
4677 
4678   // We seed the scalars analysis with three classes of instructions: (1)
4679   // instructions marked uniform-after-vectorization and (2) bitcast,
4680   // getelementptr and (pointer) phi instructions used by memory accesses
4681   // requiring a scalar use.
4682   //
4683   // (1) Add to the worklist all instructions that have been identified as
4684   // uniform-after-vectorization.
4685   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4686 
4687   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4688   // memory accesses requiring a scalar use. The pointer operands of loads and
4689   // stores will be scalar as long as the memory accesses is not a gather or
4690   // scatter operation. The value operand of a store will remain scalar if the
4691   // store is scalarized.
4692   for (auto *BB : TheLoop->blocks())
4693     for (auto &I : *BB) {
4694       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4695         evaluatePtrUse(Load, Load->getPointerOperand());
4696       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4697         evaluatePtrUse(Store, Store->getPointerOperand());
4698         evaluatePtrUse(Store, Store->getValueOperand());
4699       }
4700     }
4701   for (auto *I : ScalarPtrs)
4702     if (!PossibleNonScalarPtrs.count(I)) {
4703       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4704       Worklist.insert(I);
4705     }
4706 
4707   // Insert the forced scalars.
4708   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4709   // induction variable when the PHI user is scalarized.
4710   auto ForcedScalar = ForcedScalars.find(VF);
4711   if (ForcedScalar != ForcedScalars.end())
4712     for (auto *I : ForcedScalar->second)
4713       Worklist.insert(I);
4714 
4715   // Expand the worklist by looking through any bitcasts and getelementptr
4716   // instructions we've already identified as scalar. This is similar to the
4717   // expansion step in collectLoopUniforms(); however, here we're only
4718   // expanding to include additional bitcasts and getelementptr instructions.
4719   unsigned Idx = 0;
4720   while (Idx != Worklist.size()) {
4721     Instruction *Dst = Worklist[Idx++];
4722     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4723       continue;
4724     auto *Src = cast<Instruction>(Dst->getOperand(0));
4725     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4726           auto *J = cast<Instruction>(U);
4727           return !TheLoop->contains(J) || Worklist.count(J) ||
4728                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4729                   isScalarUse(J, Src));
4730         })) {
4731       Worklist.insert(Src);
4732       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4733     }
4734   }
4735 
4736   // An induction variable will remain scalar if all users of the induction
4737   // variable and induction variable update remain scalar.
4738   for (auto &Induction : Legal->getInductionVars()) {
4739     auto *Ind = Induction.first;
4740     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4741 
4742     // If tail-folding is applied, the primary induction variable will be used
4743     // to feed a vector compare.
4744     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4745       continue;
4746 
4747     // Determine if all users of the induction variable are scalar after
4748     // vectorization.
4749     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4750       auto *I = cast<Instruction>(U);
4751       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4752     });
4753     if (!ScalarInd)
4754       continue;
4755 
4756     // Determine if all users of the induction variable update instruction are
4757     // scalar after vectorization.
4758     auto ScalarIndUpdate =
4759         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4760           auto *I = cast<Instruction>(U);
4761           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4762         });
4763     if (!ScalarIndUpdate)
4764       continue;
4765 
4766     // The induction variable and its update instruction will remain scalar.
4767     Worklist.insert(Ind);
4768     Worklist.insert(IndUpdate);
4769     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4770     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4771                       << "\n");
4772   }
4773 
4774   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4775 }
4776 
4777 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4778   if (!blockNeedsPredication(I->getParent()))
4779     return false;
4780   switch(I->getOpcode()) {
4781   default:
4782     break;
4783   case Instruction::Load:
4784   case Instruction::Store: {
4785     if (!Legal->isMaskRequired(I))
4786       return false;
4787     auto *Ptr = getLoadStorePointerOperand(I);
4788     auto *Ty = getMemInstValueType(I);
4789     // We have already decided how to vectorize this instruction, get that
4790     // result.
4791     if (VF > 1) {
4792       InstWidening WideningDecision = getWideningDecision(I, VF);
4793       assert(WideningDecision != CM_Unknown &&
4794              "Widening decision should be ready at this moment");
4795       return WideningDecision == CM_Scalarize;
4796     }
4797     const Align Alignment = getLoadStoreAlignment(I);
4798     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4799                                 isLegalMaskedGather(Ty, Alignment))
4800                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4801                                 isLegalMaskedScatter(Ty, Alignment));
4802   }
4803   case Instruction::UDiv:
4804   case Instruction::SDiv:
4805   case Instruction::SRem:
4806   case Instruction::URem:
4807     return mayDivideByZero(*I);
4808   }
4809   return false;
4810 }
4811 
4812 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4813                                                                unsigned VF) {
4814   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4815   assert(getWideningDecision(I, VF) == CM_Unknown &&
4816          "Decision should not be set yet.");
4817   auto *Group = getInterleavedAccessGroup(I);
4818   assert(Group && "Must have a group.");
4819 
4820   // If the instruction's allocated size doesn't equal it's type size, it
4821   // requires padding and will be scalarized.
4822   auto &DL = I->getModule()->getDataLayout();
4823   auto *ScalarTy = getMemInstValueType(I);
4824   if (hasIrregularType(ScalarTy, DL, VF))
4825     return false;
4826 
4827   // Check if masking is required.
4828   // A Group may need masking for one of two reasons: it resides in a block that
4829   // needs predication, or it was decided to use masking to deal with gaps.
4830   bool PredicatedAccessRequiresMasking =
4831       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4832   bool AccessWithGapsRequiresMasking =
4833       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4834   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4835     return true;
4836 
4837   // If masked interleaving is required, we expect that the user/target had
4838   // enabled it, because otherwise it either wouldn't have been created or
4839   // it should have been invalidated by the CostModel.
4840   assert(useMaskedInterleavedAccesses(TTI) &&
4841          "Masked interleave-groups for predicated accesses are not enabled.");
4842 
4843   auto *Ty = getMemInstValueType(I);
4844   const Align Alignment = getLoadStoreAlignment(I);
4845   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4846                           : TTI.isLegalMaskedStore(Ty, Alignment);
4847 }
4848 
4849 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4850                                                                unsigned VF) {
4851   // Get and ensure we have a valid memory instruction.
4852   LoadInst *LI = dyn_cast<LoadInst>(I);
4853   StoreInst *SI = dyn_cast<StoreInst>(I);
4854   assert((LI || SI) && "Invalid memory instruction");
4855 
4856   auto *Ptr = getLoadStorePointerOperand(I);
4857 
4858   // In order to be widened, the pointer should be consecutive, first of all.
4859   if (!Legal->isConsecutivePtr(Ptr))
4860     return false;
4861 
4862   // If the instruction is a store located in a predicated block, it will be
4863   // scalarized.
4864   if (isScalarWithPredication(I))
4865     return false;
4866 
4867   // If the instruction's allocated size doesn't equal it's type size, it
4868   // requires padding and will be scalarized.
4869   auto &DL = I->getModule()->getDataLayout();
4870   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4871   if (hasIrregularType(ScalarTy, DL, VF))
4872     return false;
4873 
4874   return true;
4875 }
4876 
4877 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4878   // We should not collect Uniforms more than once per VF. Right now,
4879   // this function is called from collectUniformsAndScalars(), which
4880   // already does this check. Collecting Uniforms for VF=1 does not make any
4881   // sense.
4882 
4883   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4884          "This function should not be visited twice for the same VF");
4885 
4886   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4887   // not analyze again.  Uniforms.count(VF) will return 1.
4888   Uniforms[VF].clear();
4889 
4890   // We now know that the loop is vectorizable!
4891   // Collect instructions inside the loop that will remain uniform after
4892   // vectorization.
4893 
4894   // Global values, params and instructions outside of current loop are out of
4895   // scope.
4896   auto isOutOfScope = [&](Value *V) -> bool {
4897     Instruction *I = dyn_cast<Instruction>(V);
4898     return (!I || !TheLoop->contains(I));
4899   };
4900 
4901   SetVector<Instruction *> Worklist;
4902   BasicBlock *Latch = TheLoop->getLoopLatch();
4903 
4904   // Instructions that are scalar with predication must not be considered
4905   // uniform after vectorization, because that would create an erroneous
4906   // replicating region where only a single instance out of VF should be formed.
4907   // TODO: optimize such seldom cases if found important, see PR40816.
4908   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4909     if (isScalarWithPredication(I, VF)) {
4910       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4911                         << *I << "\n");
4912       return;
4913     }
4914     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4915     Worklist.insert(I);
4916   };
4917 
4918   // Start with the conditional branch. If the branch condition is an
4919   // instruction contained in the loop that is only used by the branch, it is
4920   // uniform.
4921   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4922   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4923     addToWorklistIfAllowed(Cmp);
4924 
4925   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4926   // are pointers that are treated like consecutive pointers during
4927   // vectorization. The pointer operands of interleaved accesses are an
4928   // example.
4929   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4930 
4931   // Holds pointer operands of instructions that are possibly non-uniform.
4932   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4933 
4934   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4935     InstWidening WideningDecision = getWideningDecision(I, VF);
4936     assert(WideningDecision != CM_Unknown &&
4937            "Widening decision should be ready at this moment");
4938 
4939     return (WideningDecision == CM_Widen ||
4940             WideningDecision == CM_Widen_Reverse ||
4941             WideningDecision == CM_Interleave);
4942   };
4943   // Iterate over the instructions in the loop, and collect all
4944   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4945   // that a consecutive-like pointer operand will be scalarized, we collect it
4946   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4947   // getelementptr instruction can be used by both vectorized and scalarized
4948   // memory instructions. For example, if a loop loads and stores from the same
4949   // location, but the store is conditional, the store will be scalarized, and
4950   // the getelementptr won't remain uniform.
4951   for (auto *BB : TheLoop->blocks())
4952     for (auto &I : *BB) {
4953       // If there's no pointer operand, there's nothing to do.
4954       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4955       if (!Ptr)
4956         continue;
4957 
4958       // True if all users of Ptr are memory accesses that have Ptr as their
4959       // pointer operand.
4960       auto UsersAreMemAccesses =
4961           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4962             return getLoadStorePointerOperand(U) == Ptr;
4963           });
4964 
4965       // Ensure the memory instruction will not be scalarized or used by
4966       // gather/scatter, making its pointer operand non-uniform. If the pointer
4967       // operand is used by any instruction other than a memory access, we
4968       // conservatively assume the pointer operand may be non-uniform.
4969       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4970         PossibleNonUniformPtrs.insert(Ptr);
4971 
4972       // If the memory instruction will be vectorized and its pointer operand
4973       // is consecutive-like, or interleaving - the pointer operand should
4974       // remain uniform.
4975       else
4976         ConsecutiveLikePtrs.insert(Ptr);
4977     }
4978 
4979   // Add to the Worklist all consecutive and consecutive-like pointers that
4980   // aren't also identified as possibly non-uniform.
4981   for (auto *V : ConsecutiveLikePtrs)
4982     if (!PossibleNonUniformPtrs.count(V))
4983       addToWorklistIfAllowed(V);
4984 
4985   // Expand Worklist in topological order: whenever a new instruction
4986   // is added , its users should be already inside Worklist.  It ensures
4987   // a uniform instruction will only be used by uniform instructions.
4988   unsigned idx = 0;
4989   while (idx != Worklist.size()) {
4990     Instruction *I = Worklist[idx++];
4991 
4992     for (auto OV : I->operand_values()) {
4993       // isOutOfScope operands cannot be uniform instructions.
4994       if (isOutOfScope(OV))
4995         continue;
4996       // First order recurrence Phi's should typically be considered
4997       // non-uniform.
4998       auto *OP = dyn_cast<PHINode>(OV);
4999       if (OP && Legal->isFirstOrderRecurrence(OP))
5000         continue;
5001       // If all the users of the operand are uniform, then add the
5002       // operand into the uniform worklist.
5003       auto *OI = cast<Instruction>(OV);
5004       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5005             auto *J = cast<Instruction>(U);
5006             return Worklist.count(J) ||
5007                    (OI == getLoadStorePointerOperand(J) &&
5008                     isUniformDecision(J, VF));
5009           }))
5010         addToWorklistIfAllowed(OI);
5011     }
5012   }
5013 
5014   // Returns true if Ptr is the pointer operand of a memory access instruction
5015   // I, and I is known to not require scalarization.
5016   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5017     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5018   };
5019 
5020   // For an instruction to be added into Worklist above, all its users inside
5021   // the loop should also be in Worklist. However, this condition cannot be
5022   // true for phi nodes that form a cyclic dependence. We must process phi
5023   // nodes separately. An induction variable will remain uniform if all users
5024   // of the induction variable and induction variable update remain uniform.
5025   // The code below handles both pointer and non-pointer induction variables.
5026   for (auto &Induction : Legal->getInductionVars()) {
5027     auto *Ind = Induction.first;
5028     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5029 
5030     // Determine if all users of the induction variable are uniform after
5031     // vectorization.
5032     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5033       auto *I = cast<Instruction>(U);
5034       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5035              isVectorizedMemAccessUse(I, Ind);
5036     });
5037     if (!UniformInd)
5038       continue;
5039 
5040     // Determine if all users of the induction variable update instruction are
5041     // uniform after vectorization.
5042     auto UniformIndUpdate =
5043         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5044           auto *I = cast<Instruction>(U);
5045           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5046                  isVectorizedMemAccessUse(I, IndUpdate);
5047         });
5048     if (!UniformIndUpdate)
5049       continue;
5050 
5051     // The induction variable and its update instruction will remain uniform.
5052     addToWorklistIfAllowed(Ind);
5053     addToWorklistIfAllowed(IndUpdate);
5054   }
5055 
5056   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5057 }
5058 
5059 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5060   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5061 
5062   if (Legal->getRuntimePointerChecking()->Need) {
5063     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5064         "runtime pointer checks needed. Enable vectorization of this "
5065         "loop with '#pragma clang loop vectorize(enable)' when "
5066         "compiling with -Os/-Oz",
5067         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5068     return true;
5069   }
5070 
5071   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5072     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5073         "runtime SCEV checks needed. Enable vectorization of this "
5074         "loop with '#pragma clang loop vectorize(enable)' when "
5075         "compiling with -Os/-Oz",
5076         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5077     return true;
5078   }
5079 
5080   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5081   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5082     reportVectorizationFailure("Runtime stride check for small trip count",
5083         "runtime stride == 1 checks needed. Enable vectorization of "
5084         "this loop without such check by compiling with -Os/-Oz",
5085         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5086     return true;
5087   }
5088 
5089   return false;
5090 }
5091 
5092 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5093                                                             unsigned UserIC) {
5094   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5095     // TODO: It may by useful to do since it's still likely to be dynamically
5096     // uniform if the target can skip.
5097     reportVectorizationFailure(
5098         "Not inserting runtime ptr check for divergent target",
5099         "runtime pointer checks needed. Not enabled for divergent target",
5100         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5101     return None;
5102   }
5103 
5104   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5105   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5106   if (TC == 1) {
5107     reportVectorizationFailure("Single iteration (non) loop",
5108         "loop trip count is one, irrelevant for vectorization",
5109         "SingleIterationLoop", ORE, TheLoop);
5110     return None;
5111   }
5112 
5113   switch (ScalarEpilogueStatus) {
5114   case CM_ScalarEpilogueAllowed:
5115     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5116   case CM_ScalarEpilogueNotNeededUsePredicate:
5117     LLVM_DEBUG(
5118         dbgs() << "LV: vector predicate hint/switch found.\n"
5119                << "LV: Not allowing scalar epilogue, creating predicated "
5120                << "vector loop.\n");
5121     break;
5122   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5123     // fallthrough as a special case of OptForSize
5124   case CM_ScalarEpilogueNotAllowedOptSize:
5125     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5126       LLVM_DEBUG(
5127           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5128     else
5129       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5130                         << "count.\n");
5131 
5132     // Bail if runtime checks are required, which are not good when optimising
5133     // for size.
5134     if (runtimeChecksRequired())
5135       return None;
5136     break;
5137   }
5138 
5139   // Now try the tail folding
5140 
5141   // Invalidate interleave groups that require an epilogue if we can't mask
5142   // the interleave-group.
5143   if (!useMaskedInterleavedAccesses(TTI)) {
5144     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5145            "No decisions should have been taken at this point");
5146     // Note: There is no need to invalidate any cost modeling decisions here, as
5147     // non where taken so far.
5148     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5149   }
5150 
5151   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5152   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5153   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5154   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5155     // Accept MaxVF if we do not have a tail.
5156     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5157     return MaxVF;
5158   }
5159 
5160   // If we don't know the precise trip count, or if the trip count that we
5161   // found modulo the vectorization factor is not zero, try to fold the tail
5162   // by masking.
5163   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5164   if (Legal->prepareToFoldTailByMasking()) {
5165     FoldTailByMasking = true;
5166     return MaxVF;
5167   }
5168 
5169   if (TC == 0) {
5170     reportVectorizationFailure(
5171         "Unable to calculate the loop count due to complex control flow",
5172         "unable to calculate the loop count due to complex control flow",
5173         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5174     return None;
5175   }
5176 
5177   reportVectorizationFailure(
5178       "Cannot optimize for size and vectorize at the same time.",
5179       "cannot optimize for size and vectorize at the same time. "
5180       "Enable vectorization of this loop with '#pragma clang loop "
5181       "vectorize(enable)' when compiling with -Os/-Oz",
5182       "NoTailLoopWithOptForSize", ORE, TheLoop);
5183   return None;
5184 }
5185 
5186 unsigned
5187 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5188   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5189   unsigned SmallestType, WidestType;
5190   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5191   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5192 
5193   // Get the maximum safe dependence distance in bits computed by LAA.
5194   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5195   // the memory accesses that is most restrictive (involved in the smallest
5196   // dependence distance).
5197   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5198 
5199   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5200 
5201   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5202   // Note that both WidestRegister and WidestType may not be a powers of 2.
5203   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5204 
5205   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5206                     << " / " << WidestType << " bits.\n");
5207   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5208                     << WidestRegister << " bits.\n");
5209 
5210   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5211                                  " into one vector!");
5212   if (MaxVectorSize == 0) {
5213     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5214     MaxVectorSize = 1;
5215     return MaxVectorSize;
5216   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5217              isPowerOf2_32(ConstTripCount)) {
5218     // We need to clamp the VF to be the ConstTripCount. There is no point in
5219     // choosing a higher viable VF as done in the loop below.
5220     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5221                       << ConstTripCount << "\n");
5222     MaxVectorSize = ConstTripCount;
5223     return MaxVectorSize;
5224   }
5225 
5226   unsigned MaxVF = MaxVectorSize;
5227   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5228       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5229     // Collect all viable vectorization factors larger than the default MaxVF
5230     // (i.e. MaxVectorSize).
5231     SmallVector<unsigned, 8> VFs;
5232     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5233     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5234       VFs.push_back(VS);
5235 
5236     // For each VF calculate its register usage.
5237     auto RUs = calculateRegisterUsage(VFs);
5238 
5239     // Select the largest VF which doesn't require more registers than existing
5240     // ones.
5241     for (int i = RUs.size() - 1; i >= 0; --i) {
5242       bool Selected = true;
5243       for (auto& pair : RUs[i].MaxLocalUsers) {
5244         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5245         if (pair.second > TargetNumRegisters)
5246           Selected = false;
5247       }
5248       if (Selected) {
5249         MaxVF = VFs[i];
5250         break;
5251       }
5252     }
5253     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5254       if (MaxVF < MinVF) {
5255         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5256                           << ") with target's minimum: " << MinVF << '\n');
5257         MaxVF = MinVF;
5258       }
5259     }
5260   }
5261   return MaxVF;
5262 }
5263 
5264 VectorizationFactor
5265 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5266   float Cost = expectedCost(1).first;
5267   const float ScalarCost = Cost;
5268   unsigned Width = 1;
5269   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5270 
5271   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5272   if (ForceVectorization && MaxVF > 1) {
5273     // Ignore scalar width, because the user explicitly wants vectorization.
5274     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5275     // evaluation.
5276     Cost = std::numeric_limits<float>::max();
5277   }
5278 
5279   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5280     // Notice that the vector loop needs to be executed less times, so
5281     // we need to divide the cost of the vector loops by the width of
5282     // the vector elements.
5283     VectorizationCostTy C = expectedCost(i);
5284     float VectorCost = C.first / (float)i;
5285     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5286                       << " costs: " << (int)VectorCost << ".\n");
5287     if (!C.second && !ForceVectorization) {
5288       LLVM_DEBUG(
5289           dbgs() << "LV: Not considering vector loop of width " << i
5290                  << " because it will not generate any vector instructions.\n");
5291       continue;
5292     }
5293     if (VectorCost < Cost) {
5294       Cost = VectorCost;
5295       Width = i;
5296     }
5297   }
5298 
5299   if (!EnableCondStoresVectorization && NumPredStores) {
5300     reportVectorizationFailure("There are conditional stores.",
5301         "store that is conditionally executed prevents vectorization",
5302         "ConditionalStore", ORE, TheLoop);
5303     Width = 1;
5304     Cost = ScalarCost;
5305   }
5306 
5307   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5308              << "LV: Vectorization seems to be not beneficial, "
5309              << "but was forced by a user.\n");
5310   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5311   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5312   return Factor;
5313 }
5314 
5315 std::pair<unsigned, unsigned>
5316 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5317   unsigned MinWidth = -1U;
5318   unsigned MaxWidth = 8;
5319   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5320 
5321   // For each block.
5322   for (BasicBlock *BB : TheLoop->blocks()) {
5323     // For each instruction in the loop.
5324     for (Instruction &I : BB->instructionsWithoutDebug()) {
5325       Type *T = I.getType();
5326 
5327       // Skip ignored values.
5328       if (ValuesToIgnore.count(&I))
5329         continue;
5330 
5331       // Only examine Loads, Stores and PHINodes.
5332       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5333         continue;
5334 
5335       // Examine PHI nodes that are reduction variables. Update the type to
5336       // account for the recurrence type.
5337       if (auto *PN = dyn_cast<PHINode>(&I)) {
5338         if (!Legal->isReductionVariable(PN))
5339           continue;
5340         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5341         T = RdxDesc.getRecurrenceType();
5342       }
5343 
5344       // Examine the stored values.
5345       if (auto *ST = dyn_cast<StoreInst>(&I))
5346         T = ST->getValueOperand()->getType();
5347 
5348       // Ignore loaded pointer types and stored pointer types that are not
5349       // vectorizable.
5350       //
5351       // FIXME: The check here attempts to predict whether a load or store will
5352       //        be vectorized. We only know this for certain after a VF has
5353       //        been selected. Here, we assume that if an access can be
5354       //        vectorized, it will be. We should also look at extending this
5355       //        optimization to non-pointer types.
5356       //
5357       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5358           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5359         continue;
5360 
5361       MinWidth = std::min(MinWidth,
5362                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5363       MaxWidth = std::max(MaxWidth,
5364                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5365     }
5366   }
5367 
5368   return {MinWidth, MaxWidth};
5369 }
5370 
5371 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5372                                                            unsigned LoopCost) {
5373   // -- The interleave heuristics --
5374   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5375   // There are many micro-architectural considerations that we can't predict
5376   // at this level. For example, frontend pressure (on decode or fetch) due to
5377   // code size, or the number and capabilities of the execution ports.
5378   //
5379   // We use the following heuristics to select the interleave count:
5380   // 1. If the code has reductions, then we interleave to break the cross
5381   // iteration dependency.
5382   // 2. If the loop is really small, then we interleave to reduce the loop
5383   // overhead.
5384   // 3. We don't interleave if we think that we will spill registers to memory
5385   // due to the increased register pressure.
5386 
5387   if (!isScalarEpilogueAllowed())
5388     return 1;
5389 
5390   // We used the distance for the interleave count.
5391   if (Legal->getMaxSafeDepDistBytes() != -1U)
5392     return 1;
5393 
5394   // Do not interleave loops with a relatively small known or estimated trip
5395   // count.
5396   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5397   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5398     return 1;
5399 
5400   RegisterUsage R = calculateRegisterUsage({VF})[0];
5401   // We divide by these constants so assume that we have at least one
5402   // instruction that uses at least one register.
5403   for (auto& pair : R.MaxLocalUsers) {
5404     pair.second = std::max(pair.second, 1U);
5405   }
5406 
5407   // We calculate the interleave count using the following formula.
5408   // Subtract the number of loop invariants from the number of available
5409   // registers. These registers are used by all of the interleaved instances.
5410   // Next, divide the remaining registers by the number of registers that is
5411   // required by the loop, in order to estimate how many parallel instances
5412   // fit without causing spills. All of this is rounded down if necessary to be
5413   // a power of two. We want power of two interleave count to simplify any
5414   // addressing operations or alignment considerations.
5415   // We also want power of two interleave counts to ensure that the induction
5416   // variable of the vector loop wraps to zero, when tail is folded by masking;
5417   // this currently happens when OptForSize, in which case IC is set to 1 above.
5418   unsigned IC = UINT_MAX;
5419 
5420   for (auto& pair : R.MaxLocalUsers) {
5421     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5422     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5423                       << " registers of "
5424                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5425     if (VF == 1) {
5426       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5427         TargetNumRegisters = ForceTargetNumScalarRegs;
5428     } else {
5429       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5430         TargetNumRegisters = ForceTargetNumVectorRegs;
5431     }
5432     unsigned MaxLocalUsers = pair.second;
5433     unsigned LoopInvariantRegs = 0;
5434     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5435       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5436 
5437     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5438     // Don't count the induction variable as interleaved.
5439     if (EnableIndVarRegisterHeur) {
5440       TmpIC =
5441           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5442                         std::max(1U, (MaxLocalUsers - 1)));
5443     }
5444 
5445     IC = std::min(IC, TmpIC);
5446   }
5447 
5448   // Clamp the interleave ranges to reasonable counts.
5449   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5450 
5451   // Check if the user has overridden the max.
5452   if (VF == 1) {
5453     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5454       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5455   } else {
5456     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5457       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5458   }
5459 
5460   // If trip count is known or estimated compile time constant, limit the
5461   // interleave count to be less than the trip count divided by VF.
5462   if (BestKnownTC) {
5463     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5464   }
5465 
5466   // If we did not calculate the cost for VF (because the user selected the VF)
5467   // then we calculate the cost of VF here.
5468   if (LoopCost == 0)
5469     LoopCost = expectedCost(VF).first;
5470 
5471   assert(LoopCost && "Non-zero loop cost expected");
5472 
5473   // Clamp the calculated IC to be between the 1 and the max interleave count
5474   // that the target and trip count allows.
5475   if (IC > MaxInterleaveCount)
5476     IC = MaxInterleaveCount;
5477   else if (IC < 1)
5478     IC = 1;
5479 
5480   // Interleave if we vectorized this loop and there is a reduction that could
5481   // benefit from interleaving.
5482   if (VF > 1 && !Legal->getReductionVars().empty()) {
5483     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5484     return IC;
5485   }
5486 
5487   // Note that if we've already vectorized the loop we will have done the
5488   // runtime check and so interleaving won't require further checks.
5489   bool InterleavingRequiresRuntimePointerCheck =
5490       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5491 
5492   // We want to interleave small loops in order to reduce the loop overhead and
5493   // potentially expose ILP opportunities.
5494   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5495   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5496     // We assume that the cost overhead is 1 and we use the cost model
5497     // to estimate the cost of the loop and interleave until the cost of the
5498     // loop overhead is about 5% of the cost of the loop.
5499     unsigned SmallIC =
5500         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5501 
5502     // Interleave until store/load ports (estimated by max interleave count) are
5503     // saturated.
5504     unsigned NumStores = Legal->getNumStores();
5505     unsigned NumLoads = Legal->getNumLoads();
5506     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5507     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5508 
5509     // If we have a scalar reduction (vector reductions are already dealt with
5510     // by this point), we can increase the critical path length if the loop
5511     // we're interleaving is inside another loop. Limit, by default to 2, so the
5512     // critical path only gets increased by one reduction operation.
5513     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5514       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5515       SmallIC = std::min(SmallIC, F);
5516       StoresIC = std::min(StoresIC, F);
5517       LoadsIC = std::min(LoadsIC, F);
5518     }
5519 
5520     if (EnableLoadStoreRuntimeInterleave &&
5521         std::max(StoresIC, LoadsIC) > SmallIC) {
5522       LLVM_DEBUG(
5523           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5524       return std::max(StoresIC, LoadsIC);
5525     }
5526 
5527     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5528     return SmallIC;
5529   }
5530 
5531   // Interleave if this is a large loop (small loops are already dealt with by
5532   // this point) that could benefit from interleaving.
5533   bool HasReductions = !Legal->getReductionVars().empty();
5534   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5535     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5536     return IC;
5537   }
5538 
5539   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5540   return 1;
5541 }
5542 
5543 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5544 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5545   // This function calculates the register usage by measuring the highest number
5546   // of values that are alive at a single location. Obviously, this is a very
5547   // rough estimation. We scan the loop in a topological order in order and
5548   // assign a number to each instruction. We use RPO to ensure that defs are
5549   // met before their users. We assume that each instruction that has in-loop
5550   // users starts an interval. We record every time that an in-loop value is
5551   // used, so we have a list of the first and last occurrences of each
5552   // instruction. Next, we transpose this data structure into a multi map that
5553   // holds the list of intervals that *end* at a specific location. This multi
5554   // map allows us to perform a linear search. We scan the instructions linearly
5555   // and record each time that a new interval starts, by placing it in a set.
5556   // If we find this value in the multi-map then we remove it from the set.
5557   // The max register usage is the maximum size of the set.
5558   // We also search for instructions that are defined outside the loop, but are
5559   // used inside the loop. We need this number separately from the max-interval
5560   // usage number because when we unroll, loop-invariant values do not take
5561   // more register.
5562   LoopBlocksDFS DFS(TheLoop);
5563   DFS.perform(LI);
5564 
5565   RegisterUsage RU;
5566 
5567   // Each 'key' in the map opens a new interval. The values
5568   // of the map are the index of the 'last seen' usage of the
5569   // instruction that is the key.
5570   using IntervalMap = DenseMap<Instruction *, unsigned>;
5571 
5572   // Maps instruction to its index.
5573   SmallVector<Instruction *, 64> IdxToInstr;
5574   // Marks the end of each interval.
5575   IntervalMap EndPoint;
5576   // Saves the list of instruction indices that are used in the loop.
5577   SmallPtrSet<Instruction *, 8> Ends;
5578   // Saves the list of values that are used in the loop but are
5579   // defined outside the loop, such as arguments and constants.
5580   SmallPtrSet<Value *, 8> LoopInvariants;
5581 
5582   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5583     for (Instruction &I : BB->instructionsWithoutDebug()) {
5584       IdxToInstr.push_back(&I);
5585 
5586       // Save the end location of each USE.
5587       for (Value *U : I.operands()) {
5588         auto *Instr = dyn_cast<Instruction>(U);
5589 
5590         // Ignore non-instruction values such as arguments, constants, etc.
5591         if (!Instr)
5592           continue;
5593 
5594         // If this instruction is outside the loop then record it and continue.
5595         if (!TheLoop->contains(Instr)) {
5596           LoopInvariants.insert(Instr);
5597           continue;
5598         }
5599 
5600         // Overwrite previous end points.
5601         EndPoint[Instr] = IdxToInstr.size();
5602         Ends.insert(Instr);
5603       }
5604     }
5605   }
5606 
5607   // Saves the list of intervals that end with the index in 'key'.
5608   using InstrList = SmallVector<Instruction *, 2>;
5609   DenseMap<unsigned, InstrList> TransposeEnds;
5610 
5611   // Transpose the EndPoints to a list of values that end at each index.
5612   for (auto &Interval : EndPoint)
5613     TransposeEnds[Interval.second].push_back(Interval.first);
5614 
5615   SmallPtrSet<Instruction *, 8> OpenIntervals;
5616 
5617   // Get the size of the widest register.
5618   unsigned MaxSafeDepDist = -1U;
5619   if (Legal->getMaxSafeDepDistBytes() != -1U)
5620     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5621   unsigned WidestRegister =
5622       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5623   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5624 
5625   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5626   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5627 
5628   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5629 
5630   // A lambda that gets the register usage for the given type and VF.
5631   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5632     if (Ty->isTokenTy())
5633       return 0U;
5634     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5635     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5636   };
5637 
5638   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5639     Instruction *I = IdxToInstr[i];
5640 
5641     // Remove all of the instructions that end at this location.
5642     InstrList &List = TransposeEnds[i];
5643     for (Instruction *ToRemove : List)
5644       OpenIntervals.erase(ToRemove);
5645 
5646     // Ignore instructions that are never used within the loop.
5647     if (!Ends.count(I))
5648       continue;
5649 
5650     // Skip ignored values.
5651     if (ValuesToIgnore.count(I))
5652       continue;
5653 
5654     // For each VF find the maximum usage of registers.
5655     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5656       // Count the number of live intervals.
5657       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5658 
5659       if (VFs[j] == 1) {
5660         for (auto Inst : OpenIntervals) {
5661           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5662           if (RegUsage.find(ClassID) == RegUsage.end())
5663             RegUsage[ClassID] = 1;
5664           else
5665             RegUsage[ClassID] += 1;
5666         }
5667       } else {
5668         collectUniformsAndScalars(VFs[j]);
5669         for (auto Inst : OpenIntervals) {
5670           // Skip ignored values for VF > 1.
5671           if (VecValuesToIgnore.count(Inst))
5672             continue;
5673           if (isScalarAfterVectorization(Inst, VFs[j])) {
5674             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5675             if (RegUsage.find(ClassID) == RegUsage.end())
5676               RegUsage[ClassID] = 1;
5677             else
5678               RegUsage[ClassID] += 1;
5679           } else {
5680             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5681             if (RegUsage.find(ClassID) == RegUsage.end())
5682               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5683             else
5684               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5685           }
5686         }
5687       }
5688 
5689       for (auto& pair : RegUsage) {
5690         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5691           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5692         else
5693           MaxUsages[j][pair.first] = pair.second;
5694       }
5695     }
5696 
5697     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5698                       << OpenIntervals.size() << '\n');
5699 
5700     // Add the current instruction to the list of open intervals.
5701     OpenIntervals.insert(I);
5702   }
5703 
5704   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5705     SmallMapVector<unsigned, unsigned, 4> Invariant;
5706 
5707     for (auto Inst : LoopInvariants) {
5708       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5709       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5710       if (Invariant.find(ClassID) == Invariant.end())
5711         Invariant[ClassID] = Usage;
5712       else
5713         Invariant[ClassID] += Usage;
5714     }
5715 
5716     LLVM_DEBUG({
5717       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5718       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5719              << " item\n";
5720       for (const auto &pair : MaxUsages[i]) {
5721         dbgs() << "LV(REG): RegisterClass: "
5722                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5723                << " registers\n";
5724       }
5725       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5726              << " item\n";
5727       for (const auto &pair : Invariant) {
5728         dbgs() << "LV(REG): RegisterClass: "
5729                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5730                << " registers\n";
5731       }
5732     });
5733 
5734     RU.LoopInvariantRegs = Invariant;
5735     RU.MaxLocalUsers = MaxUsages[i];
5736     RUs[i] = RU;
5737   }
5738 
5739   return RUs;
5740 }
5741 
5742 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5743   // TODO: Cost model for emulated masked load/store is completely
5744   // broken. This hack guides the cost model to use an artificially
5745   // high enough value to practically disable vectorization with such
5746   // operations, except where previously deployed legality hack allowed
5747   // using very low cost values. This is to avoid regressions coming simply
5748   // from moving "masked load/store" check from legality to cost model.
5749   // Masked Load/Gather emulation was previously never allowed.
5750   // Limited number of Masked Store/Scatter emulation was allowed.
5751   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5752   return isa<LoadInst>(I) ||
5753          (isa<StoreInst>(I) &&
5754           NumPredStores > NumberOfStoresToPredicate);
5755 }
5756 
5757 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5758   // If we aren't vectorizing the loop, or if we've already collected the
5759   // instructions to scalarize, there's nothing to do. Collection may already
5760   // have occurred if we have a user-selected VF and are now computing the
5761   // expected cost for interleaving.
5762   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5763     return;
5764 
5765   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5766   // not profitable to scalarize any instructions, the presence of VF in the
5767   // map will indicate that we've analyzed it already.
5768   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5769 
5770   // Find all the instructions that are scalar with predication in the loop and
5771   // determine if it would be better to not if-convert the blocks they are in.
5772   // If so, we also record the instructions to scalarize.
5773   for (BasicBlock *BB : TheLoop->blocks()) {
5774     if (!blockNeedsPredication(BB))
5775       continue;
5776     for (Instruction &I : *BB)
5777       if (isScalarWithPredication(&I)) {
5778         ScalarCostsTy ScalarCosts;
5779         // Do not apply discount logic if hacked cost is needed
5780         // for emulated masked memrefs.
5781         if (!useEmulatedMaskMemRefHack(&I) &&
5782             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5783           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5784         // Remember that BB will remain after vectorization.
5785         PredicatedBBsAfterVectorization.insert(BB);
5786       }
5787   }
5788 }
5789 
5790 int LoopVectorizationCostModel::computePredInstDiscount(
5791     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5792     unsigned VF) {
5793   assert(!isUniformAfterVectorization(PredInst, VF) &&
5794          "Instruction marked uniform-after-vectorization will be predicated");
5795 
5796   // Initialize the discount to zero, meaning that the scalar version and the
5797   // vector version cost the same.
5798   int Discount = 0;
5799 
5800   // Holds instructions to analyze. The instructions we visit are mapped in
5801   // ScalarCosts. Those instructions are the ones that would be scalarized if
5802   // we find that the scalar version costs less.
5803   SmallVector<Instruction *, 8> Worklist;
5804 
5805   // Returns true if the given instruction can be scalarized.
5806   auto canBeScalarized = [&](Instruction *I) -> bool {
5807     // We only attempt to scalarize instructions forming a single-use chain
5808     // from the original predicated block that would otherwise be vectorized.
5809     // Although not strictly necessary, we give up on instructions we know will
5810     // already be scalar to avoid traversing chains that are unlikely to be
5811     // beneficial.
5812     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5813         isScalarAfterVectorization(I, VF))
5814       return false;
5815 
5816     // If the instruction is scalar with predication, it will be analyzed
5817     // separately. We ignore it within the context of PredInst.
5818     if (isScalarWithPredication(I))
5819       return false;
5820 
5821     // If any of the instruction's operands are uniform after vectorization,
5822     // the instruction cannot be scalarized. This prevents, for example, a
5823     // masked load from being scalarized.
5824     //
5825     // We assume we will only emit a value for lane zero of an instruction
5826     // marked uniform after vectorization, rather than VF identical values.
5827     // Thus, if we scalarize an instruction that uses a uniform, we would
5828     // create uses of values corresponding to the lanes we aren't emitting code
5829     // for. This behavior can be changed by allowing getScalarValue to clone
5830     // the lane zero values for uniforms rather than asserting.
5831     for (Use &U : I->operands())
5832       if (auto *J = dyn_cast<Instruction>(U.get()))
5833         if (isUniformAfterVectorization(J, VF))
5834           return false;
5835 
5836     // Otherwise, we can scalarize the instruction.
5837     return true;
5838   };
5839 
5840   // Compute the expected cost discount from scalarizing the entire expression
5841   // feeding the predicated instruction. We currently only consider expressions
5842   // that are single-use instruction chains.
5843   Worklist.push_back(PredInst);
5844   while (!Worklist.empty()) {
5845     Instruction *I = Worklist.pop_back_val();
5846 
5847     // If we've already analyzed the instruction, there's nothing to do.
5848     if (ScalarCosts.find(I) != ScalarCosts.end())
5849       continue;
5850 
5851     // Compute the cost of the vector instruction. Note that this cost already
5852     // includes the scalarization overhead of the predicated instruction.
5853     unsigned VectorCost = getInstructionCost(I, VF).first;
5854 
5855     // Compute the cost of the scalarized instruction. This cost is the cost of
5856     // the instruction as if it wasn't if-converted and instead remained in the
5857     // predicated block. We will scale this cost by block probability after
5858     // computing the scalarization overhead.
5859     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5860 
5861     // Compute the scalarization overhead of needed insertelement instructions
5862     // and phi nodes.
5863     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5864       ScalarCost += TTI.getScalarizationOverhead(
5865           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5866           APInt::getAllOnesValue(VF), true, false);
5867       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
5868                                             TTI::TCK_RecipThroughput);
5869     }
5870 
5871     // Compute the scalarization overhead of needed extractelement
5872     // instructions. For each of the instruction's operands, if the operand can
5873     // be scalarized, add it to the worklist; otherwise, account for the
5874     // overhead.
5875     for (Use &U : I->operands())
5876       if (auto *J = dyn_cast<Instruction>(U.get())) {
5877         assert(VectorType::isValidElementType(J->getType()) &&
5878                "Instruction has non-scalar type");
5879         if (canBeScalarized(J))
5880           Worklist.push_back(J);
5881         else if (needsExtract(J, VF))
5882           ScalarCost += TTI.getScalarizationOverhead(
5883               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5884               APInt::getAllOnesValue(VF), false, true);
5885       }
5886 
5887     // Scale the total scalar cost by block probability.
5888     ScalarCost /= getReciprocalPredBlockProb();
5889 
5890     // Compute the discount. A non-negative discount means the vector version
5891     // of the instruction costs more, and scalarizing would be beneficial.
5892     Discount += VectorCost - ScalarCost;
5893     ScalarCosts[I] = ScalarCost;
5894   }
5895 
5896   return Discount;
5897 }
5898 
5899 LoopVectorizationCostModel::VectorizationCostTy
5900 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5901   VectorizationCostTy Cost;
5902 
5903   // For each block.
5904   for (BasicBlock *BB : TheLoop->blocks()) {
5905     VectorizationCostTy BlockCost;
5906 
5907     // For each instruction in the old loop.
5908     for (Instruction &I : BB->instructionsWithoutDebug()) {
5909       // Skip ignored values.
5910       if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
5911         continue;
5912 
5913       VectorizationCostTy C = getInstructionCost(&I, VF);
5914 
5915       // Check if we should override the cost.
5916       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5917         C.first = ForceTargetInstructionCost;
5918 
5919       BlockCost.first += C.first;
5920       BlockCost.second |= C.second;
5921       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5922                         << " for VF " << VF << " For instruction: " << I
5923                         << '\n');
5924     }
5925 
5926     // If we are vectorizing a predicated block, it will have been
5927     // if-converted. This means that the block's instructions (aside from
5928     // stores and instructions that may divide by zero) will now be
5929     // unconditionally executed. For the scalar case, we may not always execute
5930     // the predicated block. Thus, scale the block's cost by the probability of
5931     // executing it.
5932     if (VF == 1 && blockNeedsPredication(BB))
5933       BlockCost.first /= getReciprocalPredBlockProb();
5934 
5935     Cost.first += BlockCost.first;
5936     Cost.second |= BlockCost.second;
5937   }
5938 
5939   return Cost;
5940 }
5941 
5942 /// Gets Address Access SCEV after verifying that the access pattern
5943 /// is loop invariant except the induction variable dependence.
5944 ///
5945 /// This SCEV can be sent to the Target in order to estimate the address
5946 /// calculation cost.
5947 static const SCEV *getAddressAccessSCEV(
5948               Value *Ptr,
5949               LoopVectorizationLegality *Legal,
5950               PredicatedScalarEvolution &PSE,
5951               const Loop *TheLoop) {
5952 
5953   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5954   if (!Gep)
5955     return nullptr;
5956 
5957   // We are looking for a gep with all loop invariant indices except for one
5958   // which should be an induction variable.
5959   auto SE = PSE.getSE();
5960   unsigned NumOperands = Gep->getNumOperands();
5961   for (unsigned i = 1; i < NumOperands; ++i) {
5962     Value *Opd = Gep->getOperand(i);
5963     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5964         !Legal->isInductionVariable(Opd))
5965       return nullptr;
5966   }
5967 
5968   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5969   return PSE.getSCEV(Ptr);
5970 }
5971 
5972 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5973   return Legal->hasStride(I->getOperand(0)) ||
5974          Legal->hasStride(I->getOperand(1));
5975 }
5976 
5977 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5978                                                                  unsigned VF) {
5979   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5980   Type *ValTy = getMemInstValueType(I);
5981   auto SE = PSE.getSE();
5982 
5983   unsigned AS = getLoadStoreAddressSpace(I);
5984   Value *Ptr = getLoadStorePointerOperand(I);
5985   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5986 
5987   // Figure out whether the access is strided and get the stride value
5988   // if it's known in compile time
5989   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5990 
5991   // Get the cost of the scalar memory instruction and address computation.
5992   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5993 
5994   // Don't pass *I here, since it is scalar but will actually be part of a
5995   // vectorized loop where the user of it is a vectorized instruction.
5996   const Align Alignment = getLoadStoreAlignment(I);
5997   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5998                                    Alignment, AS,
5999                                    TTI::TCK_RecipThroughput);
6000 
6001   // Get the overhead of the extractelement and insertelement instructions
6002   // we might create due to scalarization.
6003   Cost += getScalarizationOverhead(I, VF);
6004 
6005   // If we have a predicated store, it may not be executed for each vector
6006   // lane. Scale the cost by the probability of executing the predicated
6007   // block.
6008   if (isPredicatedInst(I)) {
6009     Cost /= getReciprocalPredBlockProb();
6010 
6011     if (useEmulatedMaskMemRefHack(I))
6012       // Artificially setting to a high enough value to practically disable
6013       // vectorization with such operations.
6014       Cost = 3000000;
6015   }
6016 
6017   return Cost;
6018 }
6019 
6020 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6021                                                              unsigned VF) {
6022   Type *ValTy = getMemInstValueType(I);
6023   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6024   Value *Ptr = getLoadStorePointerOperand(I);
6025   unsigned AS = getLoadStoreAddressSpace(I);
6026   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6027   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6028 
6029   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6030          "Stride should be 1 or -1 for consecutive memory access");
6031   const Align Alignment = getLoadStoreAlignment(I);
6032   unsigned Cost = 0;
6033   if (Legal->isMaskRequired(I))
6034     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6035                                       CostKind);
6036   else
6037     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6038                                 CostKind, I);
6039 
6040   bool Reverse = ConsecutiveStride < 0;
6041   if (Reverse)
6042     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6043   return Cost;
6044 }
6045 
6046 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6047                                                          unsigned VF) {
6048   Type *ValTy = getMemInstValueType(I);
6049   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6050   const Align Alignment = getLoadStoreAlignment(I);
6051   unsigned AS = getLoadStoreAddressSpace(I);
6052   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6053   if (isa<LoadInst>(I)) {
6054     return TTI.getAddressComputationCost(ValTy) +
6055            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6056                                CostKind) +
6057            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6058   }
6059   StoreInst *SI = cast<StoreInst>(I);
6060 
6061   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6062   return TTI.getAddressComputationCost(ValTy) +
6063          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6064                              CostKind) +
6065          (isLoopInvariantStoreValue
6066               ? 0
6067               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6068                                        VF - 1));
6069 }
6070 
6071 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6072                                                           unsigned VF) {
6073   Type *ValTy = getMemInstValueType(I);
6074   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6075   const Align Alignment = getLoadStoreAlignment(I);
6076   const Value *Ptr = getLoadStorePointerOperand(I);
6077 
6078   return TTI.getAddressComputationCost(VectorTy) +
6079          TTI.getGatherScatterOpCost(
6080              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6081              TargetTransformInfo::TCK_RecipThroughput, I);
6082 }
6083 
6084 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6085                                                             unsigned VF) {
6086   Type *ValTy = getMemInstValueType(I);
6087   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6088   unsigned AS = getLoadStoreAddressSpace(I);
6089 
6090   auto Group = getInterleavedAccessGroup(I);
6091   assert(Group && "Fail to get an interleaved access group.");
6092 
6093   unsigned InterleaveFactor = Group->getFactor();
6094   auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
6095 
6096   // Holds the indices of existing members in an interleaved load group.
6097   // An interleaved store group doesn't need this as it doesn't allow gaps.
6098   SmallVector<unsigned, 4> Indices;
6099   if (isa<LoadInst>(I)) {
6100     for (unsigned i = 0; i < InterleaveFactor; i++)
6101       if (Group->getMember(i))
6102         Indices.push_back(i);
6103   }
6104 
6105   // Calculate the cost of the whole interleaved group.
6106   bool UseMaskForGaps =
6107       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6108   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6109       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6110       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6111 
6112   if (Group->isReverse()) {
6113     // TODO: Add support for reversed masked interleaved access.
6114     assert(!Legal->isMaskRequired(I) &&
6115            "Reverse masked interleaved access not supported.");
6116     Cost += Group->getNumMembers() *
6117             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6118   }
6119   return Cost;
6120 }
6121 
6122 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6123                                                               unsigned VF) {
6124   // Calculate scalar cost only. Vectorization cost should be ready at this
6125   // moment.
6126   if (VF == 1) {
6127     Type *ValTy = getMemInstValueType(I);
6128     const Align Alignment = getLoadStoreAlignment(I);
6129     unsigned AS = getLoadStoreAddressSpace(I);
6130 
6131     return TTI.getAddressComputationCost(ValTy) +
6132            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6133                                TTI::TCK_RecipThroughput, I);
6134   }
6135   return getWideningCost(I, VF);
6136 }
6137 
6138 LoopVectorizationCostModel::VectorizationCostTy
6139 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6140   // If we know that this instruction will remain uniform, check the cost of
6141   // the scalar version.
6142   if (isUniformAfterVectorization(I, VF))
6143     VF = 1;
6144 
6145   if (VF > 1 && isProfitableToScalarize(I, VF))
6146     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6147 
6148   // Forced scalars do not have any scalarization overhead.
6149   auto ForcedScalar = ForcedScalars.find(VF);
6150   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
6151     auto InstSet = ForcedScalar->second;
6152     if (InstSet.count(I))
6153       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6154   }
6155 
6156   Type *VectorTy;
6157   unsigned C = getInstructionCost(I, VF, VectorTy);
6158 
6159   bool TypeNotScalarized =
6160       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6161   return VectorizationCostTy(C, TypeNotScalarized);
6162 }
6163 
6164 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6165                                                               unsigned VF) {
6166 
6167   if (VF == 1)
6168     return 0;
6169 
6170   unsigned Cost = 0;
6171   Type *RetTy = ToVectorTy(I->getType(), VF);
6172   if (!RetTy->isVoidTy() &&
6173       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6174     Cost += TTI.getScalarizationOverhead(
6175         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6176 
6177   // Some targets keep addresses scalar.
6178   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6179     return Cost;
6180 
6181   // Some targets support efficient element stores.
6182   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6183     return Cost;
6184 
6185   // Collect operands to consider.
6186   CallInst *CI = dyn_cast<CallInst>(I);
6187   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6188 
6189   // Skip operands that do not require extraction/scalarization and do not incur
6190   // any overhead.
6191   return Cost + TTI.getOperandsScalarizationOverhead(
6192                     filterExtractingOperands(Ops, VF), VF);
6193 }
6194 
6195 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6196   if (VF == 1)
6197     return;
6198   NumPredStores = 0;
6199   for (BasicBlock *BB : TheLoop->blocks()) {
6200     // For each instruction in the old loop.
6201     for (Instruction &I : *BB) {
6202       Value *Ptr =  getLoadStorePointerOperand(&I);
6203       if (!Ptr)
6204         continue;
6205 
6206       // TODO: We should generate better code and update the cost model for
6207       // predicated uniform stores. Today they are treated as any other
6208       // predicated store (see added test cases in
6209       // invariant-store-vectorization.ll).
6210       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6211         NumPredStores++;
6212 
6213       if (Legal->isUniform(Ptr) &&
6214           // Conditional loads and stores should be scalarized and predicated.
6215           // isScalarWithPredication cannot be used here since masked
6216           // gather/scatters are not considered scalar with predication.
6217           !Legal->blockNeedsPredication(I.getParent())) {
6218         // TODO: Avoid replicating loads and stores instead of
6219         // relying on instcombine to remove them.
6220         // Load: Scalar load + broadcast
6221         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6222         unsigned Cost = getUniformMemOpCost(&I, VF);
6223         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6224         continue;
6225       }
6226 
6227       // We assume that widening is the best solution when possible.
6228       if (memoryInstructionCanBeWidened(&I, VF)) {
6229         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6230         int ConsecutiveStride =
6231                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6232         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6233                "Expected consecutive stride.");
6234         InstWidening Decision =
6235             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6236         setWideningDecision(&I, VF, Decision, Cost);
6237         continue;
6238       }
6239 
6240       // Choose between Interleaving, Gather/Scatter or Scalarization.
6241       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6242       unsigned NumAccesses = 1;
6243       if (isAccessInterleaved(&I)) {
6244         auto Group = getInterleavedAccessGroup(&I);
6245         assert(Group && "Fail to get an interleaved access group.");
6246 
6247         // Make one decision for the whole group.
6248         if (getWideningDecision(&I, VF) != CM_Unknown)
6249           continue;
6250 
6251         NumAccesses = Group->getNumMembers();
6252         if (interleavedAccessCanBeWidened(&I, VF))
6253           InterleaveCost = getInterleaveGroupCost(&I, VF);
6254       }
6255 
6256       unsigned GatherScatterCost =
6257           isLegalGatherOrScatter(&I)
6258               ? getGatherScatterCost(&I, VF) * NumAccesses
6259               : std::numeric_limits<unsigned>::max();
6260 
6261       unsigned ScalarizationCost =
6262           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6263 
6264       // Choose better solution for the current VF,
6265       // write down this decision and use it during vectorization.
6266       unsigned Cost;
6267       InstWidening Decision;
6268       if (InterleaveCost <= GatherScatterCost &&
6269           InterleaveCost < ScalarizationCost) {
6270         Decision = CM_Interleave;
6271         Cost = InterleaveCost;
6272       } else if (GatherScatterCost < ScalarizationCost) {
6273         Decision = CM_GatherScatter;
6274         Cost = GatherScatterCost;
6275       } else {
6276         Decision = CM_Scalarize;
6277         Cost = ScalarizationCost;
6278       }
6279       // If the instructions belongs to an interleave group, the whole group
6280       // receives the same decision. The whole group receives the cost, but
6281       // the cost will actually be assigned to one instruction.
6282       if (auto Group = getInterleavedAccessGroup(&I))
6283         setWideningDecision(Group, VF, Decision, Cost);
6284       else
6285         setWideningDecision(&I, VF, Decision, Cost);
6286     }
6287   }
6288 
6289   // Make sure that any load of address and any other address computation
6290   // remains scalar unless there is gather/scatter support. This avoids
6291   // inevitable extracts into address registers, and also has the benefit of
6292   // activating LSR more, since that pass can't optimize vectorized
6293   // addresses.
6294   if (TTI.prefersVectorizedAddressing())
6295     return;
6296 
6297   // Start with all scalar pointer uses.
6298   SmallPtrSet<Instruction *, 8> AddrDefs;
6299   for (BasicBlock *BB : TheLoop->blocks())
6300     for (Instruction &I : *BB) {
6301       Instruction *PtrDef =
6302         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6303       if (PtrDef && TheLoop->contains(PtrDef) &&
6304           getWideningDecision(&I, VF) != CM_GatherScatter)
6305         AddrDefs.insert(PtrDef);
6306     }
6307 
6308   // Add all instructions used to generate the addresses.
6309   SmallVector<Instruction *, 4> Worklist;
6310   for (auto *I : AddrDefs)
6311     Worklist.push_back(I);
6312   while (!Worklist.empty()) {
6313     Instruction *I = Worklist.pop_back_val();
6314     for (auto &Op : I->operands())
6315       if (auto *InstOp = dyn_cast<Instruction>(Op))
6316         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6317             AddrDefs.insert(InstOp).second)
6318           Worklist.push_back(InstOp);
6319   }
6320 
6321   for (auto *I : AddrDefs) {
6322     if (isa<LoadInst>(I)) {
6323       // Setting the desired widening decision should ideally be handled in
6324       // by cost functions, but since this involves the task of finding out
6325       // if the loaded register is involved in an address computation, it is
6326       // instead changed here when we know this is the case.
6327       InstWidening Decision = getWideningDecision(I, VF);
6328       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6329         // Scalarize a widened load of address.
6330         setWideningDecision(I, VF, CM_Scalarize,
6331                             (VF * getMemoryInstructionCost(I, 1)));
6332       else if (auto Group = getInterleavedAccessGroup(I)) {
6333         // Scalarize an interleave group of address loads.
6334         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6335           if (Instruction *Member = Group->getMember(I))
6336             setWideningDecision(Member, VF, CM_Scalarize,
6337                                 (VF * getMemoryInstructionCost(Member, 1)));
6338         }
6339       }
6340     } else
6341       // Make sure I gets scalarized and a cost estimate without
6342       // scalarization overhead.
6343       ForcedScalars[VF].insert(I);
6344   }
6345 }
6346 
6347 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6348                                                         unsigned VF,
6349                                                         Type *&VectorTy) {
6350   Type *RetTy = I->getType();
6351   if (canTruncateToMinimalBitwidth(I, VF))
6352     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6353   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6354   auto SE = PSE.getSE();
6355   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6356 
6357   // TODO: We need to estimate the cost of intrinsic calls.
6358   switch (I->getOpcode()) {
6359   case Instruction::GetElementPtr:
6360     // We mark this instruction as zero-cost because the cost of GEPs in
6361     // vectorized code depends on whether the corresponding memory instruction
6362     // is scalarized or not. Therefore, we handle GEPs with the memory
6363     // instruction cost.
6364     return 0;
6365   case Instruction::Br: {
6366     // In cases of scalarized and predicated instructions, there will be VF
6367     // predicated blocks in the vectorized loop. Each branch around these
6368     // blocks requires also an extract of its vector compare i1 element.
6369     bool ScalarPredicatedBB = false;
6370     BranchInst *BI = cast<BranchInst>(I);
6371     if (VF > 1 && BI->isConditional() &&
6372         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6373          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6374       ScalarPredicatedBB = true;
6375 
6376     if (ScalarPredicatedBB) {
6377       // Return cost for branches around scalarized and predicated blocks.
6378       auto *Vec_i1Ty =
6379           FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6380       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6381                                            false, true) +
6382               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
6383     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6384       // The back-edge branch will remain, as will all scalar branches.
6385       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6386     else
6387       // This branch will be eliminated by if-conversion.
6388       return 0;
6389     // Note: We currently assume zero cost for an unconditional branch inside
6390     // a predicated block since it will become a fall-through, although we
6391     // may decide in the future to call TTI for all branches.
6392   }
6393   case Instruction::PHI: {
6394     auto *Phi = cast<PHINode>(I);
6395 
6396     // First-order recurrences are replaced by vector shuffles inside the loop.
6397     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6398     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6399       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6400                                 cast<VectorType>(VectorTy), VF - 1,
6401                                 FixedVectorType::get(RetTy, 1));
6402 
6403     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6404     // converted into select instructions. We require N - 1 selects per phi
6405     // node, where N is the number of incoming values.
6406     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6407       return (Phi->getNumIncomingValues() - 1) *
6408              TTI.getCmpSelInstrCost(
6409                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6410                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6411                  CostKind);
6412 
6413     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6414   }
6415   case Instruction::UDiv:
6416   case Instruction::SDiv:
6417   case Instruction::URem:
6418   case Instruction::SRem:
6419     // If we have a predicated instruction, it may not be executed for each
6420     // vector lane. Get the scalarization cost and scale this amount by the
6421     // probability of executing the predicated block. If the instruction is not
6422     // predicated, we fall through to the next case.
6423     if (VF > 1 && isScalarWithPredication(I)) {
6424       unsigned Cost = 0;
6425 
6426       // These instructions have a non-void type, so account for the phi nodes
6427       // that we will create. This cost is likely to be zero. The phi node
6428       // cost, if any, should be scaled by the block probability because it
6429       // models a copy at the end of each predicated block.
6430       Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6431 
6432       // The cost of the non-predicated instruction.
6433       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6434 
6435       // The cost of insertelement and extractelement instructions needed for
6436       // scalarization.
6437       Cost += getScalarizationOverhead(I, VF);
6438 
6439       // Scale the cost by the probability of executing the predicated blocks.
6440       // This assumes the predicated block for each vector lane is equally
6441       // likely.
6442       return Cost / getReciprocalPredBlockProb();
6443     }
6444     LLVM_FALLTHROUGH;
6445   case Instruction::Add:
6446   case Instruction::FAdd:
6447   case Instruction::Sub:
6448   case Instruction::FSub:
6449   case Instruction::Mul:
6450   case Instruction::FMul:
6451   case Instruction::FDiv:
6452   case Instruction::FRem:
6453   case Instruction::Shl:
6454   case Instruction::LShr:
6455   case Instruction::AShr:
6456   case Instruction::And:
6457   case Instruction::Or:
6458   case Instruction::Xor: {
6459     // Since we will replace the stride by 1 the multiplication should go away.
6460     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6461       return 0;
6462     // Certain instructions can be cheaper to vectorize if they have a constant
6463     // second vector operand. One example of this are shifts on x86.
6464     Value *Op2 = I->getOperand(1);
6465     TargetTransformInfo::OperandValueProperties Op2VP;
6466     TargetTransformInfo::OperandValueKind Op2VK =
6467         TTI.getOperandInfo(Op2, Op2VP);
6468     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6469       Op2VK = TargetTransformInfo::OK_UniformValue;
6470 
6471     SmallVector<const Value *, 4> Operands(I->operand_values());
6472     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6473     return N * TTI.getArithmeticInstrCost(
6474                    I->getOpcode(), VectorTy, CostKind,
6475                    TargetTransformInfo::OK_AnyValue,
6476                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6477   }
6478   case Instruction::FNeg: {
6479     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6480     return N * TTI.getArithmeticInstrCost(
6481                    I->getOpcode(), VectorTy, CostKind,
6482                    TargetTransformInfo::OK_AnyValue,
6483                    TargetTransformInfo::OK_AnyValue,
6484                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6485                    I->getOperand(0), I);
6486   }
6487   case Instruction::Select: {
6488     SelectInst *SI = cast<SelectInst>(I);
6489     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6490     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6491     Type *CondTy = SI->getCondition()->getType();
6492     if (!ScalarCond)
6493       CondTy = FixedVectorType::get(CondTy, VF);
6494 
6495     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6496                                   CostKind, I);
6497   }
6498   case Instruction::ICmp:
6499   case Instruction::FCmp: {
6500     Type *ValTy = I->getOperand(0)->getType();
6501     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6502     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6503       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6504     VectorTy = ToVectorTy(ValTy, VF);
6505     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6506                                   I);
6507   }
6508   case Instruction::Store:
6509   case Instruction::Load: {
6510     unsigned Width = VF;
6511     if (Width > 1) {
6512       InstWidening Decision = getWideningDecision(I, Width);
6513       assert(Decision != CM_Unknown &&
6514              "CM decision should be taken at this point");
6515       if (Decision == CM_Scalarize)
6516         Width = 1;
6517     }
6518     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6519     return getMemoryInstructionCost(I, VF);
6520   }
6521   case Instruction::ZExt:
6522   case Instruction::SExt:
6523   case Instruction::FPToUI:
6524   case Instruction::FPToSI:
6525   case Instruction::FPExt:
6526   case Instruction::PtrToInt:
6527   case Instruction::IntToPtr:
6528   case Instruction::SIToFP:
6529   case Instruction::UIToFP:
6530   case Instruction::Trunc:
6531   case Instruction::FPTrunc:
6532   case Instruction::BitCast: {
6533     // Computes the CastContextHint from a Load/Store instruction.
6534     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6535       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6536              "Expected a load or a store!");
6537 
6538       if (VF == 1 || !TheLoop->contains(I))
6539         return TTI::CastContextHint::Normal;
6540 
6541       switch (getWideningDecision(I, VF)) {
6542       case LoopVectorizationCostModel::CM_GatherScatter:
6543         return TTI::CastContextHint::GatherScatter;
6544       case LoopVectorizationCostModel::CM_Interleave:
6545         return TTI::CastContextHint::Interleave;
6546       case LoopVectorizationCostModel::CM_Scalarize:
6547       case LoopVectorizationCostModel::CM_Widen:
6548         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6549                                         : TTI::CastContextHint::Normal;
6550       case LoopVectorizationCostModel::CM_Widen_Reverse:
6551         return TTI::CastContextHint::Reversed;
6552       case LoopVectorizationCostModel::CM_Unknown:
6553         llvm_unreachable("Instr did not go through cost modelling?");
6554       }
6555 
6556       llvm_unreachable("Unhandled case!");
6557     };
6558 
6559     unsigned Opcode = I->getOpcode();
6560     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6561     // For Trunc, the context is the only user, which must be a StoreInst.
6562     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6563       if (I->hasOneUse())
6564         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6565           CCH = ComputeCCH(Store);
6566     }
6567     // For Z/Sext, the context is the operand, which must be a LoadInst.
6568     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6569              Opcode == Instruction::FPExt) {
6570       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6571         CCH = ComputeCCH(Load);
6572     }
6573 
6574     // We optimize the truncation of induction variables having constant
6575     // integer steps. The cost of these truncations is the same as the scalar
6576     // operation.
6577     if (isOptimizableIVTruncate(I, VF)) {
6578       auto *Trunc = cast<TruncInst>(I);
6579       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6580                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6581     }
6582 
6583     Type *SrcScalarTy = I->getOperand(0)->getType();
6584     Type *SrcVecTy =
6585         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6586     if (canTruncateToMinimalBitwidth(I, VF)) {
6587       // This cast is going to be shrunk. This may remove the cast or it might
6588       // turn it into slightly different cast. For example, if MinBW == 16,
6589       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6590       //
6591       // Calculate the modified src and dest types.
6592       Type *MinVecTy = VectorTy;
6593       if (Opcode == Instruction::Trunc) {
6594         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6595         VectorTy =
6596             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6597       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6598         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6599         VectorTy =
6600             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6601       }
6602     }
6603 
6604     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6605     return N *
6606            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6607   }
6608   case Instruction::Call: {
6609     bool NeedToScalarize;
6610     CallInst *CI = cast<CallInst>(I);
6611     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6612     if (getVectorIntrinsicIDForCall(CI, TLI))
6613       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6614     return CallCost;
6615   }
6616   default:
6617     // The cost of executing VF copies of the scalar instruction. This opcode
6618     // is unknown. Assume that it is the same as 'mul'.
6619     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6620                                            CostKind) +
6621            getScalarizationOverhead(I, VF);
6622   } // end of switch.
6623 }
6624 
6625 char LoopVectorize::ID = 0;
6626 
6627 static const char lv_name[] = "Loop Vectorization";
6628 
6629 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6630 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6631 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6632 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6633 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6634 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6635 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6636 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6637 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6638 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6639 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6640 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6641 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6642 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6643 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6644 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6645 
6646 namespace llvm {
6647 
6648 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6649 
6650 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6651                               bool VectorizeOnlyWhenForced) {
6652   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6653 }
6654 
6655 } // end namespace llvm
6656 
6657 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6658   // Check if the pointer operand of a load or store instruction is
6659   // consecutive.
6660   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6661     return Legal->isConsecutivePtr(Ptr);
6662   return false;
6663 }
6664 
6665 void LoopVectorizationCostModel::collectValuesToIgnore() {
6666   // Ignore ephemeral values.
6667   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6668 
6669   // Ignore type-promoting instructions we identified during reduction
6670   // detection.
6671   for (auto &Reduction : Legal->getReductionVars()) {
6672     RecurrenceDescriptor &RedDes = Reduction.second;
6673     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6674     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6675   }
6676   // Ignore type-casting instructions we identified during induction
6677   // detection.
6678   for (auto &Induction : Legal->getInductionVars()) {
6679     InductionDescriptor &IndDes = Induction.second;
6680     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6681     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6682   }
6683 }
6684 
6685 void LoopVectorizationCostModel::collectInLoopReductions() {
6686   // For the moment, without predicated reduction instructions, we do not
6687   // support inloop reductions whilst folding the tail, and hence in those cases
6688   // all reductions are currently out of the loop.
6689   if (!PreferInLoopReductions || foldTailByMasking())
6690     return;
6691 
6692   for (auto &Reduction : Legal->getReductionVars()) {
6693     PHINode *Phi = Reduction.first;
6694     RecurrenceDescriptor &RdxDesc = Reduction.second;
6695 
6696     // We don't collect reductions that are type promoted (yet).
6697     if (RdxDesc.getRecurrenceType() != Phi->getType())
6698       continue;
6699 
6700     // Check that we can correctly put the reductions into the loop, by
6701     // finding the chain of operations that leads from the phi to the loop
6702     // exit value.
6703     SmallVector<Instruction *, 4> ReductionOperations =
6704         RdxDesc.getReductionOpChain(Phi, TheLoop);
6705     bool InLoop = !ReductionOperations.empty();
6706     if (InLoop)
6707       InLoopReductionChains[Phi] = ReductionOperations;
6708     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6709                       << " reduction for phi: " << *Phi << "\n");
6710   }
6711 }
6712 
6713 // TODO: we could return a pair of values that specify the max VF and
6714 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6715 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6716 // doesn't have a cost model that can choose which plan to execute if
6717 // more than one is generated.
6718 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6719                                  LoopVectorizationCostModel &CM) {
6720   unsigned WidestType;
6721   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6722   return WidestVectorRegBits / WidestType;
6723 }
6724 
6725 VectorizationFactor
6726 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6727   unsigned VF = UserVF;
6728   // Outer loop handling: They may require CFG and instruction level
6729   // transformations before even evaluating whether vectorization is profitable.
6730   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6731   // the vectorization pipeline.
6732   if (!OrigLoop->empty()) {
6733     // If the user doesn't provide a vectorization factor, determine a
6734     // reasonable one.
6735     if (!UserVF) {
6736       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6737       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6738 
6739       // Make sure we have a VF > 1 for stress testing.
6740       if (VPlanBuildStressTest && VF < 2) {
6741         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6742                           << "overriding computed VF.\n");
6743         VF = 4;
6744       }
6745     }
6746     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6747     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6748     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6749                       << " to build VPlans.\n");
6750     buildVPlans(VF, VF);
6751 
6752     // For VPlan build stress testing, we bail out after VPlan construction.
6753     if (VPlanBuildStressTest)
6754       return VectorizationFactor::Disabled();
6755 
6756     return {VF, 0};
6757   }
6758 
6759   LLVM_DEBUG(
6760       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6761                 "VPlan-native path.\n");
6762   return VectorizationFactor::Disabled();
6763 }
6764 
6765 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
6766                                                              unsigned UserIC) {
6767   assert(OrigLoop->empty() && "Inner loop expected.");
6768   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
6769   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6770     return None;
6771 
6772   // Invalidate interleave groups if all blocks of loop will be predicated.
6773   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6774       !useMaskedInterleavedAccesses(*TTI)) {
6775     LLVM_DEBUG(
6776         dbgs()
6777         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6778            "which requires masked-interleaved support.\n");
6779     if (CM.InterleaveInfo.invalidateGroups())
6780       // Invalidating interleave groups also requires invalidating all decisions
6781       // based on them, which includes widening decisions and uniform and scalar
6782       // values.
6783       CM.invalidateCostModelingDecisions();
6784   }
6785 
6786   if (UserVF) {
6787     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6788     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6789     // Collect the instructions (and their associated costs) that will be more
6790     // profitable to scalarize.
6791     CM.selectUserVectorizationFactor(UserVF);
6792     CM.collectInLoopReductions();
6793     buildVPlansWithVPRecipes(UserVF, UserVF);
6794     LLVM_DEBUG(printPlans(dbgs()));
6795     return {{UserVF, 0}};
6796   }
6797 
6798   unsigned MaxVF = MaybeMaxVF.getValue();
6799   assert(MaxVF != 0 && "MaxVF is zero.");
6800 
6801   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6802     // Collect Uniform and Scalar instructions after vectorization with VF.
6803     CM.collectUniformsAndScalars(VF);
6804 
6805     // Collect the instructions (and their associated costs) that will be more
6806     // profitable to scalarize.
6807     if (VF > 1)
6808       CM.collectInstsToScalarize(VF);
6809   }
6810 
6811   CM.collectInLoopReductions();
6812 
6813   buildVPlansWithVPRecipes(1, MaxVF);
6814   LLVM_DEBUG(printPlans(dbgs()));
6815   if (MaxVF == 1)
6816     return VectorizationFactor::Disabled();
6817 
6818   // Select the optimal vectorization factor.
6819   return CM.selectVectorizationFactor(MaxVF);
6820 }
6821 
6822 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6823   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6824                     << '\n');
6825   BestVF = VF;
6826   BestUF = UF;
6827 
6828   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6829     return !Plan->hasVF(VF);
6830   });
6831   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6832 }
6833 
6834 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6835                                            DominatorTree *DT) {
6836   // Perform the actual loop transformation.
6837 
6838   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6839   VPCallbackILV CallbackILV(ILV);
6840 
6841   VPTransformState State{BestVF, BestUF,      LI,
6842                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6843                          &ILV,   CallbackILV};
6844   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6845   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6846   State.CanonicalIV = ILV.Induction;
6847 
6848   //===------------------------------------------------===//
6849   //
6850   // Notice: any optimization or new instruction that go
6851   // into the code below should also be implemented in
6852   // the cost-model.
6853   //
6854   //===------------------------------------------------===//
6855 
6856   // 2. Copy and widen instructions from the old loop into the new loop.
6857   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6858   VPlans.front()->execute(&State);
6859 
6860   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6861   //    predication, updating analyses.
6862   ILV.fixVectorizedLoop();
6863 }
6864 
6865 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6866     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6867   BasicBlock *Latch = OrigLoop->getLoopLatch();
6868 
6869   // We create new control-flow for the vectorized loop, so the original
6870   // condition will be dead after vectorization if it's only used by the
6871   // branch.
6872   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6873   if (Cmp && Cmp->hasOneUse())
6874     DeadInstructions.insert(Cmp);
6875 
6876   // We create new "steps" for induction variable updates to which the original
6877   // induction variables map. An original update instruction will be dead if
6878   // all its users except the induction variable are dead.
6879   for (auto &Induction : Legal->getInductionVars()) {
6880     PHINode *Ind = Induction.first;
6881     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6882     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6883           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
6884         }))
6885       DeadInstructions.insert(IndUpdate);
6886 
6887     // We record as "Dead" also the type-casting instructions we had identified
6888     // during induction analysis. We don't need any handling for them in the
6889     // vectorized loop because we have proven that, under a proper runtime
6890     // test guarding the vectorized loop, the value of the phi, and the casted
6891     // value of the phi, are the same. The last instruction in this casting chain
6892     // will get its scalar/vector/widened def from the scalar/vector/widened def
6893     // of the respective phi node. Any other casts in the induction def-use chain
6894     // have no other uses outside the phi update chain, and will be ignored.
6895     InductionDescriptor &IndDes = Induction.second;
6896     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6897     DeadInstructions.insert(Casts.begin(), Casts.end());
6898   }
6899 }
6900 
6901 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6902 
6903 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6904 
6905 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6906                                         Instruction::BinaryOps BinOp) {
6907   // When unrolling and the VF is 1, we only need to add a simple scalar.
6908   Type *Ty = Val->getType();
6909   assert(!Ty->isVectorTy() && "Val must be a scalar");
6910 
6911   if (Ty->isFloatingPointTy()) {
6912     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6913 
6914     // Floating point operations had to be 'fast' to enable the unrolling.
6915     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6916     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6917   }
6918   Constant *C = ConstantInt::get(Ty, StartIdx);
6919   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6920 }
6921 
6922 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6923   SmallVector<Metadata *, 4> MDs;
6924   // Reserve first location for self reference to the LoopID metadata node.
6925   MDs.push_back(nullptr);
6926   bool IsUnrollMetadata = false;
6927   MDNode *LoopID = L->getLoopID();
6928   if (LoopID) {
6929     // First find existing loop unrolling disable metadata.
6930     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6931       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6932       if (MD) {
6933         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6934         IsUnrollMetadata =
6935             S && S->getString().startswith("llvm.loop.unroll.disable");
6936       }
6937       MDs.push_back(LoopID->getOperand(i));
6938     }
6939   }
6940 
6941   if (!IsUnrollMetadata) {
6942     // Add runtime unroll disable metadata.
6943     LLVMContext &Context = L->getHeader()->getContext();
6944     SmallVector<Metadata *, 1> DisableOperands;
6945     DisableOperands.push_back(
6946         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6947     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6948     MDs.push_back(DisableNode);
6949     MDNode *NewLoopID = MDNode::get(Context, MDs);
6950     // Set operand 0 to refer to the loop id itself.
6951     NewLoopID->replaceOperandWith(0, NewLoopID);
6952     L->setLoopID(NewLoopID);
6953   }
6954 }
6955 
6956 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6957     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6958   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6959   bool PredicateAtRangeStart = Predicate(Range.Start);
6960 
6961   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6962     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6963       Range.End = TmpVF;
6964       break;
6965     }
6966 
6967   return PredicateAtRangeStart;
6968 }
6969 
6970 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6971 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6972 /// of VF's starting at a given VF and extending it as much as possible. Each
6973 /// vectorization decision can potentially shorten this sub-range during
6974 /// buildVPlan().
6975 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6976   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6977     VFRange SubRange = {VF, MaxVF + 1};
6978     VPlans.push_back(buildVPlan(SubRange));
6979     VF = SubRange.End;
6980   }
6981 }
6982 
6983 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6984                                          VPlanPtr &Plan) {
6985   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6986 
6987   // Look for cached value.
6988   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6989   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6990   if (ECEntryIt != EdgeMaskCache.end())
6991     return ECEntryIt->second;
6992 
6993   VPValue *SrcMask = createBlockInMask(Src, Plan);
6994 
6995   // The terminator has to be a branch inst!
6996   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6997   assert(BI && "Unexpected terminator found");
6998 
6999   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7000     return EdgeMaskCache[Edge] = SrcMask;
7001 
7002   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7003   assert(EdgeMask && "No Edge Mask found for condition");
7004 
7005   if (BI->getSuccessor(0) != Dst)
7006     EdgeMask = Builder.createNot(EdgeMask);
7007 
7008   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7009     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7010 
7011   return EdgeMaskCache[Edge] = EdgeMask;
7012 }
7013 
7014 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7015   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7016 
7017   // Look for cached value.
7018   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7019   if (BCEntryIt != BlockMaskCache.end())
7020     return BCEntryIt->second;
7021 
7022   // All-one mask is modelled as no-mask following the convention for masked
7023   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7024   VPValue *BlockMask = nullptr;
7025 
7026   if (OrigLoop->getHeader() == BB) {
7027     if (!CM.blockNeedsPredication(BB))
7028       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7029 
7030     // Introduce the early-exit compare IV <= BTC to form header block mask.
7031     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7032     // Start by constructing the desired canonical IV.
7033     VPValue *IV = nullptr;
7034     if (Legal->getPrimaryInduction())
7035       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7036     else {
7037       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7038       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7039       IV = IVRecipe->getVPValue();
7040     }
7041     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7042     bool TailFolded = !CM.isScalarEpilogueAllowed();
7043     if (TailFolded && CM.TTI.emitGetActiveLaneMask())
7044       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
7045     else
7046       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7047     return BlockMaskCache[BB] = BlockMask;
7048   }
7049 
7050   // This is the block mask. We OR all incoming edges.
7051   for (auto *Predecessor : predecessors(BB)) {
7052     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7053     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7054       return BlockMaskCache[BB] = EdgeMask;
7055 
7056     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7057       BlockMask = EdgeMask;
7058       continue;
7059     }
7060 
7061     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7062   }
7063 
7064   return BlockMaskCache[BB] = BlockMask;
7065 }
7066 
7067 VPWidenMemoryInstructionRecipe *
7068 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7069                                   VPlanPtr &Plan) {
7070   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7071          "Must be called with either a load or store");
7072 
7073   auto willWiden = [&](unsigned VF) -> bool {
7074     if (VF == 1)
7075       return false;
7076     LoopVectorizationCostModel::InstWidening Decision =
7077         CM.getWideningDecision(I, VF);
7078     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7079            "CM decision should be taken at this point.");
7080     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7081       return true;
7082     if (CM.isScalarAfterVectorization(I, VF) ||
7083         CM.isProfitableToScalarize(I, VF))
7084       return false;
7085     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7086   };
7087 
7088   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7089     return nullptr;
7090 
7091   VPValue *Mask = nullptr;
7092   if (Legal->isMaskRequired(I))
7093     Mask = createBlockInMask(I->getParent(), Plan);
7094 
7095   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7096   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7097     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7098 
7099   StoreInst *Store = cast<StoreInst>(I);
7100   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7101   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7102 }
7103 
7104 VPWidenIntOrFpInductionRecipe *
7105 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7106   // Check if this is an integer or fp induction. If so, build the recipe that
7107   // produces its scalar and vector values.
7108   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7109   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7110       II.getKind() == InductionDescriptor::IK_FpInduction)
7111     return new VPWidenIntOrFpInductionRecipe(Phi);
7112 
7113   return nullptr;
7114 }
7115 
7116 VPWidenIntOrFpInductionRecipe *
7117 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7118                                                 VFRange &Range) const {
7119   // Optimize the special case where the source is a constant integer
7120   // induction variable. Notice that we can only optimize the 'trunc' case
7121   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7122   // (c) other casts depend on pointer size.
7123 
7124   // Determine whether \p K is a truncation based on an induction variable that
7125   // can be optimized.
7126   auto isOptimizableIVTruncate =
7127       [&](Instruction *K) -> std::function<bool(unsigned)> {
7128     return
7129         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
7130   };
7131 
7132   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7133           isOptimizableIVTruncate(I), Range))
7134     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7135                                              I);
7136   return nullptr;
7137 }
7138 
7139 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7140   // We know that all PHIs in non-header blocks are converted into selects, so
7141   // we don't have to worry about the insertion order and we can just use the
7142   // builder. At this point we generate the predication tree. There may be
7143   // duplications since this is a simple recursive scan, but future
7144   // optimizations will clean it up.
7145 
7146   SmallVector<VPValue *, 2> Operands;
7147   unsigned NumIncoming = Phi->getNumIncomingValues();
7148   for (unsigned In = 0; In < NumIncoming; In++) {
7149     VPValue *EdgeMask =
7150       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7151     assert((EdgeMask || NumIncoming == 1) &&
7152            "Multiple predecessors with one having a full mask");
7153     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7154     if (EdgeMask)
7155       Operands.push_back(EdgeMask);
7156   }
7157   return new VPBlendRecipe(Phi, Operands);
7158 }
7159 
7160 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7161                                                    VPlan &Plan) const {
7162 
7163   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7164       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
7165       Range);
7166 
7167   if (IsPredicated)
7168     return nullptr;
7169 
7170   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7171   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7172              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7173     return nullptr;
7174 
7175   auto willWiden = [&](unsigned VF) -> bool {
7176     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7177     // The following case may be scalarized depending on the VF.
7178     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7179     // version of the instruction.
7180     // Is it beneficial to perform intrinsic call compared to lib call?
7181     bool NeedToScalarize = false;
7182     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7183     bool UseVectorIntrinsic =
7184         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7185     return UseVectorIntrinsic || !NeedToScalarize;
7186   };
7187 
7188   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7189     return nullptr;
7190 
7191   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7192 }
7193 
7194 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7195   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7196          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7197   // Instruction should be widened, unless it is scalar after vectorization,
7198   // scalarization is profitable or it is predicated.
7199   auto WillScalarize = [this, I](unsigned VF) -> bool {
7200     return CM.isScalarAfterVectorization(I, VF) ||
7201            CM.isProfitableToScalarize(I, VF) ||
7202            CM.isScalarWithPredication(I, VF);
7203   };
7204   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7205                                                              Range);
7206 }
7207 
7208 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7209   auto IsVectorizableOpcode = [](unsigned Opcode) {
7210     switch (Opcode) {
7211     case Instruction::Add:
7212     case Instruction::And:
7213     case Instruction::AShr:
7214     case Instruction::BitCast:
7215     case Instruction::FAdd:
7216     case Instruction::FCmp:
7217     case Instruction::FDiv:
7218     case Instruction::FMul:
7219     case Instruction::FNeg:
7220     case Instruction::FPExt:
7221     case Instruction::FPToSI:
7222     case Instruction::FPToUI:
7223     case Instruction::FPTrunc:
7224     case Instruction::FRem:
7225     case Instruction::FSub:
7226     case Instruction::ICmp:
7227     case Instruction::IntToPtr:
7228     case Instruction::LShr:
7229     case Instruction::Mul:
7230     case Instruction::Or:
7231     case Instruction::PtrToInt:
7232     case Instruction::SDiv:
7233     case Instruction::Select:
7234     case Instruction::SExt:
7235     case Instruction::Shl:
7236     case Instruction::SIToFP:
7237     case Instruction::SRem:
7238     case Instruction::Sub:
7239     case Instruction::Trunc:
7240     case Instruction::UDiv:
7241     case Instruction::UIToFP:
7242     case Instruction::URem:
7243     case Instruction::Xor:
7244     case Instruction::ZExt:
7245       return true;
7246     }
7247     return false;
7248   };
7249 
7250   if (!IsVectorizableOpcode(I->getOpcode()))
7251     return nullptr;
7252 
7253   // Success: widen this instruction.
7254   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7255 }
7256 
7257 VPBasicBlock *VPRecipeBuilder::handleReplication(
7258     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7259     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7260     VPlanPtr &Plan) {
7261   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7262       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7263       Range);
7264 
7265   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7266       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7267 
7268   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7269                                        IsUniform, IsPredicated);
7270   setRecipe(I, Recipe);
7271 
7272   // Find if I uses a predicated instruction. If so, it will use its scalar
7273   // value. Avoid hoisting the insert-element which packs the scalar value into
7274   // a vector value, as that happens iff all users use the vector value.
7275   for (auto &Op : I->operands())
7276     if (auto *PredInst = dyn_cast<Instruction>(Op))
7277       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7278         PredInst2Recipe[PredInst]->setAlsoPack(false);
7279 
7280   // Finalize the recipe for Instr, first if it is not predicated.
7281   if (!IsPredicated) {
7282     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7283     VPBB->appendRecipe(Recipe);
7284     return VPBB;
7285   }
7286   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7287   assert(VPBB->getSuccessors().empty() &&
7288          "VPBB has successors when handling predicated replication.");
7289   // Record predicated instructions for above packing optimizations.
7290   PredInst2Recipe[I] = Recipe;
7291   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7292   VPBlockUtils::insertBlockAfter(Region, VPBB);
7293   auto *RegSucc = new VPBasicBlock();
7294   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7295   return RegSucc;
7296 }
7297 
7298 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7299                                                       VPRecipeBase *PredRecipe,
7300                                                       VPlanPtr &Plan) {
7301   // Instructions marked for predication are replicated and placed under an
7302   // if-then construct to prevent side-effects.
7303 
7304   // Generate recipes to compute the block mask for this region.
7305   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7306 
7307   // Build the triangular if-then region.
7308   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7309   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7310   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7311   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7312   auto *PHIRecipe =
7313       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7314   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7315   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7316   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7317 
7318   // Note: first set Entry as region entry and then connect successors starting
7319   // from it in order, to propagate the "parent" of each VPBasicBlock.
7320   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7321   VPBlockUtils::connectBlocks(Pred, Exit);
7322 
7323   return Region;
7324 }
7325 
7326 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7327                                                       VFRange &Range,
7328                                                       VPlanPtr &Plan) {
7329   // First, check for specific widening recipes that deal with calls, memory
7330   // operations, inductions and Phi nodes.
7331   if (auto *CI = dyn_cast<CallInst>(Instr))
7332     return tryToWidenCall(CI, Range, *Plan);
7333 
7334   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7335     return tryToWidenMemory(Instr, Range, Plan);
7336 
7337   VPRecipeBase *Recipe;
7338   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7339     if (Phi->getParent() != OrigLoop->getHeader())
7340       return tryToBlend(Phi, Plan);
7341     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7342       return Recipe;
7343     return new VPWidenPHIRecipe(Phi);
7344   }
7345 
7346   if (isa<TruncInst>(Instr) &&
7347       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7348     return Recipe;
7349 
7350   if (!shouldWiden(Instr, Range))
7351     return nullptr;
7352 
7353   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7354     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7355                                 OrigLoop);
7356 
7357   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7358     bool InvariantCond =
7359         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7360     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7361                                    InvariantCond);
7362   }
7363 
7364   return tryToWiden(Instr, *Plan);
7365 }
7366 
7367 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7368                                                         unsigned MaxVF) {
7369   assert(OrigLoop->empty() && "Inner loop expected.");
7370 
7371   // Collect conditions feeding internal conditional branches; they need to be
7372   // represented in VPlan for it to model masking.
7373   SmallPtrSet<Value *, 1> NeedDef;
7374 
7375   auto *Latch = OrigLoop->getLoopLatch();
7376   for (BasicBlock *BB : OrigLoop->blocks()) {
7377     if (BB == Latch)
7378       continue;
7379     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7380     if (Branch && Branch->isConditional())
7381       NeedDef.insert(Branch->getCondition());
7382   }
7383 
7384   // If the tail is to be folded by masking, the primary induction variable, if
7385   // exists needs to be represented in VPlan for it to model early-exit masking.
7386   // Also, both the Phi and the live-out instruction of each reduction are
7387   // required in order to introduce a select between them in VPlan.
7388   if (CM.foldTailByMasking()) {
7389     if (Legal->getPrimaryInduction())
7390       NeedDef.insert(Legal->getPrimaryInduction());
7391     for (auto &Reduction : Legal->getReductionVars()) {
7392       NeedDef.insert(Reduction.first);
7393       NeedDef.insert(Reduction.second.getLoopExitInstr());
7394     }
7395   }
7396 
7397   // Collect instructions from the original loop that will become trivially dead
7398   // in the vectorized loop. We don't need to vectorize these instructions. For
7399   // example, original induction update instructions can become dead because we
7400   // separately emit induction "steps" when generating code for the new loop.
7401   // Similarly, we create a new latch condition when setting up the structure
7402   // of the new loop, so the old one can become dead.
7403   SmallPtrSet<Instruction *, 4> DeadInstructions;
7404   collectTriviallyDeadInstructions(DeadInstructions);
7405 
7406   // Add assume instructions we need to drop to DeadInstructions, to prevent
7407   // them from being added to the VPlan.
7408   // TODO: We only need to drop assumes in blocks that get flattend. If the
7409   // control flow is preserved, we should keep them.
7410   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7411   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7412 
7413   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7414   // Dead instructions do not need sinking. Remove them from SinkAfter.
7415   for (Instruction *I : DeadInstructions)
7416     SinkAfter.erase(I);
7417 
7418   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7419     VFRange SubRange = {VF, MaxVF + 1};
7420     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7421                                              DeadInstructions, SinkAfter));
7422     VF = SubRange.End;
7423   }
7424 }
7425 
7426 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7427     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7428     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7429     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7430 
7431   // Hold a mapping from predicated instructions to their recipes, in order to
7432   // fix their AlsoPack behavior if a user is determined to replicate and use a
7433   // scalar instead of vector value.
7434   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7435 
7436   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7437 
7438   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7439 
7440   // ---------------------------------------------------------------------------
7441   // Pre-construction: record ingredients whose recipes we'll need to further
7442   // process after constructing the initial VPlan.
7443   // ---------------------------------------------------------------------------
7444 
7445   // Mark instructions we'll need to sink later and their targets as
7446   // ingredients whose recipe we'll need to record.
7447   for (auto &Entry : SinkAfter) {
7448     RecipeBuilder.recordRecipeOf(Entry.first);
7449     RecipeBuilder.recordRecipeOf(Entry.second);
7450   }
7451   for (auto &Reduction : CM.getInLoopReductionChains()) {
7452     PHINode *Phi = Reduction.first;
7453     RecurrenceDescriptor::RecurrenceKind Kind =
7454         Legal->getReductionVars()[Phi].getRecurrenceKind();
7455     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7456 
7457     RecipeBuilder.recordRecipeOf(Phi);
7458     for (auto &R : ReductionOperations) {
7459       RecipeBuilder.recordRecipeOf(R);
7460       // For min/max reducitons, where we have a pair of icmp/select, we also
7461       // need to record the ICmp recipe, so it can be removed later.
7462       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7463           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7464         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7465       }
7466     }
7467   }
7468 
7469   // For each interleave group which is relevant for this (possibly trimmed)
7470   // Range, add it to the set of groups to be later applied to the VPlan and add
7471   // placeholders for its members' Recipes which we'll be replacing with a
7472   // single VPInterleaveRecipe.
7473   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7474     auto applyIG = [IG, this](unsigned VF) -> bool {
7475       return (VF >= 2 && // Query is illegal for VF == 1
7476               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7477                   LoopVectorizationCostModel::CM_Interleave);
7478     };
7479     if (!getDecisionAndClampRange(applyIG, Range))
7480       continue;
7481     InterleaveGroups.insert(IG);
7482     for (unsigned i = 0; i < IG->getFactor(); i++)
7483       if (Instruction *Member = IG->getMember(i))
7484         RecipeBuilder.recordRecipeOf(Member);
7485   };
7486 
7487   // ---------------------------------------------------------------------------
7488   // Build initial VPlan: Scan the body of the loop in a topological order to
7489   // visit each basic block after having visited its predecessor basic blocks.
7490   // ---------------------------------------------------------------------------
7491 
7492   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7493   auto Plan = std::make_unique<VPlan>();
7494   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7495   Plan->setEntry(VPBB);
7496 
7497   // Represent values that will have defs inside VPlan.
7498   for (Value *V : NeedDef)
7499     Plan->addVPValue(V);
7500 
7501   // Scan the body of the loop in a topological order to visit each basic block
7502   // after having visited its predecessor basic blocks.
7503   LoopBlocksDFS DFS(OrigLoop);
7504   DFS.perform(LI);
7505 
7506   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7507     // Relevant instructions from basic block BB will be grouped into VPRecipe
7508     // ingredients and fill a new VPBasicBlock.
7509     unsigned VPBBsForBB = 0;
7510     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7511     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7512     VPBB = FirstVPBBForBB;
7513     Builder.setInsertPoint(VPBB);
7514 
7515     // Introduce each ingredient into VPlan.
7516     // TODO: Model and preserve debug instrinsics in VPlan.
7517     for (Instruction &I : BB->instructionsWithoutDebug()) {
7518       Instruction *Instr = &I;
7519 
7520       // First filter out irrelevant instructions, to ensure no recipes are
7521       // built for them.
7522       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7523         continue;
7524 
7525       if (auto Recipe =
7526               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7527         RecipeBuilder.setRecipe(Instr, Recipe);
7528         VPBB->appendRecipe(Recipe);
7529         continue;
7530       }
7531 
7532       // Otherwise, if all widening options failed, Instruction is to be
7533       // replicated. This may create a successor for VPBB.
7534       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7535           Instr, Range, VPBB, PredInst2Recipe, Plan);
7536       if (NextVPBB != VPBB) {
7537         VPBB = NextVPBB;
7538         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7539                                     : "");
7540       }
7541     }
7542   }
7543 
7544   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7545   // may also be empty, such as the last one VPBB, reflecting original
7546   // basic-blocks with no recipes.
7547   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7548   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7549   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7550   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7551   delete PreEntry;
7552 
7553   // ---------------------------------------------------------------------------
7554   // Transform initial VPlan: Apply previously taken decisions, in order, to
7555   // bring the VPlan to its final state.
7556   // ---------------------------------------------------------------------------
7557 
7558   // Apply Sink-After legal constraints.
7559   for (auto &Entry : SinkAfter) {
7560     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7561     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7562     Sink->moveAfter(Target);
7563   }
7564 
7565   // Interleave memory: for each Interleave Group we marked earlier as relevant
7566   // for this VPlan, replace the Recipes widening its memory instructions with a
7567   // single VPInterleaveRecipe at its insertion point.
7568   for (auto IG : InterleaveGroups) {
7569     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7570         RecipeBuilder.getRecipe(IG->getInsertPos()));
7571     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7572         ->insertBefore(Recipe);
7573 
7574     for (unsigned i = 0; i < IG->getFactor(); ++i)
7575       if (Instruction *Member = IG->getMember(i)) {
7576         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7577       }
7578   }
7579 
7580   // Adjust the recipes for any inloop reductions.
7581   if (Range.Start > 1)
7582     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7583 
7584   // Finally, if tail is folded by masking, introduce selects between the phi
7585   // and the live-out instruction of each reduction, at the end of the latch.
7586   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7587     Builder.setInsertPoint(VPBB);
7588     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7589     for (auto &Reduction : Legal->getReductionVars()) {
7590       assert(!CM.isInLoopReduction(Reduction.first) &&
7591              "Didn't expect inloop tail folded reduction yet!");
7592       VPValue *Phi = Plan->getVPValue(Reduction.first);
7593       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7594       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7595     }
7596   }
7597 
7598   std::string PlanName;
7599   raw_string_ostream RSO(PlanName);
7600   unsigned VF = Range.Start;
7601   Plan->addVF(VF);
7602   RSO << "Initial VPlan for VF={" << VF;
7603   for (VF *= 2; VF < Range.End; VF *= 2) {
7604     Plan->addVF(VF);
7605     RSO << "," << VF;
7606   }
7607   RSO << "},UF>=1";
7608   RSO.flush();
7609   Plan->setName(PlanName);
7610 
7611   return Plan;
7612 }
7613 
7614 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7615   // Outer loop handling: They may require CFG and instruction level
7616   // transformations before even evaluating whether vectorization is profitable.
7617   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7618   // the vectorization pipeline.
7619   assert(!OrigLoop->empty());
7620   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7621 
7622   // Create new empty VPlan
7623   auto Plan = std::make_unique<VPlan>();
7624 
7625   // Build hierarchical CFG
7626   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7627   HCFGBuilder.buildHierarchicalCFG();
7628 
7629   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7630     Plan->addVF(VF);
7631 
7632   if (EnableVPlanPredication) {
7633     VPlanPredicator VPP(*Plan);
7634     VPP.predicate();
7635 
7636     // Avoid running transformation to recipes until masked code generation in
7637     // VPlan-native path is in place.
7638     return Plan;
7639   }
7640 
7641   SmallPtrSet<Instruction *, 1> DeadInstructions;
7642   VPlanTransforms::VPInstructionsToVPRecipes(
7643       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7644   return Plan;
7645 }
7646 
7647 // Adjust the recipes for any inloop reductions. The chain of instructions
7648 // leading from the loop exit instr to the phi need to be converted to
7649 // reductions, with one operand being vector and the other being the scalar
7650 // reduction chain.
7651 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7652     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7653   for (auto &Reduction : CM.getInLoopReductionChains()) {
7654     PHINode *Phi = Reduction.first;
7655     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7656     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7657 
7658     // ReductionOperations are orders top-down from the phi's use to the
7659     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7660     // which of the two operands will remain scalar and which will be reduced.
7661     // For minmax the chain will be the select instructions.
7662     Instruction *Chain = Phi;
7663     for (Instruction *R : ReductionOperations) {
7664       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7665       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7666 
7667       VPValue *ChainOp = Plan->getVPValue(Chain);
7668       unsigned FirstOpId;
7669       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7670           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7671         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
7672                "Expected to replace a VPWidenSelectSC");
7673         FirstOpId = 1;
7674       } else {
7675         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7676                "Expected to replace a VPWidenSC");
7677         FirstOpId = 0;
7678       }
7679       unsigned VecOpId =
7680           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7681       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7682 
7683       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7684           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7685       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7686       WidenRecipe->eraseFromParent();
7687 
7688       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7689           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7690         VPRecipeBase *CompareRecipe =
7691             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7692         assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7693                "Expected to replace a VPWidenSC");
7694         CompareRecipe->eraseFromParent();
7695       }
7696       Chain = R;
7697     }
7698   }
7699 }
7700 
7701 Value* LoopVectorizationPlanner::VPCallbackILV::
7702 getOrCreateVectorValues(Value *V, unsigned Part) {
7703       return ILV.getOrCreateVectorValue(V, Part);
7704 }
7705 
7706 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7707     Value *V, const VPIteration &Instance) {
7708   return ILV.getOrCreateScalarValue(V, Instance);
7709 }
7710 
7711 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7712                                VPSlotTracker &SlotTracker) const {
7713   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7714   IG->getInsertPos()->printAsOperand(O, false);
7715   O << ", ";
7716   getAddr()->printAsOperand(O, SlotTracker);
7717   VPValue *Mask = getMask();
7718   if (Mask) {
7719     O << ", ";
7720     Mask->printAsOperand(O, SlotTracker);
7721   }
7722   for (unsigned i = 0; i < IG->getFactor(); ++i)
7723     if (Instruction *I = IG->getMember(i))
7724       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7725 }
7726 
7727 void VPWidenCallRecipe::execute(VPTransformState &State) {
7728   State.ILV->widenCallInstruction(Ingredient, User, State);
7729 }
7730 
7731 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7732   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7733 }
7734 
7735 void VPWidenRecipe::execute(VPTransformState &State) {
7736   State.ILV->widenInstruction(Ingredient, User, State);
7737 }
7738 
7739 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7740   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7741                       IsIndexLoopInvariant, State);
7742 }
7743 
7744 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7745   assert(!State.Instance && "Int or FP induction being replicated.");
7746   State.ILV->widenIntOrFpInduction(IV, Trunc);
7747 }
7748 
7749 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7750   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7751 }
7752 
7753 void VPBlendRecipe::execute(VPTransformState &State) {
7754   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7755   // We know that all PHIs in non-header blocks are converted into
7756   // selects, so we don't have to worry about the insertion order and we
7757   // can just use the builder.
7758   // At this point we generate the predication tree. There may be
7759   // duplications since this is a simple recursive scan, but future
7760   // optimizations will clean it up.
7761 
7762   unsigned NumIncoming = getNumIncomingValues();
7763 
7764   // Generate a sequence of selects of the form:
7765   // SELECT(Mask3, In3,
7766   //        SELECT(Mask2, In2,
7767   //               SELECT(Mask1, In1,
7768   //                      In0)))
7769   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7770   // are essentially undef are taken from In0.
7771   InnerLoopVectorizer::VectorParts Entry(State.UF);
7772   for (unsigned In = 0; In < NumIncoming; ++In) {
7773     for (unsigned Part = 0; Part < State.UF; ++Part) {
7774       // We might have single edge PHIs (blocks) - use an identity
7775       // 'select' for the first PHI operand.
7776       Value *In0 = State.get(getIncomingValue(In), Part);
7777       if (In == 0)
7778         Entry[Part] = In0; // Initialize with the first incoming value.
7779       else {
7780         // Select between the current value and the previous incoming edge
7781         // based on the incoming mask.
7782         Value *Cond = State.get(getMask(In), Part);
7783         Entry[Part] =
7784             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7785       }
7786     }
7787   }
7788   for (unsigned Part = 0; Part < State.UF; ++Part)
7789     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7790 }
7791 
7792 void VPInterleaveRecipe::execute(VPTransformState &State) {
7793   assert(!State.Instance && "Interleave group being replicated.");
7794   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7795 }
7796 
7797 void VPReductionRecipe::execute(VPTransformState &State) {
7798   assert(!State.Instance && "Reduction being replicated.");
7799   for (unsigned Part = 0; Part < State.UF; ++Part) {
7800     unsigned Kind = RdxDesc->getRecurrenceKind();
7801     Value *NewVecOp = State.get(VecOp, Part);
7802     Value *NewRed =
7803         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
7804     Value *PrevInChain = State.get(ChainOp, Part);
7805     Value *NextInChain;
7806     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7807         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7808       NextInChain =
7809           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
7810                          NewRed, PrevInChain);
7811     } else {
7812       NextInChain = State.Builder.CreateBinOp(
7813           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
7814     }
7815     State.ValueMap.setVectorValue(I, Part, NextInChain);
7816   }
7817 }
7818 
7819 void VPReplicateRecipe::execute(VPTransformState &State) {
7820   if (State.Instance) { // Generate a single instance.
7821     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7822                                     IsPredicated, State);
7823     // Insert scalar instance packing it into a vector.
7824     if (AlsoPack && State.VF > 1) {
7825       // If we're constructing lane 0, initialize to start from undef.
7826       if (State.Instance->Lane == 0) {
7827         Value *Undef = UndefValue::get(
7828             FixedVectorType::get(Ingredient->getType(), State.VF));
7829         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7830       }
7831       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7832     }
7833     return;
7834   }
7835 
7836   // Generate scalar instances for all VF lanes of all UF parts, unless the
7837   // instruction is uniform inwhich case generate only the first lane for each
7838   // of the UF parts.
7839   unsigned EndLane = IsUniform ? 1 : State.VF;
7840   for (unsigned Part = 0; Part < State.UF; ++Part)
7841     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7842       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
7843                                       IsPredicated, State);
7844 }
7845 
7846 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7847   assert(State.Instance && "Branch on Mask works only on single instance.");
7848 
7849   unsigned Part = State.Instance->Part;
7850   unsigned Lane = State.Instance->Lane;
7851 
7852   Value *ConditionBit = nullptr;
7853   VPValue *BlockInMask = getMask();
7854   if (BlockInMask) {
7855     ConditionBit = State.get(BlockInMask, Part);
7856     if (ConditionBit->getType()->isVectorTy())
7857       ConditionBit = State.Builder.CreateExtractElement(
7858           ConditionBit, State.Builder.getInt32(Lane));
7859   } else // Block in mask is all-one.
7860     ConditionBit = State.Builder.getTrue();
7861 
7862   // Replace the temporary unreachable terminator with a new conditional branch,
7863   // whose two destinations will be set later when they are created.
7864   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7865   assert(isa<UnreachableInst>(CurrentTerminator) &&
7866          "Expected to replace unreachable terminator with conditional branch.");
7867   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7868   CondBr->setSuccessor(0, nullptr);
7869   ReplaceInstWithInst(CurrentTerminator, CondBr);
7870 }
7871 
7872 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7873   assert(State.Instance && "Predicated instruction PHI works per instance.");
7874   Instruction *ScalarPredInst = cast<Instruction>(
7875       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7876   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7877   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7878   assert(PredicatingBB && "Predicated block has no single predecessor.");
7879 
7880   // By current pack/unpack logic we need to generate only a single phi node: if
7881   // a vector value for the predicated instruction exists at this point it means
7882   // the instruction has vector users only, and a phi for the vector value is
7883   // needed. In this case the recipe of the predicated instruction is marked to
7884   // also do that packing, thereby "hoisting" the insert-element sequence.
7885   // Otherwise, a phi node for the scalar value is needed.
7886   unsigned Part = State.Instance->Part;
7887   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7888     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7889     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7890     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7891     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7892     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7893     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7894   } else {
7895     Type *PredInstType = PredInst->getType();
7896     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7897     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7898     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7899     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7900   }
7901 }
7902 
7903 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7904   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7905   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7906                                         getMask());
7907 }
7908 
7909 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7910 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7911 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7912 // for predication.
7913 static ScalarEpilogueLowering getScalarEpilogueLowering(
7914     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7915     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7916     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7917     LoopVectorizationLegality &LVL) {
7918   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7919   // don't look at hints or options, and don't request a scalar epilogue.
7920   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
7921   // LoopAccessInfo (due to code dependency and not being able to reliably get
7922   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
7923   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
7924   // versioning when the vectorization is forced, unlike hasOptSize. So revert
7925   // back to the old way and vectorize with versioning when forced. See D81345.)
7926   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7927                                                       PGSOQueryType::IRPass) &&
7928                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
7929     return CM_ScalarEpilogueNotAllowedOptSize;
7930 
7931   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7932                               !PreferPredicateOverEpilog;
7933 
7934   // 2) Next, if disabling predication is requested on the command line, honour
7935   // this and request a scalar epilogue.
7936   if (PredicateOptDisabled)
7937     return CM_ScalarEpilogueAllowed;
7938 
7939   // 3) and 4) look if enabling predication is requested on the command line,
7940   // with a loop hint, or if the TTI hook indicates this is profitable, request
7941   // predication .
7942   if (PreferPredicateOverEpilog ||
7943       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7944       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7945                                         LVL.getLAI()) &&
7946        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7947     return CM_ScalarEpilogueNotNeededUsePredicate;
7948 
7949   return CM_ScalarEpilogueAllowed;
7950 }
7951 
7952 // Process the loop in the VPlan-native vectorization path. This path builds
7953 // VPlan upfront in the vectorization pipeline, which allows to apply
7954 // VPlan-to-VPlan transformations from the very beginning without modifying the
7955 // input LLVM IR.
7956 static bool processLoopInVPlanNativePath(
7957     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7958     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7959     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7960     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7961     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7962 
7963   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
7964     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7965     return false;
7966   }
7967   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7968   Function *F = L->getHeader()->getParent();
7969   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7970 
7971   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7972       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7973 
7974   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7975                                 &Hints, IAI);
7976   // Use the planner for outer loop vectorization.
7977   // TODO: CM is not used at this point inside the planner. Turn CM into an
7978   // optional argument if we don't need it in the future.
7979   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7980 
7981   // Get user vectorization factor.
7982   const unsigned UserVF = Hints.getWidth();
7983 
7984   // Plan how to best vectorize, return the best VF and its cost.
7985   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7986 
7987   // If we are stress testing VPlan builds, do not attempt to generate vector
7988   // code. Masked vector code generation support will follow soon.
7989   // Also, do not attempt to vectorize if no vector code will be produced.
7990   if (VPlanBuildStressTest || EnableVPlanPredication ||
7991       VectorizationFactor::Disabled() == VF)
7992     return false;
7993 
7994   LVP.setBestPlan(VF.Width, 1);
7995 
7996   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7997                          &CM, BFI, PSI);
7998   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7999                     << L->getHeader()->getParent()->getName() << "\"\n");
8000   LVP.executePlan(LB, DT);
8001 
8002   // Mark the loop as already vectorized to avoid vectorizing again.
8003   Hints.setAlreadyVectorized();
8004 
8005   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8006   return true;
8007 }
8008 
8009 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8010     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8011                                !EnableLoopInterleaving),
8012       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8013                               !EnableLoopVectorization) {}
8014 
8015 bool LoopVectorizePass::processLoop(Loop *L) {
8016   assert((EnableVPlanNativePath || L->empty()) &&
8017          "VPlan-native path is not enabled. Only process inner loops.");
8018 
8019 #ifndef NDEBUG
8020   const std::string DebugLocStr = getDebugLocString(L);
8021 #endif /* NDEBUG */
8022 
8023   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8024                     << L->getHeader()->getParent()->getName() << "\" from "
8025                     << DebugLocStr << "\n");
8026 
8027   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8028 
8029   LLVM_DEBUG(
8030       dbgs() << "LV: Loop hints:"
8031              << " force="
8032              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8033                      ? "disabled"
8034                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8035                             ? "enabled"
8036                             : "?"))
8037              << " width=" << Hints.getWidth()
8038              << " unroll=" << Hints.getInterleave() << "\n");
8039 
8040   // Function containing loop
8041   Function *F = L->getHeader()->getParent();
8042 
8043   // Looking at the diagnostic output is the only way to determine if a loop
8044   // was vectorized (other than looking at the IR or machine code), so it
8045   // is important to generate an optimization remark for each loop. Most of
8046   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8047   // generated as OptimizationRemark and OptimizationRemarkMissed are
8048   // less verbose reporting vectorized loops and unvectorized loops that may
8049   // benefit from vectorization, respectively.
8050 
8051   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8052     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8053     return false;
8054   }
8055 
8056   PredicatedScalarEvolution PSE(*SE, *L);
8057 
8058   // Check if it is legal to vectorize the loop.
8059   LoopVectorizationRequirements Requirements(*ORE);
8060   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8061                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8062   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8063     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8064     Hints.emitRemarkWithHints();
8065     return false;
8066   }
8067 
8068   // Check the function attributes and profiles to find out if this function
8069   // should be optimized for size.
8070   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8071       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8072 
8073   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8074   // here. They may require CFG and instruction level transformations before
8075   // even evaluating whether vectorization is profitable. Since we cannot modify
8076   // the incoming IR, we need to build VPlan upfront in the vectorization
8077   // pipeline.
8078   if (!L->empty())
8079     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8080                                         ORE, BFI, PSI, Hints);
8081 
8082   assert(L->empty() && "Inner loop expected.");
8083 
8084   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8085   // count by optimizing for size, to minimize overheads.
8086   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8087   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8088     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8089                       << "This loop is worth vectorizing only if no scalar "
8090                       << "iteration overheads are incurred.");
8091     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8092       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8093     else {
8094       LLVM_DEBUG(dbgs() << "\n");
8095       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8096     }
8097   }
8098 
8099   // Check the function attributes to see if implicit floats are allowed.
8100   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8101   // an integer loop and the vector instructions selected are purely integer
8102   // vector instructions?
8103   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8104     reportVectorizationFailure(
8105         "Can't vectorize when the NoImplicitFloat attribute is used",
8106         "loop not vectorized due to NoImplicitFloat attribute",
8107         "NoImplicitFloat", ORE, L);
8108     Hints.emitRemarkWithHints();
8109     return false;
8110   }
8111 
8112   // Check if the target supports potentially unsafe FP vectorization.
8113   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8114   // for the target we're vectorizing for, to make sure none of the
8115   // additional fp-math flags can help.
8116   if (Hints.isPotentiallyUnsafe() &&
8117       TTI->isFPVectorizationPotentiallyUnsafe()) {
8118     reportVectorizationFailure(
8119         "Potentially unsafe FP op prevents vectorization",
8120         "loop not vectorized due to unsafe FP support.",
8121         "UnsafeFP", ORE, L);
8122     Hints.emitRemarkWithHints();
8123     return false;
8124   }
8125 
8126   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8127   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8128 
8129   // If an override option has been passed in for interleaved accesses, use it.
8130   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8131     UseInterleaved = EnableInterleavedMemAccesses;
8132 
8133   // Analyze interleaved memory accesses.
8134   if (UseInterleaved) {
8135     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8136   }
8137 
8138   // Use the cost model.
8139   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8140                                 F, &Hints, IAI);
8141   CM.collectValuesToIgnore();
8142 
8143   // Use the planner for vectorization.
8144   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8145 
8146   // Get user vectorization factor and interleave count.
8147   unsigned UserVF = Hints.getWidth();
8148   unsigned UserIC = Hints.getInterleave();
8149 
8150   // Plan how to best vectorize, return the best VF and its cost.
8151   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
8152 
8153   VectorizationFactor VF = VectorizationFactor::Disabled();
8154   unsigned IC = 1;
8155 
8156   if (MaybeVF) {
8157     VF = *MaybeVF;
8158     // Select the interleave count.
8159     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8160   }
8161 
8162   // Identify the diagnostic messages that should be produced.
8163   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8164   bool VectorizeLoop = true, InterleaveLoop = true;
8165   if (Requirements.doesNotMeet(F, L, Hints)) {
8166     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8167                          "requirements.\n");
8168     Hints.emitRemarkWithHints();
8169     return false;
8170   }
8171 
8172   if (VF.Width == 1) {
8173     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8174     VecDiagMsg = std::make_pair(
8175         "VectorizationNotBeneficial",
8176         "the cost-model indicates that vectorization is not beneficial");
8177     VectorizeLoop = false;
8178   }
8179 
8180   if (!MaybeVF && UserIC > 1) {
8181     // Tell the user interleaving was avoided up-front, despite being explicitly
8182     // requested.
8183     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8184                          "interleaving should be avoided up front\n");
8185     IntDiagMsg = std::make_pair(
8186         "InterleavingAvoided",
8187         "Ignoring UserIC, because interleaving was avoided up front");
8188     InterleaveLoop = false;
8189   } else if (IC == 1 && UserIC <= 1) {
8190     // Tell the user interleaving is not beneficial.
8191     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8192     IntDiagMsg = std::make_pair(
8193         "InterleavingNotBeneficial",
8194         "the cost-model indicates that interleaving is not beneficial");
8195     InterleaveLoop = false;
8196     if (UserIC == 1) {
8197       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8198       IntDiagMsg.second +=
8199           " and is explicitly disabled or interleave count is set to 1";
8200     }
8201   } else if (IC > 1 && UserIC == 1) {
8202     // Tell the user interleaving is beneficial, but it explicitly disabled.
8203     LLVM_DEBUG(
8204         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8205     IntDiagMsg = std::make_pair(
8206         "InterleavingBeneficialButDisabled",
8207         "the cost-model indicates that interleaving is beneficial "
8208         "but is explicitly disabled or interleave count is set to 1");
8209     InterleaveLoop = false;
8210   }
8211 
8212   // Override IC if user provided an interleave count.
8213   IC = UserIC > 0 ? UserIC : IC;
8214 
8215   // Emit diagnostic messages, if any.
8216   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8217   if (!VectorizeLoop && !InterleaveLoop) {
8218     // Do not vectorize or interleaving the loop.
8219     ORE->emit([&]() {
8220       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8221                                       L->getStartLoc(), L->getHeader())
8222              << VecDiagMsg.second;
8223     });
8224     ORE->emit([&]() {
8225       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8226                                       L->getStartLoc(), L->getHeader())
8227              << IntDiagMsg.second;
8228     });
8229     return false;
8230   } else if (!VectorizeLoop && InterleaveLoop) {
8231     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8232     ORE->emit([&]() {
8233       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8234                                         L->getStartLoc(), L->getHeader())
8235              << VecDiagMsg.second;
8236     });
8237   } else if (VectorizeLoop && !InterleaveLoop) {
8238     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8239                       << ") in " << DebugLocStr << '\n');
8240     ORE->emit([&]() {
8241       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8242                                         L->getStartLoc(), L->getHeader())
8243              << IntDiagMsg.second;
8244     });
8245   } else if (VectorizeLoop && InterleaveLoop) {
8246     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8247                       << ") in " << DebugLocStr << '\n');
8248     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8249   }
8250 
8251   LVP.setBestPlan(VF.Width, IC);
8252 
8253   using namespace ore;
8254   bool DisableRuntimeUnroll = false;
8255   MDNode *OrigLoopID = L->getLoopID();
8256 
8257   if (!VectorizeLoop) {
8258     assert(IC > 1 && "interleave count should not be 1 or 0");
8259     // If we decided that it is not legal to vectorize the loop, then
8260     // interleave it.
8261     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8262                                BFI, PSI);
8263     LVP.executePlan(Unroller, DT);
8264 
8265     ORE->emit([&]() {
8266       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8267                                 L->getHeader())
8268              << "interleaved loop (interleaved count: "
8269              << NV("InterleaveCount", IC) << ")";
8270     });
8271   } else {
8272     // If we decided that it is *legal* to vectorize the loop, then do it.
8273     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8274                            &LVL, &CM, BFI, PSI);
8275     LVP.executePlan(LB, DT);
8276     ++LoopsVectorized;
8277 
8278     // Add metadata to disable runtime unrolling a scalar loop when there are
8279     // no runtime checks about strides and memory. A scalar loop that is
8280     // rarely used is not worth unrolling.
8281     if (!LB.areSafetyChecksAdded())
8282       DisableRuntimeUnroll = true;
8283 
8284     // Report the vectorization decision.
8285     ORE->emit([&]() {
8286       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8287                                 L->getHeader())
8288              << "vectorized loop (vectorization width: "
8289              << NV("VectorizationFactor", VF.Width)
8290              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8291     });
8292   }
8293 
8294   Optional<MDNode *> RemainderLoopID =
8295       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8296                                       LLVMLoopVectorizeFollowupEpilogue});
8297   if (RemainderLoopID.hasValue()) {
8298     L->setLoopID(RemainderLoopID.getValue());
8299   } else {
8300     if (DisableRuntimeUnroll)
8301       AddRuntimeUnrollDisableMetaData(L);
8302 
8303     // Mark the loop as already vectorized to avoid vectorizing again.
8304     Hints.setAlreadyVectorized();
8305   }
8306 
8307   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8308   return true;
8309 }
8310 
8311 LoopVectorizeResult LoopVectorizePass::runImpl(
8312     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8313     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8314     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8315     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8316     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8317   SE = &SE_;
8318   LI = &LI_;
8319   TTI = &TTI_;
8320   DT = &DT_;
8321   BFI = &BFI_;
8322   TLI = TLI_;
8323   AA = &AA_;
8324   AC = &AC_;
8325   GetLAA = &GetLAA_;
8326   DB = &DB_;
8327   ORE = &ORE_;
8328   PSI = PSI_;
8329 
8330   // Don't attempt if
8331   // 1. the target claims to have no vector registers, and
8332   // 2. interleaving won't help ILP.
8333   //
8334   // The second condition is necessary because, even if the target has no
8335   // vector registers, loop vectorization may still enable scalar
8336   // interleaving.
8337   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8338       TTI->getMaxInterleaveFactor(1) < 2)
8339     return LoopVectorizeResult(false, false);
8340 
8341   bool Changed = false, CFGChanged = false;
8342 
8343   // The vectorizer requires loops to be in simplified form.
8344   // Since simplification may add new inner loops, it has to run before the
8345   // legality and profitability checks. This means running the loop vectorizer
8346   // will simplify all loops, regardless of whether anything end up being
8347   // vectorized.
8348   for (auto &L : *LI)
8349     Changed |= CFGChanged |=
8350         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8351 
8352   // Build up a worklist of inner-loops to vectorize. This is necessary as
8353   // the act of vectorizing or partially unrolling a loop creates new loops
8354   // and can invalidate iterators across the loops.
8355   SmallVector<Loop *, 8> Worklist;
8356 
8357   for (Loop *L : *LI)
8358     collectSupportedLoops(*L, LI, ORE, Worklist);
8359 
8360   LoopsAnalyzed += Worklist.size();
8361 
8362   // Now walk the identified inner loops.
8363   while (!Worklist.empty()) {
8364     Loop *L = Worklist.pop_back_val();
8365 
8366     // For the inner loops we actually process, form LCSSA to simplify the
8367     // transform.
8368     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8369 
8370     Changed |= CFGChanged |= processLoop(L);
8371   }
8372 
8373   // Process each loop nest in the function.
8374   return LoopVectorizeResult(Changed, CFGChanged);
8375 }
8376 
8377 PreservedAnalyses LoopVectorizePass::run(Function &F,
8378                                          FunctionAnalysisManager &AM) {
8379     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8380     auto &LI = AM.getResult<LoopAnalysis>(F);
8381     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8382     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8383     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8384     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8385     auto &AA = AM.getResult<AAManager>(F);
8386     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8387     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8388     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8389     MemorySSA *MSSA = EnableMSSALoopDependency
8390                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8391                           : nullptr;
8392 
8393     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8394     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8395         [&](Loop &L) -> const LoopAccessInfo & {
8396       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8397       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8398     };
8399     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8400     ProfileSummaryInfo *PSI =
8401         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8402     LoopVectorizeResult Result =
8403         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8404     if (!Result.MadeAnyChange)
8405       return PreservedAnalyses::all();
8406     PreservedAnalyses PA;
8407 
8408     // We currently do not preserve loopinfo/dominator analyses with outer loop
8409     // vectorization. Until this is addressed, mark these analyses as preserved
8410     // only for non-VPlan-native path.
8411     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8412     if (!EnableVPlanNativePath) {
8413       PA.preserve<LoopAnalysis>();
8414       PA.preserve<DominatorTreeAnalysis>();
8415     }
8416     PA.preserve<BasicAA>();
8417     PA.preserve<GlobalsAA>();
8418     if (!Result.MadeCFGChange)
8419       PA.preserveSet<CFGAnalyses>();
8420     return PA;
8421 }
8422