1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 static cl::opt<bool>
269     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
270                            cl::Hidden,
271                            cl::desc("Prefer in-loop vector reductions, "
272                                     "overriding the targets preference."));
273 
274 cl::opt<bool> EnableVPlanNativePath(
275     "enable-vplan-native-path", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path with "
277              "support for outer loop vectorization."));
278 
279 // FIXME: Remove this switch once we have divergence analysis. Currently we
280 // assume divergent non-backedge branches when this switch is true.
281 cl::opt<bool> EnableVPlanPredication(
282     "enable-vplan-predication", cl::init(false), cl::Hidden,
283     cl::desc("Enable VPlan-native vectorization path predicator with "
284              "support for outer loop vectorization."));
285 
286 // This flag enables the stress testing of the VPlan H-CFG construction in the
287 // VPlan-native vectorization path. It must be used in conjuction with
288 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
289 // verification of the H-CFGs built.
290 static cl::opt<bool> VPlanBuildStressTest(
291     "vplan-build-stress-test", cl::init(false), cl::Hidden,
292     cl::desc(
293         "Build VPlan for every supported loop nest in the function and bail "
294         "out right after the build (stress test the VPlan H-CFG construction "
295         "in the VPlan-native vectorization path)."));
296 
297 cl::opt<bool> llvm::EnableLoopInterleaving(
298     "interleave-loops", cl::init(true), cl::Hidden,
299     cl::desc("Enable loop interleaving in Loop vectorization passes"));
300 cl::opt<bool> llvm::EnableLoopVectorization(
301     "vectorize-loops", cl::init(true), cl::Hidden,
302     cl::desc("Run the Loop vectorization passes"));
303 
304 /// A helper function that returns the type of loaded or stored value.
305 static Type *getMemInstValueType(Value *I) {
306   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
307          "Expected Load or Store instruction");
308   if (auto *LI = dyn_cast<LoadInst>(I))
309     return LI->getType();
310   return cast<StoreInst>(I)->getValueOperand()->getType();
311 }
312 
313 /// A helper function that returns true if the given type is irregular. The
314 /// type is irregular if its allocated size doesn't equal the store size of an
315 /// element of the corresponding vector type at the given vectorization factor.
316 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
317   // Determine if an array of VF elements of type Ty is "bitcast compatible"
318   // with a <VF x Ty> vector.
319   if (VF > 1) {
320     auto *VectorTy = FixedVectorType::get(Ty, VF);
321     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
322   }
323 
324   // If the vectorization factor is one, we just check if an array of type Ty
325   // requires padding between elements.
326   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
327 }
328 
329 /// A helper function that returns the reciprocal of the block probability of
330 /// predicated blocks. If we return X, we are assuming the predicated block
331 /// will execute once for every X iterations of the loop header.
332 ///
333 /// TODO: We should use actual block probability here, if available. Currently,
334 ///       we always assume predicated blocks have a 50% chance of executing.
335 static unsigned getReciprocalPredBlockProb() { return 2; }
336 
337 /// A helper function that adds a 'fast' flag to floating-point operations.
338 static Value *addFastMathFlag(Value *V) {
339   if (isa<FPMathOperator>(V))
340     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
341   return V;
342 }
343 
344 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
345   if (isa<FPMathOperator>(V))
346     cast<Instruction>(V)->setFastMathFlags(FMF);
347   return V;
348 }
349 
350 /// A helper function that returns an integer or floating-point constant with
351 /// value C.
352 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
353   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
354                            : ConstantFP::get(Ty, C);
355 }
356 
357 /// Returns "best known" trip count for the specified loop \p L as defined by
358 /// the following procedure:
359 ///   1) Returns exact trip count if it is known.
360 ///   2) Returns expected trip count according to profile data if any.
361 ///   3) Returns upper bound estimate if it is known.
362 ///   4) Returns None if all of the above failed.
363 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
364   // Check if exact trip count is known.
365   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
366     return ExpectedTC;
367 
368   // Check if there is an expected trip count available from profile data.
369   if (LoopVectorizeWithBlockFrequency)
370     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
371       return EstimatedTC;
372 
373   // Check if upper bound estimate is known.
374   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
375     return ExpectedTC;
376 
377   return None;
378 }
379 
380 namespace llvm {
381 
382 /// InnerLoopVectorizer vectorizes loops which contain only one basic
383 /// block to a specified vectorization factor (VF).
384 /// This class performs the widening of scalars into vectors, or multiple
385 /// scalars. This class also implements the following features:
386 /// * It inserts an epilogue loop for handling loops that don't have iteration
387 ///   counts that are known to be a multiple of the vectorization factor.
388 /// * It handles the code generation for reduction variables.
389 /// * Scalarization (implementation using scalars) of un-vectorizable
390 ///   instructions.
391 /// InnerLoopVectorizer does not perform any vectorization-legality
392 /// checks, and relies on the caller to check for the different legality
393 /// aspects. The InnerLoopVectorizer relies on the
394 /// LoopVectorizationLegality class to provide information about the induction
395 /// and reduction variables that were found to a given vectorization factor.
396 class InnerLoopVectorizer {
397 public:
398   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
399                       LoopInfo *LI, DominatorTree *DT,
400                       const TargetLibraryInfo *TLI,
401                       const TargetTransformInfo *TTI, AssumptionCache *AC,
402                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
403                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
404                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
405                       ProfileSummaryInfo *PSI)
406       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
407         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
408         Builder(PSE.getSE()->getContext()),
409         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
410         BFI(BFI), PSI(PSI) {
411     // Query this against the original loop and save it here because the profile
412     // of the original loop header may change as the transformation happens.
413     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
414         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
415   }
416 
417   virtual ~InnerLoopVectorizer() = default;
418 
419   /// Create a new empty loop that will contain vectorized instructions later
420   /// on, while the old loop will be used as the scalar remainder. Control flow
421   /// is generated around the vectorized (and scalar epilogue) loops consisting
422   /// of various checks and bypasses. Return the pre-header block of the new
423   /// loop.
424   BasicBlock *createVectorizedLoopSkeleton();
425 
426   /// Widen a single instruction within the innermost loop.
427   void widenInstruction(Instruction &I, VPUser &Operands,
428                         VPTransformState &State);
429 
430   /// Widen a single call instruction within the innermost loop.
431   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
432                             VPTransformState &State);
433 
434   /// Widen a single select instruction within the innermost loop.
435   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
436                               bool InvariantCond, VPTransformState &State);
437 
438   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
439   void fixVectorizedLoop();
440 
441   // Return true if any runtime check is added.
442   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
443 
444   /// A type for vectorized values in the new loop. Each value from the
445   /// original loop, when vectorized, is represented by UF vector values in the
446   /// new unrolled loop, where UF is the unroll factor.
447   using VectorParts = SmallVector<Value *, 2>;
448 
449   /// Vectorize a single GetElementPtrInst based on information gathered and
450   /// decisions taken during planning.
451   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
452                 unsigned VF, bool IsPtrLoopInvariant,
453                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
454 
455   /// Vectorize a single PHINode in a block. This method handles the induction
456   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
457   /// arbitrary length vectors.
458   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
459 
460   /// A helper function to scalarize a single Instruction in the innermost loop.
461   /// Generates a sequence of scalar instances for each lane between \p MinLane
462   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
463   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
464   /// Instr's operands.
465   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
466                             const VPIteration &Instance, bool IfPredicateInstr,
467                             VPTransformState &State);
468 
469   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
470   /// is provided, the integer induction variable will first be truncated to
471   /// the corresponding type.
472   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
473 
474   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
475   /// vector or scalar value on-demand if one is not yet available. When
476   /// vectorizing a loop, we visit the definition of an instruction before its
477   /// uses. When visiting the definition, we either vectorize or scalarize the
478   /// instruction, creating an entry for it in the corresponding map. (In some
479   /// cases, such as induction variables, we will create both vector and scalar
480   /// entries.) Then, as we encounter uses of the definition, we derive values
481   /// for each scalar or vector use unless such a value is already available.
482   /// For example, if we scalarize a definition and one of its uses is vector,
483   /// we build the required vector on-demand with an insertelement sequence
484   /// when visiting the use. Otherwise, if the use is scalar, we can use the
485   /// existing scalar definition.
486   ///
487   /// Return a value in the new loop corresponding to \p V from the original
488   /// loop at unroll index \p Part. If the value has already been vectorized,
489   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
490   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
491   /// a new vector value on-demand by inserting the scalar values into a vector
492   /// with an insertelement sequence. If the value has been neither vectorized
493   /// nor scalarized, it must be loop invariant, so we simply broadcast the
494   /// value into a vector.
495   Value *getOrCreateVectorValue(Value *V, unsigned Part);
496 
497   /// Return a value in the new loop corresponding to \p V from the original
498   /// loop at unroll and vector indices \p Instance. If the value has been
499   /// vectorized but not scalarized, the necessary extractelement instruction
500   /// will be generated.
501   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
502 
503   /// Construct the vector value of a scalarized value \p V one lane at a time.
504   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
505 
506   /// Try to vectorize interleaved access group \p Group with the base address
507   /// given in \p Addr, optionally masking the vector operations if \p
508   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
509   /// values in the vectorized loop.
510   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
511                                 VPTransformState &State, VPValue *Addr,
512                                 VPValue *BlockInMask = nullptr);
513 
514   /// Vectorize Load and Store instructions with the base address given in \p
515   /// Addr, optionally masking the vector operations if \p BlockInMask is
516   /// non-null. Use \p State to translate given VPValues to IR values in the
517   /// vectorized loop.
518   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
519                                   VPValue *Addr, VPValue *StoredValue,
520                                   VPValue *BlockInMask);
521 
522   /// Set the debug location in the builder using the debug location in
523   /// the instruction.
524   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
525 
526   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
527   void fixNonInductionPHIs(void);
528 
529 protected:
530   friend class LoopVectorizationPlanner;
531 
532   /// A small list of PHINodes.
533   using PhiVector = SmallVector<PHINode *, 4>;
534 
535   /// A type for scalarized values in the new loop. Each value from the
536   /// original loop, when scalarized, is represented by UF x VF scalar values
537   /// in the new unrolled loop, where UF is the unroll factor and VF is the
538   /// vectorization factor.
539   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
540 
541   /// Set up the values of the IVs correctly when exiting the vector loop.
542   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
543                     Value *CountRoundDown, Value *EndValue,
544                     BasicBlock *MiddleBlock);
545 
546   /// Create a new induction variable inside L.
547   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
548                                    Value *Step, Instruction *DL);
549 
550   /// Handle all cross-iteration phis in the header.
551   void fixCrossIterationPHIs();
552 
553   /// Fix a first-order recurrence. This is the second phase of vectorizing
554   /// this phi node.
555   void fixFirstOrderRecurrence(PHINode *Phi);
556 
557   /// Fix a reduction cross-iteration phi. This is the second phase of
558   /// vectorizing this phi node.
559   void fixReduction(PHINode *Phi);
560 
561   /// Clear NSW/NUW flags from reduction instructions if necessary.
562   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
563 
564   /// The Loop exit block may have single value PHI nodes with some
565   /// incoming value. While vectorizing we only handled real values
566   /// that were defined inside the loop and we should have one value for
567   /// each predecessor of its parent basic block. See PR14725.
568   void fixLCSSAPHIs();
569 
570   /// Iteratively sink the scalarized operands of a predicated instruction into
571   /// the block that was created for it.
572   void sinkScalarOperands(Instruction *PredInst);
573 
574   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
575   /// represented as.
576   void truncateToMinimalBitwidths();
577 
578   /// Create a broadcast instruction. This method generates a broadcast
579   /// instruction (shuffle) for loop invariant values and for the induction
580   /// value. If this is the induction variable then we extend it to N, N+1, ...
581   /// this is needed because each iteration in the loop corresponds to a SIMD
582   /// element.
583   virtual Value *getBroadcastInstrs(Value *V);
584 
585   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
586   /// to each vector element of Val. The sequence starts at StartIndex.
587   /// \p Opcode is relevant for FP induction variable.
588   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
589                                Instruction::BinaryOps Opcode =
590                                Instruction::BinaryOpsEnd);
591 
592   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
593   /// variable on which to base the steps, \p Step is the size of the step, and
594   /// \p EntryVal is the value from the original loop that maps to the steps.
595   /// Note that \p EntryVal doesn't have to be an induction variable - it
596   /// can also be a truncate instruction.
597   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
598                         const InductionDescriptor &ID);
599 
600   /// Create a vector induction phi node based on an existing scalar one. \p
601   /// EntryVal is the value from the original loop that maps to the vector phi
602   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
603   /// truncate instruction, instead of widening the original IV, we widen a
604   /// version of the IV truncated to \p EntryVal's type.
605   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
606                                        Value *Step, Instruction *EntryVal);
607 
608   /// Returns true if an instruction \p I should be scalarized instead of
609   /// vectorized for the chosen vectorization factor.
610   bool shouldScalarizeInstruction(Instruction *I) const;
611 
612   /// Returns true if we should generate a scalar version of \p IV.
613   bool needsScalarInduction(Instruction *IV) const;
614 
615   /// If there is a cast involved in the induction variable \p ID, which should
616   /// be ignored in the vectorized loop body, this function records the
617   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
618   /// cast. We had already proved that the casted Phi is equal to the uncasted
619   /// Phi in the vectorized loop (under a runtime guard), and therefore
620   /// there is no need to vectorize the cast - the same value can be used in the
621   /// vector loop for both the Phi and the cast.
622   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
623   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
624   ///
625   /// \p EntryVal is the value from the original loop that maps to the vector
626   /// phi node and is used to distinguish what is the IV currently being
627   /// processed - original one (if \p EntryVal is a phi corresponding to the
628   /// original IV) or the "newly-created" one based on the proof mentioned above
629   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
630   /// latter case \p EntryVal is a TruncInst and we must not record anything for
631   /// that IV, but it's error-prone to expect callers of this routine to care
632   /// about that, hence this explicit parameter.
633   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
634                                              const Instruction *EntryVal,
635                                              Value *VectorLoopValue,
636                                              unsigned Part,
637                                              unsigned Lane = UINT_MAX);
638 
639   /// Generate a shuffle sequence that will reverse the vector Vec.
640   virtual Value *reverseVector(Value *Vec);
641 
642   /// Returns (and creates if needed) the original loop trip count.
643   Value *getOrCreateTripCount(Loop *NewLoop);
644 
645   /// Returns (and creates if needed) the trip count of the widened loop.
646   Value *getOrCreateVectorTripCount(Loop *NewLoop);
647 
648   /// Returns a bitcasted value to the requested vector type.
649   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
650   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
651                                 const DataLayout &DL);
652 
653   /// Emit a bypass check to see if the vector trip count is zero, including if
654   /// it overflows.
655   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
656 
657   /// Emit a bypass check to see if all of the SCEV assumptions we've
658   /// had to make are correct.
659   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
660 
661   /// Emit bypass checks to check any memory assumptions we may have made.
662   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
663 
664   /// Compute the transformed value of Index at offset StartValue using step
665   /// StepValue.
666   /// For integer induction, returns StartValue + Index * StepValue.
667   /// For pointer induction, returns StartValue[Index * StepValue].
668   /// FIXME: The newly created binary instructions should contain nsw/nuw
669   /// flags, which can be found from the original scalar operations.
670   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
671                               const DataLayout &DL,
672                               const InductionDescriptor &ID) const;
673 
674   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
675   /// vector loop preheader, middle block and scalar preheader. Also
676   /// allocate a loop object for the new vector loop and return it.
677   Loop *createVectorLoopSkeleton(StringRef Prefix);
678 
679   /// Create new phi nodes for the induction variables to resume iteration count
680   /// in the scalar epilogue, from where the vectorized loop left off (given by
681   /// \p VectorTripCount).
682   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
683 
684   /// Complete the loop skeleton by adding debug MDs, creating appropriate
685   /// conditional branches in the middle block, preparing the builder and
686   /// running the verifier. Take in the vector loop \p L as argument, and return
687   /// the preheader of the completed vector loop.
688   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
689 
690   /// Add additional metadata to \p To that was not present on \p Orig.
691   ///
692   /// Currently this is used to add the noalias annotations based on the
693   /// inserted memchecks.  Use this for instructions that are *cloned* into the
694   /// vector loop.
695   void addNewMetadata(Instruction *To, const Instruction *Orig);
696 
697   /// Add metadata from one instruction to another.
698   ///
699   /// This includes both the original MDs from \p From and additional ones (\see
700   /// addNewMetadata).  Use this for *newly created* instructions in the vector
701   /// loop.
702   void addMetadata(Instruction *To, Instruction *From);
703 
704   /// Similar to the previous function but it adds the metadata to a
705   /// vector of instructions.
706   void addMetadata(ArrayRef<Value *> To, Instruction *From);
707 
708   /// The original loop.
709   Loop *OrigLoop;
710 
711   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
712   /// dynamic knowledge to simplify SCEV expressions and converts them to a
713   /// more usable form.
714   PredicatedScalarEvolution &PSE;
715 
716   /// Loop Info.
717   LoopInfo *LI;
718 
719   /// Dominator Tree.
720   DominatorTree *DT;
721 
722   /// Alias Analysis.
723   AAResults *AA;
724 
725   /// Target Library Info.
726   const TargetLibraryInfo *TLI;
727 
728   /// Target Transform Info.
729   const TargetTransformInfo *TTI;
730 
731   /// Assumption Cache.
732   AssumptionCache *AC;
733 
734   /// Interface to emit optimization remarks.
735   OptimizationRemarkEmitter *ORE;
736 
737   /// LoopVersioning.  It's only set up (non-null) if memchecks were
738   /// used.
739   ///
740   /// This is currently only used to add no-alias metadata based on the
741   /// memchecks.  The actually versioning is performed manually.
742   std::unique_ptr<LoopVersioning> LVer;
743 
744   /// The vectorization SIMD factor to use. Each vector will have this many
745   /// vector elements.
746   unsigned VF;
747 
748   /// The vectorization unroll factor to use. Each scalar is vectorized to this
749   /// many different vector instructions.
750   unsigned UF;
751 
752   /// The builder that we use
753   IRBuilder<> Builder;
754 
755   // --- Vectorization state ---
756 
757   /// The vector-loop preheader.
758   BasicBlock *LoopVectorPreHeader;
759 
760   /// The scalar-loop preheader.
761   BasicBlock *LoopScalarPreHeader;
762 
763   /// Middle Block between the vector and the scalar.
764   BasicBlock *LoopMiddleBlock;
765 
766   /// The ExitBlock of the scalar loop.
767   BasicBlock *LoopExitBlock;
768 
769   /// The vector loop body.
770   BasicBlock *LoopVectorBody;
771 
772   /// The scalar loop body.
773   BasicBlock *LoopScalarBody;
774 
775   /// A list of all bypass blocks. The first block is the entry of the loop.
776   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
777 
778   /// The new Induction variable which was added to the new block.
779   PHINode *Induction = nullptr;
780 
781   /// The induction variable of the old basic block.
782   PHINode *OldInduction = nullptr;
783 
784   /// Maps values from the original loop to their corresponding values in the
785   /// vectorized loop. A key value can map to either vector values, scalar
786   /// values or both kinds of values, depending on whether the key was
787   /// vectorized and scalarized.
788   VectorizerValueMap VectorLoopValueMap;
789 
790   /// Store instructions that were predicated.
791   SmallVector<Instruction *, 4> PredicatedInstructions;
792 
793   /// Trip count of the original loop.
794   Value *TripCount = nullptr;
795 
796   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
797   Value *VectorTripCount = nullptr;
798 
799   /// The legality analysis.
800   LoopVectorizationLegality *Legal;
801 
802   /// The profitablity analysis.
803   LoopVectorizationCostModel *Cost;
804 
805   // Record whether runtime checks are added.
806   bool AddedSafetyChecks = false;
807 
808   // Holds the end values for each induction variable. We save the end values
809   // so we can later fix-up the external users of the induction variables.
810   DenseMap<PHINode *, Value *> IVEndValues;
811 
812   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
813   // fixed up at the end of vector code generation.
814   SmallVector<PHINode *, 8> OrigPHIsToFix;
815 
816   /// BFI and PSI are used to check for profile guided size optimizations.
817   BlockFrequencyInfo *BFI;
818   ProfileSummaryInfo *PSI;
819 
820   // Whether this loop should be optimized for size based on profile guided size
821   // optimizatios.
822   bool OptForSizeBasedOnProfile;
823 };
824 
825 class InnerLoopUnroller : public InnerLoopVectorizer {
826 public:
827   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
828                     LoopInfo *LI, DominatorTree *DT,
829                     const TargetLibraryInfo *TLI,
830                     const TargetTransformInfo *TTI, AssumptionCache *AC,
831                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
832                     LoopVectorizationLegality *LVL,
833                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
834                     ProfileSummaryInfo *PSI)
835       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
836                             UnrollFactor, LVL, CM, BFI, PSI) {}
837 
838 private:
839   Value *getBroadcastInstrs(Value *V) override;
840   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
841                        Instruction::BinaryOps Opcode =
842                        Instruction::BinaryOpsEnd) override;
843   Value *reverseVector(Value *Vec) override;
844 };
845 
846 } // end namespace llvm
847 
848 /// Look for a meaningful debug location on the instruction or it's
849 /// operands.
850 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
851   if (!I)
852     return I;
853 
854   DebugLoc Empty;
855   if (I->getDebugLoc() != Empty)
856     return I;
857 
858   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
859     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
860       if (OpInst->getDebugLoc() != Empty)
861         return OpInst;
862   }
863 
864   return I;
865 }
866 
867 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
868   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
869     const DILocation *DIL = Inst->getDebugLoc();
870     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
871         !isa<DbgInfoIntrinsic>(Inst)) {
872       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
873       if (NewDIL)
874         B.SetCurrentDebugLocation(NewDIL.getValue());
875       else
876         LLVM_DEBUG(dbgs()
877                    << "Failed to create new discriminator: "
878                    << DIL->getFilename() << " Line: " << DIL->getLine());
879     }
880     else
881       B.SetCurrentDebugLocation(DIL);
882   } else
883     B.SetCurrentDebugLocation(DebugLoc());
884 }
885 
886 /// Write a record \p DebugMsg about vectorization failure to the debug
887 /// output stream. If \p I is passed, it is an instruction that prevents
888 /// vectorization.
889 #ifndef NDEBUG
890 static void debugVectorizationFailure(const StringRef DebugMsg,
891     Instruction *I) {
892   dbgs() << "LV: Not vectorizing: " << DebugMsg;
893   if (I != nullptr)
894     dbgs() << " " << *I;
895   else
896     dbgs() << '.';
897   dbgs() << '\n';
898 }
899 #endif
900 
901 /// Create an analysis remark that explains why vectorization failed
902 ///
903 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
904 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
905 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
906 /// the location of the remark.  \return the remark object that can be
907 /// streamed to.
908 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
909     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
910   Value *CodeRegion = TheLoop->getHeader();
911   DebugLoc DL = TheLoop->getStartLoc();
912 
913   if (I) {
914     CodeRegion = I->getParent();
915     // If there is no debug location attached to the instruction, revert back to
916     // using the loop's.
917     if (I->getDebugLoc())
918       DL = I->getDebugLoc();
919   }
920 
921   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
922   R << "loop not vectorized: ";
923   return R;
924 }
925 
926 namespace llvm {
927 
928 void reportVectorizationFailure(const StringRef DebugMsg,
929     const StringRef OREMsg, const StringRef ORETag,
930     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
931   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
932   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
933   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
934                 ORETag, TheLoop, I) << OREMsg);
935 }
936 
937 } // end namespace llvm
938 
939 #ifndef NDEBUG
940 /// \return string containing a file name and a line # for the given loop.
941 static std::string getDebugLocString(const Loop *L) {
942   std::string Result;
943   if (L) {
944     raw_string_ostream OS(Result);
945     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
946       LoopDbgLoc.print(OS);
947     else
948       // Just print the module name.
949       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
950     OS.flush();
951   }
952   return Result;
953 }
954 #endif
955 
956 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
957                                          const Instruction *Orig) {
958   // If the loop was versioned with memchecks, add the corresponding no-alias
959   // metadata.
960   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
961     LVer->annotateInstWithNoAlias(To, Orig);
962 }
963 
964 void InnerLoopVectorizer::addMetadata(Instruction *To,
965                                       Instruction *From) {
966   propagateMetadata(To, From);
967   addNewMetadata(To, From);
968 }
969 
970 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
971                                       Instruction *From) {
972   for (Value *V : To) {
973     if (Instruction *I = dyn_cast<Instruction>(V))
974       addMetadata(I, From);
975   }
976 }
977 
978 namespace llvm {
979 
980 // Loop vectorization cost-model hints how the scalar epilogue loop should be
981 // lowered.
982 enum ScalarEpilogueLowering {
983 
984   // The default: allowing scalar epilogues.
985   CM_ScalarEpilogueAllowed,
986 
987   // Vectorization with OptForSize: don't allow epilogues.
988   CM_ScalarEpilogueNotAllowedOptSize,
989 
990   // A special case of vectorisation with OptForSize: loops with a very small
991   // trip count are considered for vectorization under OptForSize, thereby
992   // making sure the cost of their loop body is dominant, free of runtime
993   // guards and scalar iteration overheads.
994   CM_ScalarEpilogueNotAllowedLowTripLoop,
995 
996   // Loop hint predicate indicating an epilogue is undesired.
997   CM_ScalarEpilogueNotNeededUsePredicate
998 };
999 
1000 /// LoopVectorizationCostModel - estimates the expected speedups due to
1001 /// vectorization.
1002 /// In many cases vectorization is not profitable. This can happen because of
1003 /// a number of reasons. In this class we mainly attempt to predict the
1004 /// expected speedup/slowdowns due to the supported instruction set. We use the
1005 /// TargetTransformInfo to query the different backends for the cost of
1006 /// different operations.
1007 class LoopVectorizationCostModel {
1008 public:
1009   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1010                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1011                              LoopVectorizationLegality *Legal,
1012                              const TargetTransformInfo &TTI,
1013                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1014                              AssumptionCache *AC,
1015                              OptimizationRemarkEmitter *ORE, const Function *F,
1016                              const LoopVectorizeHints *Hints,
1017                              InterleavedAccessInfo &IAI)
1018       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1019         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1020         Hints(Hints), InterleaveInfo(IAI) {}
1021 
1022   /// \return An upper bound for the vectorization factor, or None if
1023   /// vectorization and interleaving should be avoided up front.
1024   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1025 
1026   /// \return True if runtime checks are required for vectorization, and false
1027   /// otherwise.
1028   bool runtimeChecksRequired();
1029 
1030   /// \return The most profitable vectorization factor and the cost of that VF.
1031   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1032   /// then this vectorization factor will be selected if vectorization is
1033   /// possible.
1034   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1035 
1036   /// Setup cost-based decisions for user vectorization factor.
1037   void selectUserVectorizationFactor(unsigned UserVF) {
1038     collectUniformsAndScalars(UserVF);
1039     collectInstsToScalarize(UserVF);
1040   }
1041 
1042   /// \return The size (in bits) of the smallest and widest types in the code
1043   /// that needs to be vectorized. We ignore values that remain scalar such as
1044   /// 64 bit loop indices.
1045   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1046 
1047   /// \return The desired interleave count.
1048   /// If interleave count has been specified by metadata it will be returned.
1049   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1050   /// are the selected vectorization factor and the cost of the selected VF.
1051   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1052 
1053   /// Memory access instruction may be vectorized in more than one way.
1054   /// Form of instruction after vectorization depends on cost.
1055   /// This function takes cost-based decisions for Load/Store instructions
1056   /// and collects them in a map. This decisions map is used for building
1057   /// the lists of loop-uniform and loop-scalar instructions.
1058   /// The calculated cost is saved with widening decision in order to
1059   /// avoid redundant calculations.
1060   void setCostBasedWideningDecision(unsigned VF);
1061 
1062   /// A struct that represents some properties of the register usage
1063   /// of a loop.
1064   struct RegisterUsage {
1065     /// Holds the number of loop invariant values that are used in the loop.
1066     /// The key is ClassID of target-provided register class.
1067     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1068     /// Holds the maximum number of concurrent live intervals in the loop.
1069     /// The key is ClassID of target-provided register class.
1070     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1071   };
1072 
1073   /// \return Returns information about the register usages of the loop for the
1074   /// given vectorization factors.
1075   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1076 
1077   /// Collect values we want to ignore in the cost model.
1078   void collectValuesToIgnore();
1079 
1080   /// Split reductions into those that happen in the loop, and those that happen
1081   /// outside. In loop reductions are collected into InLoopReductionChains.
1082   void collectInLoopReductions();
1083 
1084   /// \returns The smallest bitwidth each instruction can be represented with.
1085   /// The vector equivalents of these instructions should be truncated to this
1086   /// type.
1087   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1088     return MinBWs;
1089   }
1090 
1091   /// \returns True if it is more profitable to scalarize instruction \p I for
1092   /// vectorization factor \p VF.
1093   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1094     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1095 
1096     // Cost model is not run in the VPlan-native path - return conservative
1097     // result until this changes.
1098     if (EnableVPlanNativePath)
1099       return false;
1100 
1101     auto Scalars = InstsToScalarize.find(VF);
1102     assert(Scalars != InstsToScalarize.end() &&
1103            "VF not yet analyzed for scalarization profitability");
1104     return Scalars->second.find(I) != Scalars->second.end();
1105   }
1106 
1107   /// Returns true if \p I is known to be uniform after vectorization.
1108   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1109     if (VF == 1)
1110       return true;
1111 
1112     // Cost model is not run in the VPlan-native path - return conservative
1113     // result until this changes.
1114     if (EnableVPlanNativePath)
1115       return false;
1116 
1117     auto UniformsPerVF = Uniforms.find(VF);
1118     assert(UniformsPerVF != Uniforms.end() &&
1119            "VF not yet analyzed for uniformity");
1120     return UniformsPerVF->second.count(I);
1121   }
1122 
1123   /// Returns true if \p I is known to be scalar after vectorization.
1124   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1125     if (VF == 1)
1126       return true;
1127 
1128     // Cost model is not run in the VPlan-native path - return conservative
1129     // result until this changes.
1130     if (EnableVPlanNativePath)
1131       return false;
1132 
1133     auto ScalarsPerVF = Scalars.find(VF);
1134     assert(ScalarsPerVF != Scalars.end() &&
1135            "Scalar values are not calculated for VF");
1136     return ScalarsPerVF->second.count(I);
1137   }
1138 
1139   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1140   /// for vectorization factor \p VF.
1141   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1142     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1143            !isProfitableToScalarize(I, VF) &&
1144            !isScalarAfterVectorization(I, VF);
1145   }
1146 
1147   /// Decision that was taken during cost calculation for memory instruction.
1148   enum InstWidening {
1149     CM_Unknown,
1150     CM_Widen,         // For consecutive accesses with stride +1.
1151     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1152     CM_Interleave,
1153     CM_GatherScatter,
1154     CM_Scalarize
1155   };
1156 
1157   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1158   /// instruction \p I and vector width \p VF.
1159   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1160                            unsigned Cost) {
1161     assert(VF >= 2 && "Expected VF >=2");
1162     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1163   }
1164 
1165   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1166   /// interleaving group \p Grp and vector width \p VF.
1167   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1168                            InstWidening W, unsigned Cost) {
1169     assert(VF >= 2 && "Expected VF >=2");
1170     /// Broadcast this decicion to all instructions inside the group.
1171     /// But the cost will be assigned to one instruction only.
1172     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1173       if (auto *I = Grp->getMember(i)) {
1174         if (Grp->getInsertPos() == I)
1175           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1176         else
1177           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1178       }
1179     }
1180   }
1181 
1182   /// Return the cost model decision for the given instruction \p I and vector
1183   /// width \p VF. Return CM_Unknown if this instruction did not pass
1184   /// through the cost modeling.
1185   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1186     assert(VF >= 2 && "Expected VF >=2");
1187 
1188     // Cost model is not run in the VPlan-native path - return conservative
1189     // result until this changes.
1190     if (EnableVPlanNativePath)
1191       return CM_GatherScatter;
1192 
1193     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1194     auto Itr = WideningDecisions.find(InstOnVF);
1195     if (Itr == WideningDecisions.end())
1196       return CM_Unknown;
1197     return Itr->second.first;
1198   }
1199 
1200   /// Return the vectorization cost for the given instruction \p I and vector
1201   /// width \p VF.
1202   unsigned getWideningCost(Instruction *I, unsigned VF) {
1203     assert(VF >= 2 && "Expected VF >=2");
1204     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1205     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1206            "The cost is not calculated");
1207     return WideningDecisions[InstOnVF].second;
1208   }
1209 
1210   /// Return True if instruction \p I is an optimizable truncate whose operand
1211   /// is an induction variable. Such a truncate will be removed by adding a new
1212   /// induction variable with the destination type.
1213   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1214     // If the instruction is not a truncate, return false.
1215     auto *Trunc = dyn_cast<TruncInst>(I);
1216     if (!Trunc)
1217       return false;
1218 
1219     // Get the source and destination types of the truncate.
1220     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1221     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1222 
1223     // If the truncate is free for the given types, return false. Replacing a
1224     // free truncate with an induction variable would add an induction variable
1225     // update instruction to each iteration of the loop. We exclude from this
1226     // check the primary induction variable since it will need an update
1227     // instruction regardless.
1228     Value *Op = Trunc->getOperand(0);
1229     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1230       return false;
1231 
1232     // If the truncated value is not an induction variable, return false.
1233     return Legal->isInductionPhi(Op);
1234   }
1235 
1236   /// Collects the instructions to scalarize for each predicated instruction in
1237   /// the loop.
1238   void collectInstsToScalarize(unsigned VF);
1239 
1240   /// Collect Uniform and Scalar values for the given \p VF.
1241   /// The sets depend on CM decision for Load/Store instructions
1242   /// that may be vectorized as interleave, gather-scatter or scalarized.
1243   void collectUniformsAndScalars(unsigned VF) {
1244     // Do the analysis once.
1245     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1246       return;
1247     setCostBasedWideningDecision(VF);
1248     collectLoopUniforms(VF);
1249     collectLoopScalars(VF);
1250   }
1251 
1252   /// Returns true if the target machine supports masked store operation
1253   /// for the given \p DataType and kind of access to \p Ptr.
1254   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1255     return Legal->isConsecutivePtr(Ptr) &&
1256            TTI.isLegalMaskedStore(DataType, Alignment);
1257   }
1258 
1259   /// Returns true if the target machine supports masked load operation
1260   /// for the given \p DataType and kind of access to \p Ptr.
1261   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1262     return Legal->isConsecutivePtr(Ptr) &&
1263            TTI.isLegalMaskedLoad(DataType, Alignment);
1264   }
1265 
1266   /// Returns true if the target machine supports masked scatter operation
1267   /// for the given \p DataType.
1268   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1269     return TTI.isLegalMaskedScatter(DataType, Alignment);
1270   }
1271 
1272   /// Returns true if the target machine supports masked gather operation
1273   /// for the given \p DataType.
1274   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1275     return TTI.isLegalMaskedGather(DataType, Alignment);
1276   }
1277 
1278   /// Returns true if the target machine can represent \p V as a masked gather
1279   /// or scatter operation.
1280   bool isLegalGatherOrScatter(Value *V) {
1281     bool LI = isa<LoadInst>(V);
1282     bool SI = isa<StoreInst>(V);
1283     if (!LI && !SI)
1284       return false;
1285     auto *Ty = getMemInstValueType(V);
1286     Align Align = getLoadStoreAlignment(V);
1287     return (LI && isLegalMaskedGather(Ty, Align)) ||
1288            (SI && isLegalMaskedScatter(Ty, Align));
1289   }
1290 
1291   /// Returns true if \p I is an instruction that will be scalarized with
1292   /// predication. Such instructions include conditional stores and
1293   /// instructions that may divide by zero.
1294   /// If a non-zero VF has been calculated, we check if I will be scalarized
1295   /// predication for that VF.
1296   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1297 
1298   // Returns true if \p I is an instruction that will be predicated either
1299   // through scalar predication or masked load/store or masked gather/scatter.
1300   // Superset of instructions that return true for isScalarWithPredication.
1301   bool isPredicatedInst(Instruction *I) {
1302     if (!blockNeedsPredication(I->getParent()))
1303       return false;
1304     // Loads and stores that need some form of masked operation are predicated
1305     // instructions.
1306     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1307       return Legal->isMaskRequired(I);
1308     return isScalarWithPredication(I);
1309   }
1310 
1311   /// Returns true if \p I is a memory instruction with consecutive memory
1312   /// access that can be widened.
1313   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1314 
1315   /// Returns true if \p I is a memory instruction in an interleaved-group
1316   /// of memory accesses that can be vectorized with wide vector loads/stores
1317   /// and shuffles.
1318   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1319 
1320   /// Check if \p Instr belongs to any interleaved access group.
1321   bool isAccessInterleaved(Instruction *Instr) {
1322     return InterleaveInfo.isInterleaved(Instr);
1323   }
1324 
1325   /// Get the interleaved access group that \p Instr belongs to.
1326   const InterleaveGroup<Instruction> *
1327   getInterleavedAccessGroup(Instruction *Instr) {
1328     return InterleaveInfo.getInterleaveGroup(Instr);
1329   }
1330 
1331   /// Returns true if an interleaved group requires a scalar iteration
1332   /// to handle accesses with gaps, and there is nothing preventing us from
1333   /// creating a scalar epilogue.
1334   bool requiresScalarEpilogue() const {
1335     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1336   }
1337 
1338   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1339   /// loop hint annotation.
1340   bool isScalarEpilogueAllowed() const {
1341     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1342   }
1343 
1344   /// Returns true if all loop blocks should be masked to fold tail loop.
1345   bool foldTailByMasking() const { return FoldTailByMasking; }
1346 
1347   bool blockNeedsPredication(BasicBlock *BB) {
1348     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1349   }
1350 
1351   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1352   /// nodes to the chain of instructions representing the reductions. Uses a
1353   /// MapVector to ensure deterministic iteration order.
1354   using ReductionChainMap =
1355       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1356 
1357   /// Return the chain of instructions representing an inloop reduction.
1358   const ReductionChainMap &getInLoopReductionChains() const {
1359     return InLoopReductionChains;
1360   }
1361 
1362   /// Returns true if the Phi is part of an inloop reduction.
1363   bool isInLoopReduction(PHINode *Phi) const {
1364     return InLoopReductionChains.count(Phi);
1365   }
1366 
1367   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1368   /// with factor VF.  Return the cost of the instruction, including
1369   /// scalarization overhead if it's needed.
1370   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1371 
1372   /// Estimate cost of a call instruction CI if it were vectorized with factor
1373   /// VF. Return the cost of the instruction, including scalarization overhead
1374   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1375   /// scalarized -
1376   /// i.e. either vector version isn't available, or is too expensive.
1377   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1378 
1379   /// Invalidates decisions already taken by the cost model.
1380   void invalidateCostModelingDecisions() {
1381     WideningDecisions.clear();
1382     Uniforms.clear();
1383     Scalars.clear();
1384   }
1385 
1386 private:
1387   unsigned NumPredStores = 0;
1388 
1389   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1390   /// than zero. One is returned if vectorization should best be avoided due
1391   /// to cost.
1392   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1393 
1394   /// The vectorization cost is a combination of the cost itself and a boolean
1395   /// indicating whether any of the contributing operations will actually
1396   /// operate on
1397   /// vector values after type legalization in the backend. If this latter value
1398   /// is
1399   /// false, then all operations will be scalarized (i.e. no vectorization has
1400   /// actually taken place).
1401   using VectorizationCostTy = std::pair<unsigned, bool>;
1402 
1403   /// Returns the expected execution cost. The unit of the cost does
1404   /// not matter because we use the 'cost' units to compare different
1405   /// vector widths. The cost that is returned is *not* normalized by
1406   /// the factor width.
1407   VectorizationCostTy expectedCost(unsigned VF);
1408 
1409   /// Returns the execution time cost of an instruction for a given vector
1410   /// width. Vector width of one means scalar.
1411   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1412 
1413   /// The cost-computation logic from getInstructionCost which provides
1414   /// the vector type as an output parameter.
1415   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1416 
1417   /// Calculate vectorization cost of memory instruction \p I.
1418   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1419 
1420   /// The cost computation for scalarized memory instruction.
1421   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1422 
1423   /// The cost computation for interleaving group of memory instructions.
1424   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1425 
1426   /// The cost computation for Gather/Scatter instruction.
1427   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1428 
1429   /// The cost computation for widening instruction \p I with consecutive
1430   /// memory access.
1431   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1432 
1433   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1434   /// Load: scalar load + broadcast.
1435   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1436   /// element)
1437   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1438 
1439   /// Estimate the overhead of scalarizing an instruction. This is a
1440   /// convenience wrapper for the type-based getScalarizationOverhead API.
1441   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1442 
1443   /// Returns whether the instruction is a load or store and will be a emitted
1444   /// as a vector operation.
1445   bool isConsecutiveLoadOrStore(Instruction *I);
1446 
1447   /// Returns true if an artificially high cost for emulated masked memrefs
1448   /// should be used.
1449   bool useEmulatedMaskMemRefHack(Instruction *I);
1450 
1451   /// Map of scalar integer values to the smallest bitwidth they can be legally
1452   /// represented as. The vector equivalents of these values should be truncated
1453   /// to this type.
1454   MapVector<Instruction *, uint64_t> MinBWs;
1455 
1456   /// A type representing the costs for instructions if they were to be
1457   /// scalarized rather than vectorized. The entries are Instruction-Cost
1458   /// pairs.
1459   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1460 
1461   /// A set containing all BasicBlocks that are known to present after
1462   /// vectorization as a predicated block.
1463   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1464 
1465   /// Records whether it is allowed to have the original scalar loop execute at
1466   /// least once. This may be needed as a fallback loop in case runtime
1467   /// aliasing/dependence checks fail, or to handle the tail/remainder
1468   /// iterations when the trip count is unknown or doesn't divide by the VF,
1469   /// or as a peel-loop to handle gaps in interleave-groups.
1470   /// Under optsize and when the trip count is very small we don't allow any
1471   /// iterations to execute in the scalar loop.
1472   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1473 
1474   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1475   bool FoldTailByMasking = false;
1476 
1477   /// A map holding scalar costs for different vectorization factors. The
1478   /// presence of a cost for an instruction in the mapping indicates that the
1479   /// instruction will be scalarized when vectorizing with the associated
1480   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1481   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1482 
1483   /// Holds the instructions known to be uniform after vectorization.
1484   /// The data is collected per VF.
1485   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1486 
1487   /// Holds the instructions known to be scalar after vectorization.
1488   /// The data is collected per VF.
1489   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1490 
1491   /// Holds the instructions (address computations) that are forced to be
1492   /// scalarized.
1493   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1494 
1495   /// PHINodes of the reductions that should be expanded in-loop along with
1496   /// their associated chains of reduction operations, in program order from top
1497   /// (PHI) to bottom
1498   ReductionChainMap InLoopReductionChains;
1499 
1500   /// Returns the expected difference in cost from scalarizing the expression
1501   /// feeding a predicated instruction \p PredInst. The instructions to
1502   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1503   /// non-negative return value implies the expression will be scalarized.
1504   /// Currently, only single-use chains are considered for scalarization.
1505   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1506                               unsigned VF);
1507 
1508   /// Collect the instructions that are uniform after vectorization. An
1509   /// instruction is uniform if we represent it with a single scalar value in
1510   /// the vectorized loop corresponding to each vector iteration. Examples of
1511   /// uniform instructions include pointer operands of consecutive or
1512   /// interleaved memory accesses. Note that although uniformity implies an
1513   /// instruction will be scalar, the reverse is not true. In general, a
1514   /// scalarized instruction will be represented by VF scalar values in the
1515   /// vectorized loop, each corresponding to an iteration of the original
1516   /// scalar loop.
1517   void collectLoopUniforms(unsigned VF);
1518 
1519   /// Collect the instructions that are scalar after vectorization. An
1520   /// instruction is scalar if it is known to be uniform or will be scalarized
1521   /// during vectorization. Non-uniform scalarized instructions will be
1522   /// represented by VF values in the vectorized loop, each corresponding to an
1523   /// iteration of the original scalar loop.
1524   void collectLoopScalars(unsigned VF);
1525 
1526   /// Keeps cost model vectorization decision and cost for instructions.
1527   /// Right now it is used for memory instructions only.
1528   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1529                                 std::pair<InstWidening, unsigned>>;
1530 
1531   DecisionList WideningDecisions;
1532 
1533   /// Returns true if \p V is expected to be vectorized and it needs to be
1534   /// extracted.
1535   bool needsExtract(Value *V, unsigned VF) const {
1536     Instruction *I = dyn_cast<Instruction>(V);
1537     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1538       return false;
1539 
1540     // Assume we can vectorize V (and hence we need extraction) if the
1541     // scalars are not computed yet. This can happen, because it is called
1542     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1543     // the scalars are collected. That should be a safe assumption in most
1544     // cases, because we check if the operands have vectorizable types
1545     // beforehand in LoopVectorizationLegality.
1546     return Scalars.find(VF) == Scalars.end() ||
1547            !isScalarAfterVectorization(I, VF);
1548   };
1549 
1550   /// Returns a range containing only operands needing to be extracted.
1551   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1552                                                    unsigned VF) {
1553     return SmallVector<Value *, 4>(make_filter_range(
1554         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1555   }
1556 
1557 public:
1558   /// The loop that we evaluate.
1559   Loop *TheLoop;
1560 
1561   /// Predicated scalar evolution analysis.
1562   PredicatedScalarEvolution &PSE;
1563 
1564   /// Loop Info analysis.
1565   LoopInfo *LI;
1566 
1567   /// Vectorization legality.
1568   LoopVectorizationLegality *Legal;
1569 
1570   /// Vector target information.
1571   const TargetTransformInfo &TTI;
1572 
1573   /// Target Library Info.
1574   const TargetLibraryInfo *TLI;
1575 
1576   /// Demanded bits analysis.
1577   DemandedBits *DB;
1578 
1579   /// Assumption cache.
1580   AssumptionCache *AC;
1581 
1582   /// Interface to emit optimization remarks.
1583   OptimizationRemarkEmitter *ORE;
1584 
1585   const Function *TheFunction;
1586 
1587   /// Loop Vectorize Hint.
1588   const LoopVectorizeHints *Hints;
1589 
1590   /// The interleave access information contains groups of interleaved accesses
1591   /// with the same stride and close to each other.
1592   InterleavedAccessInfo &InterleaveInfo;
1593 
1594   /// Values to ignore in the cost model.
1595   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1596 
1597   /// Values to ignore in the cost model when VF > 1.
1598   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1599 };
1600 
1601 } // end namespace llvm
1602 
1603 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1604 // vectorization. The loop needs to be annotated with #pragma omp simd
1605 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1606 // vector length information is not provided, vectorization is not considered
1607 // explicit. Interleave hints are not allowed either. These limitations will be
1608 // relaxed in the future.
1609 // Please, note that we are currently forced to abuse the pragma 'clang
1610 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1611 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1612 // provides *explicit vectorization hints* (LV can bypass legal checks and
1613 // assume that vectorization is legal). However, both hints are implemented
1614 // using the same metadata (llvm.loop.vectorize, processed by
1615 // LoopVectorizeHints). This will be fixed in the future when the native IR
1616 // representation for pragma 'omp simd' is introduced.
1617 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1618                                    OptimizationRemarkEmitter *ORE) {
1619   assert(!OuterLp->empty() && "This is not an outer loop");
1620   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1621 
1622   // Only outer loops with an explicit vectorization hint are supported.
1623   // Unannotated outer loops are ignored.
1624   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1625     return false;
1626 
1627   Function *Fn = OuterLp->getHeader()->getParent();
1628   if (!Hints.allowVectorization(Fn, OuterLp,
1629                                 true /*VectorizeOnlyWhenForced*/)) {
1630     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1631     return false;
1632   }
1633 
1634   if (Hints.getInterleave() > 1) {
1635     // TODO: Interleave support is future work.
1636     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1637                          "outer loops.\n");
1638     Hints.emitRemarkWithHints();
1639     return false;
1640   }
1641 
1642   return true;
1643 }
1644 
1645 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1646                                   OptimizationRemarkEmitter *ORE,
1647                                   SmallVectorImpl<Loop *> &V) {
1648   // Collect inner loops and outer loops without irreducible control flow. For
1649   // now, only collect outer loops that have explicit vectorization hints. If we
1650   // are stress testing the VPlan H-CFG construction, we collect the outermost
1651   // loop of every loop nest.
1652   if (L.empty() || VPlanBuildStressTest ||
1653       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1654     LoopBlocksRPO RPOT(&L);
1655     RPOT.perform(LI);
1656     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1657       V.push_back(&L);
1658       // TODO: Collect inner loops inside marked outer loops in case
1659       // vectorization fails for the outer loop. Do not invoke
1660       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1661       // already known to be reducible. We can use an inherited attribute for
1662       // that.
1663       return;
1664     }
1665   }
1666   for (Loop *InnerL : L)
1667     collectSupportedLoops(*InnerL, LI, ORE, V);
1668 }
1669 
1670 namespace {
1671 
1672 /// The LoopVectorize Pass.
1673 struct LoopVectorize : public FunctionPass {
1674   /// Pass identification, replacement for typeid
1675   static char ID;
1676 
1677   LoopVectorizePass Impl;
1678 
1679   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1680                          bool VectorizeOnlyWhenForced = false)
1681       : FunctionPass(ID),
1682         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1683     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1684   }
1685 
1686   bool runOnFunction(Function &F) override {
1687     if (skipFunction(F))
1688       return false;
1689 
1690     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1691     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1692     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1693     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1694     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1695     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1696     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1697     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1698     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1699     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1700     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1701     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1702     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1703 
1704     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1705         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1706 
1707     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1708                         GetLAA, *ORE, PSI).MadeAnyChange;
1709   }
1710 
1711   void getAnalysisUsage(AnalysisUsage &AU) const override {
1712     AU.addRequired<AssumptionCacheTracker>();
1713     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1714     AU.addRequired<DominatorTreeWrapperPass>();
1715     AU.addRequired<LoopInfoWrapperPass>();
1716     AU.addRequired<ScalarEvolutionWrapperPass>();
1717     AU.addRequired<TargetTransformInfoWrapperPass>();
1718     AU.addRequired<AAResultsWrapperPass>();
1719     AU.addRequired<LoopAccessLegacyAnalysis>();
1720     AU.addRequired<DemandedBitsWrapperPass>();
1721     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1722     AU.addRequired<InjectTLIMappingsLegacy>();
1723 
1724     // We currently do not preserve loopinfo/dominator analyses with outer loop
1725     // vectorization. Until this is addressed, mark these analyses as preserved
1726     // only for non-VPlan-native path.
1727     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1728     if (!EnableVPlanNativePath) {
1729       AU.addPreserved<LoopInfoWrapperPass>();
1730       AU.addPreserved<DominatorTreeWrapperPass>();
1731     }
1732 
1733     AU.addPreserved<BasicAAWrapperPass>();
1734     AU.addPreserved<GlobalsAAWrapperPass>();
1735     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1736   }
1737 };
1738 
1739 } // end anonymous namespace
1740 
1741 //===----------------------------------------------------------------------===//
1742 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1743 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1744 //===----------------------------------------------------------------------===//
1745 
1746 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1747   // We need to place the broadcast of invariant variables outside the loop,
1748   // but only if it's proven safe to do so. Else, broadcast will be inside
1749   // vector loop body.
1750   Instruction *Instr = dyn_cast<Instruction>(V);
1751   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1752                      (!Instr ||
1753                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1754   // Place the code for broadcasting invariant variables in the new preheader.
1755   IRBuilder<>::InsertPointGuard Guard(Builder);
1756   if (SafeToHoist)
1757     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1758 
1759   // Broadcast the scalar into all locations in the vector.
1760   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1761 
1762   return Shuf;
1763 }
1764 
1765 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1766     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1767   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1768          "Expected either an induction phi-node or a truncate of it!");
1769   Value *Start = II.getStartValue();
1770 
1771   // Construct the initial value of the vector IV in the vector loop preheader
1772   auto CurrIP = Builder.saveIP();
1773   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1774   if (isa<TruncInst>(EntryVal)) {
1775     assert(Start->getType()->isIntegerTy() &&
1776            "Truncation requires an integer type");
1777     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1778     Step = Builder.CreateTrunc(Step, TruncType);
1779     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1780   }
1781   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1782   Value *SteppedStart =
1783       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1784 
1785   // We create vector phi nodes for both integer and floating-point induction
1786   // variables. Here, we determine the kind of arithmetic we will perform.
1787   Instruction::BinaryOps AddOp;
1788   Instruction::BinaryOps MulOp;
1789   if (Step->getType()->isIntegerTy()) {
1790     AddOp = Instruction::Add;
1791     MulOp = Instruction::Mul;
1792   } else {
1793     AddOp = II.getInductionOpcode();
1794     MulOp = Instruction::FMul;
1795   }
1796 
1797   // Multiply the vectorization factor by the step using integer or
1798   // floating-point arithmetic as appropriate.
1799   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1800   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1801 
1802   // Create a vector splat to use in the induction update.
1803   //
1804   // FIXME: If the step is non-constant, we create the vector splat with
1805   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1806   //        handle a constant vector splat.
1807   Value *SplatVF =
1808       isa<Constant>(Mul)
1809           ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
1810           : Builder.CreateVectorSplat(VF, Mul);
1811   Builder.restoreIP(CurrIP);
1812 
1813   // We may need to add the step a number of times, depending on the unroll
1814   // factor. The last of those goes into the PHI.
1815   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1816                                     &*LoopVectorBody->getFirstInsertionPt());
1817   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1818   Instruction *LastInduction = VecInd;
1819   for (unsigned Part = 0; Part < UF; ++Part) {
1820     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1821 
1822     if (isa<TruncInst>(EntryVal))
1823       addMetadata(LastInduction, EntryVal);
1824     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1825 
1826     LastInduction = cast<Instruction>(addFastMathFlag(
1827         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1828     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1829   }
1830 
1831   // Move the last step to the end of the latch block. This ensures consistent
1832   // placement of all induction updates.
1833   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1834   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1835   auto *ICmp = cast<Instruction>(Br->getCondition());
1836   LastInduction->moveBefore(ICmp);
1837   LastInduction->setName("vec.ind.next");
1838 
1839   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1840   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1841 }
1842 
1843 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1844   return Cost->isScalarAfterVectorization(I, VF) ||
1845          Cost->isProfitableToScalarize(I, VF);
1846 }
1847 
1848 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1849   if (shouldScalarizeInstruction(IV))
1850     return true;
1851   auto isScalarInst = [&](User *U) -> bool {
1852     auto *I = cast<Instruction>(U);
1853     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1854   };
1855   return llvm::any_of(IV->users(), isScalarInst);
1856 }
1857 
1858 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1859     const InductionDescriptor &ID, const Instruction *EntryVal,
1860     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1861   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1862          "Expected either an induction phi-node or a truncate of it!");
1863 
1864   // This induction variable is not the phi from the original loop but the
1865   // newly-created IV based on the proof that casted Phi is equal to the
1866   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1867   // re-uses the same InductionDescriptor that original IV uses but we don't
1868   // have to do any recording in this case - that is done when original IV is
1869   // processed.
1870   if (isa<TruncInst>(EntryVal))
1871     return;
1872 
1873   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1874   if (Casts.empty())
1875     return;
1876   // Only the first Cast instruction in the Casts vector is of interest.
1877   // The rest of the Casts (if exist) have no uses outside the
1878   // induction update chain itself.
1879   Instruction *CastInst = *Casts.begin();
1880   if (Lane < UINT_MAX)
1881     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1882   else
1883     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1884 }
1885 
1886 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1887   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1888          "Primary induction variable must have an integer type");
1889 
1890   auto II = Legal->getInductionVars().find(IV);
1891   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1892 
1893   auto ID = II->second;
1894   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1895 
1896   // The value from the original loop to which we are mapping the new induction
1897   // variable.
1898   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1899 
1900   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1901 
1902   // Generate code for the induction step. Note that induction steps are
1903   // required to be loop-invariant
1904   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1905     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1906            "Induction step should be loop invariant");
1907     if (PSE.getSE()->isSCEVable(IV->getType())) {
1908       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1909       return Exp.expandCodeFor(Step, Step->getType(),
1910                                LoopVectorPreHeader->getTerminator());
1911     }
1912     return cast<SCEVUnknown>(Step)->getValue();
1913   };
1914 
1915   // The scalar value to broadcast. This is derived from the canonical
1916   // induction variable. If a truncation type is given, truncate the canonical
1917   // induction variable and step. Otherwise, derive these values from the
1918   // induction descriptor.
1919   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1920     Value *ScalarIV = Induction;
1921     if (IV != OldInduction) {
1922       ScalarIV = IV->getType()->isIntegerTy()
1923                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1924                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1925                                           IV->getType());
1926       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1927       ScalarIV->setName("offset.idx");
1928     }
1929     if (Trunc) {
1930       auto *TruncType = cast<IntegerType>(Trunc->getType());
1931       assert(Step->getType()->isIntegerTy() &&
1932              "Truncation requires an integer step");
1933       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1934       Step = Builder.CreateTrunc(Step, TruncType);
1935     }
1936     return ScalarIV;
1937   };
1938 
1939   // Create the vector values from the scalar IV, in the absence of creating a
1940   // vector IV.
1941   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1942     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1943     for (unsigned Part = 0; Part < UF; ++Part) {
1944       Value *EntryPart =
1945           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1946       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1947       if (Trunc)
1948         addMetadata(EntryPart, Trunc);
1949       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1950     }
1951   };
1952 
1953   // Now do the actual transformations, and start with creating the step value.
1954   Value *Step = CreateStepValue(ID.getStep());
1955   if (VF <= 1) {
1956     Value *ScalarIV = CreateScalarIV(Step);
1957     CreateSplatIV(ScalarIV, Step);
1958     return;
1959   }
1960 
1961   // Determine if we want a scalar version of the induction variable. This is
1962   // true if the induction variable itself is not widened, or if it has at
1963   // least one user in the loop that is not widened.
1964   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1965   if (!NeedsScalarIV) {
1966     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1967     return;
1968   }
1969 
1970   // Try to create a new independent vector induction variable. If we can't
1971   // create the phi node, we will splat the scalar induction variable in each
1972   // loop iteration.
1973   if (!shouldScalarizeInstruction(EntryVal)) {
1974     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1975     Value *ScalarIV = CreateScalarIV(Step);
1976     // Create scalar steps that can be used by instructions we will later
1977     // scalarize. Note that the addition of the scalar steps will not increase
1978     // the number of instructions in the loop in the common case prior to
1979     // InstCombine. We will be trading one vector extract for each scalar step.
1980     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1981     return;
1982   }
1983 
1984   // All IV users are scalar instructions, so only emit a scalar IV, not a
1985   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
1986   // predicate used by the masked loads/stores.
1987   Value *ScalarIV = CreateScalarIV(Step);
1988   if (!Cost->isScalarEpilogueAllowed())
1989     CreateSplatIV(ScalarIV, Step);
1990   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1991 }
1992 
1993 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1994                                           Instruction::BinaryOps BinOp) {
1995   // Create and check the types.
1996   auto *ValVTy = cast<VectorType>(Val->getType());
1997   int VLen = ValVTy->getNumElements();
1998 
1999   Type *STy = Val->getType()->getScalarType();
2000   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2001          "Induction Step must be an integer or FP");
2002   assert(Step->getType() == STy && "Step has wrong type");
2003 
2004   SmallVector<Constant *, 8> Indices;
2005 
2006   if (STy->isIntegerTy()) {
2007     // Create a vector of consecutive numbers from zero to VF.
2008     for (int i = 0; i < VLen; ++i)
2009       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2010 
2011     // Add the consecutive indices to the vector value.
2012     Constant *Cv = ConstantVector::get(Indices);
2013     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2014     Step = Builder.CreateVectorSplat(VLen, Step);
2015     assert(Step->getType() == Val->getType() && "Invalid step vec");
2016     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2017     // which can be found from the original scalar operations.
2018     Step = Builder.CreateMul(Cv, Step);
2019     return Builder.CreateAdd(Val, Step, "induction");
2020   }
2021 
2022   // Floating point induction.
2023   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2024          "Binary Opcode should be specified for FP induction");
2025   // Create a vector of consecutive numbers from zero to VF.
2026   for (int i = 0; i < VLen; ++i)
2027     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2028 
2029   // Add the consecutive indices to the vector value.
2030   Constant *Cv = ConstantVector::get(Indices);
2031 
2032   Step = Builder.CreateVectorSplat(VLen, Step);
2033 
2034   // Floating point operations had to be 'fast' to enable the induction.
2035   FastMathFlags Flags;
2036   Flags.setFast();
2037 
2038   Value *MulOp = Builder.CreateFMul(Cv, Step);
2039   if (isa<Instruction>(MulOp))
2040     // Have to check, MulOp may be a constant
2041     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2042 
2043   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2044   if (isa<Instruction>(BOp))
2045     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2046   return BOp;
2047 }
2048 
2049 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2050                                            Instruction *EntryVal,
2051                                            const InductionDescriptor &ID) {
2052   // We shouldn't have to build scalar steps if we aren't vectorizing.
2053   assert(VF > 1 && "VF should be greater than one");
2054 
2055   // Get the value type and ensure it and the step have the same integer type.
2056   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2057   assert(ScalarIVTy == Step->getType() &&
2058          "Val and Step should have the same type");
2059 
2060   // We build scalar steps for both integer and floating-point induction
2061   // variables. Here, we determine the kind of arithmetic we will perform.
2062   Instruction::BinaryOps AddOp;
2063   Instruction::BinaryOps MulOp;
2064   if (ScalarIVTy->isIntegerTy()) {
2065     AddOp = Instruction::Add;
2066     MulOp = Instruction::Mul;
2067   } else {
2068     AddOp = ID.getInductionOpcode();
2069     MulOp = Instruction::FMul;
2070   }
2071 
2072   // Determine the number of scalars we need to generate for each unroll
2073   // iteration. If EntryVal is uniform, we only need to generate the first
2074   // lane. Otherwise, we generate all VF values.
2075   unsigned Lanes =
2076       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
2077                                                                          : VF;
2078   // Compute the scalar steps and save the results in VectorLoopValueMap.
2079   for (unsigned Part = 0; Part < UF; ++Part) {
2080     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2081       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
2082       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2083       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2084       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2085       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2086     }
2087   }
2088 }
2089 
2090 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2091   assert(V != Induction && "The new induction variable should not be used.");
2092   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2093   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2094 
2095   // If we have a stride that is replaced by one, do it here. Defer this for
2096   // the VPlan-native path until we start running Legal checks in that path.
2097   if (!EnableVPlanNativePath && Legal->hasStride(V))
2098     V = ConstantInt::get(V->getType(), 1);
2099 
2100   // If we have a vector mapped to this value, return it.
2101   if (VectorLoopValueMap.hasVectorValue(V, Part))
2102     return VectorLoopValueMap.getVectorValue(V, Part);
2103 
2104   // If the value has not been vectorized, check if it has been scalarized
2105   // instead. If it has been scalarized, and we actually need the value in
2106   // vector form, we will construct the vector values on demand.
2107   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2108     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2109 
2110     // If we've scalarized a value, that value should be an instruction.
2111     auto *I = cast<Instruction>(V);
2112 
2113     // If we aren't vectorizing, we can just copy the scalar map values over to
2114     // the vector map.
2115     if (VF == 1) {
2116       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2117       return ScalarValue;
2118     }
2119 
2120     // Get the last scalar instruction we generated for V and Part. If the value
2121     // is known to be uniform after vectorization, this corresponds to lane zero
2122     // of the Part unroll iteration. Otherwise, the last instruction is the one
2123     // we created for the last vector lane of the Part unroll iteration.
2124     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2125     auto *LastInst = cast<Instruction>(
2126         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2127 
2128     // Set the insert point after the last scalarized instruction. This ensures
2129     // the insertelement sequence will directly follow the scalar definitions.
2130     auto OldIP = Builder.saveIP();
2131     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2132     Builder.SetInsertPoint(&*NewIP);
2133 
2134     // However, if we are vectorizing, we need to construct the vector values.
2135     // If the value is known to be uniform after vectorization, we can just
2136     // broadcast the scalar value corresponding to lane zero for each unroll
2137     // iteration. Otherwise, we construct the vector values using insertelement
2138     // instructions. Since the resulting vectors are stored in
2139     // VectorLoopValueMap, we will only generate the insertelements once.
2140     Value *VectorValue = nullptr;
2141     if (Cost->isUniformAfterVectorization(I, VF)) {
2142       VectorValue = getBroadcastInstrs(ScalarValue);
2143       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2144     } else {
2145       // Initialize packing with insertelements to start from undef.
2146       Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
2147       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2148       for (unsigned Lane = 0; Lane < VF; ++Lane)
2149         packScalarIntoVectorValue(V, {Part, Lane});
2150       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2151     }
2152     Builder.restoreIP(OldIP);
2153     return VectorValue;
2154   }
2155 
2156   // If this scalar is unknown, assume that it is a constant or that it is
2157   // loop invariant. Broadcast V and save the value for future uses.
2158   Value *B = getBroadcastInstrs(V);
2159   VectorLoopValueMap.setVectorValue(V, Part, B);
2160   return B;
2161 }
2162 
2163 Value *
2164 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2165                                             const VPIteration &Instance) {
2166   // If the value is not an instruction contained in the loop, it should
2167   // already be scalar.
2168   if (OrigLoop->isLoopInvariant(V))
2169     return V;
2170 
2171   assert(Instance.Lane > 0
2172              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2173              : true && "Uniform values only have lane zero");
2174 
2175   // If the value from the original loop has not been vectorized, it is
2176   // represented by UF x VF scalar values in the new loop. Return the requested
2177   // scalar value.
2178   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2179     return VectorLoopValueMap.getScalarValue(V, Instance);
2180 
2181   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2182   // for the given unroll part. If this entry is not a vector type (i.e., the
2183   // vectorization factor is one), there is no need to generate an
2184   // extractelement instruction.
2185   auto *U = getOrCreateVectorValue(V, Instance.Part);
2186   if (!U->getType()->isVectorTy()) {
2187     assert(VF == 1 && "Value not scalarized has non-vector type");
2188     return U;
2189   }
2190 
2191   // Otherwise, the value from the original loop has been vectorized and is
2192   // represented by UF vector values. Extract and return the requested scalar
2193   // value from the appropriate vector lane.
2194   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2195 }
2196 
2197 void InnerLoopVectorizer::packScalarIntoVectorValue(
2198     Value *V, const VPIteration &Instance) {
2199   assert(V != Induction && "The new induction variable should not be used.");
2200   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2201   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2202 
2203   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2204   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2205   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2206                                             Builder.getInt32(Instance.Lane));
2207   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2208 }
2209 
2210 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2211   assert(Vec->getType()->isVectorTy() && "Invalid type");
2212   SmallVector<int, 8> ShuffleMask;
2213   for (unsigned i = 0; i < VF; ++i)
2214     ShuffleMask.push_back(VF - i - 1);
2215 
2216   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2217                                      ShuffleMask, "reverse");
2218 }
2219 
2220 // Return whether we allow using masked interleave-groups (for dealing with
2221 // strided loads/stores that reside in predicated blocks, or for dealing
2222 // with gaps).
2223 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2224   // If an override option has been passed in for interleaved accesses, use it.
2225   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2226     return EnableMaskedInterleavedMemAccesses;
2227 
2228   return TTI.enableMaskedInterleavedAccessVectorization();
2229 }
2230 
2231 // Try to vectorize the interleave group that \p Instr belongs to.
2232 //
2233 // E.g. Translate following interleaved load group (factor = 3):
2234 //   for (i = 0; i < N; i+=3) {
2235 //     R = Pic[i];             // Member of index 0
2236 //     G = Pic[i+1];           // Member of index 1
2237 //     B = Pic[i+2];           // Member of index 2
2238 //     ... // do something to R, G, B
2239 //   }
2240 // To:
2241 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2242 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2243 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2244 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2245 //
2246 // Or translate following interleaved store group (factor = 3):
2247 //   for (i = 0; i < N; i+=3) {
2248 //     ... do something to R, G, B
2249 //     Pic[i]   = R;           // Member of index 0
2250 //     Pic[i+1] = G;           // Member of index 1
2251 //     Pic[i+2] = B;           // Member of index 2
2252 //   }
2253 // To:
2254 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2255 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2256 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2257 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2258 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2259 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2260     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2261     VPValue *Addr, VPValue *BlockInMask) {
2262   Instruction *Instr = Group->getInsertPos();
2263   const DataLayout &DL = Instr->getModule()->getDataLayout();
2264 
2265   // Prepare for the vector type of the interleaved load/store.
2266   Type *ScalarTy = getMemInstValueType(Instr);
2267   unsigned InterleaveFactor = Group->getFactor();
2268   auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
2269 
2270   // Prepare for the new pointers.
2271   SmallVector<Value *, 2> AddrParts;
2272   unsigned Index = Group->getIndex(Instr);
2273 
2274   // TODO: extend the masked interleaved-group support to reversed access.
2275   assert((!BlockInMask || !Group->isReverse()) &&
2276          "Reversed masked interleave-group not supported.");
2277 
2278   // If the group is reverse, adjust the index to refer to the last vector lane
2279   // instead of the first. We adjust the index from the first vector lane,
2280   // rather than directly getting the pointer for lane VF - 1, because the
2281   // pointer operand of the interleaved access is supposed to be uniform. For
2282   // uniform instructions, we're only required to generate a value for the
2283   // first vector lane in each unroll iteration.
2284   if (Group->isReverse())
2285     Index += (VF - 1) * Group->getFactor();
2286 
2287   for (unsigned Part = 0; Part < UF; Part++) {
2288     Value *AddrPart = State.get(Addr, {Part, 0});
2289     setDebugLocFromInst(Builder, AddrPart);
2290 
2291     // Notice current instruction could be any index. Need to adjust the address
2292     // to the member of index 0.
2293     //
2294     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2295     //       b = A[i];       // Member of index 0
2296     // Current pointer is pointed to A[i+1], adjust it to A[i].
2297     //
2298     // E.g.  A[i+1] = a;     // Member of index 1
2299     //       A[i]   = b;     // Member of index 0
2300     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2301     // Current pointer is pointed to A[i+2], adjust it to A[i].
2302 
2303     bool InBounds = false;
2304     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2305       InBounds = gep->isInBounds();
2306     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2307     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2308 
2309     // Cast to the vector pointer type.
2310     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2311     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2312     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2313   }
2314 
2315   setDebugLocFromInst(Builder, Instr);
2316   Value *UndefVec = UndefValue::get(VecTy);
2317 
2318   Value *MaskForGaps = nullptr;
2319   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2320     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2321     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2322   }
2323 
2324   // Vectorize the interleaved load group.
2325   if (isa<LoadInst>(Instr)) {
2326     // For each unroll part, create a wide load for the group.
2327     SmallVector<Value *, 2> NewLoads;
2328     for (unsigned Part = 0; Part < UF; Part++) {
2329       Instruction *NewLoad;
2330       if (BlockInMask || MaskForGaps) {
2331         assert(useMaskedInterleavedAccesses(*TTI) &&
2332                "masked interleaved groups are not allowed.");
2333         Value *GroupMask = MaskForGaps;
2334         if (BlockInMask) {
2335           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2336           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2337           Value *ShuffledMask = Builder.CreateShuffleVector(
2338               BlockInMaskPart, Undefs,
2339               createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
2340           GroupMask = MaskForGaps
2341                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2342                                                 MaskForGaps)
2343                           : ShuffledMask;
2344         }
2345         NewLoad =
2346             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2347                                      GroupMask, UndefVec, "wide.masked.vec");
2348       }
2349       else
2350         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2351                                             Group->getAlign(), "wide.vec");
2352       Group->addMetadata(NewLoad);
2353       NewLoads.push_back(NewLoad);
2354     }
2355 
2356     // For each member in the group, shuffle out the appropriate data from the
2357     // wide loads.
2358     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2359       Instruction *Member = Group->getMember(I);
2360 
2361       // Skip the gaps in the group.
2362       if (!Member)
2363         continue;
2364 
2365       auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
2366       for (unsigned Part = 0; Part < UF; Part++) {
2367         Value *StridedVec = Builder.CreateShuffleVector(
2368             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2369 
2370         // If this member has different type, cast the result type.
2371         if (Member->getType() != ScalarTy) {
2372           VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
2373           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2374         }
2375 
2376         if (Group->isReverse())
2377           StridedVec = reverseVector(StridedVec);
2378 
2379         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2380       }
2381     }
2382     return;
2383   }
2384 
2385   // The sub vector type for current instruction.
2386   auto *SubVT = FixedVectorType::get(ScalarTy, VF);
2387 
2388   // Vectorize the interleaved store group.
2389   for (unsigned Part = 0; Part < UF; Part++) {
2390     // Collect the stored vector from each member.
2391     SmallVector<Value *, 4> StoredVecs;
2392     for (unsigned i = 0; i < InterleaveFactor; i++) {
2393       // Interleaved store group doesn't allow a gap, so each index has a member
2394       Instruction *Member = Group->getMember(i);
2395       assert(Member && "Fail to get a member from an interleaved store group");
2396 
2397       Value *StoredVec = getOrCreateVectorValue(
2398           cast<StoreInst>(Member)->getValueOperand(), Part);
2399       if (Group->isReverse())
2400         StoredVec = reverseVector(StoredVec);
2401 
2402       // If this member has different type, cast it to a unified type.
2403 
2404       if (StoredVec->getType() != SubVT)
2405         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2406 
2407       StoredVecs.push_back(StoredVec);
2408     }
2409 
2410     // Concatenate all vectors into a wide vector.
2411     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2412 
2413     // Interleave the elements in the wide vector.
2414     Value *IVec = Builder.CreateShuffleVector(
2415         WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
2416         "interleaved.vec");
2417 
2418     Instruction *NewStoreInstr;
2419     if (BlockInMask) {
2420       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2421       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2422       Value *ShuffledMask = Builder.CreateShuffleVector(
2423           BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
2424           "interleaved.mask");
2425       NewStoreInstr = Builder.CreateMaskedStore(
2426           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2427     }
2428     else
2429       NewStoreInstr =
2430           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2431 
2432     Group->addMetadata(NewStoreInstr);
2433   }
2434 }
2435 
2436 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2437                                                      VPTransformState &State,
2438                                                      VPValue *Addr,
2439                                                      VPValue *StoredValue,
2440                                                      VPValue *BlockInMask) {
2441   // Attempt to issue a wide load.
2442   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2443   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2444 
2445   assert((LI || SI) && "Invalid Load/Store instruction");
2446   assert((!SI || StoredValue) && "No stored value provided for widened store");
2447   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2448 
2449   LoopVectorizationCostModel::InstWidening Decision =
2450       Cost->getWideningDecision(Instr, VF);
2451   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2452           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2453           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2454          "CM decision is not to widen the memory instruction");
2455 
2456   Type *ScalarDataTy = getMemInstValueType(Instr);
2457   auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
2458   const Align Alignment = getLoadStoreAlignment(Instr);
2459 
2460   // Determine if the pointer operand of the access is either consecutive or
2461   // reverse consecutive.
2462   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2463   bool ConsecutiveStride =
2464       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2465   bool CreateGatherScatter =
2466       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2467 
2468   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2469   // gather/scatter. Otherwise Decision should have been to Scalarize.
2470   assert((ConsecutiveStride || CreateGatherScatter) &&
2471          "The instruction should be scalarized");
2472   (void)ConsecutiveStride;
2473 
2474   VectorParts BlockInMaskParts(UF);
2475   bool isMaskRequired = BlockInMask;
2476   if (isMaskRequired)
2477     for (unsigned Part = 0; Part < UF; ++Part)
2478       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2479 
2480   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2481     // Calculate the pointer for the specific unroll-part.
2482     GetElementPtrInst *PartPtr = nullptr;
2483 
2484     bool InBounds = false;
2485     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2486       InBounds = gep->isInBounds();
2487 
2488     if (Reverse) {
2489       // If the address is consecutive but reversed, then the
2490       // wide store needs to start at the last vector element.
2491       PartPtr = cast<GetElementPtrInst>(
2492           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2493       PartPtr->setIsInBounds(InBounds);
2494       PartPtr = cast<GetElementPtrInst>(
2495           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2496       PartPtr->setIsInBounds(InBounds);
2497       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2498         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2499     } else {
2500       PartPtr = cast<GetElementPtrInst>(
2501           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2502       PartPtr->setIsInBounds(InBounds);
2503     }
2504 
2505     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2506     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2507   };
2508 
2509   // Handle Stores:
2510   if (SI) {
2511     setDebugLocFromInst(Builder, SI);
2512 
2513     for (unsigned Part = 0; Part < UF; ++Part) {
2514       Instruction *NewSI = nullptr;
2515       Value *StoredVal = State.get(StoredValue, Part);
2516       if (CreateGatherScatter) {
2517         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2518         Value *VectorGep = State.get(Addr, Part);
2519         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2520                                             MaskPart);
2521       } else {
2522         if (Reverse) {
2523           // If we store to reverse consecutive memory locations, then we need
2524           // to reverse the order of elements in the stored value.
2525           StoredVal = reverseVector(StoredVal);
2526           // We don't want to update the value in the map as it might be used in
2527           // another expression. So don't call resetVectorValue(StoredVal).
2528         }
2529         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2530         if (isMaskRequired)
2531           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2532                                             BlockInMaskParts[Part]);
2533         else
2534           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2535       }
2536       addMetadata(NewSI, SI);
2537     }
2538     return;
2539   }
2540 
2541   // Handle loads.
2542   assert(LI && "Must have a load instruction");
2543   setDebugLocFromInst(Builder, LI);
2544   for (unsigned Part = 0; Part < UF; ++Part) {
2545     Value *NewLI;
2546     if (CreateGatherScatter) {
2547       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2548       Value *VectorGep = State.get(Addr, Part);
2549       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2550                                          nullptr, "wide.masked.gather");
2551       addMetadata(NewLI, LI);
2552     } else {
2553       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2554       if (isMaskRequired)
2555         NewLI = Builder.CreateMaskedLoad(
2556             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2557             "wide.masked.load");
2558       else
2559         NewLI =
2560             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2561 
2562       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2563       addMetadata(NewLI, LI);
2564       if (Reverse)
2565         NewLI = reverseVector(NewLI);
2566     }
2567     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2568   }
2569 }
2570 
2571 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2572                                                const VPIteration &Instance,
2573                                                bool IfPredicateInstr,
2574                                                VPTransformState &State) {
2575   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2576 
2577   setDebugLocFromInst(Builder, Instr);
2578 
2579   // Does this instruction return a value ?
2580   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2581 
2582   Instruction *Cloned = Instr->clone();
2583   if (!IsVoidRetTy)
2584     Cloned->setName(Instr->getName() + ".cloned");
2585 
2586   // Replace the operands of the cloned instructions with their scalar
2587   // equivalents in the new loop.
2588   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2589     auto *NewOp = State.get(User.getOperand(op), Instance);
2590     Cloned->setOperand(op, NewOp);
2591   }
2592   addNewMetadata(Cloned, Instr);
2593 
2594   // Place the cloned scalar in the new loop.
2595   Builder.Insert(Cloned);
2596 
2597   // Add the cloned scalar to the scalar map entry.
2598   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2599 
2600   // If we just cloned a new assumption, add it the assumption cache.
2601   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2602     if (II->getIntrinsicID() == Intrinsic::assume)
2603       AC->registerAssumption(II);
2604 
2605   // End if-block.
2606   if (IfPredicateInstr)
2607     PredicatedInstructions.push_back(Cloned);
2608 }
2609 
2610 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2611                                                       Value *End, Value *Step,
2612                                                       Instruction *DL) {
2613   BasicBlock *Header = L->getHeader();
2614   BasicBlock *Latch = L->getLoopLatch();
2615   // As we're just creating this loop, it's possible no latch exists
2616   // yet. If so, use the header as this will be a single block loop.
2617   if (!Latch)
2618     Latch = Header;
2619 
2620   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2621   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2622   setDebugLocFromInst(Builder, OldInst);
2623   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2624 
2625   Builder.SetInsertPoint(Latch->getTerminator());
2626   setDebugLocFromInst(Builder, OldInst);
2627 
2628   // Create i+1 and fill the PHINode.
2629   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2630   Induction->addIncoming(Start, L->getLoopPreheader());
2631   Induction->addIncoming(Next, Latch);
2632   // Create the compare.
2633   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2634   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2635 
2636   // Now we have two terminators. Remove the old one from the block.
2637   Latch->getTerminator()->eraseFromParent();
2638 
2639   return Induction;
2640 }
2641 
2642 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2643   if (TripCount)
2644     return TripCount;
2645 
2646   assert(L && "Create Trip Count for null loop.");
2647   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2648   // Find the loop boundaries.
2649   ScalarEvolution *SE = PSE.getSE();
2650   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2651   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2652          "Invalid loop count");
2653 
2654   Type *IdxTy = Legal->getWidestInductionType();
2655   assert(IdxTy && "No type for induction");
2656 
2657   // The exit count might have the type of i64 while the phi is i32. This can
2658   // happen if we have an induction variable that is sign extended before the
2659   // compare. The only way that we get a backedge taken count is that the
2660   // induction variable was signed and as such will not overflow. In such a case
2661   // truncation is legal.
2662   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2663       IdxTy->getPrimitiveSizeInBits())
2664     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2665   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2666 
2667   // Get the total trip count from the count by adding 1.
2668   const SCEV *ExitCount = SE->getAddExpr(
2669       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2670 
2671   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2672 
2673   // Expand the trip count and place the new instructions in the preheader.
2674   // Notice that the pre-header does not change, only the loop body.
2675   SCEVExpander Exp(*SE, DL, "induction");
2676 
2677   // Count holds the overall loop count (N).
2678   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2679                                 L->getLoopPreheader()->getTerminator());
2680 
2681   if (TripCount->getType()->isPointerTy())
2682     TripCount =
2683         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2684                                     L->getLoopPreheader()->getTerminator());
2685 
2686   return TripCount;
2687 }
2688 
2689 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2690   if (VectorTripCount)
2691     return VectorTripCount;
2692 
2693   Value *TC = getOrCreateTripCount(L);
2694   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2695 
2696   Type *Ty = TC->getType();
2697   Constant *Step = ConstantInt::get(Ty, VF * UF);
2698 
2699   // If the tail is to be folded by masking, round the number of iterations N
2700   // up to a multiple of Step instead of rounding down. This is done by first
2701   // adding Step-1 and then rounding down. Note that it's ok if this addition
2702   // overflows: the vector induction variable will eventually wrap to zero given
2703   // that it starts at zero and its Step is a power of two; the loop will then
2704   // exit, with the last early-exit vector comparison also producing all-true.
2705   if (Cost->foldTailByMasking()) {
2706     assert(isPowerOf2_32(VF * UF) &&
2707            "VF*UF must be a power of 2 when folding tail by masking");
2708     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2709   }
2710 
2711   // Now we need to generate the expression for the part of the loop that the
2712   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2713   // iterations are not required for correctness, or N - Step, otherwise. Step
2714   // is equal to the vectorization factor (number of SIMD elements) times the
2715   // unroll factor (number of SIMD instructions).
2716   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2717 
2718   // If there is a non-reversed interleaved group that may speculatively access
2719   // memory out-of-bounds, we need to ensure that there will be at least one
2720   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2721   // the trip count, we set the remainder to be equal to the step. If the step
2722   // does not evenly divide the trip count, no adjustment is necessary since
2723   // there will already be scalar iterations. Note that the minimum iterations
2724   // check ensures that N >= Step.
2725   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2726     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2727     R = Builder.CreateSelect(IsZero, Step, R);
2728   }
2729 
2730   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2731 
2732   return VectorTripCount;
2733 }
2734 
2735 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2736                                                    const DataLayout &DL) {
2737   // Verify that V is a vector type with same number of elements as DstVTy.
2738   unsigned VF = DstVTy->getNumElements();
2739   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2740   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2741   Type *SrcElemTy = SrcVecTy->getElementType();
2742   Type *DstElemTy = DstVTy->getElementType();
2743   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2744          "Vector elements must have same size");
2745 
2746   // Do a direct cast if element types are castable.
2747   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2748     return Builder.CreateBitOrPointerCast(V, DstVTy);
2749   }
2750   // V cannot be directly casted to desired vector type.
2751   // May happen when V is a floating point vector but DstVTy is a vector of
2752   // pointers or vice-versa. Handle this using a two-step bitcast using an
2753   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2754   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2755          "Only one type should be a pointer type");
2756   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2757          "Only one type should be a floating point type");
2758   Type *IntTy =
2759       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2760   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2761   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2762   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2763 }
2764 
2765 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2766                                                          BasicBlock *Bypass) {
2767   Value *Count = getOrCreateTripCount(L);
2768   // Reuse existing vector loop preheader for TC checks.
2769   // Note that new preheader block is generated for vector loop.
2770   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2771   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2772 
2773   // Generate code to check if the loop's trip count is less than VF * UF, or
2774   // equal to it in case a scalar epilogue is required; this implies that the
2775   // vector trip count is zero. This check also covers the case where adding one
2776   // to the backedge-taken count overflowed leading to an incorrect trip count
2777   // of zero. In this case we will also jump to the scalar loop.
2778   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2779                                           : ICmpInst::ICMP_ULT;
2780 
2781   // If tail is to be folded, vector loop takes care of all iterations.
2782   Value *CheckMinIters = Builder.getFalse();
2783   if (!Cost->foldTailByMasking())
2784     CheckMinIters = Builder.CreateICmp(
2785         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2786         "min.iters.check");
2787 
2788   // Create new preheader for vector loop.
2789   LoopVectorPreHeader =
2790       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2791                  "vector.ph");
2792 
2793   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2794                                DT->getNode(Bypass)->getIDom()) &&
2795          "TC check is expected to dominate Bypass");
2796 
2797   // Update dominator for Bypass & LoopExit.
2798   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2799   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2800 
2801   ReplaceInstWithInst(
2802       TCCheckBlock->getTerminator(),
2803       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2804   LoopBypassBlocks.push_back(TCCheckBlock);
2805 }
2806 
2807 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2808   // Reuse existing vector loop preheader for SCEV checks.
2809   // Note that new preheader block is generated for vector loop.
2810   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2811 
2812   // Generate the code to check that the SCEV assumptions that we made.
2813   // We want the new basic block to start at the first instruction in a
2814   // sequence of instructions that form a check.
2815   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2816                    "scev.check");
2817   Value *SCEVCheck = Exp.expandCodeForPredicate(
2818       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2819 
2820   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2821     if (C->isZero())
2822       return;
2823 
2824   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2825            OptForSizeBasedOnProfile) &&
2826          "Cannot SCEV check stride or overflow when optimizing for size");
2827 
2828   SCEVCheckBlock->setName("vector.scevcheck");
2829   // Create new preheader for vector loop.
2830   LoopVectorPreHeader =
2831       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2832                  nullptr, "vector.ph");
2833 
2834   // Update dominator only if this is first RT check.
2835   if (LoopBypassBlocks.empty()) {
2836     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2837     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2838   }
2839 
2840   ReplaceInstWithInst(
2841       SCEVCheckBlock->getTerminator(),
2842       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2843   LoopBypassBlocks.push_back(SCEVCheckBlock);
2844   AddedSafetyChecks = true;
2845 }
2846 
2847 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2848   // VPlan-native path does not do any analysis for runtime checks currently.
2849   if (EnableVPlanNativePath)
2850     return;
2851 
2852   // Reuse existing vector loop preheader for runtime memory checks.
2853   // Note that new preheader block is generated for vector loop.
2854   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2855 
2856   // Generate the code that checks in runtime if arrays overlap. We put the
2857   // checks into a separate block to make the more common case of few elements
2858   // faster.
2859   auto *LAI = Legal->getLAI();
2860   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2861   if (!RtPtrChecking.Need)
2862     return;
2863   Instruction *FirstCheckInst;
2864   Instruction *MemRuntimeCheck;
2865   std::tie(FirstCheckInst, MemRuntimeCheck) =
2866       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2867                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2868   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2869                             "claimed checks are required");
2870 
2871   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2872     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2873            "Cannot emit memory checks when optimizing for size, unless forced "
2874            "to vectorize.");
2875     ORE->emit([&]() {
2876       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2877                                         L->getStartLoc(), L->getHeader())
2878              << "Code-size may be reduced by not forcing "
2879                 "vectorization, or by source-code modifications "
2880                 "eliminating the need for runtime checks "
2881                 "(e.g., adding 'restrict').";
2882     });
2883   }
2884 
2885   MemCheckBlock->setName("vector.memcheck");
2886   // Create new preheader for vector loop.
2887   LoopVectorPreHeader =
2888       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2889                  "vector.ph");
2890 
2891   // Update dominator only if this is first RT check.
2892   if (LoopBypassBlocks.empty()) {
2893     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2894     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2895   }
2896 
2897   ReplaceInstWithInst(
2898       MemCheckBlock->getTerminator(),
2899       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2900   LoopBypassBlocks.push_back(MemCheckBlock);
2901   AddedSafetyChecks = true;
2902 
2903   // We currently don't use LoopVersioning for the actual loop cloning but we
2904   // still use it to add the noalias metadata.
2905   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2906                                           PSE.getSE());
2907   LVer->prepareNoAliasMetadata();
2908 }
2909 
2910 Value *InnerLoopVectorizer::emitTransformedIndex(
2911     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2912     const InductionDescriptor &ID) const {
2913 
2914   SCEVExpander Exp(*SE, DL, "induction");
2915   auto Step = ID.getStep();
2916   auto StartValue = ID.getStartValue();
2917   assert(Index->getType() == Step->getType() &&
2918          "Index type does not match StepValue type");
2919 
2920   // Note: the IR at this point is broken. We cannot use SE to create any new
2921   // SCEV and then expand it, hoping that SCEV's simplification will give us
2922   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2923   // lead to various SCEV crashes. So all we can do is to use builder and rely
2924   // on InstCombine for future simplifications. Here we handle some trivial
2925   // cases only.
2926   auto CreateAdd = [&B](Value *X, Value *Y) {
2927     assert(X->getType() == Y->getType() && "Types don't match!");
2928     if (auto *CX = dyn_cast<ConstantInt>(X))
2929       if (CX->isZero())
2930         return Y;
2931     if (auto *CY = dyn_cast<ConstantInt>(Y))
2932       if (CY->isZero())
2933         return X;
2934     return B.CreateAdd(X, Y);
2935   };
2936 
2937   auto CreateMul = [&B](Value *X, Value *Y) {
2938     assert(X->getType() == Y->getType() && "Types don't match!");
2939     if (auto *CX = dyn_cast<ConstantInt>(X))
2940       if (CX->isOne())
2941         return Y;
2942     if (auto *CY = dyn_cast<ConstantInt>(Y))
2943       if (CY->isOne())
2944         return X;
2945     return B.CreateMul(X, Y);
2946   };
2947 
2948   // Get a suitable insert point for SCEV expansion. For blocks in the vector
2949   // loop, choose the end of the vector loop header (=LoopVectorBody), because
2950   // the DomTree is not kept up-to-date for additional blocks generated in the
2951   // vector loop. By using the header as insertion point, we guarantee that the
2952   // expanded instructions dominate all their uses.
2953   auto GetInsertPoint = [this, &B]() {
2954     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
2955     if (InsertBB != LoopVectorBody &&
2956         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
2957       return LoopVectorBody->getTerminator();
2958     return &*B.GetInsertPoint();
2959   };
2960   switch (ID.getKind()) {
2961   case InductionDescriptor::IK_IntInduction: {
2962     assert(Index->getType() == StartValue->getType() &&
2963            "Index type does not match StartValue type");
2964     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2965       return B.CreateSub(StartValue, Index);
2966     auto *Offset = CreateMul(
2967         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
2968     return CreateAdd(StartValue, Offset);
2969   }
2970   case InductionDescriptor::IK_PtrInduction: {
2971     assert(isa<SCEVConstant>(Step) &&
2972            "Expected constant step for pointer induction");
2973     return B.CreateGEP(
2974         StartValue->getType()->getPointerElementType(), StartValue,
2975         CreateMul(Index,
2976                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
2977   }
2978   case InductionDescriptor::IK_FpInduction: {
2979     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2980     auto InductionBinOp = ID.getInductionBinOp();
2981     assert(InductionBinOp &&
2982            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2983             InductionBinOp->getOpcode() == Instruction::FSub) &&
2984            "Original bin op should be defined for FP induction");
2985 
2986     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2987 
2988     // Floating point operations had to be 'fast' to enable the induction.
2989     FastMathFlags Flags;
2990     Flags.setFast();
2991 
2992     Value *MulExp = B.CreateFMul(StepValue, Index);
2993     if (isa<Instruction>(MulExp))
2994       // We have to check, the MulExp may be a constant.
2995       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2996 
2997     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2998                                "induction");
2999     if (isa<Instruction>(BOp))
3000       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3001 
3002     return BOp;
3003   }
3004   case InductionDescriptor::IK_NoInduction:
3005     return nullptr;
3006   }
3007   llvm_unreachable("invalid enum");
3008 }
3009 
3010 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3011   LoopScalarBody = OrigLoop->getHeader();
3012   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3013   LoopExitBlock = OrigLoop->getExitBlock();
3014   assert(LoopExitBlock && "Must have an exit block");
3015   assert(LoopVectorPreHeader && "Invalid loop structure");
3016 
3017   LoopMiddleBlock =
3018       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3019                  LI, nullptr, Twine(Prefix) + "middle.block");
3020   LoopScalarPreHeader =
3021       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3022                  nullptr, Twine(Prefix) + "scalar.ph");
3023   // We intentionally don't let SplitBlock to update LoopInfo since
3024   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3025   // LoopVectorBody is explicitly added to the correct place few lines later.
3026   LoopVectorBody =
3027       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3028                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3029 
3030   // Update dominator for loop exit.
3031   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3032 
3033   // Create and register the new vector loop.
3034   Loop *Lp = LI->AllocateLoop();
3035   Loop *ParentLoop = OrigLoop->getParentLoop();
3036 
3037   // Insert the new loop into the loop nest and register the new basic blocks
3038   // before calling any utilities such as SCEV that require valid LoopInfo.
3039   if (ParentLoop) {
3040     ParentLoop->addChildLoop(Lp);
3041   } else {
3042     LI->addTopLevelLoop(Lp);
3043   }
3044   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3045   return Lp;
3046 }
3047 
3048 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3049                                                       Value *VectorTripCount) {
3050   assert(VectorTripCount && L && "Expected valid arguments");
3051   // We are going to resume the execution of the scalar loop.
3052   // Go over all of the induction variables that we found and fix the
3053   // PHIs that are left in the scalar version of the loop.
3054   // The starting values of PHI nodes depend on the counter of the last
3055   // iteration in the vectorized loop.
3056   // If we come from a bypass edge then we need to start from the original
3057   // start value.
3058   for (auto &InductionEntry : Legal->getInductionVars()) {
3059     PHINode *OrigPhi = InductionEntry.first;
3060     InductionDescriptor II = InductionEntry.second;
3061 
3062     // Create phi nodes to merge from the  backedge-taken check block.
3063     PHINode *BCResumeVal =
3064         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3065                         LoopScalarPreHeader->getTerminator());
3066     // Copy original phi DL over to the new one.
3067     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3068     Value *&EndValue = IVEndValues[OrigPhi];
3069     if (OrigPhi == OldInduction) {
3070       // We know what the end value is.
3071       EndValue = VectorTripCount;
3072     } else {
3073       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3074       Type *StepType = II.getStep()->getType();
3075       Instruction::CastOps CastOp =
3076           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3077       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3078       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3079       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3080       EndValue->setName("ind.end");
3081     }
3082 
3083     // The new PHI merges the original incoming value, in case of a bypass,
3084     // or the value at the end of the vectorized loop.
3085     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3086 
3087     // Fix the scalar body counter (PHI node).
3088     // The old induction's phi node in the scalar body needs the truncated
3089     // value.
3090     for (BasicBlock *BB : LoopBypassBlocks)
3091       BCResumeVal->addIncoming(II.getStartValue(), BB);
3092     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3093   }
3094 }
3095 
3096 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3097                                                       MDNode *OrigLoopID) {
3098   assert(L && "Expected valid loop.");
3099 
3100   // The trip counts should be cached by now.
3101   Value *Count = getOrCreateTripCount(L);
3102   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3103 
3104   // We need the OrigLoop (scalar loop part) latch terminator to help
3105   // produce correct debug info for the middle block BB instructions.
3106   // The legality check stage guarantees that the loop will have a single
3107   // latch.
3108   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3109          "Scalar loop latch terminator isn't a branch");
3110   BranchInst *ScalarLatchBr =
3111       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3112 
3113   // Add a check in the middle block to see if we have completed
3114   // all of the iterations in the first vector loop.
3115   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3116   // If tail is to be folded, we know we don't need to run the remainder.
3117   Value *CmpN = Builder.getTrue();
3118   if (!Cost->foldTailByMasking()) {
3119     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3120                            VectorTripCount, "cmp.n",
3121                            LoopMiddleBlock->getTerminator());
3122 
3123     // Here we use the same DebugLoc as the scalar loop latch branch instead
3124     // of the corresponding compare because they may have ended up with
3125     // different line numbers and we want to avoid awkward line stepping while
3126     // debugging. Eg. if the compare has got a line number inside the loop.
3127     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3128   }
3129 
3130   BranchInst *BrInst =
3131       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3132   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3133   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3134 
3135   // Get ready to start creating new instructions into the vectorized body.
3136   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3137          "Inconsistent vector loop preheader");
3138   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3139 
3140   Optional<MDNode *> VectorizedLoopID =
3141       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3142                                       LLVMLoopVectorizeFollowupVectorized});
3143   if (VectorizedLoopID.hasValue()) {
3144     L->setLoopID(VectorizedLoopID.getValue());
3145 
3146     // Do not setAlreadyVectorized if loop attributes have been defined
3147     // explicitly.
3148     return LoopVectorPreHeader;
3149   }
3150 
3151   // Keep all loop hints from the original loop on the vector loop (we'll
3152   // replace the vectorizer-specific hints below).
3153   if (MDNode *LID = OrigLoop->getLoopID())
3154     L->setLoopID(LID);
3155 
3156   LoopVectorizeHints Hints(L, true, *ORE);
3157   Hints.setAlreadyVectorized();
3158 
3159 #ifdef EXPENSIVE_CHECKS
3160   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3161   LI->verify(*DT);
3162 #endif
3163 
3164   return LoopVectorPreHeader;
3165 }
3166 
3167 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3168   /*
3169    In this function we generate a new loop. The new loop will contain
3170    the vectorized instructions while the old loop will continue to run the
3171    scalar remainder.
3172 
3173        [ ] <-- loop iteration number check.
3174     /   |
3175    /    v
3176   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3177   |  /  |
3178   | /   v
3179   ||   [ ]     <-- vector pre header.
3180   |/    |
3181   |     v
3182   |    [  ] \
3183   |    [  ]_|   <-- vector loop.
3184   |     |
3185   |     v
3186   |   -[ ]   <--- middle-block.
3187   |  /  |
3188   | /   v
3189   -|- >[ ]     <--- new preheader.
3190    |    |
3191    |    v
3192    |   [ ] \
3193    |   [ ]_|   <-- old scalar loop to handle remainder.
3194     \   |
3195      \  v
3196       >[ ]     <-- exit block.
3197    ...
3198    */
3199 
3200   // Get the metadata of the original loop before it gets modified.
3201   MDNode *OrigLoopID = OrigLoop->getLoopID();
3202 
3203   // Create an empty vector loop, and prepare basic blocks for the runtime
3204   // checks.
3205   Loop *Lp = createVectorLoopSkeleton("");
3206 
3207   // Now, compare the new count to zero. If it is zero skip the vector loop and
3208   // jump to the scalar loop. This check also covers the case where the
3209   // backedge-taken count is uint##_max: adding one to it will overflow leading
3210   // to an incorrect trip count of zero. In this (rare) case we will also jump
3211   // to the scalar loop.
3212   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3213 
3214   // Generate the code to check any assumptions that we've made for SCEV
3215   // expressions.
3216   emitSCEVChecks(Lp, LoopScalarPreHeader);
3217 
3218   // Generate the code that checks in runtime if arrays overlap. We put the
3219   // checks into a separate block to make the more common case of few elements
3220   // faster.
3221   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3222 
3223   // Some loops have a single integer induction variable, while other loops
3224   // don't. One example is c++ iterators that often have multiple pointer
3225   // induction variables. In the code below we also support a case where we
3226   // don't have a single induction variable.
3227   //
3228   // We try to obtain an induction variable from the original loop as hard
3229   // as possible. However if we don't find one that:
3230   //   - is an integer
3231   //   - counts from zero, stepping by one
3232   //   - is the size of the widest induction variable type
3233   // then we create a new one.
3234   OldInduction = Legal->getPrimaryInduction();
3235   Type *IdxTy = Legal->getWidestInductionType();
3236   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3237   // The loop step is equal to the vectorization factor (num of SIMD elements)
3238   // times the unroll factor (num of SIMD instructions).
3239   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3240   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3241   Induction =
3242       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3243                               getDebugLocFromInstOrOperands(OldInduction));
3244 
3245   // Emit phis for the new starting index of the scalar loop.
3246   createInductionResumeValues(Lp, CountRoundDown);
3247 
3248   return completeLoopSkeleton(Lp, OrigLoopID);
3249 }
3250 
3251 // Fix up external users of the induction variable. At this point, we are
3252 // in LCSSA form, with all external PHIs that use the IV having one input value,
3253 // coming from the remainder loop. We need those PHIs to also have a correct
3254 // value for the IV when arriving directly from the middle block.
3255 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3256                                        const InductionDescriptor &II,
3257                                        Value *CountRoundDown, Value *EndValue,
3258                                        BasicBlock *MiddleBlock) {
3259   // There are two kinds of external IV usages - those that use the value
3260   // computed in the last iteration (the PHI) and those that use the penultimate
3261   // value (the value that feeds into the phi from the loop latch).
3262   // We allow both, but they, obviously, have different values.
3263 
3264   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3265 
3266   DenseMap<Value *, Value *> MissingVals;
3267 
3268   // An external user of the last iteration's value should see the value that
3269   // the remainder loop uses to initialize its own IV.
3270   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3271   for (User *U : PostInc->users()) {
3272     Instruction *UI = cast<Instruction>(U);
3273     if (!OrigLoop->contains(UI)) {
3274       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3275       MissingVals[UI] = EndValue;
3276     }
3277   }
3278 
3279   // An external user of the penultimate value need to see EndValue - Step.
3280   // The simplest way to get this is to recompute it from the constituent SCEVs,
3281   // that is Start + (Step * (CRD - 1)).
3282   for (User *U : OrigPhi->users()) {
3283     auto *UI = cast<Instruction>(U);
3284     if (!OrigLoop->contains(UI)) {
3285       const DataLayout &DL =
3286           OrigLoop->getHeader()->getModule()->getDataLayout();
3287       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3288 
3289       IRBuilder<> B(MiddleBlock->getTerminator());
3290       Value *CountMinusOne = B.CreateSub(
3291           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3292       Value *CMO =
3293           !II.getStep()->getType()->isIntegerTy()
3294               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3295                              II.getStep()->getType())
3296               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3297       CMO->setName("cast.cmo");
3298       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3299       Escape->setName("ind.escape");
3300       MissingVals[UI] = Escape;
3301     }
3302   }
3303 
3304   for (auto &I : MissingVals) {
3305     PHINode *PHI = cast<PHINode>(I.first);
3306     // One corner case we have to handle is two IVs "chasing" each-other,
3307     // that is %IV2 = phi [...], [ %IV1, %latch ]
3308     // In this case, if IV1 has an external use, we need to avoid adding both
3309     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3310     // don't already have an incoming value for the middle block.
3311     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3312       PHI->addIncoming(I.second, MiddleBlock);
3313   }
3314 }
3315 
3316 namespace {
3317 
3318 struct CSEDenseMapInfo {
3319   static bool canHandle(const Instruction *I) {
3320     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3321            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3322   }
3323 
3324   static inline Instruction *getEmptyKey() {
3325     return DenseMapInfo<Instruction *>::getEmptyKey();
3326   }
3327 
3328   static inline Instruction *getTombstoneKey() {
3329     return DenseMapInfo<Instruction *>::getTombstoneKey();
3330   }
3331 
3332   static unsigned getHashValue(const Instruction *I) {
3333     assert(canHandle(I) && "Unknown instruction!");
3334     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3335                                                            I->value_op_end()));
3336   }
3337 
3338   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3339     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3340         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3341       return LHS == RHS;
3342     return LHS->isIdenticalTo(RHS);
3343   }
3344 };
3345 
3346 } // end anonymous namespace
3347 
3348 ///Perform cse of induction variable instructions.
3349 static void cse(BasicBlock *BB) {
3350   // Perform simple cse.
3351   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3352   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3353     Instruction *In = &*I++;
3354 
3355     if (!CSEDenseMapInfo::canHandle(In))
3356       continue;
3357 
3358     // Check if we can replace this instruction with any of the
3359     // visited instructions.
3360     if (Instruction *V = CSEMap.lookup(In)) {
3361       In->replaceAllUsesWith(V);
3362       In->eraseFromParent();
3363       continue;
3364     }
3365 
3366     CSEMap[In] = In;
3367   }
3368 }
3369 
3370 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3371                                                        unsigned VF,
3372                                                        bool &NeedToScalarize) {
3373   Function *F = CI->getCalledFunction();
3374   Type *ScalarRetTy = CI->getType();
3375   SmallVector<Type *, 4> Tys, ScalarTys;
3376   for (auto &ArgOp : CI->arg_operands())
3377     ScalarTys.push_back(ArgOp->getType());
3378 
3379   // Estimate cost of scalarized vector call. The source operands are assumed
3380   // to be vectors, so we need to extract individual elements from there,
3381   // execute VF scalar calls, and then gather the result into the vector return
3382   // value.
3383   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3384                                                  TTI::TCK_RecipThroughput);
3385   if (VF == 1)
3386     return ScalarCallCost;
3387 
3388   // Compute corresponding vector type for return value and arguments.
3389   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3390   for (Type *ScalarTy : ScalarTys)
3391     Tys.push_back(ToVectorTy(ScalarTy, VF));
3392 
3393   // Compute costs of unpacking argument values for the scalar calls and
3394   // packing the return values to a vector.
3395   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3396 
3397   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3398 
3399   // If we can't emit a vector call for this function, then the currently found
3400   // cost is the cost we need to return.
3401   NeedToScalarize = true;
3402   VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
3403   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3404 
3405   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3406     return Cost;
3407 
3408   // If the corresponding vector cost is cheaper, return its cost.
3409   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3410                                                  TTI::TCK_RecipThroughput);
3411   if (VectorCallCost < Cost) {
3412     NeedToScalarize = false;
3413     return VectorCallCost;
3414   }
3415   return Cost;
3416 }
3417 
3418 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3419                                                             unsigned VF) {
3420   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3421   assert(ID && "Expected intrinsic call!");
3422 
3423   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3424   return TTI.getIntrinsicInstrCost(CostAttrs,
3425                                    TargetTransformInfo::TCK_RecipThroughput);
3426 }
3427 
3428 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3429   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3430   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3431   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3432 }
3433 
3434 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3435   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3436   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3437   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3438 }
3439 
3440 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3441   // For every instruction `I` in MinBWs, truncate the operands, create a
3442   // truncated version of `I` and reextend its result. InstCombine runs
3443   // later and will remove any ext/trunc pairs.
3444   SmallPtrSet<Value *, 4> Erased;
3445   for (const auto &KV : Cost->getMinimalBitwidths()) {
3446     // If the value wasn't vectorized, we must maintain the original scalar
3447     // type. The absence of the value from VectorLoopValueMap indicates that it
3448     // wasn't vectorized.
3449     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3450       continue;
3451     for (unsigned Part = 0; Part < UF; ++Part) {
3452       Value *I = getOrCreateVectorValue(KV.first, Part);
3453       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3454         continue;
3455       Type *OriginalTy = I->getType();
3456       Type *ScalarTruncatedTy =
3457           IntegerType::get(OriginalTy->getContext(), KV.second);
3458       auto *TruncatedTy = FixedVectorType::get(
3459           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3460       if (TruncatedTy == OriginalTy)
3461         continue;
3462 
3463       IRBuilder<> B(cast<Instruction>(I));
3464       auto ShrinkOperand = [&](Value *V) -> Value * {
3465         if (auto *ZI = dyn_cast<ZExtInst>(V))
3466           if (ZI->getSrcTy() == TruncatedTy)
3467             return ZI->getOperand(0);
3468         return B.CreateZExtOrTrunc(V, TruncatedTy);
3469       };
3470 
3471       // The actual instruction modification depends on the instruction type,
3472       // unfortunately.
3473       Value *NewI = nullptr;
3474       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3475         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3476                              ShrinkOperand(BO->getOperand(1)));
3477 
3478         // Any wrapping introduced by shrinking this operation shouldn't be
3479         // considered undefined behavior. So, we can't unconditionally copy
3480         // arithmetic wrapping flags to NewI.
3481         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3482       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3483         NewI =
3484             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3485                          ShrinkOperand(CI->getOperand(1)));
3486       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3487         NewI = B.CreateSelect(SI->getCondition(),
3488                               ShrinkOperand(SI->getTrueValue()),
3489                               ShrinkOperand(SI->getFalseValue()));
3490       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3491         switch (CI->getOpcode()) {
3492         default:
3493           llvm_unreachable("Unhandled cast!");
3494         case Instruction::Trunc:
3495           NewI = ShrinkOperand(CI->getOperand(0));
3496           break;
3497         case Instruction::SExt:
3498           NewI = B.CreateSExtOrTrunc(
3499               CI->getOperand(0),
3500               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3501           break;
3502         case Instruction::ZExt:
3503           NewI = B.CreateZExtOrTrunc(
3504               CI->getOperand(0),
3505               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3506           break;
3507         }
3508       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3509         auto Elements0 =
3510             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3511         auto *O0 = B.CreateZExtOrTrunc(
3512             SI->getOperand(0),
3513             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3514         auto Elements1 =
3515             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3516         auto *O1 = B.CreateZExtOrTrunc(
3517             SI->getOperand(1),
3518             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3519 
3520         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3521       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3522         // Don't do anything with the operands, just extend the result.
3523         continue;
3524       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3525         auto Elements =
3526             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3527         auto *O0 = B.CreateZExtOrTrunc(
3528             IE->getOperand(0),
3529             FixedVectorType::get(ScalarTruncatedTy, Elements));
3530         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3531         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3532       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3533         auto Elements =
3534             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3535         auto *O0 = B.CreateZExtOrTrunc(
3536             EE->getOperand(0),
3537             FixedVectorType::get(ScalarTruncatedTy, Elements));
3538         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3539       } else {
3540         // If we don't know what to do, be conservative and don't do anything.
3541         continue;
3542       }
3543 
3544       // Lastly, extend the result.
3545       NewI->takeName(cast<Instruction>(I));
3546       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3547       I->replaceAllUsesWith(Res);
3548       cast<Instruction>(I)->eraseFromParent();
3549       Erased.insert(I);
3550       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3551     }
3552   }
3553 
3554   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3555   for (const auto &KV : Cost->getMinimalBitwidths()) {
3556     // If the value wasn't vectorized, we must maintain the original scalar
3557     // type. The absence of the value from VectorLoopValueMap indicates that it
3558     // wasn't vectorized.
3559     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3560       continue;
3561     for (unsigned Part = 0; Part < UF; ++Part) {
3562       Value *I = getOrCreateVectorValue(KV.first, Part);
3563       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3564       if (Inst && Inst->use_empty()) {
3565         Value *NewI = Inst->getOperand(0);
3566         Inst->eraseFromParent();
3567         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3568       }
3569     }
3570   }
3571 }
3572 
3573 void InnerLoopVectorizer::fixVectorizedLoop() {
3574   // Insert truncates and extends for any truncated instructions as hints to
3575   // InstCombine.
3576   if (VF > 1)
3577     truncateToMinimalBitwidths();
3578 
3579   // Fix widened non-induction PHIs by setting up the PHI operands.
3580   if (OrigPHIsToFix.size()) {
3581     assert(EnableVPlanNativePath &&
3582            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3583     fixNonInductionPHIs();
3584   }
3585 
3586   // At this point every instruction in the original loop is widened to a
3587   // vector form. Now we need to fix the recurrences in the loop. These PHI
3588   // nodes are currently empty because we did not want to introduce cycles.
3589   // This is the second stage of vectorizing recurrences.
3590   fixCrossIterationPHIs();
3591 
3592   // Forget the original basic block.
3593   PSE.getSE()->forgetLoop(OrigLoop);
3594 
3595   // Fix-up external users of the induction variables.
3596   for (auto &Entry : Legal->getInductionVars())
3597     fixupIVUsers(Entry.first, Entry.second,
3598                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3599                  IVEndValues[Entry.first], LoopMiddleBlock);
3600 
3601   fixLCSSAPHIs();
3602   for (Instruction *PI : PredicatedInstructions)
3603     sinkScalarOperands(&*PI);
3604 
3605   // Remove redundant induction instructions.
3606   cse(LoopVectorBody);
3607 
3608   // Set/update profile weights for the vector and remainder loops as original
3609   // loop iterations are now distributed among them. Note that original loop
3610   // represented by LoopScalarBody becomes remainder loop after vectorization.
3611   //
3612   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3613   // end up getting slightly roughened result but that should be OK since
3614   // profile is not inherently precise anyway. Note also possible bypass of
3615   // vector code caused by legality checks is ignored, assigning all the weight
3616   // to the vector loop, optimistically.
3617   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3618                                LI->getLoopFor(LoopVectorBody),
3619                                LI->getLoopFor(LoopScalarBody), VF * UF);
3620 }
3621 
3622 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3623   // In order to support recurrences we need to be able to vectorize Phi nodes.
3624   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3625   // stage #2: We now need to fix the recurrences by adding incoming edges to
3626   // the currently empty PHI nodes. At this point every instruction in the
3627   // original loop is widened to a vector form so we can use them to construct
3628   // the incoming edges.
3629   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3630     // Handle first-order recurrences and reductions that need to be fixed.
3631     if (Legal->isFirstOrderRecurrence(&Phi))
3632       fixFirstOrderRecurrence(&Phi);
3633     else if (Legal->isReductionVariable(&Phi))
3634       fixReduction(&Phi);
3635   }
3636 }
3637 
3638 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3639   // This is the second phase of vectorizing first-order recurrences. An
3640   // overview of the transformation is described below. Suppose we have the
3641   // following loop.
3642   //
3643   //   for (int i = 0; i < n; ++i)
3644   //     b[i] = a[i] - a[i - 1];
3645   //
3646   // There is a first-order recurrence on "a". For this loop, the shorthand
3647   // scalar IR looks like:
3648   //
3649   //   scalar.ph:
3650   //     s_init = a[-1]
3651   //     br scalar.body
3652   //
3653   //   scalar.body:
3654   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3655   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3656   //     s2 = a[i]
3657   //     b[i] = s2 - s1
3658   //     br cond, scalar.body, ...
3659   //
3660   // In this example, s1 is a recurrence because it's value depends on the
3661   // previous iteration. In the first phase of vectorization, we created a
3662   // temporary value for s1. We now complete the vectorization and produce the
3663   // shorthand vector IR shown below (for VF = 4, UF = 1).
3664   //
3665   //   vector.ph:
3666   //     v_init = vector(..., ..., ..., a[-1])
3667   //     br vector.body
3668   //
3669   //   vector.body
3670   //     i = phi [0, vector.ph], [i+4, vector.body]
3671   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3672   //     v2 = a[i, i+1, i+2, i+3];
3673   //     v3 = vector(v1(3), v2(0, 1, 2))
3674   //     b[i, i+1, i+2, i+3] = v2 - v3
3675   //     br cond, vector.body, middle.block
3676   //
3677   //   middle.block:
3678   //     x = v2(3)
3679   //     br scalar.ph
3680   //
3681   //   scalar.ph:
3682   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3683   //     br scalar.body
3684   //
3685   // After execution completes the vector loop, we extract the next value of
3686   // the recurrence (x) to use as the initial value in the scalar loop.
3687 
3688   // Get the original loop preheader and single loop latch.
3689   auto *Preheader = OrigLoop->getLoopPreheader();
3690   auto *Latch = OrigLoop->getLoopLatch();
3691 
3692   // Get the initial and previous values of the scalar recurrence.
3693   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3694   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3695 
3696   // Create a vector from the initial value.
3697   auto *VectorInit = ScalarInit;
3698   if (VF > 1) {
3699     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3700     VectorInit = Builder.CreateInsertElement(
3701         UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
3702         VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
3703   }
3704 
3705   // We constructed a temporary phi node in the first phase of vectorization.
3706   // This phi node will eventually be deleted.
3707   Builder.SetInsertPoint(
3708       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3709 
3710   // Create a phi node for the new recurrence. The current value will either be
3711   // the initial value inserted into a vector or loop-varying vector value.
3712   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3713   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3714 
3715   // Get the vectorized previous value of the last part UF - 1. It appears last
3716   // among all unrolled iterations, due to the order of their construction.
3717   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3718 
3719   // Find and set the insertion point after the previous value if it is an
3720   // instruction.
3721   BasicBlock::iterator InsertPt;
3722   // Note that the previous value may have been constant-folded so it is not
3723   // guaranteed to be an instruction in the vector loop.
3724   // FIXME: Loop invariant values do not form recurrences. We should deal with
3725   //        them earlier.
3726   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3727     InsertPt = LoopVectorBody->getFirstInsertionPt();
3728   else {
3729     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3730     if (isa<PHINode>(PreviousLastPart))
3731       // If the previous value is a phi node, we should insert after all the phi
3732       // nodes in the block containing the PHI to avoid breaking basic block
3733       // verification. Note that the basic block may be different to
3734       // LoopVectorBody, in case we predicate the loop.
3735       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3736     else
3737       InsertPt = ++PreviousInst->getIterator();
3738   }
3739   Builder.SetInsertPoint(&*InsertPt);
3740 
3741   // We will construct a vector for the recurrence by combining the values for
3742   // the current and previous iterations. This is the required shuffle mask.
3743   SmallVector<int, 8> ShuffleMask(VF);
3744   ShuffleMask[0] = VF - 1;
3745   for (unsigned I = 1; I < VF; ++I)
3746     ShuffleMask[I] = I + VF - 1;
3747 
3748   // The vector from which to take the initial value for the current iteration
3749   // (actual or unrolled). Initially, this is the vector phi node.
3750   Value *Incoming = VecPhi;
3751 
3752   // Shuffle the current and previous vector and update the vector parts.
3753   for (unsigned Part = 0; Part < UF; ++Part) {
3754     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3755     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3756     auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3757                                                          ShuffleMask)
3758                            : Incoming;
3759     PhiPart->replaceAllUsesWith(Shuffle);
3760     cast<Instruction>(PhiPart)->eraseFromParent();
3761     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3762     Incoming = PreviousPart;
3763   }
3764 
3765   // Fix the latch value of the new recurrence in the vector loop.
3766   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3767 
3768   // Extract the last vector element in the middle block. This will be the
3769   // initial value for the recurrence when jumping to the scalar loop.
3770   auto *ExtractForScalar = Incoming;
3771   if (VF > 1) {
3772     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3773     ExtractForScalar = Builder.CreateExtractElement(
3774         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3775   }
3776   // Extract the second last element in the middle block if the
3777   // Phi is used outside the loop. We need to extract the phi itself
3778   // and not the last element (the phi update in the current iteration). This
3779   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3780   // when the scalar loop is not run at all.
3781   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3782   if (VF > 1)
3783     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3784         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3785   // When loop is unrolled without vectorizing, initialize
3786   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3787   // `Incoming`. This is analogous to the vectorized case above: extracting the
3788   // second last element when VF > 1.
3789   else if (UF > 1)
3790     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3791 
3792   // Fix the initial value of the original recurrence in the scalar loop.
3793   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3794   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3795   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3796     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3797     Start->addIncoming(Incoming, BB);
3798   }
3799 
3800   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3801   Phi->setName("scalar.recur");
3802 
3803   // Finally, fix users of the recurrence outside the loop. The users will need
3804   // either the last value of the scalar recurrence or the last value of the
3805   // vector recurrence we extracted in the middle block. Since the loop is in
3806   // LCSSA form, we just need to find all the phi nodes for the original scalar
3807   // recurrence in the exit block, and then add an edge for the middle block.
3808   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3809     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3810       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3811     }
3812   }
3813 }
3814 
3815 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3816   Constant *Zero = Builder.getInt32(0);
3817 
3818   // Get it's reduction variable descriptor.
3819   assert(Legal->isReductionVariable(Phi) &&
3820          "Unable to find the reduction variable");
3821   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3822 
3823   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3824   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3825   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3826   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3827     RdxDesc.getMinMaxRecurrenceKind();
3828   setDebugLocFromInst(Builder, ReductionStartValue);
3829   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3830 
3831   // We need to generate a reduction vector from the incoming scalar.
3832   // To do so, we need to generate the 'identity' vector and override
3833   // one of the elements with the incoming scalar reduction. We need
3834   // to do it in the vector-loop preheader.
3835   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3836 
3837   // This is the vector-clone of the value that leaves the loop.
3838   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3839 
3840   // Find the reduction identity variable. Zero for addition, or, xor,
3841   // one for multiplication, -1 for And.
3842   Value *Identity;
3843   Value *VectorStart;
3844   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3845       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3846     // MinMax reduction have the start value as their identify.
3847     if (VF == 1 || IsInLoopReductionPhi) {
3848       VectorStart = Identity = ReductionStartValue;
3849     } else {
3850       VectorStart = Identity =
3851         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3852     }
3853   } else {
3854     // Handle other reduction kinds:
3855     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3856         RK, VecTy->getScalarType());
3857     if (VF == 1 || IsInLoopReductionPhi) {
3858       Identity = Iden;
3859       // This vector is the Identity vector where the first element is the
3860       // incoming scalar reduction.
3861       VectorStart = ReductionStartValue;
3862     } else {
3863       Identity = ConstantVector::getSplat({VF, false}, Iden);
3864 
3865       // This vector is the Identity vector where the first element is the
3866       // incoming scalar reduction.
3867       VectorStart =
3868         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3869     }
3870   }
3871 
3872   // Wrap flags are in general invalid after vectorization, clear them.
3873   clearReductionWrapFlags(RdxDesc);
3874 
3875   // Fix the vector-loop phi.
3876 
3877   // Reductions do not have to start at zero. They can start with
3878   // any loop invariant values.
3879   BasicBlock *Latch = OrigLoop->getLoopLatch();
3880   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3881 
3882   for (unsigned Part = 0; Part < UF; ++Part) {
3883     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3884     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3885     // Make sure to add the reduction start value only to the
3886     // first unroll part.
3887     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3888     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3889     cast<PHINode>(VecRdxPhi)
3890       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3891   }
3892 
3893   // Before each round, move the insertion point right between
3894   // the PHIs and the values we are going to write.
3895   // This allows us to write both PHINodes and the extractelement
3896   // instructions.
3897   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3898 
3899   setDebugLocFromInst(Builder, LoopExitInst);
3900 
3901   // If tail is folded by masking, the vector value to leave the loop should be
3902   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3903   // instead of the former.
3904   if (Cost->foldTailByMasking()) {
3905     for (unsigned Part = 0; Part < UF; ++Part) {
3906       Value *VecLoopExitInst =
3907           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3908       Value *Sel = nullptr;
3909       for (User *U : VecLoopExitInst->users()) {
3910         if (isa<SelectInst>(U)) {
3911           assert(!Sel && "Reduction exit feeding two selects");
3912           Sel = U;
3913         } else
3914           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3915       }
3916       assert(Sel && "Reduction exit feeds no select");
3917       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3918     }
3919   }
3920 
3921   // If the vector reduction can be performed in a smaller type, we truncate
3922   // then extend the loop exit value to enable InstCombine to evaluate the
3923   // entire expression in the smaller type.
3924   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3925     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
3926     Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
3927     Builder.SetInsertPoint(
3928         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3929     VectorParts RdxParts(UF);
3930     for (unsigned Part = 0; Part < UF; ++Part) {
3931       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3932       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3933       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3934                                         : Builder.CreateZExt(Trunc, VecTy);
3935       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3936            UI != RdxParts[Part]->user_end();)
3937         if (*UI != Trunc) {
3938           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3939           RdxParts[Part] = Extnd;
3940         } else {
3941           ++UI;
3942         }
3943     }
3944     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3945     for (unsigned Part = 0; Part < UF; ++Part) {
3946       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3947       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3948     }
3949   }
3950 
3951   // Reduce all of the unrolled parts into a single vector.
3952   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3953   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3954 
3955   // The middle block terminator has already been assigned a DebugLoc here (the
3956   // OrigLoop's single latch terminator). We want the whole middle block to
3957   // appear to execute on this line because: (a) it is all compiler generated,
3958   // (b) these instructions are always executed after evaluating the latch
3959   // conditional branch, and (c) other passes may add new predecessors which
3960   // terminate on this line. This is the easiest way to ensure we don't
3961   // accidentally cause an extra step back into the loop while debugging.
3962   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3963   for (unsigned Part = 1; Part < UF; ++Part) {
3964     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3965     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3966       // Floating point operations had to be 'fast' to enable the reduction.
3967       ReducedPartRdx = addFastMathFlag(
3968           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3969                               ReducedPartRdx, "bin.rdx"),
3970           RdxDesc.getFastMathFlags());
3971     else
3972       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3973                                       RdxPart);
3974   }
3975 
3976   // Create the reduction after the loop. Note that inloop reductions create the
3977   // target reduction in the loop using a Reduction recipe.
3978   if (VF > 1 && !IsInLoopReductionPhi) {
3979     bool NoNaN = Legal->hasFunNoNaNAttr();
3980     ReducedPartRdx =
3981         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3982     // If the reduction can be performed in a smaller type, we need to extend
3983     // the reduction to the wider type before we branch to the original loop.
3984     if (Phi->getType() != RdxDesc.getRecurrenceType())
3985       ReducedPartRdx =
3986         RdxDesc.isSigned()
3987         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3988         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3989   }
3990 
3991   // Create a phi node that merges control-flow from the backedge-taken check
3992   // block and the middle block.
3993   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3994                                         LoopScalarPreHeader->getTerminator());
3995   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3996     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3997   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3998 
3999   // Now, we need to fix the users of the reduction variable
4000   // inside and outside of the scalar remainder loop.
4001   // We know that the loop is in LCSSA form. We need to update the
4002   // PHI nodes in the exit blocks.
4003   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4004     // All PHINodes need to have a single entry edge, or two if
4005     // we already fixed them.
4006     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4007 
4008     // We found a reduction value exit-PHI. Update it with the
4009     // incoming bypass edge.
4010     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4011       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4012   } // end of the LCSSA phi scan.
4013 
4014     // Fix the scalar loop reduction variable with the incoming reduction sum
4015     // from the vector body and from the backedge value.
4016   int IncomingEdgeBlockIdx =
4017     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4018   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4019   // Pick the other block.
4020   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4021   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4022   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4023 }
4024 
4025 void InnerLoopVectorizer::clearReductionWrapFlags(
4026     RecurrenceDescriptor &RdxDesc) {
4027   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4028   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4029       RK != RecurrenceDescriptor::RK_IntegerMult)
4030     return;
4031 
4032   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4033   assert(LoopExitInstr && "null loop exit instruction");
4034   SmallVector<Instruction *, 8> Worklist;
4035   SmallPtrSet<Instruction *, 8> Visited;
4036   Worklist.push_back(LoopExitInstr);
4037   Visited.insert(LoopExitInstr);
4038 
4039   while (!Worklist.empty()) {
4040     Instruction *Cur = Worklist.pop_back_val();
4041     if (isa<OverflowingBinaryOperator>(Cur))
4042       for (unsigned Part = 0; Part < UF; ++Part) {
4043         Value *V = getOrCreateVectorValue(Cur, Part);
4044         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4045       }
4046 
4047     for (User *U : Cur->users()) {
4048       Instruction *UI = cast<Instruction>(U);
4049       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4050           Visited.insert(UI).second)
4051         Worklist.push_back(UI);
4052     }
4053   }
4054 }
4055 
4056 void InnerLoopVectorizer::fixLCSSAPHIs() {
4057   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4058     if (LCSSAPhi.getNumIncomingValues() == 1) {
4059       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4060       // Non-instruction incoming values will have only one value.
4061       unsigned LastLane = 0;
4062       if (isa<Instruction>(IncomingValue))
4063           LastLane = Cost->isUniformAfterVectorization(
4064                          cast<Instruction>(IncomingValue), VF)
4065                          ? 0
4066                          : VF - 1;
4067       // Can be a loop invariant incoming value or the last scalar value to be
4068       // extracted from the vectorized loop.
4069       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4070       Value *lastIncomingValue =
4071           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4072       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4073     }
4074   }
4075 }
4076 
4077 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4078   // The basic block and loop containing the predicated instruction.
4079   auto *PredBB = PredInst->getParent();
4080   auto *VectorLoop = LI->getLoopFor(PredBB);
4081 
4082   // Initialize a worklist with the operands of the predicated instruction.
4083   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4084 
4085   // Holds instructions that we need to analyze again. An instruction may be
4086   // reanalyzed if we don't yet know if we can sink it or not.
4087   SmallVector<Instruction *, 8> InstsToReanalyze;
4088 
4089   // Returns true if a given use occurs in the predicated block. Phi nodes use
4090   // their operands in their corresponding predecessor blocks.
4091   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4092     auto *I = cast<Instruction>(U.getUser());
4093     BasicBlock *BB = I->getParent();
4094     if (auto *Phi = dyn_cast<PHINode>(I))
4095       BB = Phi->getIncomingBlock(
4096           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4097     return BB == PredBB;
4098   };
4099 
4100   // Iteratively sink the scalarized operands of the predicated instruction
4101   // into the block we created for it. When an instruction is sunk, it's
4102   // operands are then added to the worklist. The algorithm ends after one pass
4103   // through the worklist doesn't sink a single instruction.
4104   bool Changed;
4105   do {
4106     // Add the instructions that need to be reanalyzed to the worklist, and
4107     // reset the changed indicator.
4108     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4109     InstsToReanalyze.clear();
4110     Changed = false;
4111 
4112     while (!Worklist.empty()) {
4113       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4114 
4115       // We can't sink an instruction if it is a phi node, is already in the
4116       // predicated block, is not in the loop, or may have side effects.
4117       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4118           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4119         continue;
4120 
4121       // It's legal to sink the instruction if all its uses occur in the
4122       // predicated block. Otherwise, there's nothing to do yet, and we may
4123       // need to reanalyze the instruction.
4124       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4125         InstsToReanalyze.push_back(I);
4126         continue;
4127       }
4128 
4129       // Move the instruction to the beginning of the predicated block, and add
4130       // it's operands to the worklist.
4131       I->moveBefore(&*PredBB->getFirstInsertionPt());
4132       Worklist.insert(I->op_begin(), I->op_end());
4133 
4134       // The sinking may have enabled other instructions to be sunk, so we will
4135       // need to iterate.
4136       Changed = true;
4137     }
4138   } while (Changed);
4139 }
4140 
4141 void InnerLoopVectorizer::fixNonInductionPHIs() {
4142   for (PHINode *OrigPhi : OrigPHIsToFix) {
4143     PHINode *NewPhi =
4144         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4145     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4146 
4147     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4148         predecessors(OrigPhi->getParent()));
4149     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4150         predecessors(NewPhi->getParent()));
4151     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4152            "Scalar and Vector BB should have the same number of predecessors");
4153 
4154     // The insertion point in Builder may be invalidated by the time we get
4155     // here. Force the Builder insertion point to something valid so that we do
4156     // not run into issues during insertion point restore in
4157     // getOrCreateVectorValue calls below.
4158     Builder.SetInsertPoint(NewPhi);
4159 
4160     // The predecessor order is preserved and we can rely on mapping between
4161     // scalar and vector block predecessors.
4162     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4163       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4164 
4165       // When looking up the new scalar/vector values to fix up, use incoming
4166       // values from original phi.
4167       Value *ScIncV =
4168           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4169 
4170       // Scalar incoming value may need a broadcast
4171       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4172       NewPhi->addIncoming(NewIncV, NewPredBB);
4173     }
4174   }
4175 }
4176 
4177 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4178                                    unsigned UF, unsigned VF,
4179                                    bool IsPtrLoopInvariant,
4180                                    SmallBitVector &IsIndexLoopInvariant,
4181                                    VPTransformState &State) {
4182   // Construct a vector GEP by widening the operands of the scalar GEP as
4183   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4184   // results in a vector of pointers when at least one operand of the GEP
4185   // is vector-typed. Thus, to keep the representation compact, we only use
4186   // vector-typed operands for loop-varying values.
4187 
4188   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4189     // If we are vectorizing, but the GEP has only loop-invariant operands,
4190     // the GEP we build (by only using vector-typed operands for
4191     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4192     // produce a vector of pointers, we need to either arbitrarily pick an
4193     // operand to broadcast, or broadcast a clone of the original GEP.
4194     // Here, we broadcast a clone of the original.
4195     //
4196     // TODO: If at some point we decide to scalarize instructions having
4197     //       loop-invariant operands, this special case will no longer be
4198     //       required. We would add the scalarization decision to
4199     //       collectLoopScalars() and teach getVectorValue() to broadcast
4200     //       the lane-zero scalar value.
4201     auto *Clone = Builder.Insert(GEP->clone());
4202     for (unsigned Part = 0; Part < UF; ++Part) {
4203       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4204       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4205       addMetadata(EntryPart, GEP);
4206     }
4207   } else {
4208     // If the GEP has at least one loop-varying operand, we are sure to
4209     // produce a vector of pointers. But if we are only unrolling, we want
4210     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4211     // produce with the code below will be scalar (if VF == 1) or vector
4212     // (otherwise). Note that for the unroll-only case, we still maintain
4213     // values in the vector mapping with initVector, as we do for other
4214     // instructions.
4215     for (unsigned Part = 0; Part < UF; ++Part) {
4216       // The pointer operand of the new GEP. If it's loop-invariant, we
4217       // won't broadcast it.
4218       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4219                                      : State.get(Operands.getOperand(0), Part);
4220 
4221       // Collect all the indices for the new GEP. If any index is
4222       // loop-invariant, we won't broadcast it.
4223       SmallVector<Value *, 4> Indices;
4224       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4225         VPValue *Operand = Operands.getOperand(I);
4226         if (IsIndexLoopInvariant[I - 1])
4227           Indices.push_back(State.get(Operand, {0, 0}));
4228         else
4229           Indices.push_back(State.get(Operand, Part));
4230       }
4231 
4232       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4233       // but it should be a vector, otherwise.
4234       auto *NewGEP =
4235           GEP->isInBounds()
4236               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4237                                           Indices)
4238               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4239       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4240              "NewGEP is not a pointer vector");
4241       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4242       addMetadata(NewGEP, GEP);
4243     }
4244   }
4245 }
4246 
4247 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4248                                               unsigned VF) {
4249   PHINode *P = cast<PHINode>(PN);
4250   if (EnableVPlanNativePath) {
4251     // Currently we enter here in the VPlan-native path for non-induction
4252     // PHIs where all control flow is uniform. We simply widen these PHIs.
4253     // Create a vector phi with no operands - the vector phi operands will be
4254     // set at the end of vector code generation.
4255     Type *VecTy =
4256         (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4257     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4258     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4259     OrigPHIsToFix.push_back(P);
4260 
4261     return;
4262   }
4263 
4264   assert(PN->getParent() == OrigLoop->getHeader() &&
4265          "Non-header phis should have been handled elsewhere");
4266 
4267   // In order to support recurrences we need to be able to vectorize Phi nodes.
4268   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4269   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4270   // this value when we vectorize all of the instructions that use the PHI.
4271   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4272     for (unsigned Part = 0; Part < UF; ++Part) {
4273       // This is phase one of vectorizing PHIs.
4274       bool ScalarPHI = (VF == 1) || Cost->isInLoopReduction(cast<PHINode>(PN));
4275       Type *VecTy =
4276           ScalarPHI ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
4277       Value *EntryPart = PHINode::Create(
4278           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4279       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4280     }
4281     return;
4282   }
4283 
4284   setDebugLocFromInst(Builder, P);
4285 
4286   // This PHINode must be an induction variable.
4287   // Make sure that we know about it.
4288   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4289 
4290   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4291   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4292 
4293   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4294   // which can be found from the original scalar operations.
4295   switch (II.getKind()) {
4296   case InductionDescriptor::IK_NoInduction:
4297     llvm_unreachable("Unknown induction");
4298   case InductionDescriptor::IK_IntInduction:
4299   case InductionDescriptor::IK_FpInduction:
4300     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4301   case InductionDescriptor::IK_PtrInduction: {
4302     // Handle the pointer induction variable case.
4303     assert(P->getType()->isPointerTy() && "Unexpected type.");
4304 
4305     if (Cost->isScalarAfterVectorization(P, VF)) {
4306       // This is the normalized GEP that starts counting at zero.
4307       Value *PtrInd =
4308           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4309       // Determine the number of scalars we need to generate for each unroll
4310       // iteration. If the instruction is uniform, we only need to generate the
4311       // first lane. Otherwise, we generate all VF values.
4312       unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4313       for (unsigned Part = 0; Part < UF; ++Part) {
4314         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4315           Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4316           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4317           Value *SclrGep =
4318               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4319           SclrGep->setName("next.gep");
4320           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4321         }
4322       }
4323       return;
4324     }
4325     assert(isa<SCEVConstant>(II.getStep()) &&
4326            "Induction step not a SCEV constant!");
4327     Type *PhiType = II.getStep()->getType();
4328 
4329     // Build a pointer phi
4330     Value *ScalarStartValue = II.getStartValue();
4331     Type *ScStValueType = ScalarStartValue->getType();
4332     PHINode *NewPointerPhi =
4333         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4334     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4335 
4336     // A pointer induction, performed by using a gep
4337     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4338     Instruction *InductionLoc = LoopLatch->getTerminator();
4339     const SCEV *ScalarStep = II.getStep();
4340     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4341     Value *ScalarStepValue =
4342         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4343     Value *InductionGEP = GetElementPtrInst::Create(
4344         ScStValueType->getPointerElementType(), NewPointerPhi,
4345         Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)),
4346         "ptr.ind", InductionLoc);
4347     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4348 
4349     // Create UF many actual address geps that use the pointer
4350     // phi as base and a vectorized version of the step value
4351     // (<step*0, ..., step*N>) as offset.
4352     for (unsigned Part = 0; Part < UF; ++Part) {
4353       SmallVector<Constant *, 8> Indices;
4354       // Create a vector of consecutive numbers from zero to VF.
4355       for (unsigned i = 0; i < VF; ++i)
4356         Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
4357       Constant *StartOffset = ConstantVector::get(Indices);
4358 
4359       Value *GEP = Builder.CreateGEP(
4360           ScStValueType->getPointerElementType(), NewPointerPhi,
4361           Builder.CreateMul(StartOffset,
4362                             Builder.CreateVectorSplat(VF, ScalarStepValue),
4363                             "vector.gep"));
4364       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4365     }
4366   }
4367   }
4368 }
4369 
4370 /// A helper function for checking whether an integer division-related
4371 /// instruction may divide by zero (in which case it must be predicated if
4372 /// executed conditionally in the scalar code).
4373 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4374 /// Non-zero divisors that are non compile-time constants will not be
4375 /// converted into multiplication, so we will still end up scalarizing
4376 /// the division, but can do so w/o predication.
4377 static bool mayDivideByZero(Instruction &I) {
4378   assert((I.getOpcode() == Instruction::UDiv ||
4379           I.getOpcode() == Instruction::SDiv ||
4380           I.getOpcode() == Instruction::URem ||
4381           I.getOpcode() == Instruction::SRem) &&
4382          "Unexpected instruction");
4383   Value *Divisor = I.getOperand(1);
4384   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4385   return !CInt || CInt->isZero();
4386 }
4387 
4388 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4389                                            VPTransformState &State) {
4390   switch (I.getOpcode()) {
4391   case Instruction::Call:
4392   case Instruction::Br:
4393   case Instruction::PHI:
4394   case Instruction::GetElementPtr:
4395   case Instruction::Select:
4396     llvm_unreachable("This instruction is handled by a different recipe.");
4397   case Instruction::UDiv:
4398   case Instruction::SDiv:
4399   case Instruction::SRem:
4400   case Instruction::URem:
4401   case Instruction::Add:
4402   case Instruction::FAdd:
4403   case Instruction::Sub:
4404   case Instruction::FSub:
4405   case Instruction::FNeg:
4406   case Instruction::Mul:
4407   case Instruction::FMul:
4408   case Instruction::FDiv:
4409   case Instruction::FRem:
4410   case Instruction::Shl:
4411   case Instruction::LShr:
4412   case Instruction::AShr:
4413   case Instruction::And:
4414   case Instruction::Or:
4415   case Instruction::Xor: {
4416     // Just widen unops and binops.
4417     setDebugLocFromInst(Builder, &I);
4418 
4419     for (unsigned Part = 0; Part < UF; ++Part) {
4420       SmallVector<Value *, 2> Ops;
4421       for (VPValue *VPOp : User.operands())
4422         Ops.push_back(State.get(VPOp, Part));
4423 
4424       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4425 
4426       if (auto *VecOp = dyn_cast<Instruction>(V))
4427         VecOp->copyIRFlags(&I);
4428 
4429       // Use this vector value for all users of the original instruction.
4430       VectorLoopValueMap.setVectorValue(&I, Part, V);
4431       addMetadata(V, &I);
4432     }
4433 
4434     break;
4435   }
4436   case Instruction::ICmp:
4437   case Instruction::FCmp: {
4438     // Widen compares. Generate vector compares.
4439     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4440     auto *Cmp = cast<CmpInst>(&I);
4441     setDebugLocFromInst(Builder, Cmp);
4442     for (unsigned Part = 0; Part < UF; ++Part) {
4443       Value *A = State.get(User.getOperand(0), Part);
4444       Value *B = State.get(User.getOperand(1), Part);
4445       Value *C = nullptr;
4446       if (FCmp) {
4447         // Propagate fast math flags.
4448         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4449         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4450         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4451       } else {
4452         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4453       }
4454       VectorLoopValueMap.setVectorValue(&I, Part, C);
4455       addMetadata(C, &I);
4456     }
4457 
4458     break;
4459   }
4460 
4461   case Instruction::ZExt:
4462   case Instruction::SExt:
4463   case Instruction::FPToUI:
4464   case Instruction::FPToSI:
4465   case Instruction::FPExt:
4466   case Instruction::PtrToInt:
4467   case Instruction::IntToPtr:
4468   case Instruction::SIToFP:
4469   case Instruction::UIToFP:
4470   case Instruction::Trunc:
4471   case Instruction::FPTrunc:
4472   case Instruction::BitCast: {
4473     auto *CI = cast<CastInst>(&I);
4474     setDebugLocFromInst(Builder, CI);
4475 
4476     /// Vectorize casts.
4477     Type *DestTy =
4478         (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
4479 
4480     for (unsigned Part = 0; Part < UF; ++Part) {
4481       Value *A = State.get(User.getOperand(0), Part);
4482       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4483       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4484       addMetadata(Cast, &I);
4485     }
4486     break;
4487   }
4488   default:
4489     // This instruction is not vectorized by simple widening.
4490     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4491     llvm_unreachable("Unhandled instruction!");
4492   } // end of switch.
4493 }
4494 
4495 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4496                                                VPTransformState &State) {
4497   assert(!isa<DbgInfoIntrinsic>(I) &&
4498          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4499   setDebugLocFromInst(Builder, &I);
4500 
4501   Module *M = I.getParent()->getParent()->getParent();
4502   auto *CI = cast<CallInst>(&I);
4503 
4504   SmallVector<Type *, 4> Tys;
4505   for (Value *ArgOperand : CI->arg_operands())
4506     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4507 
4508   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4509 
4510   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4511   // version of the instruction.
4512   // Is it beneficial to perform intrinsic call compared to lib call?
4513   bool NeedToScalarize = false;
4514   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4515   bool UseVectorIntrinsic =
4516       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4517   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4518          "Instruction should be scalarized elsewhere.");
4519 
4520   for (unsigned Part = 0; Part < UF; ++Part) {
4521     SmallVector<Value *, 4> Args;
4522     for (auto &I : enumerate(ArgOperands.operands())) {
4523       // Some intrinsics have a scalar argument - don't replace it with a
4524       // vector.
4525       Value *Arg;
4526       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4527         Arg = State.get(I.value(), Part);
4528       else
4529         Arg = State.get(I.value(), {0, 0});
4530       Args.push_back(Arg);
4531     }
4532 
4533     Function *VectorF;
4534     if (UseVectorIntrinsic) {
4535       // Use vector version of the intrinsic.
4536       Type *TysForDecl[] = {CI->getType()};
4537       if (VF > 1)
4538         TysForDecl[0] =
4539             FixedVectorType::get(CI->getType()->getScalarType(), VF);
4540       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4541       assert(VectorF && "Can't retrieve vector intrinsic.");
4542     } else {
4543       // Use vector version of the function call.
4544       const VFShape Shape =
4545           VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
4546 #ifndef NDEBUG
4547       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4548              "Can't create vector function.");
4549 #endif
4550         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4551     }
4552       SmallVector<OperandBundleDef, 1> OpBundles;
4553       CI->getOperandBundlesAsDefs(OpBundles);
4554       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4555 
4556       if (isa<FPMathOperator>(V))
4557         V->copyFastMathFlags(CI);
4558 
4559       VectorLoopValueMap.setVectorValue(&I, Part, V);
4560       addMetadata(V, &I);
4561   }
4562 }
4563 
4564 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4565                                                  VPUser &Operands,
4566                                                  bool InvariantCond,
4567                                                  VPTransformState &State) {
4568   setDebugLocFromInst(Builder, &I);
4569 
4570   // The condition can be loop invariant  but still defined inside the
4571   // loop. This means that we can't just use the original 'cond' value.
4572   // We have to take the 'vectorized' value and pick the first lane.
4573   // Instcombine will make this a no-op.
4574   auto *InvarCond =
4575       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4576 
4577   for (unsigned Part = 0; Part < UF; ++Part) {
4578     Value *Cond =
4579         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4580     Value *Op0 = State.get(Operands.getOperand(1), Part);
4581     Value *Op1 = State.get(Operands.getOperand(2), Part);
4582     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4583     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4584     addMetadata(Sel, &I);
4585   }
4586 }
4587 
4588 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4589   // We should not collect Scalars more than once per VF. Right now, this
4590   // function is called from collectUniformsAndScalars(), which already does
4591   // this check. Collecting Scalars for VF=1 does not make any sense.
4592   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4593          "This function should not be visited twice for the same VF");
4594 
4595   SmallSetVector<Instruction *, 8> Worklist;
4596 
4597   // These sets are used to seed the analysis with pointers used by memory
4598   // accesses that will remain scalar.
4599   SmallSetVector<Instruction *, 8> ScalarPtrs;
4600   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4601   auto *Latch = TheLoop->getLoopLatch();
4602 
4603   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4604   // The pointer operands of loads and stores will be scalar as long as the
4605   // memory access is not a gather or scatter operation. The value operand of a
4606   // store will remain scalar if the store is scalarized.
4607   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4608     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4609     assert(WideningDecision != CM_Unknown &&
4610            "Widening decision should be ready at this moment");
4611     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4612       if (Ptr == Store->getValueOperand())
4613         return WideningDecision == CM_Scalarize;
4614     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4615            "Ptr is neither a value or pointer operand");
4616     return WideningDecision != CM_GatherScatter;
4617   };
4618 
4619   // A helper that returns true if the given value is a bitcast or
4620   // getelementptr instruction contained in the loop.
4621   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4622     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4623             isa<GetElementPtrInst>(V)) &&
4624            !TheLoop->isLoopInvariant(V);
4625   };
4626 
4627   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4628     if (!isa<PHINode>(Ptr) ||
4629         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4630       return false;
4631     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4632     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4633       return false;
4634     return isScalarUse(MemAccess, Ptr);
4635   };
4636 
4637   // A helper that evaluates a memory access's use of a pointer. If the
4638   // pointer is actually the pointer induction of a loop, it is being
4639   // inserted into Worklist. If the use will be a scalar use, and the
4640   // pointer is only used by memory accesses, we place the pointer in
4641   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4642   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4643     if (isScalarPtrInduction(MemAccess, Ptr)) {
4644       Worklist.insert(cast<Instruction>(Ptr));
4645       Instruction *Update = cast<Instruction>(
4646           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4647       Worklist.insert(Update);
4648       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4649                         << "\n");
4650       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4651                         << "\n");
4652       return;
4653     }
4654     // We only care about bitcast and getelementptr instructions contained in
4655     // the loop.
4656     if (!isLoopVaryingBitCastOrGEP(Ptr))
4657       return;
4658 
4659     // If the pointer has already been identified as scalar (e.g., if it was
4660     // also identified as uniform), there's nothing to do.
4661     auto *I = cast<Instruction>(Ptr);
4662     if (Worklist.count(I))
4663       return;
4664 
4665     // If the use of the pointer will be a scalar use, and all users of the
4666     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4667     // place the pointer in PossibleNonScalarPtrs.
4668     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4669           return isa<LoadInst>(U) || isa<StoreInst>(U);
4670         }))
4671       ScalarPtrs.insert(I);
4672     else
4673       PossibleNonScalarPtrs.insert(I);
4674   };
4675 
4676   // We seed the scalars analysis with three classes of instructions: (1)
4677   // instructions marked uniform-after-vectorization and (2) bitcast,
4678   // getelementptr and (pointer) phi instructions used by memory accesses
4679   // requiring a scalar use.
4680   //
4681   // (1) Add to the worklist all instructions that have been identified as
4682   // uniform-after-vectorization.
4683   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4684 
4685   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4686   // memory accesses requiring a scalar use. The pointer operands of loads and
4687   // stores will be scalar as long as the memory accesses is not a gather or
4688   // scatter operation. The value operand of a store will remain scalar if the
4689   // store is scalarized.
4690   for (auto *BB : TheLoop->blocks())
4691     for (auto &I : *BB) {
4692       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4693         evaluatePtrUse(Load, Load->getPointerOperand());
4694       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4695         evaluatePtrUse(Store, Store->getPointerOperand());
4696         evaluatePtrUse(Store, Store->getValueOperand());
4697       }
4698     }
4699   for (auto *I : ScalarPtrs)
4700     if (!PossibleNonScalarPtrs.count(I)) {
4701       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4702       Worklist.insert(I);
4703     }
4704 
4705   // Insert the forced scalars.
4706   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4707   // induction variable when the PHI user is scalarized.
4708   auto ForcedScalar = ForcedScalars.find(VF);
4709   if (ForcedScalar != ForcedScalars.end())
4710     for (auto *I : ForcedScalar->second)
4711       Worklist.insert(I);
4712 
4713   // Expand the worklist by looking through any bitcasts and getelementptr
4714   // instructions we've already identified as scalar. This is similar to the
4715   // expansion step in collectLoopUniforms(); however, here we're only
4716   // expanding to include additional bitcasts and getelementptr instructions.
4717   unsigned Idx = 0;
4718   while (Idx != Worklist.size()) {
4719     Instruction *Dst = Worklist[Idx++];
4720     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4721       continue;
4722     auto *Src = cast<Instruction>(Dst->getOperand(0));
4723     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4724           auto *J = cast<Instruction>(U);
4725           return !TheLoop->contains(J) || Worklist.count(J) ||
4726                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4727                   isScalarUse(J, Src));
4728         })) {
4729       Worklist.insert(Src);
4730       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4731     }
4732   }
4733 
4734   // An induction variable will remain scalar if all users of the induction
4735   // variable and induction variable update remain scalar.
4736   for (auto &Induction : Legal->getInductionVars()) {
4737     auto *Ind = Induction.first;
4738     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4739 
4740     // If tail-folding is applied, the primary induction variable will be used
4741     // to feed a vector compare.
4742     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4743       continue;
4744 
4745     // Determine if all users of the induction variable are scalar after
4746     // vectorization.
4747     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4748       auto *I = cast<Instruction>(U);
4749       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4750     });
4751     if (!ScalarInd)
4752       continue;
4753 
4754     // Determine if all users of the induction variable update instruction are
4755     // scalar after vectorization.
4756     auto ScalarIndUpdate =
4757         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4758           auto *I = cast<Instruction>(U);
4759           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4760         });
4761     if (!ScalarIndUpdate)
4762       continue;
4763 
4764     // The induction variable and its update instruction will remain scalar.
4765     Worklist.insert(Ind);
4766     Worklist.insert(IndUpdate);
4767     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4768     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4769                       << "\n");
4770   }
4771 
4772   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4773 }
4774 
4775 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4776   if (!blockNeedsPredication(I->getParent()))
4777     return false;
4778   switch(I->getOpcode()) {
4779   default:
4780     break;
4781   case Instruction::Load:
4782   case Instruction::Store: {
4783     if (!Legal->isMaskRequired(I))
4784       return false;
4785     auto *Ptr = getLoadStorePointerOperand(I);
4786     auto *Ty = getMemInstValueType(I);
4787     // We have already decided how to vectorize this instruction, get that
4788     // result.
4789     if (VF > 1) {
4790       InstWidening WideningDecision = getWideningDecision(I, VF);
4791       assert(WideningDecision != CM_Unknown &&
4792              "Widening decision should be ready at this moment");
4793       return WideningDecision == CM_Scalarize;
4794     }
4795     const Align Alignment = getLoadStoreAlignment(I);
4796     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4797                                 isLegalMaskedGather(Ty, Alignment))
4798                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4799                                 isLegalMaskedScatter(Ty, Alignment));
4800   }
4801   case Instruction::UDiv:
4802   case Instruction::SDiv:
4803   case Instruction::SRem:
4804   case Instruction::URem:
4805     return mayDivideByZero(*I);
4806   }
4807   return false;
4808 }
4809 
4810 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4811                                                                unsigned VF) {
4812   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4813   assert(getWideningDecision(I, VF) == CM_Unknown &&
4814          "Decision should not be set yet.");
4815   auto *Group = getInterleavedAccessGroup(I);
4816   assert(Group && "Must have a group.");
4817 
4818   // If the instruction's allocated size doesn't equal it's type size, it
4819   // requires padding and will be scalarized.
4820   auto &DL = I->getModule()->getDataLayout();
4821   auto *ScalarTy = getMemInstValueType(I);
4822   if (hasIrregularType(ScalarTy, DL, VF))
4823     return false;
4824 
4825   // Check if masking is required.
4826   // A Group may need masking for one of two reasons: it resides in a block that
4827   // needs predication, or it was decided to use masking to deal with gaps.
4828   bool PredicatedAccessRequiresMasking =
4829       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4830   bool AccessWithGapsRequiresMasking =
4831       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4832   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4833     return true;
4834 
4835   // If masked interleaving is required, we expect that the user/target had
4836   // enabled it, because otherwise it either wouldn't have been created or
4837   // it should have been invalidated by the CostModel.
4838   assert(useMaskedInterleavedAccesses(TTI) &&
4839          "Masked interleave-groups for predicated accesses are not enabled.");
4840 
4841   auto *Ty = getMemInstValueType(I);
4842   const Align Alignment = getLoadStoreAlignment(I);
4843   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4844                           : TTI.isLegalMaskedStore(Ty, Alignment);
4845 }
4846 
4847 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4848                                                                unsigned VF) {
4849   // Get and ensure we have a valid memory instruction.
4850   LoadInst *LI = dyn_cast<LoadInst>(I);
4851   StoreInst *SI = dyn_cast<StoreInst>(I);
4852   assert((LI || SI) && "Invalid memory instruction");
4853 
4854   auto *Ptr = getLoadStorePointerOperand(I);
4855 
4856   // In order to be widened, the pointer should be consecutive, first of all.
4857   if (!Legal->isConsecutivePtr(Ptr))
4858     return false;
4859 
4860   // If the instruction is a store located in a predicated block, it will be
4861   // scalarized.
4862   if (isScalarWithPredication(I))
4863     return false;
4864 
4865   // If the instruction's allocated size doesn't equal it's type size, it
4866   // requires padding and will be scalarized.
4867   auto &DL = I->getModule()->getDataLayout();
4868   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4869   if (hasIrregularType(ScalarTy, DL, VF))
4870     return false;
4871 
4872   return true;
4873 }
4874 
4875 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4876   // We should not collect Uniforms more than once per VF. Right now,
4877   // this function is called from collectUniformsAndScalars(), which
4878   // already does this check. Collecting Uniforms for VF=1 does not make any
4879   // sense.
4880 
4881   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4882          "This function should not be visited twice for the same VF");
4883 
4884   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4885   // not analyze again.  Uniforms.count(VF) will return 1.
4886   Uniforms[VF].clear();
4887 
4888   // We now know that the loop is vectorizable!
4889   // Collect instructions inside the loop that will remain uniform after
4890   // vectorization.
4891 
4892   // Global values, params and instructions outside of current loop are out of
4893   // scope.
4894   auto isOutOfScope = [&](Value *V) -> bool {
4895     Instruction *I = dyn_cast<Instruction>(V);
4896     return (!I || !TheLoop->contains(I));
4897   };
4898 
4899   SetVector<Instruction *> Worklist;
4900   BasicBlock *Latch = TheLoop->getLoopLatch();
4901 
4902   // Instructions that are scalar with predication must not be considered
4903   // uniform after vectorization, because that would create an erroneous
4904   // replicating region where only a single instance out of VF should be formed.
4905   // TODO: optimize such seldom cases if found important, see PR40816.
4906   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4907     if (isScalarWithPredication(I, VF)) {
4908       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4909                         << *I << "\n");
4910       return;
4911     }
4912     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4913     Worklist.insert(I);
4914   };
4915 
4916   // Start with the conditional branch. If the branch condition is an
4917   // instruction contained in the loop that is only used by the branch, it is
4918   // uniform.
4919   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4920   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4921     addToWorklistIfAllowed(Cmp);
4922 
4923   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4924   // are pointers that are treated like consecutive pointers during
4925   // vectorization. The pointer operands of interleaved accesses are an
4926   // example.
4927   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4928 
4929   // Holds pointer operands of instructions that are possibly non-uniform.
4930   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4931 
4932   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4933     InstWidening WideningDecision = getWideningDecision(I, VF);
4934     assert(WideningDecision != CM_Unknown &&
4935            "Widening decision should be ready at this moment");
4936 
4937     return (WideningDecision == CM_Widen ||
4938             WideningDecision == CM_Widen_Reverse ||
4939             WideningDecision == CM_Interleave);
4940   };
4941   // Iterate over the instructions in the loop, and collect all
4942   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4943   // that a consecutive-like pointer operand will be scalarized, we collect it
4944   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4945   // getelementptr instruction can be used by both vectorized and scalarized
4946   // memory instructions. For example, if a loop loads and stores from the same
4947   // location, but the store is conditional, the store will be scalarized, and
4948   // the getelementptr won't remain uniform.
4949   for (auto *BB : TheLoop->blocks())
4950     for (auto &I : *BB) {
4951       // If there's no pointer operand, there's nothing to do.
4952       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4953       if (!Ptr)
4954         continue;
4955 
4956       // True if all users of Ptr are memory accesses that have Ptr as their
4957       // pointer operand.
4958       auto UsersAreMemAccesses =
4959           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4960             return getLoadStorePointerOperand(U) == Ptr;
4961           });
4962 
4963       // Ensure the memory instruction will not be scalarized or used by
4964       // gather/scatter, making its pointer operand non-uniform. If the pointer
4965       // operand is used by any instruction other than a memory access, we
4966       // conservatively assume the pointer operand may be non-uniform.
4967       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4968         PossibleNonUniformPtrs.insert(Ptr);
4969 
4970       // If the memory instruction will be vectorized and its pointer operand
4971       // is consecutive-like, or interleaving - the pointer operand should
4972       // remain uniform.
4973       else
4974         ConsecutiveLikePtrs.insert(Ptr);
4975     }
4976 
4977   // Add to the Worklist all consecutive and consecutive-like pointers that
4978   // aren't also identified as possibly non-uniform.
4979   for (auto *V : ConsecutiveLikePtrs)
4980     if (!PossibleNonUniformPtrs.count(V))
4981       addToWorklistIfAllowed(V);
4982 
4983   // Expand Worklist in topological order: whenever a new instruction
4984   // is added , its users should be already inside Worklist.  It ensures
4985   // a uniform instruction will only be used by uniform instructions.
4986   unsigned idx = 0;
4987   while (idx != Worklist.size()) {
4988     Instruction *I = Worklist[idx++];
4989 
4990     for (auto OV : I->operand_values()) {
4991       // isOutOfScope operands cannot be uniform instructions.
4992       if (isOutOfScope(OV))
4993         continue;
4994       // First order recurrence Phi's should typically be considered
4995       // non-uniform.
4996       auto *OP = dyn_cast<PHINode>(OV);
4997       if (OP && Legal->isFirstOrderRecurrence(OP))
4998         continue;
4999       // If all the users of the operand are uniform, then add the
5000       // operand into the uniform worklist.
5001       auto *OI = cast<Instruction>(OV);
5002       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5003             auto *J = cast<Instruction>(U);
5004             return Worklist.count(J) ||
5005                    (OI == getLoadStorePointerOperand(J) &&
5006                     isUniformDecision(J, VF));
5007           }))
5008         addToWorklistIfAllowed(OI);
5009     }
5010   }
5011 
5012   // Returns true if Ptr is the pointer operand of a memory access instruction
5013   // I, and I is known to not require scalarization.
5014   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5015     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5016   };
5017 
5018   // For an instruction to be added into Worklist above, all its users inside
5019   // the loop should also be in Worklist. However, this condition cannot be
5020   // true for phi nodes that form a cyclic dependence. We must process phi
5021   // nodes separately. An induction variable will remain uniform if all users
5022   // of the induction variable and induction variable update remain uniform.
5023   // The code below handles both pointer and non-pointer induction variables.
5024   for (auto &Induction : Legal->getInductionVars()) {
5025     auto *Ind = Induction.first;
5026     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5027 
5028     // Determine if all users of the induction variable are uniform after
5029     // vectorization.
5030     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5031       auto *I = cast<Instruction>(U);
5032       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5033              isVectorizedMemAccessUse(I, Ind);
5034     });
5035     if (!UniformInd)
5036       continue;
5037 
5038     // Determine if all users of the induction variable update instruction are
5039     // uniform after vectorization.
5040     auto UniformIndUpdate =
5041         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5042           auto *I = cast<Instruction>(U);
5043           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5044                  isVectorizedMemAccessUse(I, IndUpdate);
5045         });
5046     if (!UniformIndUpdate)
5047       continue;
5048 
5049     // The induction variable and its update instruction will remain uniform.
5050     addToWorklistIfAllowed(Ind);
5051     addToWorklistIfAllowed(IndUpdate);
5052   }
5053 
5054   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5055 }
5056 
5057 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5058   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5059 
5060   if (Legal->getRuntimePointerChecking()->Need) {
5061     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5062         "runtime pointer checks needed. Enable vectorization of this "
5063         "loop with '#pragma clang loop vectorize(enable)' when "
5064         "compiling with -Os/-Oz",
5065         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5066     return true;
5067   }
5068 
5069   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5070     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5071         "runtime SCEV checks needed. Enable vectorization of this "
5072         "loop with '#pragma clang loop vectorize(enable)' when "
5073         "compiling with -Os/-Oz",
5074         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5075     return true;
5076   }
5077 
5078   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5079   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5080     reportVectorizationFailure("Runtime stride check for small trip count",
5081         "runtime stride == 1 checks needed. Enable vectorization of "
5082         "this loop without such check by compiling with -Os/-Oz",
5083         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5084     return true;
5085   }
5086 
5087   return false;
5088 }
5089 
5090 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5091                                                             unsigned UserIC) {
5092   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5093     // TODO: It may by useful to do since it's still likely to be dynamically
5094     // uniform if the target can skip.
5095     reportVectorizationFailure(
5096         "Not inserting runtime ptr check for divergent target",
5097         "runtime pointer checks needed. Not enabled for divergent target",
5098         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5099     return None;
5100   }
5101 
5102   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5103   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5104   if (TC == 1) {
5105     reportVectorizationFailure("Single iteration (non) loop",
5106         "loop trip count is one, irrelevant for vectorization",
5107         "SingleIterationLoop", ORE, TheLoop);
5108     return None;
5109   }
5110 
5111   switch (ScalarEpilogueStatus) {
5112   case CM_ScalarEpilogueAllowed:
5113     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5114   case CM_ScalarEpilogueNotNeededUsePredicate:
5115     LLVM_DEBUG(
5116         dbgs() << "LV: vector predicate hint/switch found.\n"
5117                << "LV: Not allowing scalar epilogue, creating predicated "
5118                << "vector loop.\n");
5119     break;
5120   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5121     // fallthrough as a special case of OptForSize
5122   case CM_ScalarEpilogueNotAllowedOptSize:
5123     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5124       LLVM_DEBUG(
5125           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5126     else
5127       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5128                         << "count.\n");
5129 
5130     // Bail if runtime checks are required, which are not good when optimising
5131     // for size.
5132     if (runtimeChecksRequired())
5133       return None;
5134     break;
5135   }
5136 
5137   // Now try the tail folding
5138 
5139   // Invalidate interleave groups that require an epilogue if we can't mask
5140   // the interleave-group.
5141   if (!useMaskedInterleavedAccesses(TTI)) {
5142     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5143            "No decisions should have been taken at this point");
5144     // Note: There is no need to invalidate any cost modeling decisions here, as
5145     // non where taken so far.
5146     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5147   }
5148 
5149   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5150   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5151   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5152   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5153     // Accept MaxVF if we do not have a tail.
5154     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5155     return MaxVF;
5156   }
5157 
5158   // If we don't know the precise trip count, or if the trip count that we
5159   // found modulo the vectorization factor is not zero, try to fold the tail
5160   // by masking.
5161   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5162   if (Legal->prepareToFoldTailByMasking()) {
5163     FoldTailByMasking = true;
5164     return MaxVF;
5165   }
5166 
5167   if (TC == 0) {
5168     reportVectorizationFailure(
5169         "Unable to calculate the loop count due to complex control flow",
5170         "unable to calculate the loop count due to complex control flow",
5171         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5172     return None;
5173   }
5174 
5175   reportVectorizationFailure(
5176       "Cannot optimize for size and vectorize at the same time.",
5177       "cannot optimize for size and vectorize at the same time. "
5178       "Enable vectorization of this loop with '#pragma clang loop "
5179       "vectorize(enable)' when compiling with -Os/-Oz",
5180       "NoTailLoopWithOptForSize", ORE, TheLoop);
5181   return None;
5182 }
5183 
5184 unsigned
5185 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5186   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5187   unsigned SmallestType, WidestType;
5188   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5189   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5190 
5191   // Get the maximum safe dependence distance in bits computed by LAA.
5192   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5193   // the memory accesses that is most restrictive (involved in the smallest
5194   // dependence distance).
5195   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5196 
5197   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5198 
5199   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5200   // Note that both WidestRegister and WidestType may not be a powers of 2.
5201   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5202 
5203   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5204                     << " / " << WidestType << " bits.\n");
5205   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5206                     << WidestRegister << " bits.\n");
5207 
5208   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5209                                  " into one vector!");
5210   if (MaxVectorSize == 0) {
5211     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5212     MaxVectorSize = 1;
5213     return MaxVectorSize;
5214   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5215              isPowerOf2_32(ConstTripCount)) {
5216     // We need to clamp the VF to be the ConstTripCount. There is no point in
5217     // choosing a higher viable VF as done in the loop below.
5218     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5219                       << ConstTripCount << "\n");
5220     MaxVectorSize = ConstTripCount;
5221     return MaxVectorSize;
5222   }
5223 
5224   unsigned MaxVF = MaxVectorSize;
5225   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5226       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5227     // Collect all viable vectorization factors larger than the default MaxVF
5228     // (i.e. MaxVectorSize).
5229     SmallVector<unsigned, 8> VFs;
5230     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5231     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5232       VFs.push_back(VS);
5233 
5234     // For each VF calculate its register usage.
5235     auto RUs = calculateRegisterUsage(VFs);
5236 
5237     // Select the largest VF which doesn't require more registers than existing
5238     // ones.
5239     for (int i = RUs.size() - 1; i >= 0; --i) {
5240       bool Selected = true;
5241       for (auto& pair : RUs[i].MaxLocalUsers) {
5242         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5243         if (pair.second > TargetNumRegisters)
5244           Selected = false;
5245       }
5246       if (Selected) {
5247         MaxVF = VFs[i];
5248         break;
5249       }
5250     }
5251     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5252       if (MaxVF < MinVF) {
5253         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5254                           << ") with target's minimum: " << MinVF << '\n');
5255         MaxVF = MinVF;
5256       }
5257     }
5258   }
5259   return MaxVF;
5260 }
5261 
5262 VectorizationFactor
5263 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5264   float Cost = expectedCost(1).first;
5265   const float ScalarCost = Cost;
5266   unsigned Width = 1;
5267   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5268 
5269   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5270   if (ForceVectorization && MaxVF > 1) {
5271     // Ignore scalar width, because the user explicitly wants vectorization.
5272     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5273     // evaluation.
5274     Cost = std::numeric_limits<float>::max();
5275   }
5276 
5277   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5278     // Notice that the vector loop needs to be executed less times, so
5279     // we need to divide the cost of the vector loops by the width of
5280     // the vector elements.
5281     VectorizationCostTy C = expectedCost(i);
5282     float VectorCost = C.first / (float)i;
5283     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5284                       << " costs: " << (int)VectorCost << ".\n");
5285     if (!C.second && !ForceVectorization) {
5286       LLVM_DEBUG(
5287           dbgs() << "LV: Not considering vector loop of width " << i
5288                  << " because it will not generate any vector instructions.\n");
5289       continue;
5290     }
5291     if (VectorCost < Cost) {
5292       Cost = VectorCost;
5293       Width = i;
5294     }
5295   }
5296 
5297   if (!EnableCondStoresVectorization && NumPredStores) {
5298     reportVectorizationFailure("There are conditional stores.",
5299         "store that is conditionally executed prevents vectorization",
5300         "ConditionalStore", ORE, TheLoop);
5301     Width = 1;
5302     Cost = ScalarCost;
5303   }
5304 
5305   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5306              << "LV: Vectorization seems to be not beneficial, "
5307              << "but was forced by a user.\n");
5308   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5309   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5310   return Factor;
5311 }
5312 
5313 std::pair<unsigned, unsigned>
5314 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5315   unsigned MinWidth = -1U;
5316   unsigned MaxWidth = 8;
5317   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5318 
5319   // For each block.
5320   for (BasicBlock *BB : TheLoop->blocks()) {
5321     // For each instruction in the loop.
5322     for (Instruction &I : BB->instructionsWithoutDebug()) {
5323       Type *T = I.getType();
5324 
5325       // Skip ignored values.
5326       if (ValuesToIgnore.count(&I))
5327         continue;
5328 
5329       // Only examine Loads, Stores and PHINodes.
5330       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5331         continue;
5332 
5333       // Examine PHI nodes that are reduction variables. Update the type to
5334       // account for the recurrence type.
5335       if (auto *PN = dyn_cast<PHINode>(&I)) {
5336         if (!Legal->isReductionVariable(PN))
5337           continue;
5338         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5339         T = RdxDesc.getRecurrenceType();
5340       }
5341 
5342       // Examine the stored values.
5343       if (auto *ST = dyn_cast<StoreInst>(&I))
5344         T = ST->getValueOperand()->getType();
5345 
5346       // Ignore loaded pointer types and stored pointer types that are not
5347       // vectorizable.
5348       //
5349       // FIXME: The check here attempts to predict whether a load or store will
5350       //        be vectorized. We only know this for certain after a VF has
5351       //        been selected. Here, we assume that if an access can be
5352       //        vectorized, it will be. We should also look at extending this
5353       //        optimization to non-pointer types.
5354       //
5355       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5356           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5357         continue;
5358 
5359       MinWidth = std::min(MinWidth,
5360                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5361       MaxWidth = std::max(MaxWidth,
5362                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5363     }
5364   }
5365 
5366   return {MinWidth, MaxWidth};
5367 }
5368 
5369 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5370                                                            unsigned LoopCost) {
5371   // -- The interleave heuristics --
5372   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5373   // There are many micro-architectural considerations that we can't predict
5374   // at this level. For example, frontend pressure (on decode or fetch) due to
5375   // code size, or the number and capabilities of the execution ports.
5376   //
5377   // We use the following heuristics to select the interleave count:
5378   // 1. If the code has reductions, then we interleave to break the cross
5379   // iteration dependency.
5380   // 2. If the loop is really small, then we interleave to reduce the loop
5381   // overhead.
5382   // 3. We don't interleave if we think that we will spill registers to memory
5383   // due to the increased register pressure.
5384 
5385   if (!isScalarEpilogueAllowed())
5386     return 1;
5387 
5388   // We used the distance for the interleave count.
5389   if (Legal->getMaxSafeDepDistBytes() != -1U)
5390     return 1;
5391 
5392   // Do not interleave loops with a relatively small known or estimated trip
5393   // count.
5394   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5395   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5396     return 1;
5397 
5398   RegisterUsage R = calculateRegisterUsage({VF})[0];
5399   // We divide by these constants so assume that we have at least one
5400   // instruction that uses at least one register.
5401   for (auto& pair : R.MaxLocalUsers) {
5402     pair.second = std::max(pair.second, 1U);
5403   }
5404 
5405   // We calculate the interleave count using the following formula.
5406   // Subtract the number of loop invariants from the number of available
5407   // registers. These registers are used by all of the interleaved instances.
5408   // Next, divide the remaining registers by the number of registers that is
5409   // required by the loop, in order to estimate how many parallel instances
5410   // fit without causing spills. All of this is rounded down if necessary to be
5411   // a power of two. We want power of two interleave count to simplify any
5412   // addressing operations or alignment considerations.
5413   // We also want power of two interleave counts to ensure that the induction
5414   // variable of the vector loop wraps to zero, when tail is folded by masking;
5415   // this currently happens when OptForSize, in which case IC is set to 1 above.
5416   unsigned IC = UINT_MAX;
5417 
5418   for (auto& pair : R.MaxLocalUsers) {
5419     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5420     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5421                       << " registers of "
5422                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5423     if (VF == 1) {
5424       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5425         TargetNumRegisters = ForceTargetNumScalarRegs;
5426     } else {
5427       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5428         TargetNumRegisters = ForceTargetNumVectorRegs;
5429     }
5430     unsigned MaxLocalUsers = pair.second;
5431     unsigned LoopInvariantRegs = 0;
5432     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5433       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5434 
5435     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5436     // Don't count the induction variable as interleaved.
5437     if (EnableIndVarRegisterHeur) {
5438       TmpIC =
5439           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5440                         std::max(1U, (MaxLocalUsers - 1)));
5441     }
5442 
5443     IC = std::min(IC, TmpIC);
5444   }
5445 
5446   // Clamp the interleave ranges to reasonable counts.
5447   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5448 
5449   // Check if the user has overridden the max.
5450   if (VF == 1) {
5451     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5452       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5453   } else {
5454     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5455       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5456   }
5457 
5458   // If trip count is known or estimated compile time constant, limit the
5459   // interleave count to be less than the trip count divided by VF.
5460   if (BestKnownTC) {
5461     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5462   }
5463 
5464   // If we did not calculate the cost for VF (because the user selected the VF)
5465   // then we calculate the cost of VF here.
5466   if (LoopCost == 0)
5467     LoopCost = expectedCost(VF).first;
5468 
5469   assert(LoopCost && "Non-zero loop cost expected");
5470 
5471   // Clamp the calculated IC to be between the 1 and the max interleave count
5472   // that the target and trip count allows.
5473   if (IC > MaxInterleaveCount)
5474     IC = MaxInterleaveCount;
5475   else if (IC < 1)
5476     IC = 1;
5477 
5478   // Interleave if we vectorized this loop and there is a reduction that could
5479   // benefit from interleaving.
5480   if (VF > 1 && !Legal->getReductionVars().empty()) {
5481     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5482     return IC;
5483   }
5484 
5485   // Note that if we've already vectorized the loop we will have done the
5486   // runtime check and so interleaving won't require further checks.
5487   bool InterleavingRequiresRuntimePointerCheck =
5488       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5489 
5490   // We want to interleave small loops in order to reduce the loop overhead and
5491   // potentially expose ILP opportunities.
5492   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5493   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5494     // We assume that the cost overhead is 1 and we use the cost model
5495     // to estimate the cost of the loop and interleave until the cost of the
5496     // loop overhead is about 5% of the cost of the loop.
5497     unsigned SmallIC =
5498         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5499 
5500     // Interleave until store/load ports (estimated by max interleave count) are
5501     // saturated.
5502     unsigned NumStores = Legal->getNumStores();
5503     unsigned NumLoads = Legal->getNumLoads();
5504     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5505     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5506 
5507     // If we have a scalar reduction (vector reductions are already dealt with
5508     // by this point), we can increase the critical path length if the loop
5509     // we're interleaving is inside another loop. Limit, by default to 2, so the
5510     // critical path only gets increased by one reduction operation.
5511     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5512       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5513       SmallIC = std::min(SmallIC, F);
5514       StoresIC = std::min(StoresIC, F);
5515       LoadsIC = std::min(LoadsIC, F);
5516     }
5517 
5518     if (EnableLoadStoreRuntimeInterleave &&
5519         std::max(StoresIC, LoadsIC) > SmallIC) {
5520       LLVM_DEBUG(
5521           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5522       return std::max(StoresIC, LoadsIC);
5523     }
5524 
5525     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5526     return SmallIC;
5527   }
5528 
5529   // Interleave if this is a large loop (small loops are already dealt with by
5530   // this point) that could benefit from interleaving.
5531   bool HasReductions = !Legal->getReductionVars().empty();
5532   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5533     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5534     return IC;
5535   }
5536 
5537   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5538   return 1;
5539 }
5540 
5541 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5542 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5543   // This function calculates the register usage by measuring the highest number
5544   // of values that are alive at a single location. Obviously, this is a very
5545   // rough estimation. We scan the loop in a topological order in order and
5546   // assign a number to each instruction. We use RPO to ensure that defs are
5547   // met before their users. We assume that each instruction that has in-loop
5548   // users starts an interval. We record every time that an in-loop value is
5549   // used, so we have a list of the first and last occurrences of each
5550   // instruction. Next, we transpose this data structure into a multi map that
5551   // holds the list of intervals that *end* at a specific location. This multi
5552   // map allows us to perform a linear search. We scan the instructions linearly
5553   // and record each time that a new interval starts, by placing it in a set.
5554   // If we find this value in the multi-map then we remove it from the set.
5555   // The max register usage is the maximum size of the set.
5556   // We also search for instructions that are defined outside the loop, but are
5557   // used inside the loop. We need this number separately from the max-interval
5558   // usage number because when we unroll, loop-invariant values do not take
5559   // more register.
5560   LoopBlocksDFS DFS(TheLoop);
5561   DFS.perform(LI);
5562 
5563   RegisterUsage RU;
5564 
5565   // Each 'key' in the map opens a new interval. The values
5566   // of the map are the index of the 'last seen' usage of the
5567   // instruction that is the key.
5568   using IntervalMap = DenseMap<Instruction *, unsigned>;
5569 
5570   // Maps instruction to its index.
5571   SmallVector<Instruction *, 64> IdxToInstr;
5572   // Marks the end of each interval.
5573   IntervalMap EndPoint;
5574   // Saves the list of instruction indices that are used in the loop.
5575   SmallPtrSet<Instruction *, 8> Ends;
5576   // Saves the list of values that are used in the loop but are
5577   // defined outside the loop, such as arguments and constants.
5578   SmallPtrSet<Value *, 8> LoopInvariants;
5579 
5580   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5581     for (Instruction &I : BB->instructionsWithoutDebug()) {
5582       IdxToInstr.push_back(&I);
5583 
5584       // Save the end location of each USE.
5585       for (Value *U : I.operands()) {
5586         auto *Instr = dyn_cast<Instruction>(U);
5587 
5588         // Ignore non-instruction values such as arguments, constants, etc.
5589         if (!Instr)
5590           continue;
5591 
5592         // If this instruction is outside the loop then record it and continue.
5593         if (!TheLoop->contains(Instr)) {
5594           LoopInvariants.insert(Instr);
5595           continue;
5596         }
5597 
5598         // Overwrite previous end points.
5599         EndPoint[Instr] = IdxToInstr.size();
5600         Ends.insert(Instr);
5601       }
5602     }
5603   }
5604 
5605   // Saves the list of intervals that end with the index in 'key'.
5606   using InstrList = SmallVector<Instruction *, 2>;
5607   DenseMap<unsigned, InstrList> TransposeEnds;
5608 
5609   // Transpose the EndPoints to a list of values that end at each index.
5610   for (auto &Interval : EndPoint)
5611     TransposeEnds[Interval.second].push_back(Interval.first);
5612 
5613   SmallPtrSet<Instruction *, 8> OpenIntervals;
5614 
5615   // Get the size of the widest register.
5616   unsigned MaxSafeDepDist = -1U;
5617   if (Legal->getMaxSafeDepDistBytes() != -1U)
5618     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5619   unsigned WidestRegister =
5620       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5621   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5622 
5623   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5624   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5625 
5626   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5627 
5628   // A lambda that gets the register usage for the given type and VF.
5629   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5630     if (Ty->isTokenTy())
5631       return 0U;
5632     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5633     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5634   };
5635 
5636   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5637     Instruction *I = IdxToInstr[i];
5638 
5639     // Remove all of the instructions that end at this location.
5640     InstrList &List = TransposeEnds[i];
5641     for (Instruction *ToRemove : List)
5642       OpenIntervals.erase(ToRemove);
5643 
5644     // Ignore instructions that are never used within the loop.
5645     if (!Ends.count(I))
5646       continue;
5647 
5648     // Skip ignored values.
5649     if (ValuesToIgnore.count(I))
5650       continue;
5651 
5652     // For each VF find the maximum usage of registers.
5653     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5654       // Count the number of live intervals.
5655       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5656 
5657       if (VFs[j] == 1) {
5658         for (auto Inst : OpenIntervals) {
5659           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5660           if (RegUsage.find(ClassID) == RegUsage.end())
5661             RegUsage[ClassID] = 1;
5662           else
5663             RegUsage[ClassID] += 1;
5664         }
5665       } else {
5666         collectUniformsAndScalars(VFs[j]);
5667         for (auto Inst : OpenIntervals) {
5668           // Skip ignored values for VF > 1.
5669           if (VecValuesToIgnore.count(Inst))
5670             continue;
5671           if (isScalarAfterVectorization(Inst, VFs[j])) {
5672             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5673             if (RegUsage.find(ClassID) == RegUsage.end())
5674               RegUsage[ClassID] = 1;
5675             else
5676               RegUsage[ClassID] += 1;
5677           } else {
5678             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5679             if (RegUsage.find(ClassID) == RegUsage.end())
5680               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5681             else
5682               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5683           }
5684         }
5685       }
5686 
5687       for (auto& pair : RegUsage) {
5688         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5689           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5690         else
5691           MaxUsages[j][pair.first] = pair.second;
5692       }
5693     }
5694 
5695     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5696                       << OpenIntervals.size() << '\n');
5697 
5698     // Add the current instruction to the list of open intervals.
5699     OpenIntervals.insert(I);
5700   }
5701 
5702   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5703     SmallMapVector<unsigned, unsigned, 4> Invariant;
5704 
5705     for (auto Inst : LoopInvariants) {
5706       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5707       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5708       if (Invariant.find(ClassID) == Invariant.end())
5709         Invariant[ClassID] = Usage;
5710       else
5711         Invariant[ClassID] += Usage;
5712     }
5713 
5714     LLVM_DEBUG({
5715       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5716       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5717              << " item\n";
5718       for (const auto &pair : MaxUsages[i]) {
5719         dbgs() << "LV(REG): RegisterClass: "
5720                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5721                << " registers\n";
5722       }
5723       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5724              << " item\n";
5725       for (const auto &pair : Invariant) {
5726         dbgs() << "LV(REG): RegisterClass: "
5727                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5728                << " registers\n";
5729       }
5730     });
5731 
5732     RU.LoopInvariantRegs = Invariant;
5733     RU.MaxLocalUsers = MaxUsages[i];
5734     RUs[i] = RU;
5735   }
5736 
5737   return RUs;
5738 }
5739 
5740 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5741   // TODO: Cost model for emulated masked load/store is completely
5742   // broken. This hack guides the cost model to use an artificially
5743   // high enough value to practically disable vectorization with such
5744   // operations, except where previously deployed legality hack allowed
5745   // using very low cost values. This is to avoid regressions coming simply
5746   // from moving "masked load/store" check from legality to cost model.
5747   // Masked Load/Gather emulation was previously never allowed.
5748   // Limited number of Masked Store/Scatter emulation was allowed.
5749   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5750   return isa<LoadInst>(I) ||
5751          (isa<StoreInst>(I) &&
5752           NumPredStores > NumberOfStoresToPredicate);
5753 }
5754 
5755 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5756   // If we aren't vectorizing the loop, or if we've already collected the
5757   // instructions to scalarize, there's nothing to do. Collection may already
5758   // have occurred if we have a user-selected VF and are now computing the
5759   // expected cost for interleaving.
5760   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5761     return;
5762 
5763   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5764   // not profitable to scalarize any instructions, the presence of VF in the
5765   // map will indicate that we've analyzed it already.
5766   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5767 
5768   // Find all the instructions that are scalar with predication in the loop and
5769   // determine if it would be better to not if-convert the blocks they are in.
5770   // If so, we also record the instructions to scalarize.
5771   for (BasicBlock *BB : TheLoop->blocks()) {
5772     if (!blockNeedsPredication(BB))
5773       continue;
5774     for (Instruction &I : *BB)
5775       if (isScalarWithPredication(&I)) {
5776         ScalarCostsTy ScalarCosts;
5777         // Do not apply discount logic if hacked cost is needed
5778         // for emulated masked memrefs.
5779         if (!useEmulatedMaskMemRefHack(&I) &&
5780             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5781           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5782         // Remember that BB will remain after vectorization.
5783         PredicatedBBsAfterVectorization.insert(BB);
5784       }
5785   }
5786 }
5787 
5788 int LoopVectorizationCostModel::computePredInstDiscount(
5789     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5790     unsigned VF) {
5791   assert(!isUniformAfterVectorization(PredInst, VF) &&
5792          "Instruction marked uniform-after-vectorization will be predicated");
5793 
5794   // Initialize the discount to zero, meaning that the scalar version and the
5795   // vector version cost the same.
5796   int Discount = 0;
5797 
5798   // Holds instructions to analyze. The instructions we visit are mapped in
5799   // ScalarCosts. Those instructions are the ones that would be scalarized if
5800   // we find that the scalar version costs less.
5801   SmallVector<Instruction *, 8> Worklist;
5802 
5803   // Returns true if the given instruction can be scalarized.
5804   auto canBeScalarized = [&](Instruction *I) -> bool {
5805     // We only attempt to scalarize instructions forming a single-use chain
5806     // from the original predicated block that would otherwise be vectorized.
5807     // Although not strictly necessary, we give up on instructions we know will
5808     // already be scalar to avoid traversing chains that are unlikely to be
5809     // beneficial.
5810     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5811         isScalarAfterVectorization(I, VF))
5812       return false;
5813 
5814     // If the instruction is scalar with predication, it will be analyzed
5815     // separately. We ignore it within the context of PredInst.
5816     if (isScalarWithPredication(I))
5817       return false;
5818 
5819     // If any of the instruction's operands are uniform after vectorization,
5820     // the instruction cannot be scalarized. This prevents, for example, a
5821     // masked load from being scalarized.
5822     //
5823     // We assume we will only emit a value for lane zero of an instruction
5824     // marked uniform after vectorization, rather than VF identical values.
5825     // Thus, if we scalarize an instruction that uses a uniform, we would
5826     // create uses of values corresponding to the lanes we aren't emitting code
5827     // for. This behavior can be changed by allowing getScalarValue to clone
5828     // the lane zero values for uniforms rather than asserting.
5829     for (Use &U : I->operands())
5830       if (auto *J = dyn_cast<Instruction>(U.get()))
5831         if (isUniformAfterVectorization(J, VF))
5832           return false;
5833 
5834     // Otherwise, we can scalarize the instruction.
5835     return true;
5836   };
5837 
5838   // Compute the expected cost discount from scalarizing the entire expression
5839   // feeding the predicated instruction. We currently only consider expressions
5840   // that are single-use instruction chains.
5841   Worklist.push_back(PredInst);
5842   while (!Worklist.empty()) {
5843     Instruction *I = Worklist.pop_back_val();
5844 
5845     // If we've already analyzed the instruction, there's nothing to do.
5846     if (ScalarCosts.find(I) != ScalarCosts.end())
5847       continue;
5848 
5849     // Compute the cost of the vector instruction. Note that this cost already
5850     // includes the scalarization overhead of the predicated instruction.
5851     unsigned VectorCost = getInstructionCost(I, VF).first;
5852 
5853     // Compute the cost of the scalarized instruction. This cost is the cost of
5854     // the instruction as if it wasn't if-converted and instead remained in the
5855     // predicated block. We will scale this cost by block probability after
5856     // computing the scalarization overhead.
5857     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5858 
5859     // Compute the scalarization overhead of needed insertelement instructions
5860     // and phi nodes.
5861     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5862       ScalarCost += TTI.getScalarizationOverhead(
5863           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5864           APInt::getAllOnesValue(VF), true, false);
5865       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
5866                                             TTI::TCK_RecipThroughput);
5867     }
5868 
5869     // Compute the scalarization overhead of needed extractelement
5870     // instructions. For each of the instruction's operands, if the operand can
5871     // be scalarized, add it to the worklist; otherwise, account for the
5872     // overhead.
5873     for (Use &U : I->operands())
5874       if (auto *J = dyn_cast<Instruction>(U.get())) {
5875         assert(VectorType::isValidElementType(J->getType()) &&
5876                "Instruction has non-scalar type");
5877         if (canBeScalarized(J))
5878           Worklist.push_back(J);
5879         else if (needsExtract(J, VF))
5880           ScalarCost += TTI.getScalarizationOverhead(
5881               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5882               APInt::getAllOnesValue(VF), false, true);
5883       }
5884 
5885     // Scale the total scalar cost by block probability.
5886     ScalarCost /= getReciprocalPredBlockProb();
5887 
5888     // Compute the discount. A non-negative discount means the vector version
5889     // of the instruction costs more, and scalarizing would be beneficial.
5890     Discount += VectorCost - ScalarCost;
5891     ScalarCosts[I] = ScalarCost;
5892   }
5893 
5894   return Discount;
5895 }
5896 
5897 LoopVectorizationCostModel::VectorizationCostTy
5898 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5899   VectorizationCostTy Cost;
5900 
5901   // For each block.
5902   for (BasicBlock *BB : TheLoop->blocks()) {
5903     VectorizationCostTy BlockCost;
5904 
5905     // For each instruction in the old loop.
5906     for (Instruction &I : BB->instructionsWithoutDebug()) {
5907       // Skip ignored values.
5908       if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
5909         continue;
5910 
5911       VectorizationCostTy C = getInstructionCost(&I, VF);
5912 
5913       // Check if we should override the cost.
5914       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5915         C.first = ForceTargetInstructionCost;
5916 
5917       BlockCost.first += C.first;
5918       BlockCost.second |= C.second;
5919       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5920                         << " for VF " << VF << " For instruction: " << I
5921                         << '\n');
5922     }
5923 
5924     // If we are vectorizing a predicated block, it will have been
5925     // if-converted. This means that the block's instructions (aside from
5926     // stores and instructions that may divide by zero) will now be
5927     // unconditionally executed. For the scalar case, we may not always execute
5928     // the predicated block. Thus, scale the block's cost by the probability of
5929     // executing it.
5930     if (VF == 1 && blockNeedsPredication(BB))
5931       BlockCost.first /= getReciprocalPredBlockProb();
5932 
5933     Cost.first += BlockCost.first;
5934     Cost.second |= BlockCost.second;
5935   }
5936 
5937   return Cost;
5938 }
5939 
5940 /// Gets Address Access SCEV after verifying that the access pattern
5941 /// is loop invariant except the induction variable dependence.
5942 ///
5943 /// This SCEV can be sent to the Target in order to estimate the address
5944 /// calculation cost.
5945 static const SCEV *getAddressAccessSCEV(
5946               Value *Ptr,
5947               LoopVectorizationLegality *Legal,
5948               PredicatedScalarEvolution &PSE,
5949               const Loop *TheLoop) {
5950 
5951   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5952   if (!Gep)
5953     return nullptr;
5954 
5955   // We are looking for a gep with all loop invariant indices except for one
5956   // which should be an induction variable.
5957   auto SE = PSE.getSE();
5958   unsigned NumOperands = Gep->getNumOperands();
5959   for (unsigned i = 1; i < NumOperands; ++i) {
5960     Value *Opd = Gep->getOperand(i);
5961     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5962         !Legal->isInductionVariable(Opd))
5963       return nullptr;
5964   }
5965 
5966   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5967   return PSE.getSCEV(Ptr);
5968 }
5969 
5970 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5971   return Legal->hasStride(I->getOperand(0)) ||
5972          Legal->hasStride(I->getOperand(1));
5973 }
5974 
5975 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5976                                                                  unsigned VF) {
5977   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5978   Type *ValTy = getMemInstValueType(I);
5979   auto SE = PSE.getSE();
5980 
5981   unsigned AS = getLoadStoreAddressSpace(I);
5982   Value *Ptr = getLoadStorePointerOperand(I);
5983   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5984 
5985   // Figure out whether the access is strided and get the stride value
5986   // if it's known in compile time
5987   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5988 
5989   // Get the cost of the scalar memory instruction and address computation.
5990   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5991 
5992   // Don't pass *I here, since it is scalar but will actually be part of a
5993   // vectorized loop where the user of it is a vectorized instruction.
5994   const Align Alignment = getLoadStoreAlignment(I);
5995   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5996                                    Alignment, AS,
5997                                    TTI::TCK_RecipThroughput);
5998 
5999   // Get the overhead of the extractelement and insertelement instructions
6000   // we might create due to scalarization.
6001   Cost += getScalarizationOverhead(I, VF);
6002 
6003   // If we have a predicated store, it may not be executed for each vector
6004   // lane. Scale the cost by the probability of executing the predicated
6005   // block.
6006   if (isPredicatedInst(I)) {
6007     Cost /= getReciprocalPredBlockProb();
6008 
6009     if (useEmulatedMaskMemRefHack(I))
6010       // Artificially setting to a high enough value to practically disable
6011       // vectorization with such operations.
6012       Cost = 3000000;
6013   }
6014 
6015   return Cost;
6016 }
6017 
6018 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6019                                                              unsigned VF) {
6020   Type *ValTy = getMemInstValueType(I);
6021   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6022   Value *Ptr = getLoadStorePointerOperand(I);
6023   unsigned AS = getLoadStoreAddressSpace(I);
6024   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6025   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6026 
6027   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6028          "Stride should be 1 or -1 for consecutive memory access");
6029   const Align Alignment = getLoadStoreAlignment(I);
6030   unsigned Cost = 0;
6031   if (Legal->isMaskRequired(I))
6032     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6033                                       CostKind);
6034   else
6035     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6036                                 CostKind, I);
6037 
6038   bool Reverse = ConsecutiveStride < 0;
6039   if (Reverse)
6040     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6041   return Cost;
6042 }
6043 
6044 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6045                                                          unsigned VF) {
6046   Type *ValTy = getMemInstValueType(I);
6047   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6048   const Align Alignment = getLoadStoreAlignment(I);
6049   unsigned AS = getLoadStoreAddressSpace(I);
6050   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6051   if (isa<LoadInst>(I)) {
6052     return TTI.getAddressComputationCost(ValTy) +
6053            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6054                                CostKind) +
6055            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6056   }
6057   StoreInst *SI = cast<StoreInst>(I);
6058 
6059   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6060   return TTI.getAddressComputationCost(ValTy) +
6061          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6062                              CostKind) +
6063          (isLoopInvariantStoreValue
6064               ? 0
6065               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6066                                        VF - 1));
6067 }
6068 
6069 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6070                                                           unsigned VF) {
6071   Type *ValTy = getMemInstValueType(I);
6072   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6073   const Align Alignment = getLoadStoreAlignment(I);
6074   const Value *Ptr = getLoadStorePointerOperand(I);
6075 
6076   return TTI.getAddressComputationCost(VectorTy) +
6077          TTI.getGatherScatterOpCost(
6078              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6079              TargetTransformInfo::TCK_RecipThroughput, I);
6080 }
6081 
6082 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6083                                                             unsigned VF) {
6084   Type *ValTy = getMemInstValueType(I);
6085   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6086   unsigned AS = getLoadStoreAddressSpace(I);
6087 
6088   auto Group = getInterleavedAccessGroup(I);
6089   assert(Group && "Fail to get an interleaved access group.");
6090 
6091   unsigned InterleaveFactor = Group->getFactor();
6092   auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
6093 
6094   // Holds the indices of existing members in an interleaved load group.
6095   // An interleaved store group doesn't need this as it doesn't allow gaps.
6096   SmallVector<unsigned, 4> Indices;
6097   if (isa<LoadInst>(I)) {
6098     for (unsigned i = 0; i < InterleaveFactor; i++)
6099       if (Group->getMember(i))
6100         Indices.push_back(i);
6101   }
6102 
6103   // Calculate the cost of the whole interleaved group.
6104   bool UseMaskForGaps =
6105       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6106   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6107       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6108       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6109 
6110   if (Group->isReverse()) {
6111     // TODO: Add support for reversed masked interleaved access.
6112     assert(!Legal->isMaskRequired(I) &&
6113            "Reverse masked interleaved access not supported.");
6114     Cost += Group->getNumMembers() *
6115             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6116   }
6117   return Cost;
6118 }
6119 
6120 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6121                                                               unsigned VF) {
6122   // Calculate scalar cost only. Vectorization cost should be ready at this
6123   // moment.
6124   if (VF == 1) {
6125     Type *ValTy = getMemInstValueType(I);
6126     const Align Alignment = getLoadStoreAlignment(I);
6127     unsigned AS = getLoadStoreAddressSpace(I);
6128 
6129     return TTI.getAddressComputationCost(ValTy) +
6130            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6131                                TTI::TCK_RecipThroughput, I);
6132   }
6133   return getWideningCost(I, VF);
6134 }
6135 
6136 LoopVectorizationCostModel::VectorizationCostTy
6137 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
6138   // If we know that this instruction will remain uniform, check the cost of
6139   // the scalar version.
6140   if (isUniformAfterVectorization(I, VF))
6141     VF = 1;
6142 
6143   if (VF > 1 && isProfitableToScalarize(I, VF))
6144     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6145 
6146   // Forced scalars do not have any scalarization overhead.
6147   auto ForcedScalar = ForcedScalars.find(VF);
6148   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
6149     auto InstSet = ForcedScalar->second;
6150     if (InstSet.count(I))
6151       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
6152   }
6153 
6154   Type *VectorTy;
6155   unsigned C = getInstructionCost(I, VF, VectorTy);
6156 
6157   bool TypeNotScalarized =
6158       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
6159   return VectorizationCostTy(C, TypeNotScalarized);
6160 }
6161 
6162 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6163                                                               unsigned VF) {
6164 
6165   if (VF == 1)
6166     return 0;
6167 
6168   unsigned Cost = 0;
6169   Type *RetTy = ToVectorTy(I->getType(), VF);
6170   if (!RetTy->isVoidTy() &&
6171       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6172     Cost += TTI.getScalarizationOverhead(
6173         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
6174 
6175   // Some targets keep addresses scalar.
6176   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6177     return Cost;
6178 
6179   // Some targets support efficient element stores.
6180   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6181     return Cost;
6182 
6183   // Collect operands to consider.
6184   CallInst *CI = dyn_cast<CallInst>(I);
6185   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6186 
6187   // Skip operands that do not require extraction/scalarization and do not incur
6188   // any overhead.
6189   return Cost + TTI.getOperandsScalarizationOverhead(
6190                     filterExtractingOperands(Ops, VF), VF);
6191 }
6192 
6193 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6194   if (VF == 1)
6195     return;
6196   NumPredStores = 0;
6197   for (BasicBlock *BB : TheLoop->blocks()) {
6198     // For each instruction in the old loop.
6199     for (Instruction &I : *BB) {
6200       Value *Ptr =  getLoadStorePointerOperand(&I);
6201       if (!Ptr)
6202         continue;
6203 
6204       // TODO: We should generate better code and update the cost model for
6205       // predicated uniform stores. Today they are treated as any other
6206       // predicated store (see added test cases in
6207       // invariant-store-vectorization.ll).
6208       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6209         NumPredStores++;
6210 
6211       if (Legal->isUniform(Ptr) &&
6212           // Conditional loads and stores should be scalarized and predicated.
6213           // isScalarWithPredication cannot be used here since masked
6214           // gather/scatters are not considered scalar with predication.
6215           !Legal->blockNeedsPredication(I.getParent())) {
6216         // TODO: Avoid replicating loads and stores instead of
6217         // relying on instcombine to remove them.
6218         // Load: Scalar load + broadcast
6219         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6220         unsigned Cost = getUniformMemOpCost(&I, VF);
6221         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6222         continue;
6223       }
6224 
6225       // We assume that widening is the best solution when possible.
6226       if (memoryInstructionCanBeWidened(&I, VF)) {
6227         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6228         int ConsecutiveStride =
6229                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6230         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6231                "Expected consecutive stride.");
6232         InstWidening Decision =
6233             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6234         setWideningDecision(&I, VF, Decision, Cost);
6235         continue;
6236       }
6237 
6238       // Choose between Interleaving, Gather/Scatter or Scalarization.
6239       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6240       unsigned NumAccesses = 1;
6241       if (isAccessInterleaved(&I)) {
6242         auto Group = getInterleavedAccessGroup(&I);
6243         assert(Group && "Fail to get an interleaved access group.");
6244 
6245         // Make one decision for the whole group.
6246         if (getWideningDecision(&I, VF) != CM_Unknown)
6247           continue;
6248 
6249         NumAccesses = Group->getNumMembers();
6250         if (interleavedAccessCanBeWidened(&I, VF))
6251           InterleaveCost = getInterleaveGroupCost(&I, VF);
6252       }
6253 
6254       unsigned GatherScatterCost =
6255           isLegalGatherOrScatter(&I)
6256               ? getGatherScatterCost(&I, VF) * NumAccesses
6257               : std::numeric_limits<unsigned>::max();
6258 
6259       unsigned ScalarizationCost =
6260           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6261 
6262       // Choose better solution for the current VF,
6263       // write down this decision and use it during vectorization.
6264       unsigned Cost;
6265       InstWidening Decision;
6266       if (InterleaveCost <= GatherScatterCost &&
6267           InterleaveCost < ScalarizationCost) {
6268         Decision = CM_Interleave;
6269         Cost = InterleaveCost;
6270       } else if (GatherScatterCost < ScalarizationCost) {
6271         Decision = CM_GatherScatter;
6272         Cost = GatherScatterCost;
6273       } else {
6274         Decision = CM_Scalarize;
6275         Cost = ScalarizationCost;
6276       }
6277       // If the instructions belongs to an interleave group, the whole group
6278       // receives the same decision. The whole group receives the cost, but
6279       // the cost will actually be assigned to one instruction.
6280       if (auto Group = getInterleavedAccessGroup(&I))
6281         setWideningDecision(Group, VF, Decision, Cost);
6282       else
6283         setWideningDecision(&I, VF, Decision, Cost);
6284     }
6285   }
6286 
6287   // Make sure that any load of address and any other address computation
6288   // remains scalar unless there is gather/scatter support. This avoids
6289   // inevitable extracts into address registers, and also has the benefit of
6290   // activating LSR more, since that pass can't optimize vectorized
6291   // addresses.
6292   if (TTI.prefersVectorizedAddressing())
6293     return;
6294 
6295   // Start with all scalar pointer uses.
6296   SmallPtrSet<Instruction *, 8> AddrDefs;
6297   for (BasicBlock *BB : TheLoop->blocks())
6298     for (Instruction &I : *BB) {
6299       Instruction *PtrDef =
6300         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6301       if (PtrDef && TheLoop->contains(PtrDef) &&
6302           getWideningDecision(&I, VF) != CM_GatherScatter)
6303         AddrDefs.insert(PtrDef);
6304     }
6305 
6306   // Add all instructions used to generate the addresses.
6307   SmallVector<Instruction *, 4> Worklist;
6308   for (auto *I : AddrDefs)
6309     Worklist.push_back(I);
6310   while (!Worklist.empty()) {
6311     Instruction *I = Worklist.pop_back_val();
6312     for (auto &Op : I->operands())
6313       if (auto *InstOp = dyn_cast<Instruction>(Op))
6314         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6315             AddrDefs.insert(InstOp).second)
6316           Worklist.push_back(InstOp);
6317   }
6318 
6319   for (auto *I : AddrDefs) {
6320     if (isa<LoadInst>(I)) {
6321       // Setting the desired widening decision should ideally be handled in
6322       // by cost functions, but since this involves the task of finding out
6323       // if the loaded register is involved in an address computation, it is
6324       // instead changed here when we know this is the case.
6325       InstWidening Decision = getWideningDecision(I, VF);
6326       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6327         // Scalarize a widened load of address.
6328         setWideningDecision(I, VF, CM_Scalarize,
6329                             (VF * getMemoryInstructionCost(I, 1)));
6330       else if (auto Group = getInterleavedAccessGroup(I)) {
6331         // Scalarize an interleave group of address loads.
6332         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6333           if (Instruction *Member = Group->getMember(I))
6334             setWideningDecision(Member, VF, CM_Scalarize,
6335                                 (VF * getMemoryInstructionCost(Member, 1)));
6336         }
6337       }
6338     } else
6339       // Make sure I gets scalarized and a cost estimate without
6340       // scalarization overhead.
6341       ForcedScalars[VF].insert(I);
6342   }
6343 }
6344 
6345 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6346                                                         unsigned VF,
6347                                                         Type *&VectorTy) {
6348   Type *RetTy = I->getType();
6349   if (canTruncateToMinimalBitwidth(I, VF))
6350     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6351   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6352   auto SE = PSE.getSE();
6353   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6354 
6355   // TODO: We need to estimate the cost of intrinsic calls.
6356   switch (I->getOpcode()) {
6357   case Instruction::GetElementPtr:
6358     // We mark this instruction as zero-cost because the cost of GEPs in
6359     // vectorized code depends on whether the corresponding memory instruction
6360     // is scalarized or not. Therefore, we handle GEPs with the memory
6361     // instruction cost.
6362     return 0;
6363   case Instruction::Br: {
6364     // In cases of scalarized and predicated instructions, there will be VF
6365     // predicated blocks in the vectorized loop. Each branch around these
6366     // blocks requires also an extract of its vector compare i1 element.
6367     bool ScalarPredicatedBB = false;
6368     BranchInst *BI = cast<BranchInst>(I);
6369     if (VF > 1 && BI->isConditional() &&
6370         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6371          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6372       ScalarPredicatedBB = true;
6373 
6374     if (ScalarPredicatedBB) {
6375       // Return cost for branches around scalarized and predicated blocks.
6376       auto *Vec_i1Ty =
6377           FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6378       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
6379                                            false, true) +
6380               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
6381     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6382       // The back-edge branch will remain, as will all scalar branches.
6383       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6384     else
6385       // This branch will be eliminated by if-conversion.
6386       return 0;
6387     // Note: We currently assume zero cost for an unconditional branch inside
6388     // a predicated block since it will become a fall-through, although we
6389     // may decide in the future to call TTI for all branches.
6390   }
6391   case Instruction::PHI: {
6392     auto *Phi = cast<PHINode>(I);
6393 
6394     // First-order recurrences are replaced by vector shuffles inside the loop.
6395     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6396     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6397       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6398                                 cast<VectorType>(VectorTy), VF - 1,
6399                                 FixedVectorType::get(RetTy, 1));
6400 
6401     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6402     // converted into select instructions. We require N - 1 selects per phi
6403     // node, where N is the number of incoming values.
6404     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6405       return (Phi->getNumIncomingValues() - 1) *
6406              TTI.getCmpSelInstrCost(
6407                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6408                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6409                  CostKind);
6410 
6411     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6412   }
6413   case Instruction::UDiv:
6414   case Instruction::SDiv:
6415   case Instruction::URem:
6416   case Instruction::SRem:
6417     // If we have a predicated instruction, it may not be executed for each
6418     // vector lane. Get the scalarization cost and scale this amount by the
6419     // probability of executing the predicated block. If the instruction is not
6420     // predicated, we fall through to the next case.
6421     if (VF > 1 && isScalarWithPredication(I)) {
6422       unsigned Cost = 0;
6423 
6424       // These instructions have a non-void type, so account for the phi nodes
6425       // that we will create. This cost is likely to be zero. The phi node
6426       // cost, if any, should be scaled by the block probability because it
6427       // models a copy at the end of each predicated block.
6428       Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6429 
6430       // The cost of the non-predicated instruction.
6431       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6432 
6433       // The cost of insertelement and extractelement instructions needed for
6434       // scalarization.
6435       Cost += getScalarizationOverhead(I, VF);
6436 
6437       // Scale the cost by the probability of executing the predicated blocks.
6438       // This assumes the predicated block for each vector lane is equally
6439       // likely.
6440       return Cost / getReciprocalPredBlockProb();
6441     }
6442     LLVM_FALLTHROUGH;
6443   case Instruction::Add:
6444   case Instruction::FAdd:
6445   case Instruction::Sub:
6446   case Instruction::FSub:
6447   case Instruction::Mul:
6448   case Instruction::FMul:
6449   case Instruction::FDiv:
6450   case Instruction::FRem:
6451   case Instruction::Shl:
6452   case Instruction::LShr:
6453   case Instruction::AShr:
6454   case Instruction::And:
6455   case Instruction::Or:
6456   case Instruction::Xor: {
6457     // Since we will replace the stride by 1 the multiplication should go away.
6458     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6459       return 0;
6460     // Certain instructions can be cheaper to vectorize if they have a constant
6461     // second vector operand. One example of this are shifts on x86.
6462     Value *Op2 = I->getOperand(1);
6463     TargetTransformInfo::OperandValueProperties Op2VP;
6464     TargetTransformInfo::OperandValueKind Op2VK =
6465         TTI.getOperandInfo(Op2, Op2VP);
6466     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6467       Op2VK = TargetTransformInfo::OK_UniformValue;
6468 
6469     SmallVector<const Value *, 4> Operands(I->operand_values());
6470     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6471     return N * TTI.getArithmeticInstrCost(
6472                    I->getOpcode(), VectorTy, CostKind,
6473                    TargetTransformInfo::OK_AnyValue,
6474                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6475   }
6476   case Instruction::FNeg: {
6477     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6478     return N * TTI.getArithmeticInstrCost(
6479                    I->getOpcode(), VectorTy, CostKind,
6480                    TargetTransformInfo::OK_AnyValue,
6481                    TargetTransformInfo::OK_AnyValue,
6482                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6483                    I->getOperand(0), I);
6484   }
6485   case Instruction::Select: {
6486     SelectInst *SI = cast<SelectInst>(I);
6487     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6488     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6489     Type *CondTy = SI->getCondition()->getType();
6490     if (!ScalarCond)
6491       CondTy = FixedVectorType::get(CondTy, VF);
6492 
6493     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6494                                   CostKind, I);
6495   }
6496   case Instruction::ICmp:
6497   case Instruction::FCmp: {
6498     Type *ValTy = I->getOperand(0)->getType();
6499     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6500     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6501       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6502     VectorTy = ToVectorTy(ValTy, VF);
6503     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6504                                   I);
6505   }
6506   case Instruction::Store:
6507   case Instruction::Load: {
6508     unsigned Width = VF;
6509     if (Width > 1) {
6510       InstWidening Decision = getWideningDecision(I, Width);
6511       assert(Decision != CM_Unknown &&
6512              "CM decision should be taken at this point");
6513       if (Decision == CM_Scalarize)
6514         Width = 1;
6515     }
6516     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6517     return getMemoryInstructionCost(I, VF);
6518   }
6519   case Instruction::ZExt:
6520   case Instruction::SExt:
6521   case Instruction::FPToUI:
6522   case Instruction::FPToSI:
6523   case Instruction::FPExt:
6524   case Instruction::PtrToInt:
6525   case Instruction::IntToPtr:
6526   case Instruction::SIToFP:
6527   case Instruction::UIToFP:
6528   case Instruction::Trunc:
6529   case Instruction::FPTrunc:
6530   case Instruction::BitCast: {
6531     // Computes the CastContextHint from a Load/Store instruction.
6532     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6533       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6534              "Expected a load or a store!");
6535 
6536       if (VF == 1 || !TheLoop->contains(I))
6537         return TTI::CastContextHint::Normal;
6538 
6539       switch (getWideningDecision(I, VF)) {
6540       case LoopVectorizationCostModel::CM_GatherScatter:
6541         return TTI::CastContextHint::GatherScatter;
6542       case LoopVectorizationCostModel::CM_Interleave:
6543         return TTI::CastContextHint::Interleave;
6544       case LoopVectorizationCostModel::CM_Scalarize:
6545       case LoopVectorizationCostModel::CM_Widen:
6546         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6547                                         : TTI::CastContextHint::Normal;
6548       case LoopVectorizationCostModel::CM_Widen_Reverse:
6549         return TTI::CastContextHint::Reversed;
6550       case LoopVectorizationCostModel::CM_Unknown:
6551         llvm_unreachable("Instr did not go through cost modelling?");
6552       }
6553 
6554       llvm_unreachable("Unhandled case!");
6555     };
6556 
6557     unsigned Opcode = I->getOpcode();
6558     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6559     // For Trunc, the context is the only user, which must be a StoreInst.
6560     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6561       if (I->hasOneUse())
6562         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6563           CCH = ComputeCCH(Store);
6564     }
6565     // For Z/Sext, the context is the operand, which must be a LoadInst.
6566     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6567              Opcode == Instruction::FPExt) {
6568       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6569         CCH = ComputeCCH(Load);
6570     }
6571 
6572     // We optimize the truncation of induction variables having constant
6573     // integer steps. The cost of these truncations is the same as the scalar
6574     // operation.
6575     if (isOptimizableIVTruncate(I, VF)) {
6576       auto *Trunc = cast<TruncInst>(I);
6577       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6578                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6579     }
6580 
6581     Type *SrcScalarTy = I->getOperand(0)->getType();
6582     Type *SrcVecTy =
6583         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6584     if (canTruncateToMinimalBitwidth(I, VF)) {
6585       // This cast is going to be shrunk. This may remove the cast or it might
6586       // turn it into slightly different cast. For example, if MinBW == 16,
6587       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6588       //
6589       // Calculate the modified src and dest types.
6590       Type *MinVecTy = VectorTy;
6591       if (Opcode == Instruction::Trunc) {
6592         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6593         VectorTy =
6594             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6595       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6596         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6597         VectorTy =
6598             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6599       }
6600     }
6601 
6602     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6603     return N *
6604            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6605   }
6606   case Instruction::Call: {
6607     bool NeedToScalarize;
6608     CallInst *CI = cast<CallInst>(I);
6609     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6610     if (getVectorIntrinsicIDForCall(CI, TLI))
6611       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6612     return CallCost;
6613   }
6614   default:
6615     // The cost of executing VF copies of the scalar instruction. This opcode
6616     // is unknown. Assume that it is the same as 'mul'.
6617     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6618                                            CostKind) +
6619            getScalarizationOverhead(I, VF);
6620   } // end of switch.
6621 }
6622 
6623 char LoopVectorize::ID = 0;
6624 
6625 static const char lv_name[] = "Loop Vectorization";
6626 
6627 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6628 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6629 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6630 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6631 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6632 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6633 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6634 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6635 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6636 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6637 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6638 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6639 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6640 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6641 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6642 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6643 
6644 namespace llvm {
6645 
6646 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6647 
6648 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6649                               bool VectorizeOnlyWhenForced) {
6650   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6651 }
6652 
6653 } // end namespace llvm
6654 
6655 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6656   // Check if the pointer operand of a load or store instruction is
6657   // consecutive.
6658   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6659     return Legal->isConsecutivePtr(Ptr);
6660   return false;
6661 }
6662 
6663 void LoopVectorizationCostModel::collectValuesToIgnore() {
6664   // Ignore ephemeral values.
6665   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6666 
6667   // Ignore type-promoting instructions we identified during reduction
6668   // detection.
6669   for (auto &Reduction : Legal->getReductionVars()) {
6670     RecurrenceDescriptor &RedDes = Reduction.second;
6671     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6672     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6673   }
6674   // Ignore type-casting instructions we identified during induction
6675   // detection.
6676   for (auto &Induction : Legal->getInductionVars()) {
6677     InductionDescriptor &IndDes = Induction.second;
6678     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6679     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6680   }
6681 }
6682 
6683 void LoopVectorizationCostModel::collectInLoopReductions() {
6684   // For the moment, without predicated reduction instructions, we do not
6685   // support inloop reductions whilst folding the tail, and hence in those cases
6686   // all reductions are currently out of the loop.
6687   if (!PreferInLoopReductions || foldTailByMasking())
6688     return;
6689 
6690   for (auto &Reduction : Legal->getReductionVars()) {
6691     PHINode *Phi = Reduction.first;
6692     RecurrenceDescriptor &RdxDesc = Reduction.second;
6693 
6694     // We don't collect reductions that are type promoted (yet).
6695     if (RdxDesc.getRecurrenceType() != Phi->getType())
6696       continue;
6697 
6698     // Check that we can correctly put the reductions into the loop, by
6699     // finding the chain of operations that leads from the phi to the loop
6700     // exit value.
6701     SmallVector<Instruction *, 4> ReductionOperations =
6702         RdxDesc.getReductionOpChain(Phi, TheLoop);
6703     bool InLoop = !ReductionOperations.empty();
6704     if (InLoop)
6705       InLoopReductionChains[Phi] = ReductionOperations;
6706     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6707                       << " reduction for phi: " << *Phi << "\n");
6708   }
6709 }
6710 
6711 // TODO: we could return a pair of values that specify the max VF and
6712 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6713 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6714 // doesn't have a cost model that can choose which plan to execute if
6715 // more than one is generated.
6716 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6717                                  LoopVectorizationCostModel &CM) {
6718   unsigned WidestType;
6719   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6720   return WidestVectorRegBits / WidestType;
6721 }
6722 
6723 VectorizationFactor
6724 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6725   unsigned VF = UserVF;
6726   // Outer loop handling: They may require CFG and instruction level
6727   // transformations before even evaluating whether vectorization is profitable.
6728   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6729   // the vectorization pipeline.
6730   if (!OrigLoop->empty()) {
6731     // If the user doesn't provide a vectorization factor, determine a
6732     // reasonable one.
6733     if (!UserVF) {
6734       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6735       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6736 
6737       // Make sure we have a VF > 1 for stress testing.
6738       if (VPlanBuildStressTest && VF < 2) {
6739         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6740                           << "overriding computed VF.\n");
6741         VF = 4;
6742       }
6743     }
6744     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6745     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6746     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6747                       << " to build VPlans.\n");
6748     buildVPlans(VF, VF);
6749 
6750     // For VPlan build stress testing, we bail out after VPlan construction.
6751     if (VPlanBuildStressTest)
6752       return VectorizationFactor::Disabled();
6753 
6754     return {VF, 0};
6755   }
6756 
6757   LLVM_DEBUG(
6758       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6759                 "VPlan-native path.\n");
6760   return VectorizationFactor::Disabled();
6761 }
6762 
6763 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
6764                                                              unsigned UserIC) {
6765   assert(OrigLoop->empty() && "Inner loop expected.");
6766   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
6767   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6768     return None;
6769 
6770   // Invalidate interleave groups if all blocks of loop will be predicated.
6771   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6772       !useMaskedInterleavedAccesses(*TTI)) {
6773     LLVM_DEBUG(
6774         dbgs()
6775         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6776            "which requires masked-interleaved support.\n");
6777     if (CM.InterleaveInfo.invalidateGroups())
6778       // Invalidating interleave groups also requires invalidating all decisions
6779       // based on them, which includes widening decisions and uniform and scalar
6780       // values.
6781       CM.invalidateCostModelingDecisions();
6782   }
6783 
6784   if (UserVF) {
6785     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6786     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6787     // Collect the instructions (and their associated costs) that will be more
6788     // profitable to scalarize.
6789     CM.selectUserVectorizationFactor(UserVF);
6790     CM.collectInLoopReductions();
6791     buildVPlansWithVPRecipes(UserVF, UserVF);
6792     LLVM_DEBUG(printPlans(dbgs()));
6793     return {{UserVF, 0}};
6794   }
6795 
6796   unsigned MaxVF = MaybeMaxVF.getValue();
6797   assert(MaxVF != 0 && "MaxVF is zero.");
6798 
6799   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6800     // Collect Uniform and Scalar instructions after vectorization with VF.
6801     CM.collectUniformsAndScalars(VF);
6802 
6803     // Collect the instructions (and their associated costs) that will be more
6804     // profitable to scalarize.
6805     if (VF > 1)
6806       CM.collectInstsToScalarize(VF);
6807   }
6808 
6809   CM.collectInLoopReductions();
6810 
6811   buildVPlansWithVPRecipes(1, MaxVF);
6812   LLVM_DEBUG(printPlans(dbgs()));
6813   if (MaxVF == 1)
6814     return VectorizationFactor::Disabled();
6815 
6816   // Select the optimal vectorization factor.
6817   return CM.selectVectorizationFactor(MaxVF);
6818 }
6819 
6820 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6821   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6822                     << '\n');
6823   BestVF = VF;
6824   BestUF = UF;
6825 
6826   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6827     return !Plan->hasVF(VF);
6828   });
6829   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6830 }
6831 
6832 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6833                                            DominatorTree *DT) {
6834   // Perform the actual loop transformation.
6835 
6836   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6837   VPCallbackILV CallbackILV(ILV);
6838 
6839   VPTransformState State{BestVF, BestUF,      LI,
6840                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6841                          &ILV,   CallbackILV};
6842   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6843   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6844   State.CanonicalIV = ILV.Induction;
6845 
6846   //===------------------------------------------------===//
6847   //
6848   // Notice: any optimization or new instruction that go
6849   // into the code below should also be implemented in
6850   // the cost-model.
6851   //
6852   //===------------------------------------------------===//
6853 
6854   // 2. Copy and widen instructions from the old loop into the new loop.
6855   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6856   VPlans.front()->execute(&State);
6857 
6858   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6859   //    predication, updating analyses.
6860   ILV.fixVectorizedLoop();
6861 }
6862 
6863 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6864     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6865   BasicBlock *Latch = OrigLoop->getLoopLatch();
6866 
6867   // We create new control-flow for the vectorized loop, so the original
6868   // condition will be dead after vectorization if it's only used by the
6869   // branch.
6870   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6871   if (Cmp && Cmp->hasOneUse())
6872     DeadInstructions.insert(Cmp);
6873 
6874   // We create new "steps" for induction variable updates to which the original
6875   // induction variables map. An original update instruction will be dead if
6876   // all its users except the induction variable are dead.
6877   for (auto &Induction : Legal->getInductionVars()) {
6878     PHINode *Ind = Induction.first;
6879     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6880     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6881           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
6882         }))
6883       DeadInstructions.insert(IndUpdate);
6884 
6885     // We record as "Dead" also the type-casting instructions we had identified
6886     // during induction analysis. We don't need any handling for them in the
6887     // vectorized loop because we have proven that, under a proper runtime
6888     // test guarding the vectorized loop, the value of the phi, and the casted
6889     // value of the phi, are the same. The last instruction in this casting chain
6890     // will get its scalar/vector/widened def from the scalar/vector/widened def
6891     // of the respective phi node. Any other casts in the induction def-use chain
6892     // have no other uses outside the phi update chain, and will be ignored.
6893     InductionDescriptor &IndDes = Induction.second;
6894     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6895     DeadInstructions.insert(Casts.begin(), Casts.end());
6896   }
6897 }
6898 
6899 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6900 
6901 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6902 
6903 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6904                                         Instruction::BinaryOps BinOp) {
6905   // When unrolling and the VF is 1, we only need to add a simple scalar.
6906   Type *Ty = Val->getType();
6907   assert(!Ty->isVectorTy() && "Val must be a scalar");
6908 
6909   if (Ty->isFloatingPointTy()) {
6910     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6911 
6912     // Floating point operations had to be 'fast' to enable the unrolling.
6913     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6914     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6915   }
6916   Constant *C = ConstantInt::get(Ty, StartIdx);
6917   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6918 }
6919 
6920 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6921   SmallVector<Metadata *, 4> MDs;
6922   // Reserve first location for self reference to the LoopID metadata node.
6923   MDs.push_back(nullptr);
6924   bool IsUnrollMetadata = false;
6925   MDNode *LoopID = L->getLoopID();
6926   if (LoopID) {
6927     // First find existing loop unrolling disable metadata.
6928     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6929       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6930       if (MD) {
6931         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6932         IsUnrollMetadata =
6933             S && S->getString().startswith("llvm.loop.unroll.disable");
6934       }
6935       MDs.push_back(LoopID->getOperand(i));
6936     }
6937   }
6938 
6939   if (!IsUnrollMetadata) {
6940     // Add runtime unroll disable metadata.
6941     LLVMContext &Context = L->getHeader()->getContext();
6942     SmallVector<Metadata *, 1> DisableOperands;
6943     DisableOperands.push_back(
6944         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6945     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6946     MDs.push_back(DisableNode);
6947     MDNode *NewLoopID = MDNode::get(Context, MDs);
6948     // Set operand 0 to refer to the loop id itself.
6949     NewLoopID->replaceOperandWith(0, NewLoopID);
6950     L->setLoopID(NewLoopID);
6951   }
6952 }
6953 
6954 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6955     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6956   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6957   bool PredicateAtRangeStart = Predicate(Range.Start);
6958 
6959   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6960     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6961       Range.End = TmpVF;
6962       break;
6963     }
6964 
6965   return PredicateAtRangeStart;
6966 }
6967 
6968 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6969 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6970 /// of VF's starting at a given VF and extending it as much as possible. Each
6971 /// vectorization decision can potentially shorten this sub-range during
6972 /// buildVPlan().
6973 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6974   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6975     VFRange SubRange = {VF, MaxVF + 1};
6976     VPlans.push_back(buildVPlan(SubRange));
6977     VF = SubRange.End;
6978   }
6979 }
6980 
6981 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6982                                          VPlanPtr &Plan) {
6983   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6984 
6985   // Look for cached value.
6986   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6987   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6988   if (ECEntryIt != EdgeMaskCache.end())
6989     return ECEntryIt->second;
6990 
6991   VPValue *SrcMask = createBlockInMask(Src, Plan);
6992 
6993   // The terminator has to be a branch inst!
6994   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6995   assert(BI && "Unexpected terminator found");
6996 
6997   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6998     return EdgeMaskCache[Edge] = SrcMask;
6999 
7000   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7001   assert(EdgeMask && "No Edge Mask found for condition");
7002 
7003   if (BI->getSuccessor(0) != Dst)
7004     EdgeMask = Builder.createNot(EdgeMask);
7005 
7006   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7007     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7008 
7009   return EdgeMaskCache[Edge] = EdgeMask;
7010 }
7011 
7012 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7013   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7014 
7015   // Look for cached value.
7016   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7017   if (BCEntryIt != BlockMaskCache.end())
7018     return BCEntryIt->second;
7019 
7020   // All-one mask is modelled as no-mask following the convention for masked
7021   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7022   VPValue *BlockMask = nullptr;
7023 
7024   if (OrigLoop->getHeader() == BB) {
7025     if (!CM.blockNeedsPredication(BB))
7026       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7027 
7028     // Introduce the early-exit compare IV <= BTC to form header block mask.
7029     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7030     // Start by constructing the desired canonical IV.
7031     VPValue *IV = nullptr;
7032     if (Legal->getPrimaryInduction())
7033       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7034     else {
7035       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7036       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7037       IV = IVRecipe->getVPValue();
7038     }
7039     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7040     bool TailFolded = !CM.isScalarEpilogueAllowed();
7041     if (TailFolded && CM.TTI.emitGetActiveLaneMask())
7042       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
7043     else
7044       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7045     return BlockMaskCache[BB] = BlockMask;
7046   }
7047 
7048   // This is the block mask. We OR all incoming edges.
7049   for (auto *Predecessor : predecessors(BB)) {
7050     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7051     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7052       return BlockMaskCache[BB] = EdgeMask;
7053 
7054     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7055       BlockMask = EdgeMask;
7056       continue;
7057     }
7058 
7059     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7060   }
7061 
7062   return BlockMaskCache[BB] = BlockMask;
7063 }
7064 
7065 VPWidenMemoryInstructionRecipe *
7066 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7067                                   VPlanPtr &Plan) {
7068   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7069          "Must be called with either a load or store");
7070 
7071   auto willWiden = [&](unsigned VF) -> bool {
7072     if (VF == 1)
7073       return false;
7074     LoopVectorizationCostModel::InstWidening Decision =
7075         CM.getWideningDecision(I, VF);
7076     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7077            "CM decision should be taken at this point.");
7078     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7079       return true;
7080     if (CM.isScalarAfterVectorization(I, VF) ||
7081         CM.isProfitableToScalarize(I, VF))
7082       return false;
7083     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7084   };
7085 
7086   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7087     return nullptr;
7088 
7089   VPValue *Mask = nullptr;
7090   if (Legal->isMaskRequired(I))
7091     Mask = createBlockInMask(I->getParent(), Plan);
7092 
7093   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7094   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7095     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7096 
7097   StoreInst *Store = cast<StoreInst>(I);
7098   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7099   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7100 }
7101 
7102 VPWidenIntOrFpInductionRecipe *
7103 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7104   // Check if this is an integer or fp induction. If so, build the recipe that
7105   // produces its scalar and vector values.
7106   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7107   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7108       II.getKind() == InductionDescriptor::IK_FpInduction)
7109     return new VPWidenIntOrFpInductionRecipe(Phi);
7110 
7111   return nullptr;
7112 }
7113 
7114 VPWidenIntOrFpInductionRecipe *
7115 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7116                                                 VFRange &Range) const {
7117   // Optimize the special case where the source is a constant integer
7118   // induction variable. Notice that we can only optimize the 'trunc' case
7119   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7120   // (c) other casts depend on pointer size.
7121 
7122   // Determine whether \p K is a truncation based on an induction variable that
7123   // can be optimized.
7124   auto isOptimizableIVTruncate =
7125       [&](Instruction *K) -> std::function<bool(unsigned)> {
7126     return
7127         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
7128   };
7129 
7130   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7131           isOptimizableIVTruncate(I), Range))
7132     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7133                                              I);
7134   return nullptr;
7135 }
7136 
7137 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7138   // We know that all PHIs in non-header blocks are converted into selects, so
7139   // we don't have to worry about the insertion order and we can just use the
7140   // builder. At this point we generate the predication tree. There may be
7141   // duplications since this is a simple recursive scan, but future
7142   // optimizations will clean it up.
7143 
7144   SmallVector<VPValue *, 2> Operands;
7145   unsigned NumIncoming = Phi->getNumIncomingValues();
7146   for (unsigned In = 0; In < NumIncoming; In++) {
7147     VPValue *EdgeMask =
7148       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7149     assert((EdgeMask || NumIncoming == 1) &&
7150            "Multiple predecessors with one having a full mask");
7151     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7152     if (EdgeMask)
7153       Operands.push_back(EdgeMask);
7154   }
7155   return new VPBlendRecipe(Phi, Operands);
7156 }
7157 
7158 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7159                                                    VPlan &Plan) const {
7160 
7161   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7162       [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
7163       Range);
7164 
7165   if (IsPredicated)
7166     return nullptr;
7167 
7168   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7169   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7170              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7171     return nullptr;
7172 
7173   auto willWiden = [&](unsigned VF) -> bool {
7174     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7175     // The following case may be scalarized depending on the VF.
7176     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7177     // version of the instruction.
7178     // Is it beneficial to perform intrinsic call compared to lib call?
7179     bool NeedToScalarize = false;
7180     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7181     bool UseVectorIntrinsic =
7182         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7183     return UseVectorIntrinsic || !NeedToScalarize;
7184   };
7185 
7186   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7187     return nullptr;
7188 
7189   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7190 }
7191 
7192 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7193   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7194          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7195   // Instruction should be widened, unless it is scalar after vectorization,
7196   // scalarization is profitable or it is predicated.
7197   auto WillScalarize = [this, I](unsigned VF) -> bool {
7198     return CM.isScalarAfterVectorization(I, VF) ||
7199            CM.isProfitableToScalarize(I, VF) ||
7200            CM.isScalarWithPredication(I, VF);
7201   };
7202   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7203                                                              Range);
7204 }
7205 
7206 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7207   auto IsVectorizableOpcode = [](unsigned Opcode) {
7208     switch (Opcode) {
7209     case Instruction::Add:
7210     case Instruction::And:
7211     case Instruction::AShr:
7212     case Instruction::BitCast:
7213     case Instruction::FAdd:
7214     case Instruction::FCmp:
7215     case Instruction::FDiv:
7216     case Instruction::FMul:
7217     case Instruction::FNeg:
7218     case Instruction::FPExt:
7219     case Instruction::FPToSI:
7220     case Instruction::FPToUI:
7221     case Instruction::FPTrunc:
7222     case Instruction::FRem:
7223     case Instruction::FSub:
7224     case Instruction::ICmp:
7225     case Instruction::IntToPtr:
7226     case Instruction::LShr:
7227     case Instruction::Mul:
7228     case Instruction::Or:
7229     case Instruction::PtrToInt:
7230     case Instruction::SDiv:
7231     case Instruction::Select:
7232     case Instruction::SExt:
7233     case Instruction::Shl:
7234     case Instruction::SIToFP:
7235     case Instruction::SRem:
7236     case Instruction::Sub:
7237     case Instruction::Trunc:
7238     case Instruction::UDiv:
7239     case Instruction::UIToFP:
7240     case Instruction::URem:
7241     case Instruction::Xor:
7242     case Instruction::ZExt:
7243       return true;
7244     }
7245     return false;
7246   };
7247 
7248   if (!IsVectorizableOpcode(I->getOpcode()))
7249     return nullptr;
7250 
7251   // Success: widen this instruction.
7252   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7253 }
7254 
7255 VPBasicBlock *VPRecipeBuilder::handleReplication(
7256     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7257     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7258     VPlanPtr &Plan) {
7259   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7260       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
7261       Range);
7262 
7263   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7264       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
7265 
7266   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7267                                        IsUniform, IsPredicated);
7268   setRecipe(I, Recipe);
7269 
7270   // Find if I uses a predicated instruction. If so, it will use its scalar
7271   // value. Avoid hoisting the insert-element which packs the scalar value into
7272   // a vector value, as that happens iff all users use the vector value.
7273   for (auto &Op : I->operands())
7274     if (auto *PredInst = dyn_cast<Instruction>(Op))
7275       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7276         PredInst2Recipe[PredInst]->setAlsoPack(false);
7277 
7278   // Finalize the recipe for Instr, first if it is not predicated.
7279   if (!IsPredicated) {
7280     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7281     VPBB->appendRecipe(Recipe);
7282     return VPBB;
7283   }
7284   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7285   assert(VPBB->getSuccessors().empty() &&
7286          "VPBB has successors when handling predicated replication.");
7287   // Record predicated instructions for above packing optimizations.
7288   PredInst2Recipe[I] = Recipe;
7289   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7290   VPBlockUtils::insertBlockAfter(Region, VPBB);
7291   auto *RegSucc = new VPBasicBlock();
7292   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7293   return RegSucc;
7294 }
7295 
7296 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7297                                                       VPRecipeBase *PredRecipe,
7298                                                       VPlanPtr &Plan) {
7299   // Instructions marked for predication are replicated and placed under an
7300   // if-then construct to prevent side-effects.
7301 
7302   // Generate recipes to compute the block mask for this region.
7303   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7304 
7305   // Build the triangular if-then region.
7306   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7307   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7308   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7309   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7310   auto *PHIRecipe =
7311       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7312   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7313   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7314   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7315 
7316   // Note: first set Entry as region entry and then connect successors starting
7317   // from it in order, to propagate the "parent" of each VPBasicBlock.
7318   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7319   VPBlockUtils::connectBlocks(Pred, Exit);
7320 
7321   return Region;
7322 }
7323 
7324 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7325                                                       VFRange &Range,
7326                                                       VPlanPtr &Plan) {
7327   // First, check for specific widening recipes that deal with calls, memory
7328   // operations, inductions and Phi nodes.
7329   if (auto *CI = dyn_cast<CallInst>(Instr))
7330     return tryToWidenCall(CI, Range, *Plan);
7331 
7332   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7333     return tryToWidenMemory(Instr, Range, Plan);
7334 
7335   VPRecipeBase *Recipe;
7336   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7337     if (Phi->getParent() != OrigLoop->getHeader())
7338       return tryToBlend(Phi, Plan);
7339     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7340       return Recipe;
7341     return new VPWidenPHIRecipe(Phi);
7342   }
7343 
7344   if (isa<TruncInst>(Instr) &&
7345       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7346     return Recipe;
7347 
7348   if (!shouldWiden(Instr, Range))
7349     return nullptr;
7350 
7351   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7352     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7353                                 OrigLoop);
7354 
7355   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7356     bool InvariantCond =
7357         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7358     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7359                                    InvariantCond);
7360   }
7361 
7362   return tryToWiden(Instr, *Plan);
7363 }
7364 
7365 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7366                                                         unsigned MaxVF) {
7367   assert(OrigLoop->empty() && "Inner loop expected.");
7368 
7369   // Collect conditions feeding internal conditional branches; they need to be
7370   // represented in VPlan for it to model masking.
7371   SmallPtrSet<Value *, 1> NeedDef;
7372 
7373   auto *Latch = OrigLoop->getLoopLatch();
7374   for (BasicBlock *BB : OrigLoop->blocks()) {
7375     if (BB == Latch)
7376       continue;
7377     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7378     if (Branch && Branch->isConditional())
7379       NeedDef.insert(Branch->getCondition());
7380   }
7381 
7382   // If the tail is to be folded by masking, the primary induction variable, if
7383   // exists needs to be represented in VPlan for it to model early-exit masking.
7384   // Also, both the Phi and the live-out instruction of each reduction are
7385   // required in order to introduce a select between them in VPlan.
7386   if (CM.foldTailByMasking()) {
7387     if (Legal->getPrimaryInduction())
7388       NeedDef.insert(Legal->getPrimaryInduction());
7389     for (auto &Reduction : Legal->getReductionVars()) {
7390       NeedDef.insert(Reduction.first);
7391       NeedDef.insert(Reduction.second.getLoopExitInstr());
7392     }
7393   }
7394 
7395   // Collect instructions from the original loop that will become trivially dead
7396   // in the vectorized loop. We don't need to vectorize these instructions. For
7397   // example, original induction update instructions can become dead because we
7398   // separately emit induction "steps" when generating code for the new loop.
7399   // Similarly, we create a new latch condition when setting up the structure
7400   // of the new loop, so the old one can become dead.
7401   SmallPtrSet<Instruction *, 4> DeadInstructions;
7402   collectTriviallyDeadInstructions(DeadInstructions);
7403 
7404   // Add assume instructions we need to drop to DeadInstructions, to prevent
7405   // them from being added to the VPlan.
7406   // TODO: We only need to drop assumes in blocks that get flattend. If the
7407   // control flow is preserved, we should keep them.
7408   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7409   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7410 
7411   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7412   // Dead instructions do not need sinking. Remove them from SinkAfter.
7413   for (Instruction *I : DeadInstructions)
7414     SinkAfter.erase(I);
7415 
7416   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7417     VFRange SubRange = {VF, MaxVF + 1};
7418     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7419                                              DeadInstructions, SinkAfter));
7420     VF = SubRange.End;
7421   }
7422 }
7423 
7424 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7425     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7426     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7427     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7428 
7429   // Hold a mapping from predicated instructions to their recipes, in order to
7430   // fix their AlsoPack behavior if a user is determined to replicate and use a
7431   // scalar instead of vector value.
7432   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7433 
7434   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7435 
7436   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7437 
7438   // ---------------------------------------------------------------------------
7439   // Pre-construction: record ingredients whose recipes we'll need to further
7440   // process after constructing the initial VPlan.
7441   // ---------------------------------------------------------------------------
7442 
7443   // Mark instructions we'll need to sink later and their targets as
7444   // ingredients whose recipe we'll need to record.
7445   for (auto &Entry : SinkAfter) {
7446     RecipeBuilder.recordRecipeOf(Entry.first);
7447     RecipeBuilder.recordRecipeOf(Entry.second);
7448   }
7449   for (auto &Reduction : CM.getInLoopReductionChains()) {
7450     PHINode *Phi = Reduction.first;
7451     RecurrenceDescriptor::RecurrenceKind Kind =
7452         Legal->getReductionVars()[Phi].getRecurrenceKind();
7453     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7454 
7455     RecipeBuilder.recordRecipeOf(Phi);
7456     for (auto &R : ReductionOperations) {
7457       RecipeBuilder.recordRecipeOf(R);
7458       // For min/max reducitons, where we have a pair of icmp/select, we also
7459       // need to record the ICmp recipe, so it can be removed later.
7460       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7461           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7462         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7463       }
7464     }
7465   }
7466 
7467   // For each interleave group which is relevant for this (possibly trimmed)
7468   // Range, add it to the set of groups to be later applied to the VPlan and add
7469   // placeholders for its members' Recipes which we'll be replacing with a
7470   // single VPInterleaveRecipe.
7471   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7472     auto applyIG = [IG, this](unsigned VF) -> bool {
7473       return (VF >= 2 && // Query is illegal for VF == 1
7474               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7475                   LoopVectorizationCostModel::CM_Interleave);
7476     };
7477     if (!getDecisionAndClampRange(applyIG, Range))
7478       continue;
7479     InterleaveGroups.insert(IG);
7480     for (unsigned i = 0; i < IG->getFactor(); i++)
7481       if (Instruction *Member = IG->getMember(i))
7482         RecipeBuilder.recordRecipeOf(Member);
7483   };
7484 
7485   // ---------------------------------------------------------------------------
7486   // Build initial VPlan: Scan the body of the loop in a topological order to
7487   // visit each basic block after having visited its predecessor basic blocks.
7488   // ---------------------------------------------------------------------------
7489 
7490   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7491   auto Plan = std::make_unique<VPlan>();
7492   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7493   Plan->setEntry(VPBB);
7494 
7495   // Represent values that will have defs inside VPlan.
7496   for (Value *V : NeedDef)
7497     Plan->addVPValue(V);
7498 
7499   // Scan the body of the loop in a topological order to visit each basic block
7500   // after having visited its predecessor basic blocks.
7501   LoopBlocksDFS DFS(OrigLoop);
7502   DFS.perform(LI);
7503 
7504   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7505     // Relevant instructions from basic block BB will be grouped into VPRecipe
7506     // ingredients and fill a new VPBasicBlock.
7507     unsigned VPBBsForBB = 0;
7508     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7509     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7510     VPBB = FirstVPBBForBB;
7511     Builder.setInsertPoint(VPBB);
7512 
7513     // Introduce each ingredient into VPlan.
7514     // TODO: Model and preserve debug instrinsics in VPlan.
7515     for (Instruction &I : BB->instructionsWithoutDebug()) {
7516       Instruction *Instr = &I;
7517 
7518       // First filter out irrelevant instructions, to ensure no recipes are
7519       // built for them.
7520       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7521         continue;
7522 
7523       if (auto Recipe =
7524               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7525         RecipeBuilder.setRecipe(Instr, Recipe);
7526         VPBB->appendRecipe(Recipe);
7527         continue;
7528       }
7529 
7530       // Otherwise, if all widening options failed, Instruction is to be
7531       // replicated. This may create a successor for VPBB.
7532       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7533           Instr, Range, VPBB, PredInst2Recipe, Plan);
7534       if (NextVPBB != VPBB) {
7535         VPBB = NextVPBB;
7536         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7537                                     : "");
7538       }
7539     }
7540   }
7541 
7542   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7543   // may also be empty, such as the last one VPBB, reflecting original
7544   // basic-blocks with no recipes.
7545   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7546   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7547   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7548   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7549   delete PreEntry;
7550 
7551   // ---------------------------------------------------------------------------
7552   // Transform initial VPlan: Apply previously taken decisions, in order, to
7553   // bring the VPlan to its final state.
7554   // ---------------------------------------------------------------------------
7555 
7556   // Apply Sink-After legal constraints.
7557   for (auto &Entry : SinkAfter) {
7558     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7559     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7560     Sink->moveAfter(Target);
7561   }
7562 
7563   // Interleave memory: for each Interleave Group we marked earlier as relevant
7564   // for this VPlan, replace the Recipes widening its memory instructions with a
7565   // single VPInterleaveRecipe at its insertion point.
7566   for (auto IG : InterleaveGroups) {
7567     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7568         RecipeBuilder.getRecipe(IG->getInsertPos()));
7569     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7570         ->insertBefore(Recipe);
7571 
7572     for (unsigned i = 0; i < IG->getFactor(); ++i)
7573       if (Instruction *Member = IG->getMember(i)) {
7574         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7575       }
7576   }
7577 
7578   // Adjust the recipes for any inloop reductions.
7579   if (Range.Start > 1)
7580     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7581 
7582   // Finally, if tail is folded by masking, introduce selects between the phi
7583   // and the live-out instruction of each reduction, at the end of the latch.
7584   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7585     Builder.setInsertPoint(VPBB);
7586     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7587     for (auto &Reduction : Legal->getReductionVars()) {
7588       assert(!CM.isInLoopReduction(Reduction.first) &&
7589              "Didn't expect inloop tail folded reduction yet!");
7590       VPValue *Phi = Plan->getVPValue(Reduction.first);
7591       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7592       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7593     }
7594   }
7595 
7596   std::string PlanName;
7597   raw_string_ostream RSO(PlanName);
7598   unsigned VF = Range.Start;
7599   Plan->addVF(VF);
7600   RSO << "Initial VPlan for VF={" << VF;
7601   for (VF *= 2; VF < Range.End; VF *= 2) {
7602     Plan->addVF(VF);
7603     RSO << "," << VF;
7604   }
7605   RSO << "},UF>=1";
7606   RSO.flush();
7607   Plan->setName(PlanName);
7608 
7609   return Plan;
7610 }
7611 
7612 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7613   // Outer loop handling: They may require CFG and instruction level
7614   // transformations before even evaluating whether vectorization is profitable.
7615   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7616   // the vectorization pipeline.
7617   assert(!OrigLoop->empty());
7618   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7619 
7620   // Create new empty VPlan
7621   auto Plan = std::make_unique<VPlan>();
7622 
7623   // Build hierarchical CFG
7624   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7625   HCFGBuilder.buildHierarchicalCFG();
7626 
7627   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7628     Plan->addVF(VF);
7629 
7630   if (EnableVPlanPredication) {
7631     VPlanPredicator VPP(*Plan);
7632     VPP.predicate();
7633 
7634     // Avoid running transformation to recipes until masked code generation in
7635     // VPlan-native path is in place.
7636     return Plan;
7637   }
7638 
7639   SmallPtrSet<Instruction *, 1> DeadInstructions;
7640   VPlanTransforms::VPInstructionsToVPRecipes(
7641       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7642   return Plan;
7643 }
7644 
7645 // Adjust the recipes for any inloop reductions. The chain of instructions
7646 // leading from the loop exit instr to the phi need to be converted to
7647 // reductions, with one operand being vector and the other being the scalar
7648 // reduction chain.
7649 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7650     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7651   for (auto &Reduction : CM.getInLoopReductionChains()) {
7652     PHINode *Phi = Reduction.first;
7653     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7654     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7655 
7656     // ReductionOperations are orders top-down from the phi's use to the
7657     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7658     // which of the two operands will remain scalar and which will be reduced.
7659     // For minmax the chain will be the select instructions.
7660     Instruction *Chain = Phi;
7661     for (Instruction *R : ReductionOperations) {
7662       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7663       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7664 
7665       VPValue *ChainOp = Plan->getVPValue(Chain);
7666       unsigned FirstOpId;
7667       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7668           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7669         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
7670                "Expected to replace a VPWidenSelectSC");
7671         FirstOpId = 1;
7672       } else {
7673         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7674                "Expected to replace a VPWidenSC");
7675         FirstOpId = 0;
7676       }
7677       unsigned VecOpId =
7678           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7679       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7680 
7681       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7682           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7683       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7684       WidenRecipe->eraseFromParent();
7685 
7686       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7687           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7688         VPRecipeBase *CompareRecipe =
7689             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7690         assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7691                "Expected to replace a VPWidenSC");
7692         CompareRecipe->eraseFromParent();
7693       }
7694       Chain = R;
7695     }
7696   }
7697 }
7698 
7699 Value* LoopVectorizationPlanner::VPCallbackILV::
7700 getOrCreateVectorValues(Value *V, unsigned Part) {
7701       return ILV.getOrCreateVectorValue(V, Part);
7702 }
7703 
7704 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7705     Value *V, const VPIteration &Instance) {
7706   return ILV.getOrCreateScalarValue(V, Instance);
7707 }
7708 
7709 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7710                                VPSlotTracker &SlotTracker) const {
7711   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7712   IG->getInsertPos()->printAsOperand(O, false);
7713   O << ", ";
7714   getAddr()->printAsOperand(O, SlotTracker);
7715   VPValue *Mask = getMask();
7716   if (Mask) {
7717     O << ", ";
7718     Mask->printAsOperand(O, SlotTracker);
7719   }
7720   for (unsigned i = 0; i < IG->getFactor(); ++i)
7721     if (Instruction *I = IG->getMember(i))
7722       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7723 }
7724 
7725 void VPWidenCallRecipe::execute(VPTransformState &State) {
7726   State.ILV->widenCallInstruction(Ingredient, User, State);
7727 }
7728 
7729 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7730   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7731 }
7732 
7733 void VPWidenRecipe::execute(VPTransformState &State) {
7734   State.ILV->widenInstruction(Ingredient, User, State);
7735 }
7736 
7737 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7738   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7739                       IsIndexLoopInvariant, State);
7740 }
7741 
7742 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7743   assert(!State.Instance && "Int or FP induction being replicated.");
7744   State.ILV->widenIntOrFpInduction(IV, Trunc);
7745 }
7746 
7747 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7748   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7749 }
7750 
7751 void VPBlendRecipe::execute(VPTransformState &State) {
7752   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7753   // We know that all PHIs in non-header blocks are converted into
7754   // selects, so we don't have to worry about the insertion order and we
7755   // can just use the builder.
7756   // At this point we generate the predication tree. There may be
7757   // duplications since this is a simple recursive scan, but future
7758   // optimizations will clean it up.
7759 
7760   unsigned NumIncoming = getNumIncomingValues();
7761 
7762   // Generate a sequence of selects of the form:
7763   // SELECT(Mask3, In3,
7764   //        SELECT(Mask2, In2,
7765   //               SELECT(Mask1, In1,
7766   //                      In0)))
7767   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7768   // are essentially undef are taken from In0.
7769   InnerLoopVectorizer::VectorParts Entry(State.UF);
7770   for (unsigned In = 0; In < NumIncoming; ++In) {
7771     for (unsigned Part = 0; Part < State.UF; ++Part) {
7772       // We might have single edge PHIs (blocks) - use an identity
7773       // 'select' for the first PHI operand.
7774       Value *In0 = State.get(getIncomingValue(In), Part);
7775       if (In == 0)
7776         Entry[Part] = In0; // Initialize with the first incoming value.
7777       else {
7778         // Select between the current value and the previous incoming edge
7779         // based on the incoming mask.
7780         Value *Cond = State.get(getMask(In), Part);
7781         Entry[Part] =
7782             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7783       }
7784     }
7785   }
7786   for (unsigned Part = 0; Part < State.UF; ++Part)
7787     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7788 }
7789 
7790 void VPInterleaveRecipe::execute(VPTransformState &State) {
7791   assert(!State.Instance && "Interleave group being replicated.");
7792   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7793 }
7794 
7795 void VPReductionRecipe::execute(VPTransformState &State) {
7796   assert(!State.Instance && "Reduction being replicated.");
7797   for (unsigned Part = 0; Part < State.UF; ++Part) {
7798     unsigned Kind = RdxDesc->getRecurrenceKind();
7799     Value *NewVecOp = State.get(VecOp, Part);
7800     Value *NewRed =
7801         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
7802     Value *PrevInChain = State.get(ChainOp, Part);
7803     Value *NextInChain;
7804     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7805         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7806       NextInChain =
7807           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
7808                          NewRed, PrevInChain);
7809     } else {
7810       NextInChain = State.Builder.CreateBinOp(
7811           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
7812     }
7813     State.ValueMap.setVectorValue(I, Part, NextInChain);
7814   }
7815 }
7816 
7817 void VPReplicateRecipe::execute(VPTransformState &State) {
7818   if (State.Instance) { // Generate a single instance.
7819     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7820                                     IsPredicated, State);
7821     // Insert scalar instance packing it into a vector.
7822     if (AlsoPack && State.VF > 1) {
7823       // If we're constructing lane 0, initialize to start from undef.
7824       if (State.Instance->Lane == 0) {
7825         Value *Undef = UndefValue::get(
7826             FixedVectorType::get(Ingredient->getType(), State.VF));
7827         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7828       }
7829       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7830     }
7831     return;
7832   }
7833 
7834   // Generate scalar instances for all VF lanes of all UF parts, unless the
7835   // instruction is uniform inwhich case generate only the first lane for each
7836   // of the UF parts.
7837   unsigned EndLane = IsUniform ? 1 : State.VF;
7838   for (unsigned Part = 0; Part < State.UF; ++Part)
7839     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7840       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
7841                                       IsPredicated, State);
7842 }
7843 
7844 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7845   assert(State.Instance && "Branch on Mask works only on single instance.");
7846 
7847   unsigned Part = State.Instance->Part;
7848   unsigned Lane = State.Instance->Lane;
7849 
7850   Value *ConditionBit = nullptr;
7851   VPValue *BlockInMask = getMask();
7852   if (BlockInMask) {
7853     ConditionBit = State.get(BlockInMask, Part);
7854     if (ConditionBit->getType()->isVectorTy())
7855       ConditionBit = State.Builder.CreateExtractElement(
7856           ConditionBit, State.Builder.getInt32(Lane));
7857   } else // Block in mask is all-one.
7858     ConditionBit = State.Builder.getTrue();
7859 
7860   // Replace the temporary unreachable terminator with a new conditional branch,
7861   // whose two destinations will be set later when they are created.
7862   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7863   assert(isa<UnreachableInst>(CurrentTerminator) &&
7864          "Expected to replace unreachable terminator with conditional branch.");
7865   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7866   CondBr->setSuccessor(0, nullptr);
7867   ReplaceInstWithInst(CurrentTerminator, CondBr);
7868 }
7869 
7870 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7871   assert(State.Instance && "Predicated instruction PHI works per instance.");
7872   Instruction *ScalarPredInst = cast<Instruction>(
7873       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7874   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7875   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7876   assert(PredicatingBB && "Predicated block has no single predecessor.");
7877 
7878   // By current pack/unpack logic we need to generate only a single phi node: if
7879   // a vector value for the predicated instruction exists at this point it means
7880   // the instruction has vector users only, and a phi for the vector value is
7881   // needed. In this case the recipe of the predicated instruction is marked to
7882   // also do that packing, thereby "hoisting" the insert-element sequence.
7883   // Otherwise, a phi node for the scalar value is needed.
7884   unsigned Part = State.Instance->Part;
7885   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7886     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7887     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7888     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7889     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7890     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7891     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7892   } else {
7893     Type *PredInstType = PredInst->getType();
7894     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7895     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7896     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7897     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7898   }
7899 }
7900 
7901 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7902   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
7903   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
7904                                         getMask());
7905 }
7906 
7907 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7908 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7909 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7910 // for predication.
7911 static ScalarEpilogueLowering getScalarEpilogueLowering(
7912     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7913     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7914     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7915     LoopVectorizationLegality &LVL) {
7916   bool OptSize =
7917       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7918                                                      PGSOQueryType::IRPass);
7919   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7920   // don't look at hints or options, and don't request a scalar epilogue.
7921   if (OptSize)
7922     return CM_ScalarEpilogueNotAllowedOptSize;
7923 
7924   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7925                               !PreferPredicateOverEpilog;
7926 
7927   // 2) Next, if disabling predication is requested on the command line, honour
7928   // this and request a scalar epilogue.
7929   if (PredicateOptDisabled)
7930     return CM_ScalarEpilogueAllowed;
7931 
7932   // 3) and 4) look if enabling predication is requested on the command line,
7933   // with a loop hint, or if the TTI hook indicates this is profitable, request
7934   // predication .
7935   if (PreferPredicateOverEpilog ||
7936       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7937       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7938                                         LVL.getLAI()) &&
7939        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7940     return CM_ScalarEpilogueNotNeededUsePredicate;
7941 
7942   return CM_ScalarEpilogueAllowed;
7943 }
7944 
7945 // Process the loop in the VPlan-native vectorization path. This path builds
7946 // VPlan upfront in the vectorization pipeline, which allows to apply
7947 // VPlan-to-VPlan transformations from the very beginning without modifying the
7948 // input LLVM IR.
7949 static bool processLoopInVPlanNativePath(
7950     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7951     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7952     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7953     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7954     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7955 
7956   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
7957     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7958     return false;
7959   }
7960   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7961   Function *F = L->getHeader()->getParent();
7962   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7963 
7964   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7965       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7966 
7967   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7968                                 &Hints, IAI);
7969   // Use the planner for outer loop vectorization.
7970   // TODO: CM is not used at this point inside the planner. Turn CM into an
7971   // optional argument if we don't need it in the future.
7972   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
7973 
7974   // Get user vectorization factor.
7975   const unsigned UserVF = Hints.getWidth();
7976 
7977   // Plan how to best vectorize, return the best VF and its cost.
7978   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7979 
7980   // If we are stress testing VPlan builds, do not attempt to generate vector
7981   // code. Masked vector code generation support will follow soon.
7982   // Also, do not attempt to vectorize if no vector code will be produced.
7983   if (VPlanBuildStressTest || EnableVPlanPredication ||
7984       VectorizationFactor::Disabled() == VF)
7985     return false;
7986 
7987   LVP.setBestPlan(VF.Width, 1);
7988 
7989   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7990                          &CM, BFI, PSI);
7991   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7992                     << L->getHeader()->getParent()->getName() << "\"\n");
7993   LVP.executePlan(LB, DT);
7994 
7995   // Mark the loop as already vectorized to avoid vectorizing again.
7996   Hints.setAlreadyVectorized();
7997 
7998   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
7999   return true;
8000 }
8001 
8002 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8003     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8004                                !EnableLoopInterleaving),
8005       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8006                               !EnableLoopVectorization) {}
8007 
8008 bool LoopVectorizePass::processLoop(Loop *L) {
8009   assert((EnableVPlanNativePath || L->empty()) &&
8010          "VPlan-native path is not enabled. Only process inner loops.");
8011 
8012 #ifndef NDEBUG
8013   const std::string DebugLocStr = getDebugLocString(L);
8014 #endif /* NDEBUG */
8015 
8016   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8017                     << L->getHeader()->getParent()->getName() << "\" from "
8018                     << DebugLocStr << "\n");
8019 
8020   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8021 
8022   LLVM_DEBUG(
8023       dbgs() << "LV: Loop hints:"
8024              << " force="
8025              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8026                      ? "disabled"
8027                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8028                             ? "enabled"
8029                             : "?"))
8030              << " width=" << Hints.getWidth()
8031              << " unroll=" << Hints.getInterleave() << "\n");
8032 
8033   // Function containing loop
8034   Function *F = L->getHeader()->getParent();
8035 
8036   // Looking at the diagnostic output is the only way to determine if a loop
8037   // was vectorized (other than looking at the IR or machine code), so it
8038   // is important to generate an optimization remark for each loop. Most of
8039   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8040   // generated as OptimizationRemark and OptimizationRemarkMissed are
8041   // less verbose reporting vectorized loops and unvectorized loops that may
8042   // benefit from vectorization, respectively.
8043 
8044   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8045     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8046     return false;
8047   }
8048 
8049   PredicatedScalarEvolution PSE(*SE, *L);
8050 
8051   // Check if it is legal to vectorize the loop.
8052   LoopVectorizationRequirements Requirements(*ORE);
8053   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8054                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8055   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8056     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8057     Hints.emitRemarkWithHints();
8058     return false;
8059   }
8060 
8061   // Check the function attributes and profiles to find out if this function
8062   // should be optimized for size.
8063   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8064       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8065 
8066   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8067   // here. They may require CFG and instruction level transformations before
8068   // even evaluating whether vectorization is profitable. Since we cannot modify
8069   // the incoming IR, we need to build VPlan upfront in the vectorization
8070   // pipeline.
8071   if (!L->empty())
8072     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8073                                         ORE, BFI, PSI, Hints);
8074 
8075   assert(L->empty() && "Inner loop expected.");
8076 
8077   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8078   // count by optimizing for size, to minimize overheads.
8079   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8080   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8081     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8082                       << "This loop is worth vectorizing only if no scalar "
8083                       << "iteration overheads are incurred.");
8084     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8085       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8086     else {
8087       LLVM_DEBUG(dbgs() << "\n");
8088       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8089     }
8090   }
8091 
8092   // Check the function attributes to see if implicit floats are allowed.
8093   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8094   // an integer loop and the vector instructions selected are purely integer
8095   // vector instructions?
8096   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8097     reportVectorizationFailure(
8098         "Can't vectorize when the NoImplicitFloat attribute is used",
8099         "loop not vectorized due to NoImplicitFloat attribute",
8100         "NoImplicitFloat", ORE, L);
8101     Hints.emitRemarkWithHints();
8102     return false;
8103   }
8104 
8105   // Check if the target supports potentially unsafe FP vectorization.
8106   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8107   // for the target we're vectorizing for, to make sure none of the
8108   // additional fp-math flags can help.
8109   if (Hints.isPotentiallyUnsafe() &&
8110       TTI->isFPVectorizationPotentiallyUnsafe()) {
8111     reportVectorizationFailure(
8112         "Potentially unsafe FP op prevents vectorization",
8113         "loop not vectorized due to unsafe FP support.",
8114         "UnsafeFP", ORE, L);
8115     Hints.emitRemarkWithHints();
8116     return false;
8117   }
8118 
8119   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8120   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8121 
8122   // If an override option has been passed in for interleaved accesses, use it.
8123   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8124     UseInterleaved = EnableInterleavedMemAccesses;
8125 
8126   // Analyze interleaved memory accesses.
8127   if (UseInterleaved) {
8128     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8129   }
8130 
8131   // Use the cost model.
8132   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8133                                 F, &Hints, IAI);
8134   CM.collectValuesToIgnore();
8135 
8136   // Use the planner for vectorization.
8137   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8138 
8139   // Get user vectorization factor and interleave count.
8140   unsigned UserVF = Hints.getWidth();
8141   unsigned UserIC = Hints.getInterleave();
8142 
8143   // Plan how to best vectorize, return the best VF and its cost.
8144   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
8145 
8146   VectorizationFactor VF = VectorizationFactor::Disabled();
8147   unsigned IC = 1;
8148 
8149   if (MaybeVF) {
8150     VF = *MaybeVF;
8151     // Select the interleave count.
8152     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8153   }
8154 
8155   // Identify the diagnostic messages that should be produced.
8156   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8157   bool VectorizeLoop = true, InterleaveLoop = true;
8158   if (Requirements.doesNotMeet(F, L, Hints)) {
8159     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8160                          "requirements.\n");
8161     Hints.emitRemarkWithHints();
8162     return false;
8163   }
8164 
8165   if (VF.Width == 1) {
8166     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8167     VecDiagMsg = std::make_pair(
8168         "VectorizationNotBeneficial",
8169         "the cost-model indicates that vectorization is not beneficial");
8170     VectorizeLoop = false;
8171   }
8172 
8173   if (!MaybeVF && UserIC > 1) {
8174     // Tell the user interleaving was avoided up-front, despite being explicitly
8175     // requested.
8176     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8177                          "interleaving should be avoided up front\n");
8178     IntDiagMsg = std::make_pair(
8179         "InterleavingAvoided",
8180         "Ignoring UserIC, because interleaving was avoided up front");
8181     InterleaveLoop = false;
8182   } else if (IC == 1 && UserIC <= 1) {
8183     // Tell the user interleaving is not beneficial.
8184     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8185     IntDiagMsg = std::make_pair(
8186         "InterleavingNotBeneficial",
8187         "the cost-model indicates that interleaving is not beneficial");
8188     InterleaveLoop = false;
8189     if (UserIC == 1) {
8190       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8191       IntDiagMsg.second +=
8192           " and is explicitly disabled or interleave count is set to 1";
8193     }
8194   } else if (IC > 1 && UserIC == 1) {
8195     // Tell the user interleaving is beneficial, but it explicitly disabled.
8196     LLVM_DEBUG(
8197         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8198     IntDiagMsg = std::make_pair(
8199         "InterleavingBeneficialButDisabled",
8200         "the cost-model indicates that interleaving is beneficial "
8201         "but is explicitly disabled or interleave count is set to 1");
8202     InterleaveLoop = false;
8203   }
8204 
8205   // Override IC if user provided an interleave count.
8206   IC = UserIC > 0 ? UserIC : IC;
8207 
8208   // Emit diagnostic messages, if any.
8209   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8210   if (!VectorizeLoop && !InterleaveLoop) {
8211     // Do not vectorize or interleaving the loop.
8212     ORE->emit([&]() {
8213       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8214                                       L->getStartLoc(), L->getHeader())
8215              << VecDiagMsg.second;
8216     });
8217     ORE->emit([&]() {
8218       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8219                                       L->getStartLoc(), L->getHeader())
8220              << IntDiagMsg.second;
8221     });
8222     return false;
8223   } else if (!VectorizeLoop && InterleaveLoop) {
8224     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8225     ORE->emit([&]() {
8226       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8227                                         L->getStartLoc(), L->getHeader())
8228              << VecDiagMsg.second;
8229     });
8230   } else if (VectorizeLoop && !InterleaveLoop) {
8231     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8232                       << ") in " << DebugLocStr << '\n');
8233     ORE->emit([&]() {
8234       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8235                                         L->getStartLoc(), L->getHeader())
8236              << IntDiagMsg.second;
8237     });
8238   } else if (VectorizeLoop && InterleaveLoop) {
8239     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8240                       << ") in " << DebugLocStr << '\n');
8241     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8242   }
8243 
8244   LVP.setBestPlan(VF.Width, IC);
8245 
8246   using namespace ore;
8247   bool DisableRuntimeUnroll = false;
8248   MDNode *OrigLoopID = L->getLoopID();
8249 
8250   if (!VectorizeLoop) {
8251     assert(IC > 1 && "interleave count should not be 1 or 0");
8252     // If we decided that it is not legal to vectorize the loop, then
8253     // interleave it.
8254     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8255                                BFI, PSI);
8256     LVP.executePlan(Unroller, DT);
8257 
8258     ORE->emit([&]() {
8259       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8260                                 L->getHeader())
8261              << "interleaved loop (interleaved count: "
8262              << NV("InterleaveCount", IC) << ")";
8263     });
8264   } else {
8265     // If we decided that it is *legal* to vectorize the loop, then do it.
8266     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8267                            &LVL, &CM, BFI, PSI);
8268     LVP.executePlan(LB, DT);
8269     ++LoopsVectorized;
8270 
8271     // Add metadata to disable runtime unrolling a scalar loop when there are
8272     // no runtime checks about strides and memory. A scalar loop that is
8273     // rarely used is not worth unrolling.
8274     if (!LB.areSafetyChecksAdded())
8275       DisableRuntimeUnroll = true;
8276 
8277     // Report the vectorization decision.
8278     ORE->emit([&]() {
8279       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8280                                 L->getHeader())
8281              << "vectorized loop (vectorization width: "
8282              << NV("VectorizationFactor", VF.Width)
8283              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8284     });
8285   }
8286 
8287   Optional<MDNode *> RemainderLoopID =
8288       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8289                                       LLVMLoopVectorizeFollowupEpilogue});
8290   if (RemainderLoopID.hasValue()) {
8291     L->setLoopID(RemainderLoopID.getValue());
8292   } else {
8293     if (DisableRuntimeUnroll)
8294       AddRuntimeUnrollDisableMetaData(L);
8295 
8296     // Mark the loop as already vectorized to avoid vectorizing again.
8297     Hints.setAlreadyVectorized();
8298   }
8299 
8300   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8301   return true;
8302 }
8303 
8304 LoopVectorizeResult LoopVectorizePass::runImpl(
8305     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8306     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8307     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8308     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8309     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8310   SE = &SE_;
8311   LI = &LI_;
8312   TTI = &TTI_;
8313   DT = &DT_;
8314   BFI = &BFI_;
8315   TLI = TLI_;
8316   AA = &AA_;
8317   AC = &AC_;
8318   GetLAA = &GetLAA_;
8319   DB = &DB_;
8320   ORE = &ORE_;
8321   PSI = PSI_;
8322 
8323   // Don't attempt if
8324   // 1. the target claims to have no vector registers, and
8325   // 2. interleaving won't help ILP.
8326   //
8327   // The second condition is necessary because, even if the target has no
8328   // vector registers, loop vectorization may still enable scalar
8329   // interleaving.
8330   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8331       TTI->getMaxInterleaveFactor(1) < 2)
8332     return LoopVectorizeResult(false, false);
8333 
8334   bool Changed = false, CFGChanged = false;
8335 
8336   // The vectorizer requires loops to be in simplified form.
8337   // Since simplification may add new inner loops, it has to run before the
8338   // legality and profitability checks. This means running the loop vectorizer
8339   // will simplify all loops, regardless of whether anything end up being
8340   // vectorized.
8341   for (auto &L : *LI)
8342     Changed |= CFGChanged |=
8343         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8344 
8345   // Build up a worklist of inner-loops to vectorize. This is necessary as
8346   // the act of vectorizing or partially unrolling a loop creates new loops
8347   // and can invalidate iterators across the loops.
8348   SmallVector<Loop *, 8> Worklist;
8349 
8350   for (Loop *L : *LI)
8351     collectSupportedLoops(*L, LI, ORE, Worklist);
8352 
8353   LoopsAnalyzed += Worklist.size();
8354 
8355   // Now walk the identified inner loops.
8356   while (!Worklist.empty()) {
8357     Loop *L = Worklist.pop_back_val();
8358 
8359     // For the inner loops we actually process, form LCSSA to simplify the
8360     // transform.
8361     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8362 
8363     Changed |= CFGChanged |= processLoop(L);
8364   }
8365 
8366   // Process each loop nest in the function.
8367   return LoopVectorizeResult(Changed, CFGChanged);
8368 }
8369 
8370 PreservedAnalyses LoopVectorizePass::run(Function &F,
8371                                          FunctionAnalysisManager &AM) {
8372     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8373     auto &LI = AM.getResult<LoopAnalysis>(F);
8374     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8375     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8376     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8377     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8378     auto &AA = AM.getResult<AAManager>(F);
8379     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8380     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8381     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8382     MemorySSA *MSSA = EnableMSSALoopDependency
8383                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8384                           : nullptr;
8385 
8386     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8387     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8388         [&](Loop &L) -> const LoopAccessInfo & {
8389       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8390       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8391     };
8392     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8393     ProfileSummaryInfo *PSI =
8394         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8395     LoopVectorizeResult Result =
8396         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8397     if (!Result.MadeAnyChange)
8398       return PreservedAnalyses::all();
8399     PreservedAnalyses PA;
8400 
8401     // We currently do not preserve loopinfo/dominator analyses with outer loop
8402     // vectorization. Until this is addressed, mark these analyses as preserved
8403     // only for non-VPlan-native path.
8404     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8405     if (!EnableVPlanNativePath) {
8406       PA.preserve<LoopAnalysis>();
8407       PA.preserve<DominatorTreeAnalysis>();
8408     }
8409     PA.preserve<BasicAA>();
8410     PA.preserve<GlobalsAA>();
8411     if (!Result.MadeCFGChange)
8412       PA.preserveSet<CFGAnalyses>();
8413     return PA;
8414 }
8415