1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Indicates that an epilogue is undesired, predication is preferred.
182 // This means that the vectorizer will try to fold the loop-tail (epilogue)
183 // into the loop and predicate the loop body accordingly.
184 static cl::opt<bool> PreferPredicateOverEpilog(
185     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
186     cl::desc("Indicate that an epilogue is undesired, predication should be "
187              "used instead."));
188 
189 static cl::opt<bool> MaximizeBandwidth(
190     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
191     cl::desc("Maximize bandwidth when selecting vectorization factor which "
192              "will be determined by the smallest type in loop."));
193 
194 static cl::opt<bool> EnableInterleavedMemAccesses(
195     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
196     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
197 
198 /// An interleave-group may need masking if it resides in a block that needs
199 /// predication, or in order to mask away gaps.
200 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
201     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
202     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
203 
204 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
205     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
206     cl::desc("We don't interleave loops with a estimated constant trip count "
207              "below this number"));
208 
209 static cl::opt<unsigned> ForceTargetNumScalarRegs(
210     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
211     cl::desc("A flag that overrides the target's number of scalar registers."));
212 
213 static cl::opt<unsigned> ForceTargetNumVectorRegs(
214     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
215     cl::desc("A flag that overrides the target's number of vector registers."));
216 
217 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
218     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's max interleave factor for "
220              "scalar loops."));
221 
222 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
223     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
224     cl::desc("A flag that overrides the target's max interleave factor for "
225              "vectorized loops."));
226 
227 static cl::opt<unsigned> ForceTargetInstructionCost(
228     "force-target-instruction-cost", cl::init(0), cl::Hidden,
229     cl::desc("A flag that overrides the target's expected cost for "
230              "an instruction to a single constant value. Mostly "
231              "useful for getting consistent testing."));
232 
233 static cl::opt<unsigned> SmallLoopCost(
234     "small-loop-cost", cl::init(20), cl::Hidden,
235     cl::desc(
236         "The cost of a loop that is considered 'small' by the interleaver."));
237 
238 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
239     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
240     cl::desc("Enable the use of the block frequency analysis to access PGO "
241              "heuristics minimizing code growth in cold regions and being more "
242              "aggressive in hot regions."));
243 
244 // Runtime interleave loops for load/store throughput.
245 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
246     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
247     cl::desc(
248         "Enable runtime interleaving until load/store ports are saturated"));
249 
250 /// The number of stores in a loop that are allowed to need predication.
251 static cl::opt<unsigned> NumberOfStoresToPredicate(
252     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
253     cl::desc("Max number of stores to be predicated behind an if."));
254 
255 static cl::opt<bool> EnableIndVarRegisterHeur(
256     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
257     cl::desc("Count the induction variable only once when interleaving"));
258 
259 static cl::opt<bool> EnableCondStoresVectorization(
260     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
261     cl::desc("Enable if predication of stores during vectorization."));
262 
263 static cl::opt<unsigned> MaxNestedScalarReductionIC(
264     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
265     cl::desc("The maximum interleave count to use when interleaving a scalar "
266              "reduction in a nested loop."));
267 
268 static cl::opt<bool>
269     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
270                            cl::Hidden,
271                            cl::desc("Prefer in-loop vector reductions, "
272                                     "overriding the targets preference."));
273 
274 static cl::opt<bool> PreferPredicatedReductionSelect(
275     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
276     cl::desc(
277         "Prefer predicating a reduction operation over an after loop select."));
278 
279 cl::opt<bool> EnableVPlanNativePath(
280     "enable-vplan-native-path", cl::init(false), cl::Hidden,
281     cl::desc("Enable VPlan-native vectorization path with "
282              "support for outer loop vectorization."));
283 
284 // FIXME: Remove this switch once we have divergence analysis. Currently we
285 // assume divergent non-backedge branches when this switch is true.
286 cl::opt<bool> EnableVPlanPredication(
287     "enable-vplan-predication", cl::init(false), cl::Hidden,
288     cl::desc("Enable VPlan-native vectorization path predicator with "
289              "support for outer loop vectorization."));
290 
291 // This flag enables the stress testing of the VPlan H-CFG construction in the
292 // VPlan-native vectorization path. It must be used in conjuction with
293 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
294 // verification of the H-CFGs built.
295 static cl::opt<bool> VPlanBuildStressTest(
296     "vplan-build-stress-test", cl::init(false), cl::Hidden,
297     cl::desc(
298         "Build VPlan for every supported loop nest in the function and bail "
299         "out right after the build (stress test the VPlan H-CFG construction "
300         "in the VPlan-native vectorization path)."));
301 
302 cl::opt<bool> llvm::EnableLoopInterleaving(
303     "interleave-loops", cl::init(true), cl::Hidden,
304     cl::desc("Enable loop interleaving in Loop vectorization passes"));
305 cl::opt<bool> llvm::EnableLoopVectorization(
306     "vectorize-loops", cl::init(true), cl::Hidden,
307     cl::desc("Run the Loop vectorization passes"));
308 
309 /// A helper function that returns the type of loaded or stored value.
310 static Type *getMemInstValueType(Value *I) {
311   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
312          "Expected Load or Store instruction");
313   if (auto *LI = dyn_cast<LoadInst>(I))
314     return LI->getType();
315   return cast<StoreInst>(I)->getValueOperand()->getType();
316 }
317 
318 /// A helper function that returns true if the given type is irregular. The
319 /// type is irregular if its allocated size doesn't equal the store size of an
320 /// element of the corresponding vector type at the given vectorization factor.
321 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
322   assert(!VF.Scalable && "scalable vectors not yet supported.");
323   // Determine if an array of VF elements of type Ty is "bitcast compatible"
324   // with a <VF x Ty> vector.
325   if (VF.isVector()) {
326     auto *VectorTy = VectorType::get(Ty, VF);
327     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
328   }
329 
330   // If the vectorization factor is one, we just check if an array of type Ty
331   // requires padding between elements.
332   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
333 }
334 
335 /// A helper function that returns the reciprocal of the block probability of
336 /// predicated blocks. If we return X, we are assuming the predicated block
337 /// will execute once for every X iterations of the loop header.
338 ///
339 /// TODO: We should use actual block probability here, if available. Currently,
340 ///       we always assume predicated blocks have a 50% chance of executing.
341 static unsigned getReciprocalPredBlockProb() { return 2; }
342 
343 /// A helper function that adds a 'fast' flag to floating-point operations.
344 static Value *addFastMathFlag(Value *V) {
345   if (isa<FPMathOperator>(V))
346     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
347   return V;
348 }
349 
350 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
351   if (isa<FPMathOperator>(V))
352     cast<Instruction>(V)->setFastMathFlags(FMF);
353   return V;
354 }
355 
356 /// A helper function that returns an integer or floating-point constant with
357 /// value C.
358 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
359   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
360                            : ConstantFP::get(Ty, C);
361 }
362 
363 /// Returns "best known" trip count for the specified loop \p L as defined by
364 /// the following procedure:
365 ///   1) Returns exact trip count if it is known.
366 ///   2) Returns expected trip count according to profile data if any.
367 ///   3) Returns upper bound estimate if it is known.
368 ///   4) Returns None if all of the above failed.
369 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
370   // Check if exact trip count is known.
371   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
372     return ExpectedTC;
373 
374   // Check if there is an expected trip count available from profile data.
375   if (LoopVectorizeWithBlockFrequency)
376     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
377       return EstimatedTC;
378 
379   // Check if upper bound estimate is known.
380   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
381     return ExpectedTC;
382 
383   return None;
384 }
385 
386 namespace llvm {
387 
388 /// InnerLoopVectorizer vectorizes loops which contain only one basic
389 /// block to a specified vectorization factor (VF).
390 /// This class performs the widening of scalars into vectors, or multiple
391 /// scalars. This class also implements the following features:
392 /// * It inserts an epilogue loop for handling loops that don't have iteration
393 ///   counts that are known to be a multiple of the vectorization factor.
394 /// * It handles the code generation for reduction variables.
395 /// * Scalarization (implementation using scalars) of un-vectorizable
396 ///   instructions.
397 /// InnerLoopVectorizer does not perform any vectorization-legality
398 /// checks, and relies on the caller to check for the different legality
399 /// aspects. The InnerLoopVectorizer relies on the
400 /// LoopVectorizationLegality class to provide information about the induction
401 /// and reduction variables that were found to a given vectorization factor.
402 class InnerLoopVectorizer {
403 public:
404   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
405                       LoopInfo *LI, DominatorTree *DT,
406                       const TargetLibraryInfo *TLI,
407                       const TargetTransformInfo *TTI, AssumptionCache *AC,
408                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
409                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
410                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
411                       ProfileSummaryInfo *PSI)
412       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
413         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
414         Builder(PSE.getSE()->getContext()),
415         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
416         BFI(BFI), PSI(PSI) {
417     // Query this against the original loop and save it here because the profile
418     // of the original loop header may change as the transformation happens.
419     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
420         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
421   }
422 
423   virtual ~InnerLoopVectorizer() = default;
424 
425   /// Create a new empty loop that will contain vectorized instructions later
426   /// on, while the old loop will be used as the scalar remainder. Control flow
427   /// is generated around the vectorized (and scalar epilogue) loops consisting
428   /// of various checks and bypasses. Return the pre-header block of the new
429   /// loop.
430   BasicBlock *createVectorizedLoopSkeleton();
431 
432   /// Widen a single instruction within the innermost loop.
433   void widenInstruction(Instruction &I, VPUser &Operands,
434                         VPTransformState &State);
435 
436   /// Widen a single call instruction within the innermost loop.
437   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
438                             VPTransformState &State);
439 
440   /// Widen a single select instruction within the innermost loop.
441   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
442                               bool InvariantCond, VPTransformState &State);
443 
444   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
445   void fixVectorizedLoop();
446 
447   // Return true if any runtime check is added.
448   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
449 
450   /// A type for vectorized values in the new loop. Each value from the
451   /// original loop, when vectorized, is represented by UF vector values in the
452   /// new unrolled loop, where UF is the unroll factor.
453   using VectorParts = SmallVector<Value *, 2>;
454 
455   /// Vectorize a single GetElementPtrInst based on information gathered and
456   /// decisions taken during planning.
457   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
458                 ElementCount VF, bool IsPtrLoopInvariant,
459                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
460 
461   /// Vectorize a single PHINode in a block. This method handles the induction
462   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
463   /// arbitrary length vectors.
464   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
465 
466   /// A helper function to scalarize a single Instruction in the innermost loop.
467   /// Generates a sequence of scalar instances for each lane between \p MinLane
468   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
469   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
470   /// Instr's operands.
471   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
472                             const VPIteration &Instance, bool IfPredicateInstr,
473                             VPTransformState &State);
474 
475   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
476   /// is provided, the integer induction variable will first be truncated to
477   /// the corresponding type.
478   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
479 
480   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
481   /// vector or scalar value on-demand if one is not yet available. When
482   /// vectorizing a loop, we visit the definition of an instruction before its
483   /// uses. When visiting the definition, we either vectorize or scalarize the
484   /// instruction, creating an entry for it in the corresponding map. (In some
485   /// cases, such as induction variables, we will create both vector and scalar
486   /// entries.) Then, as we encounter uses of the definition, we derive values
487   /// for each scalar or vector use unless such a value is already available.
488   /// For example, if we scalarize a definition and one of its uses is vector,
489   /// we build the required vector on-demand with an insertelement sequence
490   /// when visiting the use. Otherwise, if the use is scalar, we can use the
491   /// existing scalar definition.
492   ///
493   /// Return a value in the new loop corresponding to \p V from the original
494   /// loop at unroll index \p Part. If the value has already been vectorized,
495   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
496   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
497   /// a new vector value on-demand by inserting the scalar values into a vector
498   /// with an insertelement sequence. If the value has been neither vectorized
499   /// nor scalarized, it must be loop invariant, so we simply broadcast the
500   /// value into a vector.
501   Value *getOrCreateVectorValue(Value *V, unsigned Part);
502 
503   /// Return a value in the new loop corresponding to \p V from the original
504   /// loop at unroll and vector indices \p Instance. If the value has been
505   /// vectorized but not scalarized, the necessary extractelement instruction
506   /// will be generated.
507   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
508 
509   /// Construct the vector value of a scalarized value \p V one lane at a time.
510   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
511 
512   /// Try to vectorize interleaved access group \p Group with the base address
513   /// given in \p Addr, optionally masking the vector operations if \p
514   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
515   /// values in the vectorized loop.
516   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
517                                 VPTransformState &State, VPValue *Addr,
518                                 VPValue *BlockInMask = nullptr);
519 
520   /// Vectorize Load and Store instructions with the base address given in \p
521   /// Addr, optionally masking the vector operations if \p BlockInMask is
522   /// non-null. Use \p State to translate given VPValues to IR values in the
523   /// vectorized loop.
524   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
525                                   VPValue *Addr, VPValue *StoredValue,
526                                   VPValue *BlockInMask);
527 
528   /// Set the debug location in the builder using the debug location in
529   /// the instruction.
530   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
531 
532   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
533   void fixNonInductionPHIs(void);
534 
535 protected:
536   friend class LoopVectorizationPlanner;
537 
538   /// A small list of PHINodes.
539   using PhiVector = SmallVector<PHINode *, 4>;
540 
541   /// A type for scalarized values in the new loop. Each value from the
542   /// original loop, when scalarized, is represented by UF x VF scalar values
543   /// in the new unrolled loop, where UF is the unroll factor and VF is the
544   /// vectorization factor.
545   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
546 
547   /// Set up the values of the IVs correctly when exiting the vector loop.
548   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
549                     Value *CountRoundDown, Value *EndValue,
550                     BasicBlock *MiddleBlock);
551 
552   /// Create a new induction variable inside L.
553   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
554                                    Value *Step, Instruction *DL);
555 
556   /// Handle all cross-iteration phis in the header.
557   void fixCrossIterationPHIs();
558 
559   /// Fix a first-order recurrence. This is the second phase of vectorizing
560   /// this phi node.
561   void fixFirstOrderRecurrence(PHINode *Phi);
562 
563   /// Fix a reduction cross-iteration phi. This is the second phase of
564   /// vectorizing this phi node.
565   void fixReduction(PHINode *Phi);
566 
567   /// Clear NSW/NUW flags from reduction instructions if necessary.
568   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
569 
570   /// The Loop exit block may have single value PHI nodes with some
571   /// incoming value. While vectorizing we only handled real values
572   /// that were defined inside the loop and we should have one value for
573   /// each predecessor of its parent basic block. See PR14725.
574   void fixLCSSAPHIs();
575 
576   /// Iteratively sink the scalarized operands of a predicated instruction into
577   /// the block that was created for it.
578   void sinkScalarOperands(Instruction *PredInst);
579 
580   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
581   /// represented as.
582   void truncateToMinimalBitwidths();
583 
584   /// Create a broadcast instruction. This method generates a broadcast
585   /// instruction (shuffle) for loop invariant values and for the induction
586   /// value. If this is the induction variable then we extend it to N, N+1, ...
587   /// this is needed because each iteration in the loop corresponds to a SIMD
588   /// element.
589   virtual Value *getBroadcastInstrs(Value *V);
590 
591   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
592   /// to each vector element of Val. The sequence starts at StartIndex.
593   /// \p Opcode is relevant for FP induction variable.
594   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
595                                Instruction::BinaryOps Opcode =
596                                Instruction::BinaryOpsEnd);
597 
598   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
599   /// variable on which to base the steps, \p Step is the size of the step, and
600   /// \p EntryVal is the value from the original loop that maps to the steps.
601   /// Note that \p EntryVal doesn't have to be an induction variable - it
602   /// can also be a truncate instruction.
603   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
604                         const InductionDescriptor &ID);
605 
606   /// Create a vector induction phi node based on an existing scalar one. \p
607   /// EntryVal is the value from the original loop that maps to the vector phi
608   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
609   /// truncate instruction, instead of widening the original IV, we widen a
610   /// version of the IV truncated to \p EntryVal's type.
611   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
612                                        Value *Step, Instruction *EntryVal);
613 
614   /// Returns true if an instruction \p I should be scalarized instead of
615   /// vectorized for the chosen vectorization factor.
616   bool shouldScalarizeInstruction(Instruction *I) const;
617 
618   /// Returns true if we should generate a scalar version of \p IV.
619   bool needsScalarInduction(Instruction *IV) const;
620 
621   /// If there is a cast involved in the induction variable \p ID, which should
622   /// be ignored in the vectorized loop body, this function records the
623   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
624   /// cast. We had already proved that the casted Phi is equal to the uncasted
625   /// Phi in the vectorized loop (under a runtime guard), and therefore
626   /// there is no need to vectorize the cast - the same value can be used in the
627   /// vector loop for both the Phi and the cast.
628   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
629   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
630   ///
631   /// \p EntryVal is the value from the original loop that maps to the vector
632   /// phi node and is used to distinguish what is the IV currently being
633   /// processed - original one (if \p EntryVal is a phi corresponding to the
634   /// original IV) or the "newly-created" one based on the proof mentioned above
635   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
636   /// latter case \p EntryVal is a TruncInst and we must not record anything for
637   /// that IV, but it's error-prone to expect callers of this routine to care
638   /// about that, hence this explicit parameter.
639   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
640                                              const Instruction *EntryVal,
641                                              Value *VectorLoopValue,
642                                              unsigned Part,
643                                              unsigned Lane = UINT_MAX);
644 
645   /// Generate a shuffle sequence that will reverse the vector Vec.
646   virtual Value *reverseVector(Value *Vec);
647 
648   /// Returns (and creates if needed) the original loop trip count.
649   Value *getOrCreateTripCount(Loop *NewLoop);
650 
651   /// Returns (and creates if needed) the trip count of the widened loop.
652   Value *getOrCreateVectorTripCount(Loop *NewLoop);
653 
654   /// Returns a bitcasted value to the requested vector type.
655   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
656   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
657                                 const DataLayout &DL);
658 
659   /// Emit a bypass check to see if the vector trip count is zero, including if
660   /// it overflows.
661   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
662 
663   /// Emit a bypass check to see if all of the SCEV assumptions we've
664   /// had to make are correct.
665   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
666 
667   /// Emit bypass checks to check any memory assumptions we may have made.
668   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
669 
670   /// Compute the transformed value of Index at offset StartValue using step
671   /// StepValue.
672   /// For integer induction, returns StartValue + Index * StepValue.
673   /// For pointer induction, returns StartValue[Index * StepValue].
674   /// FIXME: The newly created binary instructions should contain nsw/nuw
675   /// flags, which can be found from the original scalar operations.
676   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
677                               const DataLayout &DL,
678                               const InductionDescriptor &ID) const;
679 
680   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
681   /// vector loop preheader, middle block and scalar preheader. Also
682   /// allocate a loop object for the new vector loop and return it.
683   Loop *createVectorLoopSkeleton(StringRef Prefix);
684 
685   /// Create new phi nodes for the induction variables to resume iteration count
686   /// in the scalar epilogue, from where the vectorized loop left off (given by
687   /// \p VectorTripCount).
688   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
689 
690   /// Complete the loop skeleton by adding debug MDs, creating appropriate
691   /// conditional branches in the middle block, preparing the builder and
692   /// running the verifier. Take in the vector loop \p L as argument, and return
693   /// the preheader of the completed vector loop.
694   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
695 
696   /// Add additional metadata to \p To that was not present on \p Orig.
697   ///
698   /// Currently this is used to add the noalias annotations based on the
699   /// inserted memchecks.  Use this for instructions that are *cloned* into the
700   /// vector loop.
701   void addNewMetadata(Instruction *To, const Instruction *Orig);
702 
703   /// Add metadata from one instruction to another.
704   ///
705   /// This includes both the original MDs from \p From and additional ones (\see
706   /// addNewMetadata).  Use this for *newly created* instructions in the vector
707   /// loop.
708   void addMetadata(Instruction *To, Instruction *From);
709 
710   /// Similar to the previous function but it adds the metadata to a
711   /// vector of instructions.
712   void addMetadata(ArrayRef<Value *> To, Instruction *From);
713 
714   /// The original loop.
715   Loop *OrigLoop;
716 
717   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
718   /// dynamic knowledge to simplify SCEV expressions and converts them to a
719   /// more usable form.
720   PredicatedScalarEvolution &PSE;
721 
722   /// Loop Info.
723   LoopInfo *LI;
724 
725   /// Dominator Tree.
726   DominatorTree *DT;
727 
728   /// Alias Analysis.
729   AAResults *AA;
730 
731   /// Target Library Info.
732   const TargetLibraryInfo *TLI;
733 
734   /// Target Transform Info.
735   const TargetTransformInfo *TTI;
736 
737   /// Assumption Cache.
738   AssumptionCache *AC;
739 
740   /// Interface to emit optimization remarks.
741   OptimizationRemarkEmitter *ORE;
742 
743   /// LoopVersioning.  It's only set up (non-null) if memchecks were
744   /// used.
745   ///
746   /// This is currently only used to add no-alias metadata based on the
747   /// memchecks.  The actually versioning is performed manually.
748   std::unique_ptr<LoopVersioning> LVer;
749 
750   /// The vectorization SIMD factor to use. Each vector will have this many
751   /// vector elements.
752   ElementCount VF;
753 
754   /// The vectorization unroll factor to use. Each scalar is vectorized to this
755   /// many different vector instructions.
756   unsigned UF;
757 
758   /// The builder that we use
759   IRBuilder<> Builder;
760 
761   // --- Vectorization state ---
762 
763   /// The vector-loop preheader.
764   BasicBlock *LoopVectorPreHeader;
765 
766   /// The scalar-loop preheader.
767   BasicBlock *LoopScalarPreHeader;
768 
769   /// Middle Block between the vector and the scalar.
770   BasicBlock *LoopMiddleBlock;
771 
772   /// The ExitBlock of the scalar loop.
773   BasicBlock *LoopExitBlock;
774 
775   /// The vector loop body.
776   BasicBlock *LoopVectorBody;
777 
778   /// The scalar loop body.
779   BasicBlock *LoopScalarBody;
780 
781   /// A list of all bypass blocks. The first block is the entry of the loop.
782   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
783 
784   /// The new Induction variable which was added to the new block.
785   PHINode *Induction = nullptr;
786 
787   /// The induction variable of the old basic block.
788   PHINode *OldInduction = nullptr;
789 
790   /// Maps values from the original loop to their corresponding values in the
791   /// vectorized loop. A key value can map to either vector values, scalar
792   /// values or both kinds of values, depending on whether the key was
793   /// vectorized and scalarized.
794   VectorizerValueMap VectorLoopValueMap;
795 
796   /// Store instructions that were predicated.
797   SmallVector<Instruction *, 4> PredicatedInstructions;
798 
799   /// Trip count of the original loop.
800   Value *TripCount = nullptr;
801 
802   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
803   Value *VectorTripCount = nullptr;
804 
805   /// The legality analysis.
806   LoopVectorizationLegality *Legal;
807 
808   /// The profitablity analysis.
809   LoopVectorizationCostModel *Cost;
810 
811   // Record whether runtime checks are added.
812   bool AddedSafetyChecks = false;
813 
814   // Holds the end values for each induction variable. We save the end values
815   // so we can later fix-up the external users of the induction variables.
816   DenseMap<PHINode *, Value *> IVEndValues;
817 
818   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
819   // fixed up at the end of vector code generation.
820   SmallVector<PHINode *, 8> OrigPHIsToFix;
821 
822   /// BFI and PSI are used to check for profile guided size optimizations.
823   BlockFrequencyInfo *BFI;
824   ProfileSummaryInfo *PSI;
825 
826   // Whether this loop should be optimized for size based on profile guided size
827   // optimizatios.
828   bool OptForSizeBasedOnProfile;
829 };
830 
831 class InnerLoopUnroller : public InnerLoopVectorizer {
832 public:
833   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
834                     LoopInfo *LI, DominatorTree *DT,
835                     const TargetLibraryInfo *TLI,
836                     const TargetTransformInfo *TTI, AssumptionCache *AC,
837                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
838                     LoopVectorizationLegality *LVL,
839                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
840                     ProfileSummaryInfo *PSI)
841       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
842                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
843                             BFI, PSI) {}
844 
845 private:
846   Value *getBroadcastInstrs(Value *V) override;
847   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
848                        Instruction::BinaryOps Opcode =
849                        Instruction::BinaryOpsEnd) override;
850   Value *reverseVector(Value *Vec) override;
851 };
852 
853 } // end namespace llvm
854 
855 /// Look for a meaningful debug location on the instruction or it's
856 /// operands.
857 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
858   if (!I)
859     return I;
860 
861   DebugLoc Empty;
862   if (I->getDebugLoc() != Empty)
863     return I;
864 
865   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
866     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
867       if (OpInst->getDebugLoc() != Empty)
868         return OpInst;
869   }
870 
871   return I;
872 }
873 
874 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
875   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
876     const DILocation *DIL = Inst->getDebugLoc();
877     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
878         !isa<DbgInfoIntrinsic>(Inst)) {
879       assert(!VF.Scalable && "scalable vectors not yet supported.");
880       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min);
881       if (NewDIL)
882         B.SetCurrentDebugLocation(NewDIL.getValue());
883       else
884         LLVM_DEBUG(dbgs()
885                    << "Failed to create new discriminator: "
886                    << DIL->getFilename() << " Line: " << DIL->getLine());
887     }
888     else
889       B.SetCurrentDebugLocation(DIL);
890   } else
891     B.SetCurrentDebugLocation(DebugLoc());
892 }
893 
894 /// Write a record \p DebugMsg about vectorization failure to the debug
895 /// output stream. If \p I is passed, it is an instruction that prevents
896 /// vectorization.
897 #ifndef NDEBUG
898 static void debugVectorizationFailure(const StringRef DebugMsg,
899     Instruction *I) {
900   dbgs() << "LV: Not vectorizing: " << DebugMsg;
901   if (I != nullptr)
902     dbgs() << " " << *I;
903   else
904     dbgs() << '.';
905   dbgs() << '\n';
906 }
907 #endif
908 
909 /// Create an analysis remark that explains why vectorization failed
910 ///
911 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
912 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
913 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
914 /// the location of the remark.  \return the remark object that can be
915 /// streamed to.
916 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
917     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
918   Value *CodeRegion = TheLoop->getHeader();
919   DebugLoc DL = TheLoop->getStartLoc();
920 
921   if (I) {
922     CodeRegion = I->getParent();
923     // If there is no debug location attached to the instruction, revert back to
924     // using the loop's.
925     if (I->getDebugLoc())
926       DL = I->getDebugLoc();
927   }
928 
929   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
930   R << "loop not vectorized: ";
931   return R;
932 }
933 
934 namespace llvm {
935 
936 void reportVectorizationFailure(const StringRef DebugMsg,
937     const StringRef OREMsg, const StringRef ORETag,
938     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
939   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
940   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
941   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
942                 ORETag, TheLoop, I) << OREMsg);
943 }
944 
945 } // end namespace llvm
946 
947 #ifndef NDEBUG
948 /// \return string containing a file name and a line # for the given loop.
949 static std::string getDebugLocString(const Loop *L) {
950   std::string Result;
951   if (L) {
952     raw_string_ostream OS(Result);
953     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
954       LoopDbgLoc.print(OS);
955     else
956       // Just print the module name.
957       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
958     OS.flush();
959   }
960   return Result;
961 }
962 #endif
963 
964 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
965                                          const Instruction *Orig) {
966   // If the loop was versioned with memchecks, add the corresponding no-alias
967   // metadata.
968   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
969     LVer->annotateInstWithNoAlias(To, Orig);
970 }
971 
972 void InnerLoopVectorizer::addMetadata(Instruction *To,
973                                       Instruction *From) {
974   propagateMetadata(To, From);
975   addNewMetadata(To, From);
976 }
977 
978 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
979                                       Instruction *From) {
980   for (Value *V : To) {
981     if (Instruction *I = dyn_cast<Instruction>(V))
982       addMetadata(I, From);
983   }
984 }
985 
986 namespace llvm {
987 
988 // Loop vectorization cost-model hints how the scalar epilogue loop should be
989 // lowered.
990 enum ScalarEpilogueLowering {
991 
992   // The default: allowing scalar epilogues.
993   CM_ScalarEpilogueAllowed,
994 
995   // Vectorization with OptForSize: don't allow epilogues.
996   CM_ScalarEpilogueNotAllowedOptSize,
997 
998   // A special case of vectorisation with OptForSize: loops with a very small
999   // trip count are considered for vectorization under OptForSize, thereby
1000   // making sure the cost of their loop body is dominant, free of runtime
1001   // guards and scalar iteration overheads.
1002   CM_ScalarEpilogueNotAllowedLowTripLoop,
1003 
1004   // Loop hint predicate indicating an epilogue is undesired.
1005   CM_ScalarEpilogueNotNeededUsePredicate
1006 };
1007 
1008 /// LoopVectorizationCostModel - estimates the expected speedups due to
1009 /// vectorization.
1010 /// In many cases vectorization is not profitable. This can happen because of
1011 /// a number of reasons. In this class we mainly attempt to predict the
1012 /// expected speedup/slowdowns due to the supported instruction set. We use the
1013 /// TargetTransformInfo to query the different backends for the cost of
1014 /// different operations.
1015 class LoopVectorizationCostModel {
1016 public:
1017   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1018                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1019                              LoopVectorizationLegality *Legal,
1020                              const TargetTransformInfo &TTI,
1021                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1022                              AssumptionCache *AC,
1023                              OptimizationRemarkEmitter *ORE, const Function *F,
1024                              const LoopVectorizeHints *Hints,
1025                              InterleavedAccessInfo &IAI)
1026       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1027         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1028         Hints(Hints), InterleaveInfo(IAI) {}
1029 
1030   /// \return An upper bound for the vectorization factor, or None if
1031   /// vectorization and interleaving should be avoided up front.
1032   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1033 
1034   /// \return True if runtime checks are required for vectorization, and false
1035   /// otherwise.
1036   bool runtimeChecksRequired();
1037 
1038   /// \return The most profitable vectorization factor and the cost of that VF.
1039   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1040   /// then this vectorization factor will be selected if vectorization is
1041   /// possible.
1042   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1043 
1044   /// Setup cost-based decisions for user vectorization factor.
1045   void selectUserVectorizationFactor(ElementCount UserVF) {
1046     collectUniformsAndScalars(UserVF);
1047     collectInstsToScalarize(UserVF);
1048   }
1049 
1050   /// \return The size (in bits) of the smallest and widest types in the code
1051   /// that needs to be vectorized. We ignore values that remain scalar such as
1052   /// 64 bit loop indices.
1053   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1054 
1055   /// \return The desired interleave count.
1056   /// If interleave count has been specified by metadata it will be returned.
1057   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1058   /// are the selected vectorization factor and the cost of the selected VF.
1059   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1060 
1061   /// Memory access instruction may be vectorized in more than one way.
1062   /// Form of instruction after vectorization depends on cost.
1063   /// This function takes cost-based decisions for Load/Store instructions
1064   /// and collects them in a map. This decisions map is used for building
1065   /// the lists of loop-uniform and loop-scalar instructions.
1066   /// The calculated cost is saved with widening decision in order to
1067   /// avoid redundant calculations.
1068   void setCostBasedWideningDecision(ElementCount VF);
1069 
1070   /// A struct that represents some properties of the register usage
1071   /// of a loop.
1072   struct RegisterUsage {
1073     /// Holds the number of loop invariant values that are used in the loop.
1074     /// The key is ClassID of target-provided register class.
1075     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1076     /// Holds the maximum number of concurrent live intervals in the loop.
1077     /// The key is ClassID of target-provided register class.
1078     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1079   };
1080 
1081   /// \return Returns information about the register usages of the loop for the
1082   /// given vectorization factors.
1083   SmallVector<RegisterUsage, 8>
1084   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1085 
1086   /// Collect values we want to ignore in the cost model.
1087   void collectValuesToIgnore();
1088 
1089   /// Split reductions into those that happen in the loop, and those that happen
1090   /// outside. In loop reductions are collected into InLoopReductionChains.
1091   void collectInLoopReductions();
1092 
1093   /// \returns The smallest bitwidth each instruction can be represented with.
1094   /// The vector equivalents of these instructions should be truncated to this
1095   /// type.
1096   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1097     return MinBWs;
1098   }
1099 
1100   /// \returns True if it is more profitable to scalarize instruction \p I for
1101   /// vectorization factor \p VF.
1102   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1103     assert(VF.isVector() &&
1104            "Profitable to scalarize relevant only for VF > 1.");
1105 
1106     // Cost model is not run in the VPlan-native path - return conservative
1107     // result until this changes.
1108     if (EnableVPlanNativePath)
1109       return false;
1110 
1111     auto Scalars = InstsToScalarize.find(VF);
1112     assert(Scalars != InstsToScalarize.end() &&
1113            "VF not yet analyzed for scalarization profitability");
1114     return Scalars->second.find(I) != Scalars->second.end();
1115   }
1116 
1117   /// Returns true if \p I is known to be uniform after vectorization.
1118   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1119     if (VF.isScalar())
1120       return true;
1121 
1122     // Cost model is not run in the VPlan-native path - return conservative
1123     // result until this changes.
1124     if (EnableVPlanNativePath)
1125       return false;
1126 
1127     auto UniformsPerVF = Uniforms.find(VF);
1128     assert(UniformsPerVF != Uniforms.end() &&
1129            "VF not yet analyzed for uniformity");
1130     return UniformsPerVF->second.count(I);
1131   }
1132 
1133   /// Returns true if \p I is known to be scalar after vectorization.
1134   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1135     if (VF.isScalar())
1136       return true;
1137 
1138     // Cost model is not run in the VPlan-native path - return conservative
1139     // result until this changes.
1140     if (EnableVPlanNativePath)
1141       return false;
1142 
1143     auto ScalarsPerVF = Scalars.find(VF);
1144     assert(ScalarsPerVF != Scalars.end() &&
1145            "Scalar values are not calculated for VF");
1146     return ScalarsPerVF->second.count(I);
1147   }
1148 
1149   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1150   /// for vectorization factor \p VF.
1151   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1152     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1153            !isProfitableToScalarize(I, VF) &&
1154            !isScalarAfterVectorization(I, VF);
1155   }
1156 
1157   /// Decision that was taken during cost calculation for memory instruction.
1158   enum InstWidening {
1159     CM_Unknown,
1160     CM_Widen,         // For consecutive accesses with stride +1.
1161     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1162     CM_Interleave,
1163     CM_GatherScatter,
1164     CM_Scalarize
1165   };
1166 
1167   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1168   /// instruction \p I and vector width \p VF.
1169   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1170                            unsigned Cost) {
1171     assert(VF.isVector() && "Expected VF >=2");
1172     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1173   }
1174 
1175   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1176   /// interleaving group \p Grp and vector width \p VF.
1177   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1178                            ElementCount VF, InstWidening W, unsigned Cost) {
1179     assert(VF.isVector() && "Expected VF >=2");
1180     /// Broadcast this decicion to all instructions inside the group.
1181     /// But the cost will be assigned to one instruction only.
1182     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1183       if (auto *I = Grp->getMember(i)) {
1184         if (Grp->getInsertPos() == I)
1185           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1186         else
1187           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1188       }
1189     }
1190   }
1191 
1192   /// Return the cost model decision for the given instruction \p I and vector
1193   /// width \p VF. Return CM_Unknown if this instruction did not pass
1194   /// through the cost modeling.
1195   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1196     assert(!VF.Scalable && "scalable vectors not yet supported.");
1197     assert(VF.isVector() && "Expected VF >=2");
1198 
1199     // Cost model is not run in the VPlan-native path - return conservative
1200     // result until this changes.
1201     if (EnableVPlanNativePath)
1202       return CM_GatherScatter;
1203 
1204     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1205     auto Itr = WideningDecisions.find(InstOnVF);
1206     if (Itr == WideningDecisions.end())
1207       return CM_Unknown;
1208     return Itr->second.first;
1209   }
1210 
1211   /// Return the vectorization cost for the given instruction \p I and vector
1212   /// width \p VF.
1213   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1214     assert(VF.isVector() && "Expected VF >=2");
1215     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1216     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1217            "The cost is not calculated");
1218     return WideningDecisions[InstOnVF].second;
1219   }
1220 
1221   /// Return True if instruction \p I is an optimizable truncate whose operand
1222   /// is an induction variable. Such a truncate will be removed by adding a new
1223   /// induction variable with the destination type.
1224   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1225     // If the instruction is not a truncate, return false.
1226     auto *Trunc = dyn_cast<TruncInst>(I);
1227     if (!Trunc)
1228       return false;
1229 
1230     // Get the source and destination types of the truncate.
1231     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1232     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1233 
1234     // If the truncate is free for the given types, return false. Replacing a
1235     // free truncate with an induction variable would add an induction variable
1236     // update instruction to each iteration of the loop. We exclude from this
1237     // check the primary induction variable since it will need an update
1238     // instruction regardless.
1239     Value *Op = Trunc->getOperand(0);
1240     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1241       return false;
1242 
1243     // If the truncated value is not an induction variable, return false.
1244     return Legal->isInductionPhi(Op);
1245   }
1246 
1247   /// Collects the instructions to scalarize for each predicated instruction in
1248   /// the loop.
1249   void collectInstsToScalarize(ElementCount VF);
1250 
1251   /// Collect Uniform and Scalar values for the given \p VF.
1252   /// The sets depend on CM decision for Load/Store instructions
1253   /// that may be vectorized as interleave, gather-scatter or scalarized.
1254   void collectUniformsAndScalars(ElementCount VF) {
1255     // Do the analysis once.
1256     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1257       return;
1258     setCostBasedWideningDecision(VF);
1259     collectLoopUniforms(VF);
1260     collectLoopScalars(VF);
1261   }
1262 
1263   /// Returns true if the target machine supports masked store operation
1264   /// for the given \p DataType and kind of access to \p Ptr.
1265   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1266     return Legal->isConsecutivePtr(Ptr) &&
1267            TTI.isLegalMaskedStore(DataType, Alignment);
1268   }
1269 
1270   /// Returns true if the target machine supports masked load operation
1271   /// for the given \p DataType and kind of access to \p Ptr.
1272   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1273     return Legal->isConsecutivePtr(Ptr) &&
1274            TTI.isLegalMaskedLoad(DataType, Alignment);
1275   }
1276 
1277   /// Returns true if the target machine supports masked scatter operation
1278   /// for the given \p DataType.
1279   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1280     return TTI.isLegalMaskedScatter(DataType, Alignment);
1281   }
1282 
1283   /// Returns true if the target machine supports masked gather operation
1284   /// for the given \p DataType.
1285   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1286     return TTI.isLegalMaskedGather(DataType, Alignment);
1287   }
1288 
1289   /// Returns true if the target machine can represent \p V as a masked gather
1290   /// or scatter operation.
1291   bool isLegalGatherOrScatter(Value *V) {
1292     bool LI = isa<LoadInst>(V);
1293     bool SI = isa<StoreInst>(V);
1294     if (!LI && !SI)
1295       return false;
1296     auto *Ty = getMemInstValueType(V);
1297     Align Align = getLoadStoreAlignment(V);
1298     return (LI && isLegalMaskedGather(Ty, Align)) ||
1299            (SI && isLegalMaskedScatter(Ty, Align));
1300   }
1301 
1302   /// Returns true if \p I is an instruction that will be scalarized with
1303   /// predication. Such instructions include conditional stores and
1304   /// instructions that may divide by zero.
1305   /// If a non-zero VF has been calculated, we check if I will be scalarized
1306   /// predication for that VF.
1307   bool isScalarWithPredication(Instruction *I,
1308                                ElementCount VF = ElementCount::getFixed(1));
1309 
1310   // Returns true if \p I is an instruction that will be predicated either
1311   // through scalar predication or masked load/store or masked gather/scatter.
1312   // Superset of instructions that return true for isScalarWithPredication.
1313   bool isPredicatedInst(Instruction *I) {
1314     if (!blockNeedsPredication(I->getParent()))
1315       return false;
1316     // Loads and stores that need some form of masked operation are predicated
1317     // instructions.
1318     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1319       return Legal->isMaskRequired(I);
1320     return isScalarWithPredication(I);
1321   }
1322 
1323   /// Returns true if \p I is a memory instruction with consecutive memory
1324   /// access that can be widened.
1325   bool
1326   memoryInstructionCanBeWidened(Instruction *I,
1327                                 ElementCount VF = ElementCount::getFixed(1));
1328 
1329   /// Returns true if \p I is a memory instruction in an interleaved-group
1330   /// of memory accesses that can be vectorized with wide vector loads/stores
1331   /// and shuffles.
1332   bool
1333   interleavedAccessCanBeWidened(Instruction *I,
1334                                 ElementCount VF = ElementCount::getFixed(1));
1335 
1336   /// Check if \p Instr belongs to any interleaved access group.
1337   bool isAccessInterleaved(Instruction *Instr) {
1338     return InterleaveInfo.isInterleaved(Instr);
1339   }
1340 
1341   /// Get the interleaved access group that \p Instr belongs to.
1342   const InterleaveGroup<Instruction> *
1343   getInterleavedAccessGroup(Instruction *Instr) {
1344     return InterleaveInfo.getInterleaveGroup(Instr);
1345   }
1346 
1347   /// Returns true if an interleaved group requires a scalar iteration
1348   /// to handle accesses with gaps, and there is nothing preventing us from
1349   /// creating a scalar epilogue.
1350   bool requiresScalarEpilogue() const {
1351     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1352   }
1353 
1354   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1355   /// loop hint annotation.
1356   bool isScalarEpilogueAllowed() const {
1357     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1358   }
1359 
1360   /// Returns true if all loop blocks should be masked to fold tail loop.
1361   bool foldTailByMasking() const { return FoldTailByMasking; }
1362 
1363   bool blockNeedsPredication(BasicBlock *BB) {
1364     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1365   }
1366 
1367   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1368   /// nodes to the chain of instructions representing the reductions. Uses a
1369   /// MapVector to ensure deterministic iteration order.
1370   using ReductionChainMap =
1371       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1372 
1373   /// Return the chain of instructions representing an inloop reduction.
1374   const ReductionChainMap &getInLoopReductionChains() const {
1375     return InLoopReductionChains;
1376   }
1377 
1378   /// Returns true if the Phi is part of an inloop reduction.
1379   bool isInLoopReduction(PHINode *Phi) const {
1380     return InLoopReductionChains.count(Phi);
1381   }
1382 
1383   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1384   /// with factor VF.  Return the cost of the instruction, including
1385   /// scalarization overhead if it's needed.
1386   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1387 
1388   /// Estimate cost of a call instruction CI if it were vectorized with factor
1389   /// VF. Return the cost of the instruction, including scalarization overhead
1390   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1391   /// scalarized -
1392   /// i.e. either vector version isn't available, or is too expensive.
1393   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1394                              bool &NeedToScalarize);
1395 
1396   /// Invalidates decisions already taken by the cost model.
1397   void invalidateCostModelingDecisions() {
1398     WideningDecisions.clear();
1399     Uniforms.clear();
1400     Scalars.clear();
1401   }
1402 
1403 private:
1404   unsigned NumPredStores = 0;
1405 
1406   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1407   /// than zero. One is returned if vectorization should best be avoided due
1408   /// to cost.
1409   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1410 
1411   /// The vectorization cost is a combination of the cost itself and a boolean
1412   /// indicating whether any of the contributing operations will actually
1413   /// operate on
1414   /// vector values after type legalization in the backend. If this latter value
1415   /// is
1416   /// false, then all operations will be scalarized (i.e. no vectorization has
1417   /// actually taken place).
1418   using VectorizationCostTy = std::pair<unsigned, bool>;
1419 
1420   /// Returns the expected execution cost. The unit of the cost does
1421   /// not matter because we use the 'cost' units to compare different
1422   /// vector widths. The cost that is returned is *not* normalized by
1423   /// the factor width.
1424   VectorizationCostTy expectedCost(ElementCount VF);
1425 
1426   /// Returns the execution time cost of an instruction for a given vector
1427   /// width. Vector width of one means scalar.
1428   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1429 
1430   /// The cost-computation logic from getInstructionCost which provides
1431   /// the vector type as an output parameter.
1432   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1433 
1434   /// Calculate vectorization cost of memory instruction \p I.
1435   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1436 
1437   /// The cost computation for scalarized memory instruction.
1438   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1439 
1440   /// The cost computation for interleaving group of memory instructions.
1441   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1442 
1443   /// The cost computation for Gather/Scatter instruction.
1444   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1445 
1446   /// The cost computation for widening instruction \p I with consecutive
1447   /// memory access.
1448   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1449 
1450   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1451   /// Load: scalar load + broadcast.
1452   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1453   /// element)
1454   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1455 
1456   /// Estimate the overhead of scalarizing an instruction. This is a
1457   /// convenience wrapper for the type-based getScalarizationOverhead API.
1458   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1459 
1460   /// Returns whether the instruction is a load or store and will be a emitted
1461   /// as a vector operation.
1462   bool isConsecutiveLoadOrStore(Instruction *I);
1463 
1464   /// Returns true if an artificially high cost for emulated masked memrefs
1465   /// should be used.
1466   bool useEmulatedMaskMemRefHack(Instruction *I);
1467 
1468   /// Map of scalar integer values to the smallest bitwidth they can be legally
1469   /// represented as. The vector equivalents of these values should be truncated
1470   /// to this type.
1471   MapVector<Instruction *, uint64_t> MinBWs;
1472 
1473   /// A type representing the costs for instructions if they were to be
1474   /// scalarized rather than vectorized. The entries are Instruction-Cost
1475   /// pairs.
1476   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1477 
1478   /// A set containing all BasicBlocks that are known to present after
1479   /// vectorization as a predicated block.
1480   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1481 
1482   /// Records whether it is allowed to have the original scalar loop execute at
1483   /// least once. This may be needed as a fallback loop in case runtime
1484   /// aliasing/dependence checks fail, or to handle the tail/remainder
1485   /// iterations when the trip count is unknown or doesn't divide by the VF,
1486   /// or as a peel-loop to handle gaps in interleave-groups.
1487   /// Under optsize and when the trip count is very small we don't allow any
1488   /// iterations to execute in the scalar loop.
1489   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1490 
1491   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1492   bool FoldTailByMasking = false;
1493 
1494   /// A map holding scalar costs for different vectorization factors. The
1495   /// presence of a cost for an instruction in the mapping indicates that the
1496   /// instruction will be scalarized when vectorizing with the associated
1497   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1498   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1499 
1500   /// Holds the instructions known to be uniform after vectorization.
1501   /// The data is collected per VF.
1502   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1503 
1504   /// Holds the instructions known to be scalar after vectorization.
1505   /// The data is collected per VF.
1506   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1507 
1508   /// Holds the instructions (address computations) that are forced to be
1509   /// scalarized.
1510   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1511 
1512   /// PHINodes of the reductions that should be expanded in-loop along with
1513   /// their associated chains of reduction operations, in program order from top
1514   /// (PHI) to bottom
1515   ReductionChainMap InLoopReductionChains;
1516 
1517   /// Returns the expected difference in cost from scalarizing the expression
1518   /// feeding a predicated instruction \p PredInst. The instructions to
1519   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1520   /// non-negative return value implies the expression will be scalarized.
1521   /// Currently, only single-use chains are considered for scalarization.
1522   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1523                               ElementCount VF);
1524 
1525   /// Collect the instructions that are uniform after vectorization. An
1526   /// instruction is uniform if we represent it with a single scalar value in
1527   /// the vectorized loop corresponding to each vector iteration. Examples of
1528   /// uniform instructions include pointer operands of consecutive or
1529   /// interleaved memory accesses. Note that although uniformity implies an
1530   /// instruction will be scalar, the reverse is not true. In general, a
1531   /// scalarized instruction will be represented by VF scalar values in the
1532   /// vectorized loop, each corresponding to an iteration of the original
1533   /// scalar loop.
1534   void collectLoopUniforms(ElementCount VF);
1535 
1536   /// Collect the instructions that are scalar after vectorization. An
1537   /// instruction is scalar if it is known to be uniform or will be scalarized
1538   /// during vectorization. Non-uniform scalarized instructions will be
1539   /// represented by VF values in the vectorized loop, each corresponding to an
1540   /// iteration of the original scalar loop.
1541   void collectLoopScalars(ElementCount VF);
1542 
1543   /// Keeps cost model vectorization decision and cost for instructions.
1544   /// Right now it is used for memory instructions only.
1545   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1546                                 std::pair<InstWidening, unsigned>>;
1547 
1548   DecisionList WideningDecisions;
1549 
1550   /// Returns true if \p V is expected to be vectorized and it needs to be
1551   /// extracted.
1552   bool needsExtract(Value *V, ElementCount VF) const {
1553     Instruction *I = dyn_cast<Instruction>(V);
1554     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1555         TheLoop->isLoopInvariant(I))
1556       return false;
1557 
1558     // Assume we can vectorize V (and hence we need extraction) if the
1559     // scalars are not computed yet. This can happen, because it is called
1560     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1561     // the scalars are collected. That should be a safe assumption in most
1562     // cases, because we check if the operands have vectorizable types
1563     // beforehand in LoopVectorizationLegality.
1564     return Scalars.find(VF) == Scalars.end() ||
1565            !isScalarAfterVectorization(I, VF);
1566   };
1567 
1568   /// Returns a range containing only operands needing to be extracted.
1569   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1570                                                    ElementCount VF) {
1571     return SmallVector<Value *, 4>(make_filter_range(
1572         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1573   }
1574 
1575 public:
1576   /// The loop that we evaluate.
1577   Loop *TheLoop;
1578 
1579   /// Predicated scalar evolution analysis.
1580   PredicatedScalarEvolution &PSE;
1581 
1582   /// Loop Info analysis.
1583   LoopInfo *LI;
1584 
1585   /// Vectorization legality.
1586   LoopVectorizationLegality *Legal;
1587 
1588   /// Vector target information.
1589   const TargetTransformInfo &TTI;
1590 
1591   /// Target Library Info.
1592   const TargetLibraryInfo *TLI;
1593 
1594   /// Demanded bits analysis.
1595   DemandedBits *DB;
1596 
1597   /// Assumption cache.
1598   AssumptionCache *AC;
1599 
1600   /// Interface to emit optimization remarks.
1601   OptimizationRemarkEmitter *ORE;
1602 
1603   const Function *TheFunction;
1604 
1605   /// Loop Vectorize Hint.
1606   const LoopVectorizeHints *Hints;
1607 
1608   /// The interleave access information contains groups of interleaved accesses
1609   /// with the same stride and close to each other.
1610   InterleavedAccessInfo &InterleaveInfo;
1611 
1612   /// Values to ignore in the cost model.
1613   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1614 
1615   /// Values to ignore in the cost model when VF > 1.
1616   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1617 };
1618 
1619 } // end namespace llvm
1620 
1621 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1622 // vectorization. The loop needs to be annotated with #pragma omp simd
1623 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1624 // vector length information is not provided, vectorization is not considered
1625 // explicit. Interleave hints are not allowed either. These limitations will be
1626 // relaxed in the future.
1627 // Please, note that we are currently forced to abuse the pragma 'clang
1628 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1629 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1630 // provides *explicit vectorization hints* (LV can bypass legal checks and
1631 // assume that vectorization is legal). However, both hints are implemented
1632 // using the same metadata (llvm.loop.vectorize, processed by
1633 // LoopVectorizeHints). This will be fixed in the future when the native IR
1634 // representation for pragma 'omp simd' is introduced.
1635 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1636                                    OptimizationRemarkEmitter *ORE) {
1637   assert(!OuterLp->empty() && "This is not an outer loop");
1638   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1639 
1640   // Only outer loops with an explicit vectorization hint are supported.
1641   // Unannotated outer loops are ignored.
1642   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1643     return false;
1644 
1645   Function *Fn = OuterLp->getHeader()->getParent();
1646   if (!Hints.allowVectorization(Fn, OuterLp,
1647                                 true /*VectorizeOnlyWhenForced*/)) {
1648     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1649     return false;
1650   }
1651 
1652   if (Hints.getInterleave() > 1) {
1653     // TODO: Interleave support is future work.
1654     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1655                          "outer loops.\n");
1656     Hints.emitRemarkWithHints();
1657     return false;
1658   }
1659 
1660   return true;
1661 }
1662 
1663 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1664                                   OptimizationRemarkEmitter *ORE,
1665                                   SmallVectorImpl<Loop *> &V) {
1666   // Collect inner loops and outer loops without irreducible control flow. For
1667   // now, only collect outer loops that have explicit vectorization hints. If we
1668   // are stress testing the VPlan H-CFG construction, we collect the outermost
1669   // loop of every loop nest.
1670   if (L.empty() || VPlanBuildStressTest ||
1671       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1672     LoopBlocksRPO RPOT(&L);
1673     RPOT.perform(LI);
1674     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1675       V.push_back(&L);
1676       // TODO: Collect inner loops inside marked outer loops in case
1677       // vectorization fails for the outer loop. Do not invoke
1678       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1679       // already known to be reducible. We can use an inherited attribute for
1680       // that.
1681       return;
1682     }
1683   }
1684   for (Loop *InnerL : L)
1685     collectSupportedLoops(*InnerL, LI, ORE, V);
1686 }
1687 
1688 namespace {
1689 
1690 /// The LoopVectorize Pass.
1691 struct LoopVectorize : public FunctionPass {
1692   /// Pass identification, replacement for typeid
1693   static char ID;
1694 
1695   LoopVectorizePass Impl;
1696 
1697   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1698                          bool VectorizeOnlyWhenForced = false)
1699       : FunctionPass(ID),
1700         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1701     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1702   }
1703 
1704   bool runOnFunction(Function &F) override {
1705     if (skipFunction(F))
1706       return false;
1707 
1708     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1709     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1710     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1711     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1712     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1713     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1714     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1715     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1716     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1717     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1718     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1719     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1720     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1721 
1722     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1723         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1724 
1725     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1726                         GetLAA, *ORE, PSI).MadeAnyChange;
1727   }
1728 
1729   void getAnalysisUsage(AnalysisUsage &AU) const override {
1730     AU.addRequired<AssumptionCacheTracker>();
1731     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1732     AU.addRequired<DominatorTreeWrapperPass>();
1733     AU.addRequired<LoopInfoWrapperPass>();
1734     AU.addRequired<ScalarEvolutionWrapperPass>();
1735     AU.addRequired<TargetTransformInfoWrapperPass>();
1736     AU.addRequired<AAResultsWrapperPass>();
1737     AU.addRequired<LoopAccessLegacyAnalysis>();
1738     AU.addRequired<DemandedBitsWrapperPass>();
1739     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1740     AU.addRequired<InjectTLIMappingsLegacy>();
1741 
1742     // We currently do not preserve loopinfo/dominator analyses with outer loop
1743     // vectorization. Until this is addressed, mark these analyses as preserved
1744     // only for non-VPlan-native path.
1745     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1746     if (!EnableVPlanNativePath) {
1747       AU.addPreserved<LoopInfoWrapperPass>();
1748       AU.addPreserved<DominatorTreeWrapperPass>();
1749     }
1750 
1751     AU.addPreserved<BasicAAWrapperPass>();
1752     AU.addPreserved<GlobalsAAWrapperPass>();
1753     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1754   }
1755 };
1756 
1757 } // end anonymous namespace
1758 
1759 //===----------------------------------------------------------------------===//
1760 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1761 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1762 //===----------------------------------------------------------------------===//
1763 
1764 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1765   // We need to place the broadcast of invariant variables outside the loop,
1766   // but only if it's proven safe to do so. Else, broadcast will be inside
1767   // vector loop body.
1768   Instruction *Instr = dyn_cast<Instruction>(V);
1769   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1770                      (!Instr ||
1771                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1772   // Place the code for broadcasting invariant variables in the new preheader.
1773   IRBuilder<>::InsertPointGuard Guard(Builder);
1774   if (SafeToHoist)
1775     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1776 
1777   // Broadcast the scalar into all locations in the vector.
1778   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1779 
1780   return Shuf;
1781 }
1782 
1783 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1784     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1785   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1786          "Expected either an induction phi-node or a truncate of it!");
1787   Value *Start = II.getStartValue();
1788 
1789   // Construct the initial value of the vector IV in the vector loop preheader
1790   auto CurrIP = Builder.saveIP();
1791   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1792   if (isa<TruncInst>(EntryVal)) {
1793     assert(Start->getType()->isIntegerTy() &&
1794            "Truncation requires an integer type");
1795     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1796     Step = Builder.CreateTrunc(Step, TruncType);
1797     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1798   }
1799   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1800   Value *SteppedStart =
1801       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1802 
1803   // We create vector phi nodes for both integer and floating-point induction
1804   // variables. Here, we determine the kind of arithmetic we will perform.
1805   Instruction::BinaryOps AddOp;
1806   Instruction::BinaryOps MulOp;
1807   if (Step->getType()->isIntegerTy()) {
1808     AddOp = Instruction::Add;
1809     MulOp = Instruction::Mul;
1810   } else {
1811     AddOp = II.getInductionOpcode();
1812     MulOp = Instruction::FMul;
1813   }
1814 
1815   // Multiply the vectorization factor by the step using integer or
1816   // floating-point arithmetic as appropriate.
1817   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min);
1818   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1819 
1820   // Create a vector splat to use in the induction update.
1821   //
1822   // FIXME: If the step is non-constant, we create the vector splat with
1823   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1824   //        handle a constant vector splat.
1825   assert(!VF.Scalable && "scalable vectors not yet supported.");
1826   Value *SplatVF = isa<Constant>(Mul)
1827                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1828                        : Builder.CreateVectorSplat(VF, Mul);
1829   Builder.restoreIP(CurrIP);
1830 
1831   // We may need to add the step a number of times, depending on the unroll
1832   // factor. The last of those goes into the PHI.
1833   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1834                                     &*LoopVectorBody->getFirstInsertionPt());
1835   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1836   Instruction *LastInduction = VecInd;
1837   for (unsigned Part = 0; Part < UF; ++Part) {
1838     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1839 
1840     if (isa<TruncInst>(EntryVal))
1841       addMetadata(LastInduction, EntryVal);
1842     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1843 
1844     LastInduction = cast<Instruction>(addFastMathFlag(
1845         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1846     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1847   }
1848 
1849   // Move the last step to the end of the latch block. This ensures consistent
1850   // placement of all induction updates.
1851   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1852   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1853   auto *ICmp = cast<Instruction>(Br->getCondition());
1854   LastInduction->moveBefore(ICmp);
1855   LastInduction->setName("vec.ind.next");
1856 
1857   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1858   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1859 }
1860 
1861 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1862   return Cost->isScalarAfterVectorization(I, VF) ||
1863          Cost->isProfitableToScalarize(I, VF);
1864 }
1865 
1866 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1867   if (shouldScalarizeInstruction(IV))
1868     return true;
1869   auto isScalarInst = [&](User *U) -> bool {
1870     auto *I = cast<Instruction>(U);
1871     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1872   };
1873   return llvm::any_of(IV->users(), isScalarInst);
1874 }
1875 
1876 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1877     const InductionDescriptor &ID, const Instruction *EntryVal,
1878     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1879   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1880          "Expected either an induction phi-node or a truncate of it!");
1881 
1882   // This induction variable is not the phi from the original loop but the
1883   // newly-created IV based on the proof that casted Phi is equal to the
1884   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1885   // re-uses the same InductionDescriptor that original IV uses but we don't
1886   // have to do any recording in this case - that is done when original IV is
1887   // processed.
1888   if (isa<TruncInst>(EntryVal))
1889     return;
1890 
1891   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1892   if (Casts.empty())
1893     return;
1894   // Only the first Cast instruction in the Casts vector is of interest.
1895   // The rest of the Casts (if exist) have no uses outside the
1896   // induction update chain itself.
1897   Instruction *CastInst = *Casts.begin();
1898   if (Lane < UINT_MAX)
1899     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1900   else
1901     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1902 }
1903 
1904 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1905   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1906          "Primary induction variable must have an integer type");
1907 
1908   auto II = Legal->getInductionVars().find(IV);
1909   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1910 
1911   auto ID = II->second;
1912   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1913 
1914   // The value from the original loop to which we are mapping the new induction
1915   // variable.
1916   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1917 
1918   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1919 
1920   // Generate code for the induction step. Note that induction steps are
1921   // required to be loop-invariant
1922   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1923     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1924            "Induction step should be loop invariant");
1925     if (PSE.getSE()->isSCEVable(IV->getType())) {
1926       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1927       return Exp.expandCodeFor(Step, Step->getType(),
1928                                LoopVectorPreHeader->getTerminator());
1929     }
1930     return cast<SCEVUnknown>(Step)->getValue();
1931   };
1932 
1933   // The scalar value to broadcast. This is derived from the canonical
1934   // induction variable. If a truncation type is given, truncate the canonical
1935   // induction variable and step. Otherwise, derive these values from the
1936   // induction descriptor.
1937   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1938     Value *ScalarIV = Induction;
1939     if (IV != OldInduction) {
1940       ScalarIV = IV->getType()->isIntegerTy()
1941                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1942                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1943                                           IV->getType());
1944       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1945       ScalarIV->setName("offset.idx");
1946     }
1947     if (Trunc) {
1948       auto *TruncType = cast<IntegerType>(Trunc->getType());
1949       assert(Step->getType()->isIntegerTy() &&
1950              "Truncation requires an integer step");
1951       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1952       Step = Builder.CreateTrunc(Step, TruncType);
1953     }
1954     return ScalarIV;
1955   };
1956 
1957   // Create the vector values from the scalar IV, in the absence of creating a
1958   // vector IV.
1959   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1960     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1961     for (unsigned Part = 0; Part < UF; ++Part) {
1962       assert(!VF.Scalable && "scalable vectors not yet supported.");
1963       Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step,
1964                                        ID.getInductionOpcode());
1965       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1966       if (Trunc)
1967         addMetadata(EntryPart, Trunc);
1968       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1969     }
1970   };
1971 
1972   // Now do the actual transformations, and start with creating the step value.
1973   Value *Step = CreateStepValue(ID.getStep());
1974   if (VF.isZero() || VF.isScalar()) {
1975     Value *ScalarIV = CreateScalarIV(Step);
1976     CreateSplatIV(ScalarIV, Step);
1977     return;
1978   }
1979 
1980   // Determine if we want a scalar version of the induction variable. This is
1981   // true if the induction variable itself is not widened, or if it has at
1982   // least one user in the loop that is not widened.
1983   auto NeedsScalarIV = needsScalarInduction(EntryVal);
1984   if (!NeedsScalarIV) {
1985     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1986     return;
1987   }
1988 
1989   // Try to create a new independent vector induction variable. If we can't
1990   // create the phi node, we will splat the scalar induction variable in each
1991   // loop iteration.
1992   if (!shouldScalarizeInstruction(EntryVal)) {
1993     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1994     Value *ScalarIV = CreateScalarIV(Step);
1995     // Create scalar steps that can be used by instructions we will later
1996     // scalarize. Note that the addition of the scalar steps will not increase
1997     // the number of instructions in the loop in the common case prior to
1998     // InstCombine. We will be trading one vector extract for each scalar step.
1999     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2000     return;
2001   }
2002 
2003   // All IV users are scalar instructions, so only emit a scalar IV, not a
2004   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2005   // predicate used by the masked loads/stores.
2006   Value *ScalarIV = CreateScalarIV(Step);
2007   if (!Cost->isScalarEpilogueAllowed())
2008     CreateSplatIV(ScalarIV, Step);
2009   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2010 }
2011 
2012 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2013                                           Instruction::BinaryOps BinOp) {
2014   // Create and check the types.
2015   auto *ValVTy = cast<VectorType>(Val->getType());
2016   int VLen = ValVTy->getNumElements();
2017 
2018   Type *STy = Val->getType()->getScalarType();
2019   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2020          "Induction Step must be an integer or FP");
2021   assert(Step->getType() == STy && "Step has wrong type");
2022 
2023   SmallVector<Constant *, 8> Indices;
2024 
2025   if (STy->isIntegerTy()) {
2026     // Create a vector of consecutive numbers from zero to VF.
2027     for (int i = 0; i < VLen; ++i)
2028       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2029 
2030     // Add the consecutive indices to the vector value.
2031     Constant *Cv = ConstantVector::get(Indices);
2032     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2033     Step = Builder.CreateVectorSplat(VLen, Step);
2034     assert(Step->getType() == Val->getType() && "Invalid step vec");
2035     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2036     // which can be found from the original scalar operations.
2037     Step = Builder.CreateMul(Cv, Step);
2038     return Builder.CreateAdd(Val, Step, "induction");
2039   }
2040 
2041   // Floating point induction.
2042   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2043          "Binary Opcode should be specified for FP induction");
2044   // Create a vector of consecutive numbers from zero to VF.
2045   for (int i = 0; i < VLen; ++i)
2046     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2047 
2048   // Add the consecutive indices to the vector value.
2049   Constant *Cv = ConstantVector::get(Indices);
2050 
2051   Step = Builder.CreateVectorSplat(VLen, Step);
2052 
2053   // Floating point operations had to be 'fast' to enable the induction.
2054   FastMathFlags Flags;
2055   Flags.setFast();
2056 
2057   Value *MulOp = Builder.CreateFMul(Cv, Step);
2058   if (isa<Instruction>(MulOp))
2059     // Have to check, MulOp may be a constant
2060     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2061 
2062   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2063   if (isa<Instruction>(BOp))
2064     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2065   return BOp;
2066 }
2067 
2068 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2069                                            Instruction *EntryVal,
2070                                            const InductionDescriptor &ID) {
2071   // We shouldn't have to build scalar steps if we aren't vectorizing.
2072   assert(VF.isVector() && "VF should be greater than one");
2073   assert(!VF.Scalable &&
2074          "the code below assumes a fixed number of elements at compile time");
2075   // Get the value type and ensure it and the step have the same integer type.
2076   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2077   assert(ScalarIVTy == Step->getType() &&
2078          "Val and Step should have the same type");
2079 
2080   // We build scalar steps for both integer and floating-point induction
2081   // variables. Here, we determine the kind of arithmetic we will perform.
2082   Instruction::BinaryOps AddOp;
2083   Instruction::BinaryOps MulOp;
2084   if (ScalarIVTy->isIntegerTy()) {
2085     AddOp = Instruction::Add;
2086     MulOp = Instruction::Mul;
2087   } else {
2088     AddOp = ID.getInductionOpcode();
2089     MulOp = Instruction::FMul;
2090   }
2091 
2092   // Determine the number of scalars we need to generate for each unroll
2093   // iteration. If EntryVal is uniform, we only need to generate the first
2094   // lane. Otherwise, we generate all VF values.
2095   unsigned Lanes =
2096       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2097           ? 1
2098           : VF.Min;
2099   // Compute the scalar steps and save the results in VectorLoopValueMap.
2100   for (unsigned Part = 0; Part < UF; ++Part) {
2101     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2102       auto *StartIdx =
2103           getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane);
2104       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2105       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2106       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2107       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2108     }
2109   }
2110 }
2111 
2112 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2113   assert(V != Induction && "The new induction variable should not be used.");
2114   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2115   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2116 
2117   // If we have a stride that is replaced by one, do it here. Defer this for
2118   // the VPlan-native path until we start running Legal checks in that path.
2119   if (!EnableVPlanNativePath && Legal->hasStride(V))
2120     V = ConstantInt::get(V->getType(), 1);
2121 
2122   // If we have a vector mapped to this value, return it.
2123   if (VectorLoopValueMap.hasVectorValue(V, Part))
2124     return VectorLoopValueMap.getVectorValue(V, Part);
2125 
2126   // If the value has not been vectorized, check if it has been scalarized
2127   // instead. If it has been scalarized, and we actually need the value in
2128   // vector form, we will construct the vector values on demand.
2129   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2130     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2131 
2132     // If we've scalarized a value, that value should be an instruction.
2133     auto *I = cast<Instruction>(V);
2134 
2135     // If we aren't vectorizing, we can just copy the scalar map values over to
2136     // the vector map.
2137     if (VF == 1) {
2138       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2139       return ScalarValue;
2140     }
2141 
2142     // Get the last scalar instruction we generated for V and Part. If the value
2143     // is known to be uniform after vectorization, this corresponds to lane zero
2144     // of the Part unroll iteration. Otherwise, the last instruction is the one
2145     // we created for the last vector lane of the Part unroll iteration.
2146     assert(!VF.Scalable && "scalable vectors not yet supported.");
2147     unsigned LastLane =
2148         Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1;
2149     auto *LastInst = cast<Instruction>(
2150         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2151 
2152     // Set the insert point after the last scalarized instruction. This ensures
2153     // the insertelement sequence will directly follow the scalar definitions.
2154     auto OldIP = Builder.saveIP();
2155     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2156     Builder.SetInsertPoint(&*NewIP);
2157 
2158     // However, if we are vectorizing, we need to construct the vector values.
2159     // If the value is known to be uniform after vectorization, we can just
2160     // broadcast the scalar value corresponding to lane zero for each unroll
2161     // iteration. Otherwise, we construct the vector values using insertelement
2162     // instructions. Since the resulting vectors are stored in
2163     // VectorLoopValueMap, we will only generate the insertelements once.
2164     Value *VectorValue = nullptr;
2165     if (Cost->isUniformAfterVectorization(I, VF)) {
2166       VectorValue = getBroadcastInstrs(ScalarValue);
2167       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2168     } else {
2169       // Initialize packing with insertelements to start from undef.
2170       assert(!VF.Scalable && "VF is assumed to be non scalable.");
2171       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2172       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2173       for (unsigned Lane = 0; Lane < VF.Min; ++Lane)
2174         packScalarIntoVectorValue(V, {Part, Lane});
2175       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2176     }
2177     Builder.restoreIP(OldIP);
2178     return VectorValue;
2179   }
2180 
2181   // If this scalar is unknown, assume that it is a constant or that it is
2182   // loop invariant. Broadcast V and save the value for future uses.
2183   Value *B = getBroadcastInstrs(V);
2184   VectorLoopValueMap.setVectorValue(V, Part, B);
2185   return B;
2186 }
2187 
2188 Value *
2189 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2190                                             const VPIteration &Instance) {
2191   // If the value is not an instruction contained in the loop, it should
2192   // already be scalar.
2193   if (OrigLoop->isLoopInvariant(V))
2194     return V;
2195 
2196   assert(Instance.Lane > 0
2197              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2198              : true && "Uniform values only have lane zero");
2199 
2200   // If the value from the original loop has not been vectorized, it is
2201   // represented by UF x VF scalar values in the new loop. Return the requested
2202   // scalar value.
2203   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2204     return VectorLoopValueMap.getScalarValue(V, Instance);
2205 
2206   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2207   // for the given unroll part. If this entry is not a vector type (i.e., the
2208   // vectorization factor is one), there is no need to generate an
2209   // extractelement instruction.
2210   auto *U = getOrCreateVectorValue(V, Instance.Part);
2211   if (!U->getType()->isVectorTy()) {
2212     assert(VF == 1 && "Value not scalarized has non-vector type");
2213     return U;
2214   }
2215 
2216   // Otherwise, the value from the original loop has been vectorized and is
2217   // represented by UF vector values. Extract and return the requested scalar
2218   // value from the appropriate vector lane.
2219   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2220 }
2221 
2222 void InnerLoopVectorizer::packScalarIntoVectorValue(
2223     Value *V, const VPIteration &Instance) {
2224   assert(V != Induction && "The new induction variable should not be used.");
2225   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2226   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2227 
2228   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2229   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2230   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2231                                             Builder.getInt32(Instance.Lane));
2232   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2233 }
2234 
2235 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2236   assert(Vec->getType()->isVectorTy() && "Invalid type");
2237   assert(!VF.Scalable && "Cannot reverse scalable vectors");
2238   SmallVector<int, 8> ShuffleMask;
2239   for (unsigned i = 0; i < VF.Min; ++i)
2240     ShuffleMask.push_back(VF.Min - i - 1);
2241 
2242   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2243                                      ShuffleMask, "reverse");
2244 }
2245 
2246 // Return whether we allow using masked interleave-groups (for dealing with
2247 // strided loads/stores that reside in predicated blocks, or for dealing
2248 // with gaps).
2249 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2250   // If an override option has been passed in for interleaved accesses, use it.
2251   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2252     return EnableMaskedInterleavedMemAccesses;
2253 
2254   return TTI.enableMaskedInterleavedAccessVectorization();
2255 }
2256 
2257 // Try to vectorize the interleave group that \p Instr belongs to.
2258 //
2259 // E.g. Translate following interleaved load group (factor = 3):
2260 //   for (i = 0; i < N; i+=3) {
2261 //     R = Pic[i];             // Member of index 0
2262 //     G = Pic[i+1];           // Member of index 1
2263 //     B = Pic[i+2];           // Member of index 2
2264 //     ... // do something to R, G, B
2265 //   }
2266 // To:
2267 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2268 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2269 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2270 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2271 //
2272 // Or translate following interleaved store group (factor = 3):
2273 //   for (i = 0; i < N; i+=3) {
2274 //     ... do something to R, G, B
2275 //     Pic[i]   = R;           // Member of index 0
2276 //     Pic[i+1] = G;           // Member of index 1
2277 //     Pic[i+2] = B;           // Member of index 2
2278 //   }
2279 // To:
2280 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2281 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2282 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2283 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2284 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2285 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2286     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2287     VPValue *Addr, VPValue *BlockInMask) {
2288   Instruction *Instr = Group->getInsertPos();
2289   const DataLayout &DL = Instr->getModule()->getDataLayout();
2290 
2291   // Prepare for the vector type of the interleaved load/store.
2292   Type *ScalarTy = getMemInstValueType(Instr);
2293   unsigned InterleaveFactor = Group->getFactor();
2294   assert(!VF.Scalable && "scalable vectors not yet supported.");
2295   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2296 
2297   // Prepare for the new pointers.
2298   SmallVector<Value *, 2> AddrParts;
2299   unsigned Index = Group->getIndex(Instr);
2300 
2301   // TODO: extend the masked interleaved-group support to reversed access.
2302   assert((!BlockInMask || !Group->isReverse()) &&
2303          "Reversed masked interleave-group not supported.");
2304 
2305   // If the group is reverse, adjust the index to refer to the last vector lane
2306   // instead of the first. We adjust the index from the first vector lane,
2307   // rather than directly getting the pointer for lane VF - 1, because the
2308   // pointer operand of the interleaved access is supposed to be uniform. For
2309   // uniform instructions, we're only required to generate a value for the
2310   // first vector lane in each unroll iteration.
2311   assert(!VF.Scalable &&
2312          "scalable vector reverse operation is not implemented");
2313   if (Group->isReverse())
2314     Index += (VF.Min - 1) * Group->getFactor();
2315 
2316   for (unsigned Part = 0; Part < UF; Part++) {
2317     Value *AddrPart = State.get(Addr, {Part, 0});
2318     setDebugLocFromInst(Builder, AddrPart);
2319 
2320     // Notice current instruction could be any index. Need to adjust the address
2321     // to the member of index 0.
2322     //
2323     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2324     //       b = A[i];       // Member of index 0
2325     // Current pointer is pointed to A[i+1], adjust it to A[i].
2326     //
2327     // E.g.  A[i+1] = a;     // Member of index 1
2328     //       A[i]   = b;     // Member of index 0
2329     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2330     // Current pointer is pointed to A[i+2], adjust it to A[i].
2331 
2332     bool InBounds = false;
2333     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2334       InBounds = gep->isInBounds();
2335     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2336     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2337 
2338     // Cast to the vector pointer type.
2339     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2340     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2341     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2342   }
2343 
2344   setDebugLocFromInst(Builder, Instr);
2345   Value *UndefVec = UndefValue::get(VecTy);
2346 
2347   Value *MaskForGaps = nullptr;
2348   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2349     assert(!VF.Scalable && "scalable vectors not yet supported.");
2350     MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group);
2351     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2352   }
2353 
2354   // Vectorize the interleaved load group.
2355   if (isa<LoadInst>(Instr)) {
2356     // For each unroll part, create a wide load for the group.
2357     SmallVector<Value *, 2> NewLoads;
2358     for (unsigned Part = 0; Part < UF; Part++) {
2359       Instruction *NewLoad;
2360       if (BlockInMask || MaskForGaps) {
2361         assert(useMaskedInterleavedAccesses(*TTI) &&
2362                "masked interleaved groups are not allowed.");
2363         Value *GroupMask = MaskForGaps;
2364         if (BlockInMask) {
2365           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2366           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2367           assert(!VF.Scalable && "scalable vectors not yet supported.");
2368           Value *ShuffledMask = Builder.CreateShuffleVector(
2369               BlockInMaskPart, Undefs,
2370               createReplicatedMask(InterleaveFactor, VF.Min),
2371               "interleaved.mask");
2372           GroupMask = MaskForGaps
2373                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2374                                                 MaskForGaps)
2375                           : ShuffledMask;
2376         }
2377         NewLoad =
2378             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2379                                      GroupMask, UndefVec, "wide.masked.vec");
2380       }
2381       else
2382         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2383                                             Group->getAlign(), "wide.vec");
2384       Group->addMetadata(NewLoad);
2385       NewLoads.push_back(NewLoad);
2386     }
2387 
2388     // For each member in the group, shuffle out the appropriate data from the
2389     // wide loads.
2390     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2391       Instruction *Member = Group->getMember(I);
2392 
2393       // Skip the gaps in the group.
2394       if (!Member)
2395         continue;
2396 
2397       assert(!VF.Scalable && "scalable vectors not yet supported.");
2398       auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min);
2399       for (unsigned Part = 0; Part < UF; Part++) {
2400         Value *StridedVec = Builder.CreateShuffleVector(
2401             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2402 
2403         // If this member has different type, cast the result type.
2404         if (Member->getType() != ScalarTy) {
2405           assert(!VF.Scalable && "VF is assumed to be non scalable.");
2406           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2407           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2408         }
2409 
2410         if (Group->isReverse())
2411           StridedVec = reverseVector(StridedVec);
2412 
2413         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2414       }
2415     }
2416     return;
2417   }
2418 
2419   // The sub vector type for current instruction.
2420   assert(!VF.Scalable && "VF is assumed to be non scalable.");
2421   auto *SubVT = VectorType::get(ScalarTy, VF);
2422 
2423   // Vectorize the interleaved store group.
2424   for (unsigned Part = 0; Part < UF; Part++) {
2425     // Collect the stored vector from each member.
2426     SmallVector<Value *, 4> StoredVecs;
2427     for (unsigned i = 0; i < InterleaveFactor; i++) {
2428       // Interleaved store group doesn't allow a gap, so each index has a member
2429       Instruction *Member = Group->getMember(i);
2430       assert(Member && "Fail to get a member from an interleaved store group");
2431 
2432       Value *StoredVec = getOrCreateVectorValue(
2433           cast<StoreInst>(Member)->getValueOperand(), Part);
2434       if (Group->isReverse())
2435         StoredVec = reverseVector(StoredVec);
2436 
2437       // If this member has different type, cast it to a unified type.
2438 
2439       if (StoredVec->getType() != SubVT)
2440         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2441 
2442       StoredVecs.push_back(StoredVec);
2443     }
2444 
2445     // Concatenate all vectors into a wide vector.
2446     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2447 
2448     // Interleave the elements in the wide vector.
2449     assert(!VF.Scalable && "scalable vectors not yet supported.");
2450     Value *IVec = Builder.CreateShuffleVector(
2451         WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor),
2452         "interleaved.vec");
2453 
2454     Instruction *NewStoreInstr;
2455     if (BlockInMask) {
2456       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2457       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2458       Value *ShuffledMask = Builder.CreateShuffleVector(
2459           BlockInMaskPart, Undefs,
2460           createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask");
2461       NewStoreInstr = Builder.CreateMaskedStore(
2462           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2463     }
2464     else
2465       NewStoreInstr =
2466           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2467 
2468     Group->addMetadata(NewStoreInstr);
2469   }
2470 }
2471 
2472 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2473                                                      VPTransformState &State,
2474                                                      VPValue *Addr,
2475                                                      VPValue *StoredValue,
2476                                                      VPValue *BlockInMask) {
2477   // Attempt to issue a wide load.
2478   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2479   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2480 
2481   assert((LI || SI) && "Invalid Load/Store instruction");
2482   assert((!SI || StoredValue) && "No stored value provided for widened store");
2483   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2484 
2485   LoopVectorizationCostModel::InstWidening Decision =
2486       Cost->getWideningDecision(Instr, VF);
2487   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2488           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2489           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2490          "CM decision is not to widen the memory instruction");
2491 
2492   Type *ScalarDataTy = getMemInstValueType(Instr);
2493 
2494   assert(!VF.Scalable && "scalable vectors not yet supported.");
2495   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2496   const Align Alignment = getLoadStoreAlignment(Instr);
2497 
2498   // Determine if the pointer operand of the access is either consecutive or
2499   // reverse consecutive.
2500   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2501   bool ConsecutiveStride =
2502       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2503   bool CreateGatherScatter =
2504       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2505 
2506   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2507   // gather/scatter. Otherwise Decision should have been to Scalarize.
2508   assert((ConsecutiveStride || CreateGatherScatter) &&
2509          "The instruction should be scalarized");
2510   (void)ConsecutiveStride;
2511 
2512   VectorParts BlockInMaskParts(UF);
2513   bool isMaskRequired = BlockInMask;
2514   if (isMaskRequired)
2515     for (unsigned Part = 0; Part < UF; ++Part)
2516       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2517 
2518   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2519     // Calculate the pointer for the specific unroll-part.
2520     GetElementPtrInst *PartPtr = nullptr;
2521 
2522     bool InBounds = false;
2523     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2524       InBounds = gep->isInBounds();
2525 
2526     if (Reverse) {
2527       // If the address is consecutive but reversed, then the
2528       // wide store needs to start at the last vector element.
2529       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2530           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min)));
2531       PartPtr->setIsInBounds(InBounds);
2532       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2533           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min)));
2534       PartPtr->setIsInBounds(InBounds);
2535       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2536         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2537     } else {
2538       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2539           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min)));
2540       PartPtr->setIsInBounds(InBounds);
2541     }
2542 
2543     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2544     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2545   };
2546 
2547   // Handle Stores:
2548   if (SI) {
2549     setDebugLocFromInst(Builder, SI);
2550 
2551     for (unsigned Part = 0; Part < UF; ++Part) {
2552       Instruction *NewSI = nullptr;
2553       Value *StoredVal = State.get(StoredValue, Part);
2554       if (CreateGatherScatter) {
2555         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2556         Value *VectorGep = State.get(Addr, Part);
2557         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2558                                             MaskPart);
2559       } else {
2560         if (Reverse) {
2561           // If we store to reverse consecutive memory locations, then we need
2562           // to reverse the order of elements in the stored value.
2563           StoredVal = reverseVector(StoredVal);
2564           // We don't want to update the value in the map as it might be used in
2565           // another expression. So don't call resetVectorValue(StoredVal).
2566         }
2567         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2568         if (isMaskRequired)
2569           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2570                                             BlockInMaskParts[Part]);
2571         else
2572           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2573       }
2574       addMetadata(NewSI, SI);
2575     }
2576     return;
2577   }
2578 
2579   // Handle loads.
2580   assert(LI && "Must have a load instruction");
2581   setDebugLocFromInst(Builder, LI);
2582   for (unsigned Part = 0; Part < UF; ++Part) {
2583     Value *NewLI;
2584     if (CreateGatherScatter) {
2585       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2586       Value *VectorGep = State.get(Addr, Part);
2587       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2588                                          nullptr, "wide.masked.gather");
2589       addMetadata(NewLI, LI);
2590     } else {
2591       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2592       if (isMaskRequired)
2593         NewLI = Builder.CreateMaskedLoad(
2594             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2595             "wide.masked.load");
2596       else
2597         NewLI =
2598             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2599 
2600       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2601       addMetadata(NewLI, LI);
2602       if (Reverse)
2603         NewLI = reverseVector(NewLI);
2604     }
2605     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2606   }
2607 }
2608 
2609 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2610                                                const VPIteration &Instance,
2611                                                bool IfPredicateInstr,
2612                                                VPTransformState &State) {
2613   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2614 
2615   setDebugLocFromInst(Builder, Instr);
2616 
2617   // Does this instruction return a value ?
2618   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2619 
2620   Instruction *Cloned = Instr->clone();
2621   if (!IsVoidRetTy)
2622     Cloned->setName(Instr->getName() + ".cloned");
2623 
2624   // Replace the operands of the cloned instructions with their scalar
2625   // equivalents in the new loop.
2626   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2627     auto *NewOp = State.get(User.getOperand(op), Instance);
2628     Cloned->setOperand(op, NewOp);
2629   }
2630   addNewMetadata(Cloned, Instr);
2631 
2632   // Place the cloned scalar in the new loop.
2633   Builder.Insert(Cloned);
2634 
2635   // Add the cloned scalar to the scalar map entry.
2636   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2637 
2638   // If we just cloned a new assumption, add it the assumption cache.
2639   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2640     if (II->getIntrinsicID() == Intrinsic::assume)
2641       AC->registerAssumption(II);
2642 
2643   // End if-block.
2644   if (IfPredicateInstr)
2645     PredicatedInstructions.push_back(Cloned);
2646 }
2647 
2648 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2649                                                       Value *End, Value *Step,
2650                                                       Instruction *DL) {
2651   BasicBlock *Header = L->getHeader();
2652   BasicBlock *Latch = L->getLoopLatch();
2653   // As we're just creating this loop, it's possible no latch exists
2654   // yet. If so, use the header as this will be a single block loop.
2655   if (!Latch)
2656     Latch = Header;
2657 
2658   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2659   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2660   setDebugLocFromInst(Builder, OldInst);
2661   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2662 
2663   Builder.SetInsertPoint(Latch->getTerminator());
2664   setDebugLocFromInst(Builder, OldInst);
2665 
2666   // Create i+1 and fill the PHINode.
2667   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2668   Induction->addIncoming(Start, L->getLoopPreheader());
2669   Induction->addIncoming(Next, Latch);
2670   // Create the compare.
2671   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2672   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2673 
2674   // Now we have two terminators. Remove the old one from the block.
2675   Latch->getTerminator()->eraseFromParent();
2676 
2677   return Induction;
2678 }
2679 
2680 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2681   if (TripCount)
2682     return TripCount;
2683 
2684   assert(L && "Create Trip Count for null loop.");
2685   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2686   // Find the loop boundaries.
2687   ScalarEvolution *SE = PSE.getSE();
2688   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2689   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2690          "Invalid loop count");
2691 
2692   Type *IdxTy = Legal->getWidestInductionType();
2693   assert(IdxTy && "No type for induction");
2694 
2695   // The exit count might have the type of i64 while the phi is i32. This can
2696   // happen if we have an induction variable that is sign extended before the
2697   // compare. The only way that we get a backedge taken count is that the
2698   // induction variable was signed and as such will not overflow. In such a case
2699   // truncation is legal.
2700   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2701       IdxTy->getPrimitiveSizeInBits())
2702     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2703   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2704 
2705   // Get the total trip count from the count by adding 1.
2706   const SCEV *ExitCount = SE->getAddExpr(
2707       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2708 
2709   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2710 
2711   // Expand the trip count and place the new instructions in the preheader.
2712   // Notice that the pre-header does not change, only the loop body.
2713   SCEVExpander Exp(*SE, DL, "induction");
2714 
2715   // Count holds the overall loop count (N).
2716   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2717                                 L->getLoopPreheader()->getTerminator());
2718 
2719   if (TripCount->getType()->isPointerTy())
2720     TripCount =
2721         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2722                                     L->getLoopPreheader()->getTerminator());
2723 
2724   return TripCount;
2725 }
2726 
2727 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2728   if (VectorTripCount)
2729     return VectorTripCount;
2730 
2731   Value *TC = getOrCreateTripCount(L);
2732   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2733 
2734   Type *Ty = TC->getType();
2735   // This is where we can make the step a runtime constant.
2736   assert(!VF.Scalable && "scalable vectorization is not supported yet");
2737   Constant *Step = ConstantInt::get(Ty, VF.Min * UF);
2738 
2739   // If the tail is to be folded by masking, round the number of iterations N
2740   // up to a multiple of Step instead of rounding down. This is done by first
2741   // adding Step-1 and then rounding down. Note that it's ok if this addition
2742   // overflows: the vector induction variable will eventually wrap to zero given
2743   // that it starts at zero and its Step is a power of two; the loop will then
2744   // exit, with the last early-exit vector comparison also producing all-true.
2745   if (Cost->foldTailByMasking()) {
2746     assert(isPowerOf2_32(VF.Min * UF) &&
2747            "VF*UF must be a power of 2 when folding tail by masking");
2748     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1),
2749                            "n.rnd.up");
2750   }
2751 
2752   // Now we need to generate the expression for the part of the loop that the
2753   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2754   // iterations are not required for correctness, or N - Step, otherwise. Step
2755   // is equal to the vectorization factor (number of SIMD elements) times the
2756   // unroll factor (number of SIMD instructions).
2757   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2758 
2759   // If there is a non-reversed interleaved group that may speculatively access
2760   // memory out-of-bounds, we need to ensure that there will be at least one
2761   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2762   // the trip count, we set the remainder to be equal to the step. If the step
2763   // does not evenly divide the trip count, no adjustment is necessary since
2764   // there will already be scalar iterations. Note that the minimum iterations
2765   // check ensures that N >= Step.
2766   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2767     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2768     R = Builder.CreateSelect(IsZero, Step, R);
2769   }
2770 
2771   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2772 
2773   return VectorTripCount;
2774 }
2775 
2776 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2777                                                    const DataLayout &DL) {
2778   // Verify that V is a vector type with same number of elements as DstVTy.
2779   assert(isa<FixedVectorType>(DstVTy) &&
2780          "Vector type is assumed to be fixed width.");
2781   unsigned VF = DstVTy->getNumElements();
2782   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2783   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2784   Type *SrcElemTy = SrcVecTy->getElementType();
2785   Type *DstElemTy = DstVTy->getElementType();
2786   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2787          "Vector elements must have same size");
2788 
2789   // Do a direct cast if element types are castable.
2790   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2791     return Builder.CreateBitOrPointerCast(V, DstVTy);
2792   }
2793   // V cannot be directly casted to desired vector type.
2794   // May happen when V is a floating point vector but DstVTy is a vector of
2795   // pointers or vice-versa. Handle this using a two-step bitcast using an
2796   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2797   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2798          "Only one type should be a pointer type");
2799   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2800          "Only one type should be a floating point type");
2801   Type *IntTy =
2802       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2803   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2804   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2805   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2806 }
2807 
2808 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2809                                                          BasicBlock *Bypass) {
2810   Value *Count = getOrCreateTripCount(L);
2811   // Reuse existing vector loop preheader for TC checks.
2812   // Note that new preheader block is generated for vector loop.
2813   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2814   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2815 
2816   // Generate code to check if the loop's trip count is less than VF * UF, or
2817   // equal to it in case a scalar epilogue is required; this implies that the
2818   // vector trip count is zero. This check also covers the case where adding one
2819   // to the backedge-taken count overflowed leading to an incorrect trip count
2820   // of zero. In this case we will also jump to the scalar loop.
2821   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2822                                           : ICmpInst::ICMP_ULT;
2823 
2824   // If tail is to be folded, vector loop takes care of all iterations.
2825   Value *CheckMinIters = Builder.getFalse();
2826   if (!Cost->foldTailByMasking()) {
2827     assert(!VF.Scalable && "scalable vectors not yet supported.");
2828     CheckMinIters = Builder.CreateICmp(
2829         P, Count, ConstantInt::get(Count->getType(), VF.Min * UF),
2830         "min.iters.check");
2831   }
2832   // Create new preheader for vector loop.
2833   LoopVectorPreHeader =
2834       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2835                  "vector.ph");
2836 
2837   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2838                                DT->getNode(Bypass)->getIDom()) &&
2839          "TC check is expected to dominate Bypass");
2840 
2841   // Update dominator for Bypass & LoopExit.
2842   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2843   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2844 
2845   ReplaceInstWithInst(
2846       TCCheckBlock->getTerminator(),
2847       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2848   LoopBypassBlocks.push_back(TCCheckBlock);
2849 }
2850 
2851 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2852   // Reuse existing vector loop preheader for SCEV checks.
2853   // Note that new preheader block is generated for vector loop.
2854   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2855 
2856   // Generate the code to check that the SCEV assumptions that we made.
2857   // We want the new basic block to start at the first instruction in a
2858   // sequence of instructions that form a check.
2859   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2860                    "scev.check");
2861   Value *SCEVCheck = Exp.expandCodeForPredicate(
2862       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2863 
2864   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2865     if (C->isZero())
2866       return;
2867 
2868   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2869            (OptForSizeBasedOnProfile &&
2870             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2871          "Cannot SCEV check stride or overflow when optimizing for size");
2872 
2873   SCEVCheckBlock->setName("vector.scevcheck");
2874   // Create new preheader for vector loop.
2875   LoopVectorPreHeader =
2876       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2877                  nullptr, "vector.ph");
2878 
2879   // Update dominator only if this is first RT check.
2880   if (LoopBypassBlocks.empty()) {
2881     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2882     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2883   }
2884 
2885   ReplaceInstWithInst(
2886       SCEVCheckBlock->getTerminator(),
2887       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2888   LoopBypassBlocks.push_back(SCEVCheckBlock);
2889   AddedSafetyChecks = true;
2890 }
2891 
2892 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2893   // VPlan-native path does not do any analysis for runtime checks currently.
2894   if (EnableVPlanNativePath)
2895     return;
2896 
2897   // Reuse existing vector loop preheader for runtime memory checks.
2898   // Note that new preheader block is generated for vector loop.
2899   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2900 
2901   // Generate the code that checks in runtime if arrays overlap. We put the
2902   // checks into a separate block to make the more common case of few elements
2903   // faster.
2904   auto *LAI = Legal->getLAI();
2905   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2906   if (!RtPtrChecking.Need)
2907     return;
2908   Instruction *FirstCheckInst;
2909   Instruction *MemRuntimeCheck;
2910   std::tie(FirstCheckInst, MemRuntimeCheck) =
2911       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2912                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2913   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2914                             "claimed checks are required");
2915 
2916   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2917     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2918            "Cannot emit memory checks when optimizing for size, unless forced "
2919            "to vectorize.");
2920     ORE->emit([&]() {
2921       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2922                                         L->getStartLoc(), L->getHeader())
2923              << "Code-size may be reduced by not forcing "
2924                 "vectorization, or by source-code modifications "
2925                 "eliminating the need for runtime checks "
2926                 "(e.g., adding 'restrict').";
2927     });
2928   }
2929 
2930   MemCheckBlock->setName("vector.memcheck");
2931   // Create new preheader for vector loop.
2932   LoopVectorPreHeader =
2933       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2934                  "vector.ph");
2935 
2936   // Update dominator only if this is first RT check.
2937   if (LoopBypassBlocks.empty()) {
2938     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2939     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2940   }
2941 
2942   ReplaceInstWithInst(
2943       MemCheckBlock->getTerminator(),
2944       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2945   LoopBypassBlocks.push_back(MemCheckBlock);
2946   AddedSafetyChecks = true;
2947 
2948   // We currently don't use LoopVersioning for the actual loop cloning but we
2949   // still use it to add the noalias metadata.
2950   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2951                                           PSE.getSE());
2952   LVer->prepareNoAliasMetadata();
2953 }
2954 
2955 Value *InnerLoopVectorizer::emitTransformedIndex(
2956     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2957     const InductionDescriptor &ID) const {
2958 
2959   SCEVExpander Exp(*SE, DL, "induction");
2960   auto Step = ID.getStep();
2961   auto StartValue = ID.getStartValue();
2962   assert(Index->getType() == Step->getType() &&
2963          "Index type does not match StepValue type");
2964 
2965   // Note: the IR at this point is broken. We cannot use SE to create any new
2966   // SCEV and then expand it, hoping that SCEV's simplification will give us
2967   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2968   // lead to various SCEV crashes. So all we can do is to use builder and rely
2969   // on InstCombine for future simplifications. Here we handle some trivial
2970   // cases only.
2971   auto CreateAdd = [&B](Value *X, Value *Y) {
2972     assert(X->getType() == Y->getType() && "Types don't match!");
2973     if (auto *CX = dyn_cast<ConstantInt>(X))
2974       if (CX->isZero())
2975         return Y;
2976     if (auto *CY = dyn_cast<ConstantInt>(Y))
2977       if (CY->isZero())
2978         return X;
2979     return B.CreateAdd(X, Y);
2980   };
2981 
2982   auto CreateMul = [&B](Value *X, Value *Y) {
2983     assert(X->getType() == Y->getType() && "Types don't match!");
2984     if (auto *CX = dyn_cast<ConstantInt>(X))
2985       if (CX->isOne())
2986         return Y;
2987     if (auto *CY = dyn_cast<ConstantInt>(Y))
2988       if (CY->isOne())
2989         return X;
2990     return B.CreateMul(X, Y);
2991   };
2992 
2993   // Get a suitable insert point for SCEV expansion. For blocks in the vector
2994   // loop, choose the end of the vector loop header (=LoopVectorBody), because
2995   // the DomTree is not kept up-to-date for additional blocks generated in the
2996   // vector loop. By using the header as insertion point, we guarantee that the
2997   // expanded instructions dominate all their uses.
2998   auto GetInsertPoint = [this, &B]() {
2999     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3000     if (InsertBB != LoopVectorBody &&
3001         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3002       return LoopVectorBody->getTerminator();
3003     return &*B.GetInsertPoint();
3004   };
3005   switch (ID.getKind()) {
3006   case InductionDescriptor::IK_IntInduction: {
3007     assert(Index->getType() == StartValue->getType() &&
3008            "Index type does not match StartValue type");
3009     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3010       return B.CreateSub(StartValue, Index);
3011     auto *Offset = CreateMul(
3012         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3013     return CreateAdd(StartValue, Offset);
3014   }
3015   case InductionDescriptor::IK_PtrInduction: {
3016     assert(isa<SCEVConstant>(Step) &&
3017            "Expected constant step for pointer induction");
3018     return B.CreateGEP(
3019         StartValue->getType()->getPointerElementType(), StartValue,
3020         CreateMul(Index,
3021                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3022   }
3023   case InductionDescriptor::IK_FpInduction: {
3024     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3025     auto InductionBinOp = ID.getInductionBinOp();
3026     assert(InductionBinOp &&
3027            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3028             InductionBinOp->getOpcode() == Instruction::FSub) &&
3029            "Original bin op should be defined for FP induction");
3030 
3031     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3032 
3033     // Floating point operations had to be 'fast' to enable the induction.
3034     FastMathFlags Flags;
3035     Flags.setFast();
3036 
3037     Value *MulExp = B.CreateFMul(StepValue, Index);
3038     if (isa<Instruction>(MulExp))
3039       // We have to check, the MulExp may be a constant.
3040       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3041 
3042     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3043                                "induction");
3044     if (isa<Instruction>(BOp))
3045       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3046 
3047     return BOp;
3048   }
3049   case InductionDescriptor::IK_NoInduction:
3050     return nullptr;
3051   }
3052   llvm_unreachable("invalid enum");
3053 }
3054 
3055 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3056   LoopScalarBody = OrigLoop->getHeader();
3057   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3058   LoopExitBlock = OrigLoop->getExitBlock();
3059   assert(LoopExitBlock && "Must have an exit block");
3060   assert(LoopVectorPreHeader && "Invalid loop structure");
3061 
3062   LoopMiddleBlock =
3063       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3064                  LI, nullptr, Twine(Prefix) + "middle.block");
3065   LoopScalarPreHeader =
3066       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3067                  nullptr, Twine(Prefix) + "scalar.ph");
3068   // We intentionally don't let SplitBlock to update LoopInfo since
3069   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3070   // LoopVectorBody is explicitly added to the correct place few lines later.
3071   LoopVectorBody =
3072       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3073                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3074 
3075   // Update dominator for loop exit.
3076   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3077 
3078   // Create and register the new vector loop.
3079   Loop *Lp = LI->AllocateLoop();
3080   Loop *ParentLoop = OrigLoop->getParentLoop();
3081 
3082   // Insert the new loop into the loop nest and register the new basic blocks
3083   // before calling any utilities such as SCEV that require valid LoopInfo.
3084   if (ParentLoop) {
3085     ParentLoop->addChildLoop(Lp);
3086   } else {
3087     LI->addTopLevelLoop(Lp);
3088   }
3089   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3090   return Lp;
3091 }
3092 
3093 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3094                                                       Value *VectorTripCount) {
3095   assert(VectorTripCount && L && "Expected valid arguments");
3096   // We are going to resume the execution of the scalar loop.
3097   // Go over all of the induction variables that we found and fix the
3098   // PHIs that are left in the scalar version of the loop.
3099   // The starting values of PHI nodes depend on the counter of the last
3100   // iteration in the vectorized loop.
3101   // If we come from a bypass edge then we need to start from the original
3102   // start value.
3103   for (auto &InductionEntry : Legal->getInductionVars()) {
3104     PHINode *OrigPhi = InductionEntry.first;
3105     InductionDescriptor II = InductionEntry.second;
3106 
3107     // Create phi nodes to merge from the  backedge-taken check block.
3108     PHINode *BCResumeVal =
3109         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3110                         LoopScalarPreHeader->getTerminator());
3111     // Copy original phi DL over to the new one.
3112     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3113     Value *&EndValue = IVEndValues[OrigPhi];
3114     if (OrigPhi == OldInduction) {
3115       // We know what the end value is.
3116       EndValue = VectorTripCount;
3117     } else {
3118       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3119       Type *StepType = II.getStep()->getType();
3120       Instruction::CastOps CastOp =
3121           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3122       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3123       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3124       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3125       EndValue->setName("ind.end");
3126     }
3127 
3128     // The new PHI merges the original incoming value, in case of a bypass,
3129     // or the value at the end of the vectorized loop.
3130     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3131 
3132     // Fix the scalar body counter (PHI node).
3133     // The old induction's phi node in the scalar body needs the truncated
3134     // value.
3135     for (BasicBlock *BB : LoopBypassBlocks)
3136       BCResumeVal->addIncoming(II.getStartValue(), BB);
3137     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3138   }
3139 }
3140 
3141 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3142                                                       MDNode *OrigLoopID) {
3143   assert(L && "Expected valid loop.");
3144 
3145   // The trip counts should be cached by now.
3146   Value *Count = getOrCreateTripCount(L);
3147   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3148 
3149   // We need the OrigLoop (scalar loop part) latch terminator to help
3150   // produce correct debug info for the middle block BB instructions.
3151   // The legality check stage guarantees that the loop will have a single
3152   // latch.
3153   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3154          "Scalar loop latch terminator isn't a branch");
3155   BranchInst *ScalarLatchBr =
3156       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3157 
3158   // Add a check in the middle block to see if we have completed
3159   // all of the iterations in the first vector loop.
3160   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3161   // If tail is to be folded, we know we don't need to run the remainder.
3162   Value *CmpN = Builder.getTrue();
3163   if (!Cost->foldTailByMasking()) {
3164     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3165                            VectorTripCount, "cmp.n",
3166                            LoopMiddleBlock->getTerminator());
3167 
3168     // Here we use the same DebugLoc as the scalar loop latch branch instead
3169     // of the corresponding compare because they may have ended up with
3170     // different line numbers and we want to avoid awkward line stepping while
3171     // debugging. Eg. if the compare has got a line number inside the loop.
3172     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3173   }
3174 
3175   BranchInst *BrInst =
3176       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3177   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3178   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3179 
3180   // Get ready to start creating new instructions into the vectorized body.
3181   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3182          "Inconsistent vector loop preheader");
3183   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3184 
3185   Optional<MDNode *> VectorizedLoopID =
3186       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3187                                       LLVMLoopVectorizeFollowupVectorized});
3188   if (VectorizedLoopID.hasValue()) {
3189     L->setLoopID(VectorizedLoopID.getValue());
3190 
3191     // Do not setAlreadyVectorized if loop attributes have been defined
3192     // explicitly.
3193     return LoopVectorPreHeader;
3194   }
3195 
3196   // Keep all loop hints from the original loop on the vector loop (we'll
3197   // replace the vectorizer-specific hints below).
3198   if (MDNode *LID = OrigLoop->getLoopID())
3199     L->setLoopID(LID);
3200 
3201   LoopVectorizeHints Hints(L, true, *ORE);
3202   Hints.setAlreadyVectorized();
3203 
3204 #ifdef EXPENSIVE_CHECKS
3205   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3206   LI->verify(*DT);
3207 #endif
3208 
3209   return LoopVectorPreHeader;
3210 }
3211 
3212 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3213   /*
3214    In this function we generate a new loop. The new loop will contain
3215    the vectorized instructions while the old loop will continue to run the
3216    scalar remainder.
3217 
3218        [ ] <-- loop iteration number check.
3219     /   |
3220    /    v
3221   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3222   |  /  |
3223   | /   v
3224   ||   [ ]     <-- vector pre header.
3225   |/    |
3226   |     v
3227   |    [  ] \
3228   |    [  ]_|   <-- vector loop.
3229   |     |
3230   |     v
3231   |   -[ ]   <--- middle-block.
3232   |  /  |
3233   | /   v
3234   -|- >[ ]     <--- new preheader.
3235    |    |
3236    |    v
3237    |   [ ] \
3238    |   [ ]_|   <-- old scalar loop to handle remainder.
3239     \   |
3240      \  v
3241       >[ ]     <-- exit block.
3242    ...
3243    */
3244 
3245   // Get the metadata of the original loop before it gets modified.
3246   MDNode *OrigLoopID = OrigLoop->getLoopID();
3247 
3248   // Create an empty vector loop, and prepare basic blocks for the runtime
3249   // checks.
3250   Loop *Lp = createVectorLoopSkeleton("");
3251 
3252   // Now, compare the new count to zero. If it is zero skip the vector loop and
3253   // jump to the scalar loop. This check also covers the case where the
3254   // backedge-taken count is uint##_max: adding one to it will overflow leading
3255   // to an incorrect trip count of zero. In this (rare) case we will also jump
3256   // to the scalar loop.
3257   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3258 
3259   // Generate the code to check any assumptions that we've made for SCEV
3260   // expressions.
3261   emitSCEVChecks(Lp, LoopScalarPreHeader);
3262 
3263   // Generate the code that checks in runtime if arrays overlap. We put the
3264   // checks into a separate block to make the more common case of few elements
3265   // faster.
3266   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3267 
3268   // Some loops have a single integer induction variable, while other loops
3269   // don't. One example is c++ iterators that often have multiple pointer
3270   // induction variables. In the code below we also support a case where we
3271   // don't have a single induction variable.
3272   //
3273   // We try to obtain an induction variable from the original loop as hard
3274   // as possible. However if we don't find one that:
3275   //   - is an integer
3276   //   - counts from zero, stepping by one
3277   //   - is the size of the widest induction variable type
3278   // then we create a new one.
3279   OldInduction = Legal->getPrimaryInduction();
3280   Type *IdxTy = Legal->getWidestInductionType();
3281   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3282   // The loop step is equal to the vectorization factor (num of SIMD elements)
3283   // times the unroll factor (num of SIMD instructions).
3284   assert(!VF.Scalable && "scalable vectors not yet supported.");
3285   Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF);
3286   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3287   Induction =
3288       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3289                               getDebugLocFromInstOrOperands(OldInduction));
3290 
3291   // Emit phis for the new starting index of the scalar loop.
3292   createInductionResumeValues(Lp, CountRoundDown);
3293 
3294   return completeLoopSkeleton(Lp, OrigLoopID);
3295 }
3296 
3297 // Fix up external users of the induction variable. At this point, we are
3298 // in LCSSA form, with all external PHIs that use the IV having one input value,
3299 // coming from the remainder loop. We need those PHIs to also have a correct
3300 // value for the IV when arriving directly from the middle block.
3301 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3302                                        const InductionDescriptor &II,
3303                                        Value *CountRoundDown, Value *EndValue,
3304                                        BasicBlock *MiddleBlock) {
3305   // There are two kinds of external IV usages - those that use the value
3306   // computed in the last iteration (the PHI) and those that use the penultimate
3307   // value (the value that feeds into the phi from the loop latch).
3308   // We allow both, but they, obviously, have different values.
3309 
3310   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3311 
3312   DenseMap<Value *, Value *> MissingVals;
3313 
3314   // An external user of the last iteration's value should see the value that
3315   // the remainder loop uses to initialize its own IV.
3316   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3317   for (User *U : PostInc->users()) {
3318     Instruction *UI = cast<Instruction>(U);
3319     if (!OrigLoop->contains(UI)) {
3320       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3321       MissingVals[UI] = EndValue;
3322     }
3323   }
3324 
3325   // An external user of the penultimate value need to see EndValue - Step.
3326   // The simplest way to get this is to recompute it from the constituent SCEVs,
3327   // that is Start + (Step * (CRD - 1)).
3328   for (User *U : OrigPhi->users()) {
3329     auto *UI = cast<Instruction>(U);
3330     if (!OrigLoop->contains(UI)) {
3331       const DataLayout &DL =
3332           OrigLoop->getHeader()->getModule()->getDataLayout();
3333       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3334 
3335       IRBuilder<> B(MiddleBlock->getTerminator());
3336       Value *CountMinusOne = B.CreateSub(
3337           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3338       Value *CMO =
3339           !II.getStep()->getType()->isIntegerTy()
3340               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3341                              II.getStep()->getType())
3342               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3343       CMO->setName("cast.cmo");
3344       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3345       Escape->setName("ind.escape");
3346       MissingVals[UI] = Escape;
3347     }
3348   }
3349 
3350   for (auto &I : MissingVals) {
3351     PHINode *PHI = cast<PHINode>(I.first);
3352     // One corner case we have to handle is two IVs "chasing" each-other,
3353     // that is %IV2 = phi [...], [ %IV1, %latch ]
3354     // In this case, if IV1 has an external use, we need to avoid adding both
3355     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3356     // don't already have an incoming value for the middle block.
3357     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3358       PHI->addIncoming(I.second, MiddleBlock);
3359   }
3360 }
3361 
3362 namespace {
3363 
3364 struct CSEDenseMapInfo {
3365   static bool canHandle(const Instruction *I) {
3366     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3367            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3368   }
3369 
3370   static inline Instruction *getEmptyKey() {
3371     return DenseMapInfo<Instruction *>::getEmptyKey();
3372   }
3373 
3374   static inline Instruction *getTombstoneKey() {
3375     return DenseMapInfo<Instruction *>::getTombstoneKey();
3376   }
3377 
3378   static unsigned getHashValue(const Instruction *I) {
3379     assert(canHandle(I) && "Unknown instruction!");
3380     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3381                                                            I->value_op_end()));
3382   }
3383 
3384   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3385     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3386         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3387       return LHS == RHS;
3388     return LHS->isIdenticalTo(RHS);
3389   }
3390 };
3391 
3392 } // end anonymous namespace
3393 
3394 ///Perform cse of induction variable instructions.
3395 static void cse(BasicBlock *BB) {
3396   // Perform simple cse.
3397   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3398   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3399     Instruction *In = &*I++;
3400 
3401     if (!CSEDenseMapInfo::canHandle(In))
3402       continue;
3403 
3404     // Check if we can replace this instruction with any of the
3405     // visited instructions.
3406     if (Instruction *V = CSEMap.lookup(In)) {
3407       In->replaceAllUsesWith(V);
3408       In->eraseFromParent();
3409       continue;
3410     }
3411 
3412     CSEMap[In] = In;
3413   }
3414 }
3415 
3416 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3417                                                        ElementCount VF,
3418                                                        bool &NeedToScalarize) {
3419   assert(!VF.Scalable && "scalable vectors not yet supported.");
3420   Function *F = CI->getCalledFunction();
3421   Type *ScalarRetTy = CI->getType();
3422   SmallVector<Type *, 4> Tys, ScalarTys;
3423   for (auto &ArgOp : CI->arg_operands())
3424     ScalarTys.push_back(ArgOp->getType());
3425 
3426   // Estimate cost of scalarized vector call. The source operands are assumed
3427   // to be vectors, so we need to extract individual elements from there,
3428   // execute VF scalar calls, and then gather the result into the vector return
3429   // value.
3430   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3431                                                  TTI::TCK_RecipThroughput);
3432   if (VF.isScalar())
3433     return ScalarCallCost;
3434 
3435   // Compute corresponding vector type for return value and arguments.
3436   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3437   for (Type *ScalarTy : ScalarTys)
3438     Tys.push_back(ToVectorTy(ScalarTy, VF));
3439 
3440   // Compute costs of unpacking argument values for the scalar calls and
3441   // packing the return values to a vector.
3442   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3443 
3444   unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost;
3445 
3446   // If we can't emit a vector call for this function, then the currently found
3447   // cost is the cost we need to return.
3448   NeedToScalarize = true;
3449   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3450   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3451 
3452   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3453     return Cost;
3454 
3455   // If the corresponding vector cost is cheaper, return its cost.
3456   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3457                                                  TTI::TCK_RecipThroughput);
3458   if (VectorCallCost < Cost) {
3459     NeedToScalarize = false;
3460     return VectorCallCost;
3461   }
3462   return Cost;
3463 }
3464 
3465 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3466                                                             ElementCount VF) {
3467   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3468   assert(ID && "Expected intrinsic call!");
3469 
3470   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3471   return TTI.getIntrinsicInstrCost(CostAttrs,
3472                                    TargetTransformInfo::TCK_RecipThroughput);
3473 }
3474 
3475 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3476   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3477   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3478   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3479 }
3480 
3481 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3482   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3483   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3484   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3485 }
3486 
3487 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3488   // For every instruction `I` in MinBWs, truncate the operands, create a
3489   // truncated version of `I` and reextend its result. InstCombine runs
3490   // later and will remove any ext/trunc pairs.
3491   SmallPtrSet<Value *, 4> Erased;
3492   for (const auto &KV : Cost->getMinimalBitwidths()) {
3493     // If the value wasn't vectorized, we must maintain the original scalar
3494     // type. The absence of the value from VectorLoopValueMap indicates that it
3495     // wasn't vectorized.
3496     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3497       continue;
3498     for (unsigned Part = 0; Part < UF; ++Part) {
3499       Value *I = getOrCreateVectorValue(KV.first, Part);
3500       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3501         continue;
3502       Type *OriginalTy = I->getType();
3503       Type *ScalarTruncatedTy =
3504           IntegerType::get(OriginalTy->getContext(), KV.second);
3505       auto *TruncatedTy = FixedVectorType::get(
3506           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3507       if (TruncatedTy == OriginalTy)
3508         continue;
3509 
3510       IRBuilder<> B(cast<Instruction>(I));
3511       auto ShrinkOperand = [&](Value *V) -> Value * {
3512         if (auto *ZI = dyn_cast<ZExtInst>(V))
3513           if (ZI->getSrcTy() == TruncatedTy)
3514             return ZI->getOperand(0);
3515         return B.CreateZExtOrTrunc(V, TruncatedTy);
3516       };
3517 
3518       // The actual instruction modification depends on the instruction type,
3519       // unfortunately.
3520       Value *NewI = nullptr;
3521       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3522         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3523                              ShrinkOperand(BO->getOperand(1)));
3524 
3525         // Any wrapping introduced by shrinking this operation shouldn't be
3526         // considered undefined behavior. So, we can't unconditionally copy
3527         // arithmetic wrapping flags to NewI.
3528         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3529       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3530         NewI =
3531             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3532                          ShrinkOperand(CI->getOperand(1)));
3533       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3534         NewI = B.CreateSelect(SI->getCondition(),
3535                               ShrinkOperand(SI->getTrueValue()),
3536                               ShrinkOperand(SI->getFalseValue()));
3537       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3538         switch (CI->getOpcode()) {
3539         default:
3540           llvm_unreachable("Unhandled cast!");
3541         case Instruction::Trunc:
3542           NewI = ShrinkOperand(CI->getOperand(0));
3543           break;
3544         case Instruction::SExt:
3545           NewI = B.CreateSExtOrTrunc(
3546               CI->getOperand(0),
3547               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3548           break;
3549         case Instruction::ZExt:
3550           NewI = B.CreateZExtOrTrunc(
3551               CI->getOperand(0),
3552               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3553           break;
3554         }
3555       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3556         auto Elements0 =
3557             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3558         auto *O0 = B.CreateZExtOrTrunc(
3559             SI->getOperand(0),
3560             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3561         auto Elements1 =
3562             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3563         auto *O1 = B.CreateZExtOrTrunc(
3564             SI->getOperand(1),
3565             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3566 
3567         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3568       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3569         // Don't do anything with the operands, just extend the result.
3570         continue;
3571       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3572         auto Elements =
3573             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3574         auto *O0 = B.CreateZExtOrTrunc(
3575             IE->getOperand(0),
3576             FixedVectorType::get(ScalarTruncatedTy, Elements));
3577         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3578         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3579       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3580         auto Elements =
3581             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3582         auto *O0 = B.CreateZExtOrTrunc(
3583             EE->getOperand(0),
3584             FixedVectorType::get(ScalarTruncatedTy, Elements));
3585         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3586       } else {
3587         // If we don't know what to do, be conservative and don't do anything.
3588         continue;
3589       }
3590 
3591       // Lastly, extend the result.
3592       NewI->takeName(cast<Instruction>(I));
3593       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3594       I->replaceAllUsesWith(Res);
3595       cast<Instruction>(I)->eraseFromParent();
3596       Erased.insert(I);
3597       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3598     }
3599   }
3600 
3601   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3602   for (const auto &KV : Cost->getMinimalBitwidths()) {
3603     // If the value wasn't vectorized, we must maintain the original scalar
3604     // type. The absence of the value from VectorLoopValueMap indicates that it
3605     // wasn't vectorized.
3606     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3607       continue;
3608     for (unsigned Part = 0; Part < UF; ++Part) {
3609       Value *I = getOrCreateVectorValue(KV.first, Part);
3610       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3611       if (Inst && Inst->use_empty()) {
3612         Value *NewI = Inst->getOperand(0);
3613         Inst->eraseFromParent();
3614         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3615       }
3616     }
3617   }
3618 }
3619 
3620 void InnerLoopVectorizer::fixVectorizedLoop() {
3621   // Insert truncates and extends for any truncated instructions as hints to
3622   // InstCombine.
3623   if (VF.isVector())
3624     truncateToMinimalBitwidths();
3625 
3626   // Fix widened non-induction PHIs by setting up the PHI operands.
3627   if (OrigPHIsToFix.size()) {
3628     assert(EnableVPlanNativePath &&
3629            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3630     fixNonInductionPHIs();
3631   }
3632 
3633   // At this point every instruction in the original loop is widened to a
3634   // vector form. Now we need to fix the recurrences in the loop. These PHI
3635   // nodes are currently empty because we did not want to introduce cycles.
3636   // This is the second stage of vectorizing recurrences.
3637   fixCrossIterationPHIs();
3638 
3639   // Forget the original basic block.
3640   PSE.getSE()->forgetLoop(OrigLoop);
3641 
3642   // Fix-up external users of the induction variables.
3643   for (auto &Entry : Legal->getInductionVars())
3644     fixupIVUsers(Entry.first, Entry.second,
3645                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3646                  IVEndValues[Entry.first], LoopMiddleBlock);
3647 
3648   fixLCSSAPHIs();
3649   for (Instruction *PI : PredicatedInstructions)
3650     sinkScalarOperands(&*PI);
3651 
3652   // Remove redundant induction instructions.
3653   cse(LoopVectorBody);
3654 
3655   // Set/update profile weights for the vector and remainder loops as original
3656   // loop iterations are now distributed among them. Note that original loop
3657   // represented by LoopScalarBody becomes remainder loop after vectorization.
3658   //
3659   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3660   // end up getting slightly roughened result but that should be OK since
3661   // profile is not inherently precise anyway. Note also possible bypass of
3662   // vector code caused by legality checks is ignored, assigning all the weight
3663   // to the vector loop, optimistically.
3664   assert(!VF.Scalable &&
3665          "cannot use scalable ElementCount to determine unroll factor");
3666   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3667                                LI->getLoopFor(LoopVectorBody),
3668                                LI->getLoopFor(LoopScalarBody), VF.Min * UF);
3669 }
3670 
3671 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3672   // In order to support recurrences we need to be able to vectorize Phi nodes.
3673   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3674   // stage #2: We now need to fix the recurrences by adding incoming edges to
3675   // the currently empty PHI nodes. At this point every instruction in the
3676   // original loop is widened to a vector form so we can use them to construct
3677   // the incoming edges.
3678   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3679     // Handle first-order recurrences and reductions that need to be fixed.
3680     if (Legal->isFirstOrderRecurrence(&Phi))
3681       fixFirstOrderRecurrence(&Phi);
3682     else if (Legal->isReductionVariable(&Phi))
3683       fixReduction(&Phi);
3684   }
3685 }
3686 
3687 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3688   // This is the second phase of vectorizing first-order recurrences. An
3689   // overview of the transformation is described below. Suppose we have the
3690   // following loop.
3691   //
3692   //   for (int i = 0; i < n; ++i)
3693   //     b[i] = a[i] - a[i - 1];
3694   //
3695   // There is a first-order recurrence on "a". For this loop, the shorthand
3696   // scalar IR looks like:
3697   //
3698   //   scalar.ph:
3699   //     s_init = a[-1]
3700   //     br scalar.body
3701   //
3702   //   scalar.body:
3703   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3704   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3705   //     s2 = a[i]
3706   //     b[i] = s2 - s1
3707   //     br cond, scalar.body, ...
3708   //
3709   // In this example, s1 is a recurrence because it's value depends on the
3710   // previous iteration. In the first phase of vectorization, we created a
3711   // temporary value for s1. We now complete the vectorization and produce the
3712   // shorthand vector IR shown below (for VF = 4, UF = 1).
3713   //
3714   //   vector.ph:
3715   //     v_init = vector(..., ..., ..., a[-1])
3716   //     br vector.body
3717   //
3718   //   vector.body
3719   //     i = phi [0, vector.ph], [i+4, vector.body]
3720   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3721   //     v2 = a[i, i+1, i+2, i+3];
3722   //     v3 = vector(v1(3), v2(0, 1, 2))
3723   //     b[i, i+1, i+2, i+3] = v2 - v3
3724   //     br cond, vector.body, middle.block
3725   //
3726   //   middle.block:
3727   //     x = v2(3)
3728   //     br scalar.ph
3729   //
3730   //   scalar.ph:
3731   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3732   //     br scalar.body
3733   //
3734   // After execution completes the vector loop, we extract the next value of
3735   // the recurrence (x) to use as the initial value in the scalar loop.
3736 
3737   // Get the original loop preheader and single loop latch.
3738   auto *Preheader = OrigLoop->getLoopPreheader();
3739   auto *Latch = OrigLoop->getLoopLatch();
3740 
3741   // Get the initial and previous values of the scalar recurrence.
3742   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3743   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3744 
3745   // Create a vector from the initial value.
3746   auto *VectorInit = ScalarInit;
3747   if (VF.isVector()) {
3748     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3749     assert(!VF.Scalable && "VF is assumed to be non scalable.");
3750     VectorInit = Builder.CreateInsertElement(
3751         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3752         Builder.getInt32(VF.Min - 1), "vector.recur.init");
3753   }
3754 
3755   // We constructed a temporary phi node in the first phase of vectorization.
3756   // This phi node will eventually be deleted.
3757   Builder.SetInsertPoint(
3758       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3759 
3760   // Create a phi node for the new recurrence. The current value will either be
3761   // the initial value inserted into a vector or loop-varying vector value.
3762   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3763   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3764 
3765   // Get the vectorized previous value of the last part UF - 1. It appears last
3766   // among all unrolled iterations, due to the order of their construction.
3767   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3768 
3769   // Find and set the insertion point after the previous value if it is an
3770   // instruction.
3771   BasicBlock::iterator InsertPt;
3772   // Note that the previous value may have been constant-folded so it is not
3773   // guaranteed to be an instruction in the vector loop.
3774   // FIXME: Loop invariant values do not form recurrences. We should deal with
3775   //        them earlier.
3776   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3777     InsertPt = LoopVectorBody->getFirstInsertionPt();
3778   else {
3779     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3780     if (isa<PHINode>(PreviousLastPart))
3781       // If the previous value is a phi node, we should insert after all the phi
3782       // nodes in the block containing the PHI to avoid breaking basic block
3783       // verification. Note that the basic block may be different to
3784       // LoopVectorBody, in case we predicate the loop.
3785       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3786     else
3787       InsertPt = ++PreviousInst->getIterator();
3788   }
3789   Builder.SetInsertPoint(&*InsertPt);
3790 
3791   // We will construct a vector for the recurrence by combining the values for
3792   // the current and previous iterations. This is the required shuffle mask.
3793   assert(!VF.Scalable);
3794   SmallVector<int, 8> ShuffleMask(VF.Min);
3795   ShuffleMask[0] = VF.Min - 1;
3796   for (unsigned I = 1; I < VF.Min; ++I)
3797     ShuffleMask[I] = I + VF.Min - 1;
3798 
3799   // The vector from which to take the initial value for the current iteration
3800   // (actual or unrolled). Initially, this is the vector phi node.
3801   Value *Incoming = VecPhi;
3802 
3803   // Shuffle the current and previous vector and update the vector parts.
3804   for (unsigned Part = 0; Part < UF; ++Part) {
3805     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3806     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3807     auto *Shuffle =
3808         VF.isVector()
3809             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3810             : Incoming;
3811     PhiPart->replaceAllUsesWith(Shuffle);
3812     cast<Instruction>(PhiPart)->eraseFromParent();
3813     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3814     Incoming = PreviousPart;
3815   }
3816 
3817   // Fix the latch value of the new recurrence in the vector loop.
3818   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3819 
3820   // Extract the last vector element in the middle block. This will be the
3821   // initial value for the recurrence when jumping to the scalar loop.
3822   auto *ExtractForScalar = Incoming;
3823   if (VF.isVector()) {
3824     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3825     ExtractForScalar = Builder.CreateExtractElement(
3826         ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract");
3827   }
3828   // Extract the second last element in the middle block if the
3829   // Phi is used outside the loop. We need to extract the phi itself
3830   // and not the last element (the phi update in the current iteration). This
3831   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3832   // when the scalar loop is not run at all.
3833   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3834   if (VF.isVector())
3835     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3836         Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi");
3837   // When loop is unrolled without vectorizing, initialize
3838   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3839   // `Incoming`. This is analogous to the vectorized case above: extracting the
3840   // second last element when VF > 1.
3841   else if (UF > 1)
3842     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3843 
3844   // Fix the initial value of the original recurrence in the scalar loop.
3845   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3846   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3847   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3848     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3849     Start->addIncoming(Incoming, BB);
3850   }
3851 
3852   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3853   Phi->setName("scalar.recur");
3854 
3855   // Finally, fix users of the recurrence outside the loop. The users will need
3856   // either the last value of the scalar recurrence or the last value of the
3857   // vector recurrence we extracted in the middle block. Since the loop is in
3858   // LCSSA form, we just need to find all the phi nodes for the original scalar
3859   // recurrence in the exit block, and then add an edge for the middle block.
3860   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3861     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3862       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3863     }
3864   }
3865 }
3866 
3867 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3868   Constant *Zero = Builder.getInt32(0);
3869 
3870   // Get it's reduction variable descriptor.
3871   assert(Legal->isReductionVariable(Phi) &&
3872          "Unable to find the reduction variable");
3873   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3874 
3875   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3876   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3877   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3878   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3879     RdxDesc.getMinMaxRecurrenceKind();
3880   setDebugLocFromInst(Builder, ReductionStartValue);
3881   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3882 
3883   // We need to generate a reduction vector from the incoming scalar.
3884   // To do so, we need to generate the 'identity' vector and override
3885   // one of the elements with the incoming scalar reduction. We need
3886   // to do it in the vector-loop preheader.
3887   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3888 
3889   // This is the vector-clone of the value that leaves the loop.
3890   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3891 
3892   // Find the reduction identity variable. Zero for addition, or, xor,
3893   // one for multiplication, -1 for And.
3894   Value *Identity;
3895   Value *VectorStart;
3896   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3897       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3898     // MinMax reduction have the start value as their identify.
3899     if (VF == 1 || IsInLoopReductionPhi) {
3900       VectorStart = Identity = ReductionStartValue;
3901     } else {
3902       VectorStart = Identity =
3903         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3904     }
3905   } else {
3906     // Handle other reduction kinds:
3907     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3908         RK, VecTy->getScalarType());
3909     if (VF == 1 || IsInLoopReductionPhi) {
3910       Identity = Iden;
3911       // This vector is the Identity vector where the first element is the
3912       // incoming scalar reduction.
3913       VectorStart = ReductionStartValue;
3914     } else {
3915       Identity = ConstantVector::getSplat(VF, Iden);
3916 
3917       // This vector is the Identity vector where the first element is the
3918       // incoming scalar reduction.
3919       VectorStart =
3920         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3921     }
3922   }
3923 
3924   // Wrap flags are in general invalid after vectorization, clear them.
3925   clearReductionWrapFlags(RdxDesc);
3926 
3927   // Fix the vector-loop phi.
3928 
3929   // Reductions do not have to start at zero. They can start with
3930   // any loop invariant values.
3931   BasicBlock *Latch = OrigLoop->getLoopLatch();
3932   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3933 
3934   for (unsigned Part = 0; Part < UF; ++Part) {
3935     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3936     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3937     // Make sure to add the reduction start value only to the
3938     // first unroll part.
3939     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3940     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3941     cast<PHINode>(VecRdxPhi)
3942       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3943   }
3944 
3945   // Before each round, move the insertion point right between
3946   // the PHIs and the values we are going to write.
3947   // This allows us to write both PHINodes and the extractelement
3948   // instructions.
3949   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3950 
3951   setDebugLocFromInst(Builder, LoopExitInst);
3952 
3953   // If tail is folded by masking, the vector value to leave the loop should be
3954   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3955   // instead of the former.
3956   if (Cost->foldTailByMasking()) {
3957     for (unsigned Part = 0; Part < UF; ++Part) {
3958       Value *VecLoopExitInst =
3959           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3960       Value *Sel = nullptr;
3961       for (User *U : VecLoopExitInst->users()) {
3962         if (isa<SelectInst>(U)) {
3963           assert(!Sel && "Reduction exit feeding two selects");
3964           Sel = U;
3965         } else
3966           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3967       }
3968       assert(Sel && "Reduction exit feeds no select");
3969       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3970 
3971       // If the target can create a predicated operator for the reduction at no
3972       // extra cost in the loop (for example a predicated vadd), it can be
3973       // cheaper for the select to remain in the loop than be sunk out of it,
3974       // and so use the select value for the phi instead of the old
3975       // LoopExitValue.
3976       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3977       if (PreferPredicatedReductionSelect ||
3978           TTI->preferPredicatedReductionSelect(
3979               RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
3980               Phi->getType(), TargetTransformInfo::ReductionFlags())) {
3981         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
3982         VecRdxPhi->setIncomingValueForBlock(
3983             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
3984       }
3985     }
3986   }
3987 
3988   // If the vector reduction can be performed in a smaller type, we truncate
3989   // then extend the loop exit value to enable InstCombine to evaluate the
3990   // entire expression in the smaller type.
3991   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
3992     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
3993     assert(!VF.Scalable && "scalable vectors not yet supported.");
3994     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3995     Builder.SetInsertPoint(
3996         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3997     VectorParts RdxParts(UF);
3998     for (unsigned Part = 0; Part < UF; ++Part) {
3999       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4000       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4001       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4002                                         : Builder.CreateZExt(Trunc, VecTy);
4003       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4004            UI != RdxParts[Part]->user_end();)
4005         if (*UI != Trunc) {
4006           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4007           RdxParts[Part] = Extnd;
4008         } else {
4009           ++UI;
4010         }
4011     }
4012     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4013     for (unsigned Part = 0; Part < UF; ++Part) {
4014       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4015       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4016     }
4017   }
4018 
4019   // Reduce all of the unrolled parts into a single vector.
4020   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4021   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4022 
4023   // The middle block terminator has already been assigned a DebugLoc here (the
4024   // OrigLoop's single latch terminator). We want the whole middle block to
4025   // appear to execute on this line because: (a) it is all compiler generated,
4026   // (b) these instructions are always executed after evaluating the latch
4027   // conditional branch, and (c) other passes may add new predecessors which
4028   // terminate on this line. This is the easiest way to ensure we don't
4029   // accidentally cause an extra step back into the loop while debugging.
4030   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4031   for (unsigned Part = 1; Part < UF; ++Part) {
4032     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4033     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4034       // Floating point operations had to be 'fast' to enable the reduction.
4035       ReducedPartRdx = addFastMathFlag(
4036           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4037                               ReducedPartRdx, "bin.rdx"),
4038           RdxDesc.getFastMathFlags());
4039     else
4040       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4041                                       RdxPart);
4042   }
4043 
4044   // Create the reduction after the loop. Note that inloop reductions create the
4045   // target reduction in the loop using a Reduction recipe.
4046   if (VF.isVector() && !IsInLoopReductionPhi) {
4047     bool NoNaN = Legal->hasFunNoNaNAttr();
4048     ReducedPartRdx =
4049         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4050     // If the reduction can be performed in a smaller type, we need to extend
4051     // the reduction to the wider type before we branch to the original loop.
4052     if (Phi->getType() != RdxDesc.getRecurrenceType())
4053       ReducedPartRdx =
4054         RdxDesc.isSigned()
4055         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4056         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4057   }
4058 
4059   // Create a phi node that merges control-flow from the backedge-taken check
4060   // block and the middle block.
4061   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4062                                         LoopScalarPreHeader->getTerminator());
4063   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4064     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4065   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4066 
4067   // Now, we need to fix the users of the reduction variable
4068   // inside and outside of the scalar remainder loop.
4069   // We know that the loop is in LCSSA form. We need to update the
4070   // PHI nodes in the exit blocks.
4071   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4072     // All PHINodes need to have a single entry edge, or two if
4073     // we already fixed them.
4074     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4075 
4076     // We found a reduction value exit-PHI. Update it with the
4077     // incoming bypass edge.
4078     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4079       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4080   } // end of the LCSSA phi scan.
4081 
4082     // Fix the scalar loop reduction variable with the incoming reduction sum
4083     // from the vector body and from the backedge value.
4084   int IncomingEdgeBlockIdx =
4085     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4086   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4087   // Pick the other block.
4088   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4089   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4090   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4091 }
4092 
4093 void InnerLoopVectorizer::clearReductionWrapFlags(
4094     RecurrenceDescriptor &RdxDesc) {
4095   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4096   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4097       RK != RecurrenceDescriptor::RK_IntegerMult)
4098     return;
4099 
4100   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4101   assert(LoopExitInstr && "null loop exit instruction");
4102   SmallVector<Instruction *, 8> Worklist;
4103   SmallPtrSet<Instruction *, 8> Visited;
4104   Worklist.push_back(LoopExitInstr);
4105   Visited.insert(LoopExitInstr);
4106 
4107   while (!Worklist.empty()) {
4108     Instruction *Cur = Worklist.pop_back_val();
4109     if (isa<OverflowingBinaryOperator>(Cur))
4110       for (unsigned Part = 0; Part < UF; ++Part) {
4111         Value *V = getOrCreateVectorValue(Cur, Part);
4112         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4113       }
4114 
4115     for (User *U : Cur->users()) {
4116       Instruction *UI = cast<Instruction>(U);
4117       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4118           Visited.insert(UI).second)
4119         Worklist.push_back(UI);
4120     }
4121   }
4122 }
4123 
4124 void InnerLoopVectorizer::fixLCSSAPHIs() {
4125   assert(!VF.Scalable && "the code below assumes fixed width vectors");
4126   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4127     if (LCSSAPhi.getNumIncomingValues() == 1) {
4128       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4129       // Non-instruction incoming values will have only one value.
4130       unsigned LastLane = 0;
4131       if (isa<Instruction>(IncomingValue))
4132         LastLane = Cost->isUniformAfterVectorization(
4133                        cast<Instruction>(IncomingValue), VF)
4134                        ? 0
4135                        : VF.Min - 1;
4136       // Can be a loop invariant incoming value or the last scalar value to be
4137       // extracted from the vectorized loop.
4138       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4139       Value *lastIncomingValue =
4140           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4141       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4142     }
4143   }
4144 }
4145 
4146 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4147   // The basic block and loop containing the predicated instruction.
4148   auto *PredBB = PredInst->getParent();
4149   auto *VectorLoop = LI->getLoopFor(PredBB);
4150 
4151   // Initialize a worklist with the operands of the predicated instruction.
4152   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4153 
4154   // Holds instructions that we need to analyze again. An instruction may be
4155   // reanalyzed if we don't yet know if we can sink it or not.
4156   SmallVector<Instruction *, 8> InstsToReanalyze;
4157 
4158   // Returns true if a given use occurs in the predicated block. Phi nodes use
4159   // their operands in their corresponding predecessor blocks.
4160   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4161     auto *I = cast<Instruction>(U.getUser());
4162     BasicBlock *BB = I->getParent();
4163     if (auto *Phi = dyn_cast<PHINode>(I))
4164       BB = Phi->getIncomingBlock(
4165           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4166     return BB == PredBB;
4167   };
4168 
4169   // Iteratively sink the scalarized operands of the predicated instruction
4170   // into the block we created for it. When an instruction is sunk, it's
4171   // operands are then added to the worklist. The algorithm ends after one pass
4172   // through the worklist doesn't sink a single instruction.
4173   bool Changed;
4174   do {
4175     // Add the instructions that need to be reanalyzed to the worklist, and
4176     // reset the changed indicator.
4177     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4178     InstsToReanalyze.clear();
4179     Changed = false;
4180 
4181     while (!Worklist.empty()) {
4182       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4183 
4184       // We can't sink an instruction if it is a phi node, is already in the
4185       // predicated block, is not in the loop, or may have side effects.
4186       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4187           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4188         continue;
4189 
4190       // It's legal to sink the instruction if all its uses occur in the
4191       // predicated block. Otherwise, there's nothing to do yet, and we may
4192       // need to reanalyze the instruction.
4193       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4194         InstsToReanalyze.push_back(I);
4195         continue;
4196       }
4197 
4198       // Move the instruction to the beginning of the predicated block, and add
4199       // it's operands to the worklist.
4200       I->moveBefore(&*PredBB->getFirstInsertionPt());
4201       Worklist.insert(I->op_begin(), I->op_end());
4202 
4203       // The sinking may have enabled other instructions to be sunk, so we will
4204       // need to iterate.
4205       Changed = true;
4206     }
4207   } while (Changed);
4208 }
4209 
4210 void InnerLoopVectorizer::fixNonInductionPHIs() {
4211   for (PHINode *OrigPhi : OrigPHIsToFix) {
4212     PHINode *NewPhi =
4213         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4214     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4215 
4216     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4217         predecessors(OrigPhi->getParent()));
4218     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4219         predecessors(NewPhi->getParent()));
4220     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4221            "Scalar and Vector BB should have the same number of predecessors");
4222 
4223     // The insertion point in Builder may be invalidated by the time we get
4224     // here. Force the Builder insertion point to something valid so that we do
4225     // not run into issues during insertion point restore in
4226     // getOrCreateVectorValue calls below.
4227     Builder.SetInsertPoint(NewPhi);
4228 
4229     // The predecessor order is preserved and we can rely on mapping between
4230     // scalar and vector block predecessors.
4231     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4232       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4233 
4234       // When looking up the new scalar/vector values to fix up, use incoming
4235       // values from original phi.
4236       Value *ScIncV =
4237           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4238 
4239       // Scalar incoming value may need a broadcast
4240       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4241       NewPhi->addIncoming(NewIncV, NewPredBB);
4242     }
4243   }
4244 }
4245 
4246 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4247                                    unsigned UF, ElementCount VF,
4248                                    bool IsPtrLoopInvariant,
4249                                    SmallBitVector &IsIndexLoopInvariant,
4250                                    VPTransformState &State) {
4251   // Construct a vector GEP by widening the operands of the scalar GEP as
4252   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4253   // results in a vector of pointers when at least one operand of the GEP
4254   // is vector-typed. Thus, to keep the representation compact, we only use
4255   // vector-typed operands for loop-varying values.
4256 
4257   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4258     // If we are vectorizing, but the GEP has only loop-invariant operands,
4259     // the GEP we build (by only using vector-typed operands for
4260     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4261     // produce a vector of pointers, we need to either arbitrarily pick an
4262     // operand to broadcast, or broadcast a clone of the original GEP.
4263     // Here, we broadcast a clone of the original.
4264     //
4265     // TODO: If at some point we decide to scalarize instructions having
4266     //       loop-invariant operands, this special case will no longer be
4267     //       required. We would add the scalarization decision to
4268     //       collectLoopScalars() and teach getVectorValue() to broadcast
4269     //       the lane-zero scalar value.
4270     auto *Clone = Builder.Insert(GEP->clone());
4271     for (unsigned Part = 0; Part < UF; ++Part) {
4272       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4273       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4274       addMetadata(EntryPart, GEP);
4275     }
4276   } else {
4277     // If the GEP has at least one loop-varying operand, we are sure to
4278     // produce a vector of pointers. But if we are only unrolling, we want
4279     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4280     // produce with the code below will be scalar (if VF == 1) or vector
4281     // (otherwise). Note that for the unroll-only case, we still maintain
4282     // values in the vector mapping with initVector, as we do for other
4283     // instructions.
4284     for (unsigned Part = 0; Part < UF; ++Part) {
4285       // The pointer operand of the new GEP. If it's loop-invariant, we
4286       // won't broadcast it.
4287       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4288                                      : State.get(Operands.getOperand(0), Part);
4289 
4290       // Collect all the indices for the new GEP. If any index is
4291       // loop-invariant, we won't broadcast it.
4292       SmallVector<Value *, 4> Indices;
4293       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4294         VPValue *Operand = Operands.getOperand(I);
4295         if (IsIndexLoopInvariant[I - 1])
4296           Indices.push_back(State.get(Operand, {0, 0}));
4297         else
4298           Indices.push_back(State.get(Operand, Part));
4299       }
4300 
4301       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4302       // but it should be a vector, otherwise.
4303       auto *NewGEP =
4304           GEP->isInBounds()
4305               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4306                                           Indices)
4307               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4308       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4309              "NewGEP is not a pointer vector");
4310       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4311       addMetadata(NewGEP, GEP);
4312     }
4313   }
4314 }
4315 
4316 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4317                                               ElementCount VF) {
4318   assert(!VF.Scalable && "scalable vectors not yet supported.");
4319   PHINode *P = cast<PHINode>(PN);
4320   if (EnableVPlanNativePath) {
4321     // Currently we enter here in the VPlan-native path for non-induction
4322     // PHIs where all control flow is uniform. We simply widen these PHIs.
4323     // Create a vector phi with no operands - the vector phi operands will be
4324     // set at the end of vector code generation.
4325     Type *VecTy =
4326         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4327     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4328     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4329     OrigPHIsToFix.push_back(P);
4330 
4331     return;
4332   }
4333 
4334   assert(PN->getParent() == OrigLoop->getHeader() &&
4335          "Non-header phis should have been handled elsewhere");
4336 
4337   // In order to support recurrences we need to be able to vectorize Phi nodes.
4338   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4339   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4340   // this value when we vectorize all of the instructions that use the PHI.
4341   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4342     for (unsigned Part = 0; Part < UF; ++Part) {
4343       // This is phase one of vectorizing PHIs.
4344       bool ScalarPHI =
4345           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4346       Type *VecTy =
4347           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4348       Value *EntryPart = PHINode::Create(
4349           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4350       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4351     }
4352     return;
4353   }
4354 
4355   setDebugLocFromInst(Builder, P);
4356 
4357   // This PHINode must be an induction variable.
4358   // Make sure that we know about it.
4359   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4360 
4361   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4362   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4363 
4364   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4365   // which can be found from the original scalar operations.
4366   switch (II.getKind()) {
4367   case InductionDescriptor::IK_NoInduction:
4368     llvm_unreachable("Unknown induction");
4369   case InductionDescriptor::IK_IntInduction:
4370   case InductionDescriptor::IK_FpInduction:
4371     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4372   case InductionDescriptor::IK_PtrInduction: {
4373     // Handle the pointer induction variable case.
4374     assert(P->getType()->isPointerTy() && "Unexpected type.");
4375 
4376     if (Cost->isScalarAfterVectorization(P, VF)) {
4377       // This is the normalized GEP that starts counting at zero.
4378       Value *PtrInd =
4379           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4380       // Determine the number of scalars we need to generate for each unroll
4381       // iteration. If the instruction is uniform, we only need to generate the
4382       // first lane. Otherwise, we generate all VF values.
4383       unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min;
4384       for (unsigned Part = 0; Part < UF; ++Part) {
4385         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4386           Constant *Idx =
4387               ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min);
4388           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4389           Value *SclrGep =
4390               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4391           SclrGep->setName("next.gep");
4392           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4393         }
4394       }
4395       return;
4396     }
4397     assert(isa<SCEVConstant>(II.getStep()) &&
4398            "Induction step not a SCEV constant!");
4399     Type *PhiType = II.getStep()->getType();
4400 
4401     // Build a pointer phi
4402     Value *ScalarStartValue = II.getStartValue();
4403     Type *ScStValueType = ScalarStartValue->getType();
4404     PHINode *NewPointerPhi =
4405         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4406     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4407 
4408     // A pointer induction, performed by using a gep
4409     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4410     Instruction *InductionLoc = LoopLatch->getTerminator();
4411     const SCEV *ScalarStep = II.getStep();
4412     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4413     Value *ScalarStepValue =
4414         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4415     Value *InductionGEP = GetElementPtrInst::Create(
4416         ScStValueType->getPointerElementType(), NewPointerPhi,
4417         Builder.CreateMul(ScalarStepValue,
4418                           ConstantInt::get(PhiType, VF.Min * UF)),
4419         "ptr.ind", InductionLoc);
4420     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4421 
4422     // Create UF many actual address geps that use the pointer
4423     // phi as base and a vectorized version of the step value
4424     // (<step*0, ..., step*N>) as offset.
4425     for (unsigned Part = 0; Part < UF; ++Part) {
4426       SmallVector<Constant *, 8> Indices;
4427       // Create a vector of consecutive numbers from zero to VF.
4428       for (unsigned i = 0; i < VF.Min; ++i)
4429         Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min));
4430       Constant *StartOffset = ConstantVector::get(Indices);
4431 
4432       Value *GEP = Builder.CreateGEP(
4433           ScStValueType->getPointerElementType(), NewPointerPhi,
4434           Builder.CreateMul(StartOffset,
4435                             Builder.CreateVectorSplat(VF.Min, ScalarStepValue),
4436                             "vector.gep"));
4437       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4438     }
4439   }
4440   }
4441 }
4442 
4443 /// A helper function for checking whether an integer division-related
4444 /// instruction may divide by zero (in which case it must be predicated if
4445 /// executed conditionally in the scalar code).
4446 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4447 /// Non-zero divisors that are non compile-time constants will not be
4448 /// converted into multiplication, so we will still end up scalarizing
4449 /// the division, but can do so w/o predication.
4450 static bool mayDivideByZero(Instruction &I) {
4451   assert((I.getOpcode() == Instruction::UDiv ||
4452           I.getOpcode() == Instruction::SDiv ||
4453           I.getOpcode() == Instruction::URem ||
4454           I.getOpcode() == Instruction::SRem) &&
4455          "Unexpected instruction");
4456   Value *Divisor = I.getOperand(1);
4457   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4458   return !CInt || CInt->isZero();
4459 }
4460 
4461 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4462                                            VPTransformState &State) {
4463   assert(!VF.Scalable && "scalable vectors not yet supported.");
4464   switch (I.getOpcode()) {
4465   case Instruction::Call:
4466   case Instruction::Br:
4467   case Instruction::PHI:
4468   case Instruction::GetElementPtr:
4469   case Instruction::Select:
4470     llvm_unreachable("This instruction is handled by a different recipe.");
4471   case Instruction::UDiv:
4472   case Instruction::SDiv:
4473   case Instruction::SRem:
4474   case Instruction::URem:
4475   case Instruction::Add:
4476   case Instruction::FAdd:
4477   case Instruction::Sub:
4478   case Instruction::FSub:
4479   case Instruction::FNeg:
4480   case Instruction::Mul:
4481   case Instruction::FMul:
4482   case Instruction::FDiv:
4483   case Instruction::FRem:
4484   case Instruction::Shl:
4485   case Instruction::LShr:
4486   case Instruction::AShr:
4487   case Instruction::And:
4488   case Instruction::Or:
4489   case Instruction::Xor: {
4490     // Just widen unops and binops.
4491     setDebugLocFromInst(Builder, &I);
4492 
4493     for (unsigned Part = 0; Part < UF; ++Part) {
4494       SmallVector<Value *, 2> Ops;
4495       for (VPValue *VPOp : User.operands())
4496         Ops.push_back(State.get(VPOp, Part));
4497 
4498       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4499 
4500       if (auto *VecOp = dyn_cast<Instruction>(V))
4501         VecOp->copyIRFlags(&I);
4502 
4503       // Use this vector value for all users of the original instruction.
4504       VectorLoopValueMap.setVectorValue(&I, Part, V);
4505       addMetadata(V, &I);
4506     }
4507 
4508     break;
4509   }
4510   case Instruction::ICmp:
4511   case Instruction::FCmp: {
4512     // Widen compares. Generate vector compares.
4513     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4514     auto *Cmp = cast<CmpInst>(&I);
4515     setDebugLocFromInst(Builder, Cmp);
4516     for (unsigned Part = 0; Part < UF; ++Part) {
4517       Value *A = State.get(User.getOperand(0), Part);
4518       Value *B = State.get(User.getOperand(1), Part);
4519       Value *C = nullptr;
4520       if (FCmp) {
4521         // Propagate fast math flags.
4522         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4523         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4524         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4525       } else {
4526         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4527       }
4528       VectorLoopValueMap.setVectorValue(&I, Part, C);
4529       addMetadata(C, &I);
4530     }
4531 
4532     break;
4533   }
4534 
4535   case Instruction::ZExt:
4536   case Instruction::SExt:
4537   case Instruction::FPToUI:
4538   case Instruction::FPToSI:
4539   case Instruction::FPExt:
4540   case Instruction::PtrToInt:
4541   case Instruction::IntToPtr:
4542   case Instruction::SIToFP:
4543   case Instruction::UIToFP:
4544   case Instruction::Trunc:
4545   case Instruction::FPTrunc:
4546   case Instruction::BitCast: {
4547     auto *CI = cast<CastInst>(&I);
4548     setDebugLocFromInst(Builder, CI);
4549 
4550     /// Vectorize casts.
4551     assert(!VF.Scalable && "VF is assumed to be non scalable.");
4552     Type *DestTy =
4553         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4554 
4555     for (unsigned Part = 0; Part < UF; ++Part) {
4556       Value *A = State.get(User.getOperand(0), Part);
4557       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4558       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4559       addMetadata(Cast, &I);
4560     }
4561     break;
4562   }
4563   default:
4564     // This instruction is not vectorized by simple widening.
4565     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4566     llvm_unreachable("Unhandled instruction!");
4567   } // end of switch.
4568 }
4569 
4570 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4571                                                VPTransformState &State) {
4572   assert(!isa<DbgInfoIntrinsic>(I) &&
4573          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4574   setDebugLocFromInst(Builder, &I);
4575 
4576   Module *M = I.getParent()->getParent()->getParent();
4577   auto *CI = cast<CallInst>(&I);
4578 
4579   SmallVector<Type *, 4> Tys;
4580   for (Value *ArgOperand : CI->arg_operands())
4581     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min));
4582 
4583   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4584 
4585   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4586   // version of the instruction.
4587   // Is it beneficial to perform intrinsic call compared to lib call?
4588   bool NeedToScalarize = false;
4589   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4590   bool UseVectorIntrinsic =
4591       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4592   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4593          "Instruction should be scalarized elsewhere.");
4594 
4595   for (unsigned Part = 0; Part < UF; ++Part) {
4596     SmallVector<Value *, 4> Args;
4597     for (auto &I : enumerate(ArgOperands.operands())) {
4598       // Some intrinsics have a scalar argument - don't replace it with a
4599       // vector.
4600       Value *Arg;
4601       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4602         Arg = State.get(I.value(), Part);
4603       else
4604         Arg = State.get(I.value(), {0, 0});
4605       Args.push_back(Arg);
4606     }
4607 
4608     Function *VectorF;
4609     if (UseVectorIntrinsic) {
4610       // Use vector version of the intrinsic.
4611       Type *TysForDecl[] = {CI->getType()};
4612       if (VF.isVector()) {
4613         assert(!VF.Scalable && "VF is assumed to be non scalable.");
4614         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4615       }
4616       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4617       assert(VectorF && "Can't retrieve vector intrinsic.");
4618     } else {
4619       // Use vector version of the function call.
4620       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4621 #ifndef NDEBUG
4622       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4623              "Can't create vector function.");
4624 #endif
4625         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4626     }
4627       SmallVector<OperandBundleDef, 1> OpBundles;
4628       CI->getOperandBundlesAsDefs(OpBundles);
4629       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4630 
4631       if (isa<FPMathOperator>(V))
4632         V->copyFastMathFlags(CI);
4633 
4634       VectorLoopValueMap.setVectorValue(&I, Part, V);
4635       addMetadata(V, &I);
4636   }
4637 }
4638 
4639 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4640                                                  VPUser &Operands,
4641                                                  bool InvariantCond,
4642                                                  VPTransformState &State) {
4643   setDebugLocFromInst(Builder, &I);
4644 
4645   // The condition can be loop invariant  but still defined inside the
4646   // loop. This means that we can't just use the original 'cond' value.
4647   // We have to take the 'vectorized' value and pick the first lane.
4648   // Instcombine will make this a no-op.
4649   auto *InvarCond =
4650       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4651 
4652   for (unsigned Part = 0; Part < UF; ++Part) {
4653     Value *Cond =
4654         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4655     Value *Op0 = State.get(Operands.getOperand(1), Part);
4656     Value *Op1 = State.get(Operands.getOperand(2), Part);
4657     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4658     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4659     addMetadata(Sel, &I);
4660   }
4661 }
4662 
4663 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4664   // We should not collect Scalars more than once per VF. Right now, this
4665   // function is called from collectUniformsAndScalars(), which already does
4666   // this check. Collecting Scalars for VF=1 does not make any sense.
4667   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4668          "This function should not be visited twice for the same VF");
4669 
4670   SmallSetVector<Instruction *, 8> Worklist;
4671 
4672   // These sets are used to seed the analysis with pointers used by memory
4673   // accesses that will remain scalar.
4674   SmallSetVector<Instruction *, 8> ScalarPtrs;
4675   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4676   auto *Latch = TheLoop->getLoopLatch();
4677 
4678   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4679   // The pointer operands of loads and stores will be scalar as long as the
4680   // memory access is not a gather or scatter operation. The value operand of a
4681   // store will remain scalar if the store is scalarized.
4682   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4683     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4684     assert(WideningDecision != CM_Unknown &&
4685            "Widening decision should be ready at this moment");
4686     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4687       if (Ptr == Store->getValueOperand())
4688         return WideningDecision == CM_Scalarize;
4689     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4690            "Ptr is neither a value or pointer operand");
4691     return WideningDecision != CM_GatherScatter;
4692   };
4693 
4694   // A helper that returns true if the given value is a bitcast or
4695   // getelementptr instruction contained in the loop.
4696   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4697     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4698             isa<GetElementPtrInst>(V)) &&
4699            !TheLoop->isLoopInvariant(V);
4700   };
4701 
4702   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4703     if (!isa<PHINode>(Ptr) ||
4704         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4705       return false;
4706     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4707     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4708       return false;
4709     return isScalarUse(MemAccess, Ptr);
4710   };
4711 
4712   // A helper that evaluates a memory access's use of a pointer. If the
4713   // pointer is actually the pointer induction of a loop, it is being
4714   // inserted into Worklist. If the use will be a scalar use, and the
4715   // pointer is only used by memory accesses, we place the pointer in
4716   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4717   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4718     if (isScalarPtrInduction(MemAccess, Ptr)) {
4719       Worklist.insert(cast<Instruction>(Ptr));
4720       Instruction *Update = cast<Instruction>(
4721           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4722       Worklist.insert(Update);
4723       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4724                         << "\n");
4725       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4726                         << "\n");
4727       return;
4728     }
4729     // We only care about bitcast and getelementptr instructions contained in
4730     // the loop.
4731     if (!isLoopVaryingBitCastOrGEP(Ptr))
4732       return;
4733 
4734     // If the pointer has already been identified as scalar (e.g., if it was
4735     // also identified as uniform), there's nothing to do.
4736     auto *I = cast<Instruction>(Ptr);
4737     if (Worklist.count(I))
4738       return;
4739 
4740     // If the use of the pointer will be a scalar use, and all users of the
4741     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4742     // place the pointer in PossibleNonScalarPtrs.
4743     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4744           return isa<LoadInst>(U) || isa<StoreInst>(U);
4745         }))
4746       ScalarPtrs.insert(I);
4747     else
4748       PossibleNonScalarPtrs.insert(I);
4749   };
4750 
4751   // We seed the scalars analysis with three classes of instructions: (1)
4752   // instructions marked uniform-after-vectorization and (2) bitcast,
4753   // getelementptr and (pointer) phi instructions used by memory accesses
4754   // requiring a scalar use.
4755   //
4756   // (1) Add to the worklist all instructions that have been identified as
4757   // uniform-after-vectorization.
4758   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4759 
4760   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4761   // memory accesses requiring a scalar use. The pointer operands of loads and
4762   // stores will be scalar as long as the memory accesses is not a gather or
4763   // scatter operation. The value operand of a store will remain scalar if the
4764   // store is scalarized.
4765   for (auto *BB : TheLoop->blocks())
4766     for (auto &I : *BB) {
4767       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4768         evaluatePtrUse(Load, Load->getPointerOperand());
4769       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4770         evaluatePtrUse(Store, Store->getPointerOperand());
4771         evaluatePtrUse(Store, Store->getValueOperand());
4772       }
4773     }
4774   for (auto *I : ScalarPtrs)
4775     if (!PossibleNonScalarPtrs.count(I)) {
4776       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4777       Worklist.insert(I);
4778     }
4779 
4780   // Insert the forced scalars.
4781   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4782   // induction variable when the PHI user is scalarized.
4783   auto ForcedScalar = ForcedScalars.find(VF);
4784   if (ForcedScalar != ForcedScalars.end())
4785     for (auto *I : ForcedScalar->second)
4786       Worklist.insert(I);
4787 
4788   // Expand the worklist by looking through any bitcasts and getelementptr
4789   // instructions we've already identified as scalar. This is similar to the
4790   // expansion step in collectLoopUniforms(); however, here we're only
4791   // expanding to include additional bitcasts and getelementptr instructions.
4792   unsigned Idx = 0;
4793   while (Idx != Worklist.size()) {
4794     Instruction *Dst = Worklist[Idx++];
4795     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4796       continue;
4797     auto *Src = cast<Instruction>(Dst->getOperand(0));
4798     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4799           auto *J = cast<Instruction>(U);
4800           return !TheLoop->contains(J) || Worklist.count(J) ||
4801                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4802                   isScalarUse(J, Src));
4803         })) {
4804       Worklist.insert(Src);
4805       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4806     }
4807   }
4808 
4809   // An induction variable will remain scalar if all users of the induction
4810   // variable and induction variable update remain scalar.
4811   for (auto &Induction : Legal->getInductionVars()) {
4812     auto *Ind = Induction.first;
4813     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4814 
4815     // If tail-folding is applied, the primary induction variable will be used
4816     // to feed a vector compare.
4817     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4818       continue;
4819 
4820     // Determine if all users of the induction variable are scalar after
4821     // vectorization.
4822     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4823       auto *I = cast<Instruction>(U);
4824       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4825     });
4826     if (!ScalarInd)
4827       continue;
4828 
4829     // Determine if all users of the induction variable update instruction are
4830     // scalar after vectorization.
4831     auto ScalarIndUpdate =
4832         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4833           auto *I = cast<Instruction>(U);
4834           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4835         });
4836     if (!ScalarIndUpdate)
4837       continue;
4838 
4839     // The induction variable and its update instruction will remain scalar.
4840     Worklist.insert(Ind);
4841     Worklist.insert(IndUpdate);
4842     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4843     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4844                       << "\n");
4845   }
4846 
4847   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4848 }
4849 
4850 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4851                                                          ElementCount VF) {
4852   assert(!VF.Scalable && "scalable vectors not yet supported.");
4853   if (!blockNeedsPredication(I->getParent()))
4854     return false;
4855   switch(I->getOpcode()) {
4856   default:
4857     break;
4858   case Instruction::Load:
4859   case Instruction::Store: {
4860     if (!Legal->isMaskRequired(I))
4861       return false;
4862     auto *Ptr = getLoadStorePointerOperand(I);
4863     auto *Ty = getMemInstValueType(I);
4864     // We have already decided how to vectorize this instruction, get that
4865     // result.
4866     if (VF.isVector()) {
4867       InstWidening WideningDecision = getWideningDecision(I, VF);
4868       assert(WideningDecision != CM_Unknown &&
4869              "Widening decision should be ready at this moment");
4870       return WideningDecision == CM_Scalarize;
4871     }
4872     const Align Alignment = getLoadStoreAlignment(I);
4873     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4874                                 isLegalMaskedGather(Ty, Alignment))
4875                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4876                                 isLegalMaskedScatter(Ty, Alignment));
4877   }
4878   case Instruction::UDiv:
4879   case Instruction::SDiv:
4880   case Instruction::SRem:
4881   case Instruction::URem:
4882     return mayDivideByZero(*I);
4883   }
4884   return false;
4885 }
4886 
4887 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4888     Instruction *I, ElementCount VF) {
4889   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4890   assert(getWideningDecision(I, VF) == CM_Unknown &&
4891          "Decision should not be set yet.");
4892   auto *Group = getInterleavedAccessGroup(I);
4893   assert(Group && "Must have a group.");
4894 
4895   // If the instruction's allocated size doesn't equal it's type size, it
4896   // requires padding and will be scalarized.
4897   auto &DL = I->getModule()->getDataLayout();
4898   auto *ScalarTy = getMemInstValueType(I);
4899   if (hasIrregularType(ScalarTy, DL, VF))
4900     return false;
4901 
4902   // Check if masking is required.
4903   // A Group may need masking for one of two reasons: it resides in a block that
4904   // needs predication, or it was decided to use masking to deal with gaps.
4905   bool PredicatedAccessRequiresMasking =
4906       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4907   bool AccessWithGapsRequiresMasking =
4908       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4909   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4910     return true;
4911 
4912   // If masked interleaving is required, we expect that the user/target had
4913   // enabled it, because otherwise it either wouldn't have been created or
4914   // it should have been invalidated by the CostModel.
4915   assert(useMaskedInterleavedAccesses(TTI) &&
4916          "Masked interleave-groups for predicated accesses are not enabled.");
4917 
4918   auto *Ty = getMemInstValueType(I);
4919   const Align Alignment = getLoadStoreAlignment(I);
4920   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4921                           : TTI.isLegalMaskedStore(Ty, Alignment);
4922 }
4923 
4924 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4925     Instruction *I, ElementCount VF) {
4926   // Get and ensure we have a valid memory instruction.
4927   LoadInst *LI = dyn_cast<LoadInst>(I);
4928   StoreInst *SI = dyn_cast<StoreInst>(I);
4929   assert((LI || SI) && "Invalid memory instruction");
4930 
4931   auto *Ptr = getLoadStorePointerOperand(I);
4932 
4933   // In order to be widened, the pointer should be consecutive, first of all.
4934   if (!Legal->isConsecutivePtr(Ptr))
4935     return false;
4936 
4937   // If the instruction is a store located in a predicated block, it will be
4938   // scalarized.
4939   if (isScalarWithPredication(I))
4940     return false;
4941 
4942   // If the instruction's allocated size doesn't equal it's type size, it
4943   // requires padding and will be scalarized.
4944   auto &DL = I->getModule()->getDataLayout();
4945   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4946   if (hasIrregularType(ScalarTy, DL, VF))
4947     return false;
4948 
4949   return true;
4950 }
4951 
4952 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4953   // We should not collect Uniforms more than once per VF. Right now,
4954   // this function is called from collectUniformsAndScalars(), which
4955   // already does this check. Collecting Uniforms for VF=1 does not make any
4956   // sense.
4957 
4958   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4959          "This function should not be visited twice for the same VF");
4960 
4961   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4962   // not analyze again.  Uniforms.count(VF) will return 1.
4963   Uniforms[VF].clear();
4964 
4965   // We now know that the loop is vectorizable!
4966   // Collect instructions inside the loop that will remain uniform after
4967   // vectorization.
4968 
4969   // Global values, params and instructions outside of current loop are out of
4970   // scope.
4971   auto isOutOfScope = [&](Value *V) -> bool {
4972     Instruction *I = dyn_cast<Instruction>(V);
4973     return (!I || !TheLoop->contains(I));
4974   };
4975 
4976   SetVector<Instruction *> Worklist;
4977   BasicBlock *Latch = TheLoop->getLoopLatch();
4978 
4979   // Instructions that are scalar with predication must not be considered
4980   // uniform after vectorization, because that would create an erroneous
4981   // replicating region where only a single instance out of VF should be formed.
4982   // TODO: optimize such seldom cases if found important, see PR40816.
4983   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4984     if (isScalarWithPredication(I, VF)) {
4985       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4986                         << *I << "\n");
4987       return;
4988     }
4989     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4990     Worklist.insert(I);
4991   };
4992 
4993   // Start with the conditional branch. If the branch condition is an
4994   // instruction contained in the loop that is only used by the branch, it is
4995   // uniform.
4996   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4997   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4998     addToWorklistIfAllowed(Cmp);
4999 
5000   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5001   // are pointers that are treated like consecutive pointers during
5002   // vectorization. The pointer operands of interleaved accesses are an
5003   // example.
5004   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5005 
5006   // Holds pointer operands of instructions that are possibly non-uniform.
5007   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5008 
5009   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5010     InstWidening WideningDecision = getWideningDecision(I, VF);
5011     assert(WideningDecision != CM_Unknown &&
5012            "Widening decision should be ready at this moment");
5013 
5014     return (WideningDecision == CM_Widen ||
5015             WideningDecision == CM_Widen_Reverse ||
5016             WideningDecision == CM_Interleave);
5017   };
5018   // Iterate over the instructions in the loop, and collect all
5019   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5020   // that a consecutive-like pointer operand will be scalarized, we collect it
5021   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5022   // getelementptr instruction can be used by both vectorized and scalarized
5023   // memory instructions. For example, if a loop loads and stores from the same
5024   // location, but the store is conditional, the store will be scalarized, and
5025   // the getelementptr won't remain uniform.
5026   for (auto *BB : TheLoop->blocks())
5027     for (auto &I : *BB) {
5028       // If there's no pointer operand, there's nothing to do.
5029       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5030       if (!Ptr)
5031         continue;
5032 
5033       // True if all users of Ptr are memory accesses that have Ptr as their
5034       // pointer operand.
5035       auto UsersAreMemAccesses =
5036           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5037             return getLoadStorePointerOperand(U) == Ptr;
5038           });
5039 
5040       // Ensure the memory instruction will not be scalarized or used by
5041       // gather/scatter, making its pointer operand non-uniform. If the pointer
5042       // operand is used by any instruction other than a memory access, we
5043       // conservatively assume the pointer operand may be non-uniform.
5044       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5045         PossibleNonUniformPtrs.insert(Ptr);
5046 
5047       // If the memory instruction will be vectorized and its pointer operand
5048       // is consecutive-like, or interleaving - the pointer operand should
5049       // remain uniform.
5050       else
5051         ConsecutiveLikePtrs.insert(Ptr);
5052     }
5053 
5054   // Add to the Worklist all consecutive and consecutive-like pointers that
5055   // aren't also identified as possibly non-uniform.
5056   for (auto *V : ConsecutiveLikePtrs)
5057     if (!PossibleNonUniformPtrs.count(V))
5058       addToWorklistIfAllowed(V);
5059 
5060   // Expand Worklist in topological order: whenever a new instruction
5061   // is added , its users should be already inside Worklist.  It ensures
5062   // a uniform instruction will only be used by uniform instructions.
5063   unsigned idx = 0;
5064   while (idx != Worklist.size()) {
5065     Instruction *I = Worklist[idx++];
5066 
5067     for (auto OV : I->operand_values()) {
5068       // isOutOfScope operands cannot be uniform instructions.
5069       if (isOutOfScope(OV))
5070         continue;
5071       // First order recurrence Phi's should typically be considered
5072       // non-uniform.
5073       auto *OP = dyn_cast<PHINode>(OV);
5074       if (OP && Legal->isFirstOrderRecurrence(OP))
5075         continue;
5076       // If all the users of the operand are uniform, then add the
5077       // operand into the uniform worklist.
5078       auto *OI = cast<Instruction>(OV);
5079       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5080             auto *J = cast<Instruction>(U);
5081             return Worklist.count(J) ||
5082                    (OI == getLoadStorePointerOperand(J) &&
5083                     isUniformDecision(J, VF));
5084           }))
5085         addToWorklistIfAllowed(OI);
5086     }
5087   }
5088 
5089   // Returns true if Ptr is the pointer operand of a memory access instruction
5090   // I, and I is known to not require scalarization.
5091   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5092     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5093   };
5094 
5095   // For an instruction to be added into Worklist above, all its users inside
5096   // the loop should also be in Worklist. However, this condition cannot be
5097   // true for phi nodes that form a cyclic dependence. We must process phi
5098   // nodes separately. An induction variable will remain uniform if all users
5099   // of the induction variable and induction variable update remain uniform.
5100   // The code below handles both pointer and non-pointer induction variables.
5101   for (auto &Induction : Legal->getInductionVars()) {
5102     auto *Ind = Induction.first;
5103     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5104 
5105     // Determine if all users of the induction variable are uniform after
5106     // vectorization.
5107     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5108       auto *I = cast<Instruction>(U);
5109       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5110              isVectorizedMemAccessUse(I, Ind);
5111     });
5112     if (!UniformInd)
5113       continue;
5114 
5115     // Determine if all users of the induction variable update instruction are
5116     // uniform after vectorization.
5117     auto UniformIndUpdate =
5118         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5119           auto *I = cast<Instruction>(U);
5120           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5121                  isVectorizedMemAccessUse(I, IndUpdate);
5122         });
5123     if (!UniformIndUpdate)
5124       continue;
5125 
5126     // The induction variable and its update instruction will remain uniform.
5127     addToWorklistIfAllowed(Ind);
5128     addToWorklistIfAllowed(IndUpdate);
5129   }
5130 
5131   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5132 }
5133 
5134 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5135   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5136 
5137   if (Legal->getRuntimePointerChecking()->Need) {
5138     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5139         "runtime pointer checks needed. Enable vectorization of this "
5140         "loop with '#pragma clang loop vectorize(enable)' when "
5141         "compiling with -Os/-Oz",
5142         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5143     return true;
5144   }
5145 
5146   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5147     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5148         "runtime SCEV checks needed. Enable vectorization of this "
5149         "loop with '#pragma clang loop vectorize(enable)' when "
5150         "compiling with -Os/-Oz",
5151         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5152     return true;
5153   }
5154 
5155   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5156   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5157     reportVectorizationFailure("Runtime stride check for small trip count",
5158         "runtime stride == 1 checks needed. Enable vectorization of "
5159         "this loop without such check by compiling with -Os/-Oz",
5160         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5161     return true;
5162   }
5163 
5164   return false;
5165 }
5166 
5167 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5168                                                             unsigned UserIC) {
5169   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5170     // TODO: It may by useful to do since it's still likely to be dynamically
5171     // uniform if the target can skip.
5172     reportVectorizationFailure(
5173         "Not inserting runtime ptr check for divergent target",
5174         "runtime pointer checks needed. Not enabled for divergent target",
5175         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5176     return None;
5177   }
5178 
5179   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5180   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5181   if (TC == 1) {
5182     reportVectorizationFailure("Single iteration (non) loop",
5183         "loop trip count is one, irrelevant for vectorization",
5184         "SingleIterationLoop", ORE, TheLoop);
5185     return None;
5186   }
5187 
5188   switch (ScalarEpilogueStatus) {
5189   case CM_ScalarEpilogueAllowed:
5190     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5191   case CM_ScalarEpilogueNotNeededUsePredicate:
5192     LLVM_DEBUG(
5193         dbgs() << "LV: vector predicate hint/switch found.\n"
5194                << "LV: Not allowing scalar epilogue, creating predicated "
5195                << "vector loop.\n");
5196     break;
5197   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5198     // fallthrough as a special case of OptForSize
5199   case CM_ScalarEpilogueNotAllowedOptSize:
5200     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5201       LLVM_DEBUG(
5202           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5203     else
5204       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5205                         << "count.\n");
5206 
5207     // Bail if runtime checks are required, which are not good when optimising
5208     // for size.
5209     if (runtimeChecksRequired())
5210       return None;
5211     break;
5212   }
5213 
5214   // Now try the tail folding
5215 
5216   // Invalidate interleave groups that require an epilogue if we can't mask
5217   // the interleave-group.
5218   if (!useMaskedInterleavedAccesses(TTI)) {
5219     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5220            "No decisions should have been taken at this point");
5221     // Note: There is no need to invalidate any cost modeling decisions here, as
5222     // non where taken so far.
5223     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5224   }
5225 
5226   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5227   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5228   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5229   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5230     // Accept MaxVF if we do not have a tail.
5231     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5232     return MaxVF;
5233   }
5234 
5235   // If we don't know the precise trip count, or if the trip count that we
5236   // found modulo the vectorization factor is not zero, try to fold the tail
5237   // by masking.
5238   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5239   if (Legal->prepareToFoldTailByMasking()) {
5240     FoldTailByMasking = true;
5241     return MaxVF;
5242   }
5243 
5244   if (TC == 0) {
5245     reportVectorizationFailure(
5246         "Unable to calculate the loop count due to complex control flow",
5247         "unable to calculate the loop count due to complex control flow",
5248         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5249     return None;
5250   }
5251 
5252   reportVectorizationFailure(
5253       "Cannot optimize for size and vectorize at the same time.",
5254       "cannot optimize for size and vectorize at the same time. "
5255       "Enable vectorization of this loop with '#pragma clang loop "
5256       "vectorize(enable)' when compiling with -Os/-Oz",
5257       "NoTailLoopWithOptForSize", ORE, TheLoop);
5258   return None;
5259 }
5260 
5261 unsigned
5262 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5263   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5264   unsigned SmallestType, WidestType;
5265   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5266   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5267 
5268   // Get the maximum safe dependence distance in bits computed by LAA.
5269   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5270   // the memory accesses that is most restrictive (involved in the smallest
5271   // dependence distance).
5272   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5273 
5274   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5275 
5276   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5277   // Note that both WidestRegister and WidestType may not be a powers of 2.
5278   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5279 
5280   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5281                     << " / " << WidestType << " bits.\n");
5282   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5283                     << WidestRegister << " bits.\n");
5284 
5285   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5286                                  " into one vector!");
5287   if (MaxVectorSize == 0) {
5288     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5289     MaxVectorSize = 1;
5290     return MaxVectorSize;
5291   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5292              isPowerOf2_32(ConstTripCount)) {
5293     // We need to clamp the VF to be the ConstTripCount. There is no point in
5294     // choosing a higher viable VF as done in the loop below.
5295     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5296                       << ConstTripCount << "\n");
5297     MaxVectorSize = ConstTripCount;
5298     return MaxVectorSize;
5299   }
5300 
5301   unsigned MaxVF = MaxVectorSize;
5302   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5303       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5304     // Collect all viable vectorization factors larger than the default MaxVF
5305     // (i.e. MaxVectorSize).
5306     SmallVector<ElementCount, 8> VFs;
5307     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5308     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5309       VFs.push_back(ElementCount::getFixed(VS));
5310 
5311     // For each VF calculate its register usage.
5312     auto RUs = calculateRegisterUsage(VFs);
5313 
5314     // Select the largest VF which doesn't require more registers than existing
5315     // ones.
5316     for (int i = RUs.size() - 1; i >= 0; --i) {
5317       bool Selected = true;
5318       for (auto& pair : RUs[i].MaxLocalUsers) {
5319         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5320         if (pair.second > TargetNumRegisters)
5321           Selected = false;
5322       }
5323       if (Selected) {
5324         MaxVF = VFs[i].Min;
5325         break;
5326       }
5327     }
5328     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5329       if (MaxVF < MinVF) {
5330         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5331                           << ") with target's minimum: " << MinVF << '\n');
5332         MaxVF = MinVF;
5333       }
5334     }
5335   }
5336   return MaxVF;
5337 }
5338 
5339 VectorizationFactor
5340 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5341   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5342   const float ScalarCost = Cost;
5343   unsigned Width = 1;
5344   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5345 
5346   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5347   if (ForceVectorization && MaxVF > 1) {
5348     // Ignore scalar width, because the user explicitly wants vectorization.
5349     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5350     // evaluation.
5351     Cost = std::numeric_limits<float>::max();
5352   }
5353 
5354   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5355     // Notice that the vector loop needs to be executed less times, so
5356     // we need to divide the cost of the vector loops by the width of
5357     // the vector elements.
5358     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5359     float VectorCost = C.first / (float)i;
5360     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5361                       << " costs: " << (int)VectorCost << ".\n");
5362     if (!C.second && !ForceVectorization) {
5363       LLVM_DEBUG(
5364           dbgs() << "LV: Not considering vector loop of width " << i
5365                  << " because it will not generate any vector instructions.\n");
5366       continue;
5367     }
5368     if (VectorCost < Cost) {
5369       Cost = VectorCost;
5370       Width = i;
5371     }
5372   }
5373 
5374   if (!EnableCondStoresVectorization && NumPredStores) {
5375     reportVectorizationFailure("There are conditional stores.",
5376         "store that is conditionally executed prevents vectorization",
5377         "ConditionalStore", ORE, TheLoop);
5378     Width = 1;
5379     Cost = ScalarCost;
5380   }
5381 
5382   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5383              << "LV: Vectorization seems to be not beneficial, "
5384              << "but was forced by a user.\n");
5385   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5386   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5387                                 (unsigned)(Width * Cost)};
5388   return Factor;
5389 }
5390 
5391 std::pair<unsigned, unsigned>
5392 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5393   unsigned MinWidth = -1U;
5394   unsigned MaxWidth = 8;
5395   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5396 
5397   // For each block.
5398   for (BasicBlock *BB : TheLoop->blocks()) {
5399     // For each instruction in the loop.
5400     for (Instruction &I : BB->instructionsWithoutDebug()) {
5401       Type *T = I.getType();
5402 
5403       // Skip ignored values.
5404       if (ValuesToIgnore.count(&I))
5405         continue;
5406 
5407       // Only examine Loads, Stores and PHINodes.
5408       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5409         continue;
5410 
5411       // Examine PHI nodes that are reduction variables. Update the type to
5412       // account for the recurrence type.
5413       if (auto *PN = dyn_cast<PHINode>(&I)) {
5414         if (!Legal->isReductionVariable(PN))
5415           continue;
5416         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5417         T = RdxDesc.getRecurrenceType();
5418       }
5419 
5420       // Examine the stored values.
5421       if (auto *ST = dyn_cast<StoreInst>(&I))
5422         T = ST->getValueOperand()->getType();
5423 
5424       // Ignore loaded pointer types and stored pointer types that are not
5425       // vectorizable.
5426       //
5427       // FIXME: The check here attempts to predict whether a load or store will
5428       //        be vectorized. We only know this for certain after a VF has
5429       //        been selected. Here, we assume that if an access can be
5430       //        vectorized, it will be. We should also look at extending this
5431       //        optimization to non-pointer types.
5432       //
5433       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5434           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5435         continue;
5436 
5437       MinWidth = std::min(MinWidth,
5438                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5439       MaxWidth = std::max(MaxWidth,
5440                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5441     }
5442   }
5443 
5444   return {MinWidth, MaxWidth};
5445 }
5446 
5447 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5448                                                            unsigned LoopCost) {
5449   // -- The interleave heuristics --
5450   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5451   // There are many micro-architectural considerations that we can't predict
5452   // at this level. For example, frontend pressure (on decode or fetch) due to
5453   // code size, or the number and capabilities of the execution ports.
5454   //
5455   // We use the following heuristics to select the interleave count:
5456   // 1. If the code has reductions, then we interleave to break the cross
5457   // iteration dependency.
5458   // 2. If the loop is really small, then we interleave to reduce the loop
5459   // overhead.
5460   // 3. We don't interleave if we think that we will spill registers to memory
5461   // due to the increased register pressure.
5462 
5463   if (!isScalarEpilogueAllowed())
5464     return 1;
5465 
5466   // We used the distance for the interleave count.
5467   if (Legal->getMaxSafeDepDistBytes() != -1U)
5468     return 1;
5469 
5470   // Do not interleave loops with a relatively small known or estimated trip
5471   // count.
5472   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5473   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5474     return 1;
5475 
5476   RegisterUsage R = calculateRegisterUsage({VF})[0];
5477   // We divide by these constants so assume that we have at least one
5478   // instruction that uses at least one register.
5479   for (auto& pair : R.MaxLocalUsers) {
5480     pair.second = std::max(pair.second, 1U);
5481   }
5482 
5483   // We calculate the interleave count using the following formula.
5484   // Subtract the number of loop invariants from the number of available
5485   // registers. These registers are used by all of the interleaved instances.
5486   // Next, divide the remaining registers by the number of registers that is
5487   // required by the loop, in order to estimate how many parallel instances
5488   // fit without causing spills. All of this is rounded down if necessary to be
5489   // a power of two. We want power of two interleave count to simplify any
5490   // addressing operations or alignment considerations.
5491   // We also want power of two interleave counts to ensure that the induction
5492   // variable of the vector loop wraps to zero, when tail is folded by masking;
5493   // this currently happens when OptForSize, in which case IC is set to 1 above.
5494   unsigned IC = UINT_MAX;
5495 
5496   for (auto& pair : R.MaxLocalUsers) {
5497     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5498     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5499                       << " registers of "
5500                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5501     if (VF == 1) {
5502       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5503         TargetNumRegisters = ForceTargetNumScalarRegs;
5504     } else {
5505       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5506         TargetNumRegisters = ForceTargetNumVectorRegs;
5507     }
5508     unsigned MaxLocalUsers = pair.second;
5509     unsigned LoopInvariantRegs = 0;
5510     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5511       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5512 
5513     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5514     // Don't count the induction variable as interleaved.
5515     if (EnableIndVarRegisterHeur) {
5516       TmpIC =
5517           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5518                         std::max(1U, (MaxLocalUsers - 1)));
5519     }
5520 
5521     IC = std::min(IC, TmpIC);
5522   }
5523 
5524   // Clamp the interleave ranges to reasonable counts.
5525   assert(!VF.Scalable && "scalable vectors not yet supported.");
5526   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min);
5527 
5528   // Check if the user has overridden the max.
5529   if (VF == 1) {
5530     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5531       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5532   } else {
5533     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5534       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5535   }
5536 
5537   // If trip count is known or estimated compile time constant, limit the
5538   // interleave count to be less than the trip count divided by VF.
5539   if (BestKnownTC) {
5540     MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount);
5541   }
5542 
5543   // If we did not calculate the cost for VF (because the user selected the VF)
5544   // then we calculate the cost of VF here.
5545   if (LoopCost == 0)
5546     LoopCost = expectedCost(VF).first;
5547 
5548   assert(LoopCost && "Non-zero loop cost expected");
5549 
5550   // Clamp the calculated IC to be between the 1 and the max interleave count
5551   // that the target and trip count allows.
5552   if (IC > MaxInterleaveCount)
5553     IC = MaxInterleaveCount;
5554   else if (IC < 1)
5555     IC = 1;
5556 
5557   // Interleave if we vectorized this loop and there is a reduction that could
5558   // benefit from interleaving.
5559   if (VF.isVector() && !Legal->getReductionVars().empty()) {
5560     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5561     return IC;
5562   }
5563 
5564   // Note that if we've already vectorized the loop we will have done the
5565   // runtime check and so interleaving won't require further checks.
5566   bool InterleavingRequiresRuntimePointerCheck =
5567       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5568 
5569   // We want to interleave small loops in order to reduce the loop overhead and
5570   // potentially expose ILP opportunities.
5571   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5572   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5573     // We assume that the cost overhead is 1 and we use the cost model
5574     // to estimate the cost of the loop and interleave until the cost of the
5575     // loop overhead is about 5% of the cost of the loop.
5576     unsigned SmallIC =
5577         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5578 
5579     // Interleave until store/load ports (estimated by max interleave count) are
5580     // saturated.
5581     unsigned NumStores = Legal->getNumStores();
5582     unsigned NumLoads = Legal->getNumLoads();
5583     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5584     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5585 
5586     // If we have a scalar reduction (vector reductions are already dealt with
5587     // by this point), we can increase the critical path length if the loop
5588     // we're interleaving is inside another loop. Limit, by default to 2, so the
5589     // critical path only gets increased by one reduction operation.
5590     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5591       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5592       SmallIC = std::min(SmallIC, F);
5593       StoresIC = std::min(StoresIC, F);
5594       LoadsIC = std::min(LoadsIC, F);
5595     }
5596 
5597     if (EnableLoadStoreRuntimeInterleave &&
5598         std::max(StoresIC, LoadsIC) > SmallIC) {
5599       LLVM_DEBUG(
5600           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5601       return std::max(StoresIC, LoadsIC);
5602     }
5603 
5604     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5605     return SmallIC;
5606   }
5607 
5608   // Interleave if this is a large loop (small loops are already dealt with by
5609   // this point) that could benefit from interleaving.
5610   bool HasReductions = !Legal->getReductionVars().empty();
5611   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5612     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5613     return IC;
5614   }
5615 
5616   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5617   return 1;
5618 }
5619 
5620 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5621 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5622   // This function calculates the register usage by measuring the highest number
5623   // of values that are alive at a single location. Obviously, this is a very
5624   // rough estimation. We scan the loop in a topological order in order and
5625   // assign a number to each instruction. We use RPO to ensure that defs are
5626   // met before their users. We assume that each instruction that has in-loop
5627   // users starts an interval. We record every time that an in-loop value is
5628   // used, so we have a list of the first and last occurrences of each
5629   // instruction. Next, we transpose this data structure into a multi map that
5630   // holds the list of intervals that *end* at a specific location. This multi
5631   // map allows us to perform a linear search. We scan the instructions linearly
5632   // and record each time that a new interval starts, by placing it in a set.
5633   // If we find this value in the multi-map then we remove it from the set.
5634   // The max register usage is the maximum size of the set.
5635   // We also search for instructions that are defined outside the loop, but are
5636   // used inside the loop. We need this number separately from the max-interval
5637   // usage number because when we unroll, loop-invariant values do not take
5638   // more register.
5639   LoopBlocksDFS DFS(TheLoop);
5640   DFS.perform(LI);
5641 
5642   RegisterUsage RU;
5643 
5644   // Each 'key' in the map opens a new interval. The values
5645   // of the map are the index of the 'last seen' usage of the
5646   // instruction that is the key.
5647   using IntervalMap = DenseMap<Instruction *, unsigned>;
5648 
5649   // Maps instruction to its index.
5650   SmallVector<Instruction *, 64> IdxToInstr;
5651   // Marks the end of each interval.
5652   IntervalMap EndPoint;
5653   // Saves the list of instruction indices that are used in the loop.
5654   SmallPtrSet<Instruction *, 8> Ends;
5655   // Saves the list of values that are used in the loop but are
5656   // defined outside the loop, such as arguments and constants.
5657   SmallPtrSet<Value *, 8> LoopInvariants;
5658 
5659   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5660     for (Instruction &I : BB->instructionsWithoutDebug()) {
5661       IdxToInstr.push_back(&I);
5662 
5663       // Save the end location of each USE.
5664       for (Value *U : I.operands()) {
5665         auto *Instr = dyn_cast<Instruction>(U);
5666 
5667         // Ignore non-instruction values such as arguments, constants, etc.
5668         if (!Instr)
5669           continue;
5670 
5671         // If this instruction is outside the loop then record it and continue.
5672         if (!TheLoop->contains(Instr)) {
5673           LoopInvariants.insert(Instr);
5674           continue;
5675         }
5676 
5677         // Overwrite previous end points.
5678         EndPoint[Instr] = IdxToInstr.size();
5679         Ends.insert(Instr);
5680       }
5681     }
5682   }
5683 
5684   // Saves the list of intervals that end with the index in 'key'.
5685   using InstrList = SmallVector<Instruction *, 2>;
5686   DenseMap<unsigned, InstrList> TransposeEnds;
5687 
5688   // Transpose the EndPoints to a list of values that end at each index.
5689   for (auto &Interval : EndPoint)
5690     TransposeEnds[Interval.second].push_back(Interval.first);
5691 
5692   SmallPtrSet<Instruction *, 8> OpenIntervals;
5693 
5694   // Get the size of the widest register.
5695   unsigned MaxSafeDepDist = -1U;
5696   if (Legal->getMaxSafeDepDistBytes() != -1U)
5697     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5698   unsigned WidestRegister =
5699       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5700   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5701 
5702   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5703   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5704 
5705   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5706 
5707   // A lambda that gets the register usage for the given type and VF.
5708   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5709     if (Ty->isTokenTy())
5710       return 0U;
5711     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5712     assert(!VF.Scalable && "scalable vectors not yet supported.");
5713     return std::max<unsigned>(1, VF.Min * TypeSize / WidestRegister);
5714   };
5715 
5716   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5717     Instruction *I = IdxToInstr[i];
5718 
5719     // Remove all of the instructions that end at this location.
5720     InstrList &List = TransposeEnds[i];
5721     for (Instruction *ToRemove : List)
5722       OpenIntervals.erase(ToRemove);
5723 
5724     // Ignore instructions that are never used within the loop.
5725     if (!Ends.count(I))
5726       continue;
5727 
5728     // Skip ignored values.
5729     if (ValuesToIgnore.count(I))
5730       continue;
5731 
5732     // For each VF find the maximum usage of registers.
5733     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5734       // Count the number of live intervals.
5735       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5736 
5737       if (VFs[j].isScalar()) {
5738         for (auto Inst : OpenIntervals) {
5739           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5740           if (RegUsage.find(ClassID) == RegUsage.end())
5741             RegUsage[ClassID] = 1;
5742           else
5743             RegUsage[ClassID] += 1;
5744         }
5745       } else {
5746         collectUniformsAndScalars(VFs[j]);
5747         for (auto Inst : OpenIntervals) {
5748           // Skip ignored values for VF > 1.
5749           if (VecValuesToIgnore.count(Inst))
5750             continue;
5751           if (isScalarAfterVectorization(Inst, VFs[j])) {
5752             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5753             if (RegUsage.find(ClassID) == RegUsage.end())
5754               RegUsage[ClassID] = 1;
5755             else
5756               RegUsage[ClassID] += 1;
5757           } else {
5758             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5759             if (RegUsage.find(ClassID) == RegUsage.end())
5760               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5761             else
5762               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5763           }
5764         }
5765       }
5766 
5767       for (auto& pair : RegUsage) {
5768         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5769           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5770         else
5771           MaxUsages[j][pair.first] = pair.second;
5772       }
5773     }
5774 
5775     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5776                       << OpenIntervals.size() << '\n');
5777 
5778     // Add the current instruction to the list of open intervals.
5779     OpenIntervals.insert(I);
5780   }
5781 
5782   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5783     SmallMapVector<unsigned, unsigned, 4> Invariant;
5784 
5785     for (auto Inst : LoopInvariants) {
5786       unsigned Usage =
5787           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5788       unsigned ClassID =
5789           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5790       if (Invariant.find(ClassID) == Invariant.end())
5791         Invariant[ClassID] = Usage;
5792       else
5793         Invariant[ClassID] += Usage;
5794     }
5795 
5796     LLVM_DEBUG({
5797       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5798       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5799              << " item\n";
5800       for (const auto &pair : MaxUsages[i]) {
5801         dbgs() << "LV(REG): RegisterClass: "
5802                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5803                << " registers\n";
5804       }
5805       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5806              << " item\n";
5807       for (const auto &pair : Invariant) {
5808         dbgs() << "LV(REG): RegisterClass: "
5809                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5810                << " registers\n";
5811       }
5812     });
5813 
5814     RU.LoopInvariantRegs = Invariant;
5815     RU.MaxLocalUsers = MaxUsages[i];
5816     RUs[i] = RU;
5817   }
5818 
5819   return RUs;
5820 }
5821 
5822 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5823   // TODO: Cost model for emulated masked load/store is completely
5824   // broken. This hack guides the cost model to use an artificially
5825   // high enough value to practically disable vectorization with such
5826   // operations, except where previously deployed legality hack allowed
5827   // using very low cost values. This is to avoid regressions coming simply
5828   // from moving "masked load/store" check from legality to cost model.
5829   // Masked Load/Gather emulation was previously never allowed.
5830   // Limited number of Masked Store/Scatter emulation was allowed.
5831   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5832   return isa<LoadInst>(I) ||
5833          (isa<StoreInst>(I) &&
5834           NumPredStores > NumberOfStoresToPredicate);
5835 }
5836 
5837 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5838   // If we aren't vectorizing the loop, or if we've already collected the
5839   // instructions to scalarize, there's nothing to do. Collection may already
5840   // have occurred if we have a user-selected VF and are now computing the
5841   // expected cost for interleaving.
5842   if (VF.isScalar() || VF.isZero() ||
5843       InstsToScalarize.find(VF) != InstsToScalarize.end())
5844     return;
5845 
5846   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5847   // not profitable to scalarize any instructions, the presence of VF in the
5848   // map will indicate that we've analyzed it already.
5849   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5850 
5851   // Find all the instructions that are scalar with predication in the loop and
5852   // determine if it would be better to not if-convert the blocks they are in.
5853   // If so, we also record the instructions to scalarize.
5854   for (BasicBlock *BB : TheLoop->blocks()) {
5855     if (!blockNeedsPredication(BB))
5856       continue;
5857     for (Instruction &I : *BB)
5858       if (isScalarWithPredication(&I)) {
5859         ScalarCostsTy ScalarCosts;
5860         // Do not apply discount logic if hacked cost is needed
5861         // for emulated masked memrefs.
5862         if (!useEmulatedMaskMemRefHack(&I) &&
5863             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5864           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5865         // Remember that BB will remain after vectorization.
5866         PredicatedBBsAfterVectorization.insert(BB);
5867       }
5868   }
5869 }
5870 
5871 int LoopVectorizationCostModel::computePredInstDiscount(
5872     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5873     ElementCount VF) {
5874   assert(!isUniformAfterVectorization(PredInst, VF) &&
5875          "Instruction marked uniform-after-vectorization will be predicated");
5876 
5877   // Initialize the discount to zero, meaning that the scalar version and the
5878   // vector version cost the same.
5879   int Discount = 0;
5880 
5881   // Holds instructions to analyze. The instructions we visit are mapped in
5882   // ScalarCosts. Those instructions are the ones that would be scalarized if
5883   // we find that the scalar version costs less.
5884   SmallVector<Instruction *, 8> Worklist;
5885 
5886   // Returns true if the given instruction can be scalarized.
5887   auto canBeScalarized = [&](Instruction *I) -> bool {
5888     // We only attempt to scalarize instructions forming a single-use chain
5889     // from the original predicated block that would otherwise be vectorized.
5890     // Although not strictly necessary, we give up on instructions we know will
5891     // already be scalar to avoid traversing chains that are unlikely to be
5892     // beneficial.
5893     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5894         isScalarAfterVectorization(I, VF))
5895       return false;
5896 
5897     // If the instruction is scalar with predication, it will be analyzed
5898     // separately. We ignore it within the context of PredInst.
5899     if (isScalarWithPredication(I))
5900       return false;
5901 
5902     // If any of the instruction's operands are uniform after vectorization,
5903     // the instruction cannot be scalarized. This prevents, for example, a
5904     // masked load from being scalarized.
5905     //
5906     // We assume we will only emit a value for lane zero of an instruction
5907     // marked uniform after vectorization, rather than VF identical values.
5908     // Thus, if we scalarize an instruction that uses a uniform, we would
5909     // create uses of values corresponding to the lanes we aren't emitting code
5910     // for. This behavior can be changed by allowing getScalarValue to clone
5911     // the lane zero values for uniforms rather than asserting.
5912     for (Use &U : I->operands())
5913       if (auto *J = dyn_cast<Instruction>(U.get()))
5914         if (isUniformAfterVectorization(J, VF))
5915           return false;
5916 
5917     // Otherwise, we can scalarize the instruction.
5918     return true;
5919   };
5920 
5921   // Compute the expected cost discount from scalarizing the entire expression
5922   // feeding the predicated instruction. We currently only consider expressions
5923   // that are single-use instruction chains.
5924   Worklist.push_back(PredInst);
5925   while (!Worklist.empty()) {
5926     Instruction *I = Worklist.pop_back_val();
5927 
5928     // If we've already analyzed the instruction, there's nothing to do.
5929     if (ScalarCosts.find(I) != ScalarCosts.end())
5930       continue;
5931 
5932     // Compute the cost of the vector instruction. Note that this cost already
5933     // includes the scalarization overhead of the predicated instruction.
5934     unsigned VectorCost = getInstructionCost(I, VF).first;
5935 
5936     // Compute the cost of the scalarized instruction. This cost is the cost of
5937     // the instruction as if it wasn't if-converted and instead remained in the
5938     // predicated block. We will scale this cost by block probability after
5939     // computing the scalarization overhead.
5940     assert(!VF.Scalable && "scalable vectors not yet supported.");
5941     unsigned ScalarCost =
5942         VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first;
5943 
5944     // Compute the scalarization overhead of needed insertelement instructions
5945     // and phi nodes.
5946     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5947       ScalarCost += TTI.getScalarizationOverhead(
5948           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5949           APInt::getAllOnesValue(VF.Min), true, false);
5950       assert(!VF.Scalable && "scalable vectors not yet supported.");
5951       ScalarCost +=
5952           VF.Min *
5953           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
5954     }
5955 
5956     // Compute the scalarization overhead of needed extractelement
5957     // instructions. For each of the instruction's operands, if the operand can
5958     // be scalarized, add it to the worklist; otherwise, account for the
5959     // overhead.
5960     for (Use &U : I->operands())
5961       if (auto *J = dyn_cast<Instruction>(U.get())) {
5962         assert(VectorType::isValidElementType(J->getType()) &&
5963                "Instruction has non-scalar type");
5964         if (canBeScalarized(J))
5965           Worklist.push_back(J);
5966         else if (needsExtract(J, VF)) {
5967           assert(!VF.Scalable && "scalable vectors not yet supported.");
5968           ScalarCost += TTI.getScalarizationOverhead(
5969               cast<VectorType>(ToVectorTy(J->getType(), VF)),
5970               APInt::getAllOnesValue(VF.Min), false, true);
5971         }
5972       }
5973 
5974     // Scale the total scalar cost by block probability.
5975     ScalarCost /= getReciprocalPredBlockProb();
5976 
5977     // Compute the discount. A non-negative discount means the vector version
5978     // of the instruction costs more, and scalarizing would be beneficial.
5979     Discount += VectorCost - ScalarCost;
5980     ScalarCosts[I] = ScalarCost;
5981   }
5982 
5983   return Discount;
5984 }
5985 
5986 LoopVectorizationCostModel::VectorizationCostTy
5987 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5988   assert(!VF.Scalable && "scalable vectors not yet supported.");
5989   VectorizationCostTy Cost;
5990 
5991   // For each block.
5992   for (BasicBlock *BB : TheLoop->blocks()) {
5993     VectorizationCostTy BlockCost;
5994 
5995     // For each instruction in the old loop.
5996     for (Instruction &I : BB->instructionsWithoutDebug()) {
5997       // Skip ignored values.
5998       if (ValuesToIgnore.count(&I) ||
5999           (VF.isVector() && VecValuesToIgnore.count(&I)))
6000         continue;
6001 
6002       VectorizationCostTy C = getInstructionCost(&I, VF);
6003 
6004       // Check if we should override the cost.
6005       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6006         C.first = ForceTargetInstructionCost;
6007 
6008       BlockCost.first += C.first;
6009       BlockCost.second |= C.second;
6010       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6011                         << " for VF " << VF << " For instruction: " << I
6012                         << '\n');
6013     }
6014 
6015     // If we are vectorizing a predicated block, it will have been
6016     // if-converted. This means that the block's instructions (aside from
6017     // stores and instructions that may divide by zero) will now be
6018     // unconditionally executed. For the scalar case, we may not always execute
6019     // the predicated block. Thus, scale the block's cost by the probability of
6020     // executing it.
6021     if (VF.isScalar() && blockNeedsPredication(BB))
6022       BlockCost.first /= getReciprocalPredBlockProb();
6023 
6024     Cost.first += BlockCost.first;
6025     Cost.second |= BlockCost.second;
6026   }
6027 
6028   return Cost;
6029 }
6030 
6031 /// Gets Address Access SCEV after verifying that the access pattern
6032 /// is loop invariant except the induction variable dependence.
6033 ///
6034 /// This SCEV can be sent to the Target in order to estimate the address
6035 /// calculation cost.
6036 static const SCEV *getAddressAccessSCEV(
6037               Value *Ptr,
6038               LoopVectorizationLegality *Legal,
6039               PredicatedScalarEvolution &PSE,
6040               const Loop *TheLoop) {
6041 
6042   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6043   if (!Gep)
6044     return nullptr;
6045 
6046   // We are looking for a gep with all loop invariant indices except for one
6047   // which should be an induction variable.
6048   auto SE = PSE.getSE();
6049   unsigned NumOperands = Gep->getNumOperands();
6050   for (unsigned i = 1; i < NumOperands; ++i) {
6051     Value *Opd = Gep->getOperand(i);
6052     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6053         !Legal->isInductionVariable(Opd))
6054       return nullptr;
6055   }
6056 
6057   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6058   return PSE.getSCEV(Ptr);
6059 }
6060 
6061 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6062   return Legal->hasStride(I->getOperand(0)) ||
6063          Legal->hasStride(I->getOperand(1));
6064 }
6065 
6066 unsigned
6067 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6068                                                         ElementCount VF) {
6069   assert(VF.isVector() &&
6070          "Scalarization cost of instruction implies vectorization.");
6071   assert(!VF.Scalable && "scalable vectors not yet supported.");
6072   Type *ValTy = getMemInstValueType(I);
6073   auto SE = PSE.getSE();
6074 
6075   unsigned AS = getLoadStoreAddressSpace(I);
6076   Value *Ptr = getLoadStorePointerOperand(I);
6077   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6078 
6079   // Figure out whether the access is strided and get the stride value
6080   // if it's known in compile time
6081   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6082 
6083   // Get the cost of the scalar memory instruction and address computation.
6084   unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6085 
6086   // Don't pass *I here, since it is scalar but will actually be part of a
6087   // vectorized loop where the user of it is a vectorized instruction.
6088   const Align Alignment = getLoadStoreAlignment(I);
6089   Cost += VF.Min *
6090           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6091                               AS, TTI::TCK_RecipThroughput);
6092 
6093   // Get the overhead of the extractelement and insertelement instructions
6094   // we might create due to scalarization.
6095   Cost += getScalarizationOverhead(I, VF);
6096 
6097   // If we have a predicated store, it may not be executed for each vector
6098   // lane. Scale the cost by the probability of executing the predicated
6099   // block.
6100   if (isPredicatedInst(I)) {
6101     Cost /= getReciprocalPredBlockProb();
6102 
6103     if (useEmulatedMaskMemRefHack(I))
6104       // Artificially setting to a high enough value to practically disable
6105       // vectorization with such operations.
6106       Cost = 3000000;
6107   }
6108 
6109   return Cost;
6110 }
6111 
6112 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6113                                                              ElementCount VF) {
6114   Type *ValTy = getMemInstValueType(I);
6115   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6116   Value *Ptr = getLoadStorePointerOperand(I);
6117   unsigned AS = getLoadStoreAddressSpace(I);
6118   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6119   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6120 
6121   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6122          "Stride should be 1 or -1 for consecutive memory access");
6123   const Align Alignment = getLoadStoreAlignment(I);
6124   unsigned Cost = 0;
6125   if (Legal->isMaskRequired(I))
6126     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6127                                       CostKind);
6128   else
6129     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6130                                 CostKind, I);
6131 
6132   bool Reverse = ConsecutiveStride < 0;
6133   if (Reverse)
6134     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6135   return Cost;
6136 }
6137 
6138 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6139                                                          ElementCount VF) {
6140   Type *ValTy = getMemInstValueType(I);
6141   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6142   const Align Alignment = getLoadStoreAlignment(I);
6143   unsigned AS = getLoadStoreAddressSpace(I);
6144   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6145   if (isa<LoadInst>(I)) {
6146     return TTI.getAddressComputationCost(ValTy) +
6147            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6148                                CostKind) +
6149            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6150   }
6151   StoreInst *SI = cast<StoreInst>(I);
6152 
6153   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6154   return TTI.getAddressComputationCost(ValTy) +
6155          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6156                              CostKind) +
6157          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
6158                                               Instruction::ExtractElement,
6159                                               VectorTy, VF.Min - 1));
6160 }
6161 
6162 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6163                                                           ElementCount VF) {
6164   Type *ValTy = getMemInstValueType(I);
6165   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6166   const Align Alignment = getLoadStoreAlignment(I);
6167   const Value *Ptr = getLoadStorePointerOperand(I);
6168 
6169   return TTI.getAddressComputationCost(VectorTy) +
6170          TTI.getGatherScatterOpCost(
6171              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6172              TargetTransformInfo::TCK_RecipThroughput, I);
6173 }
6174 
6175 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6176                                                             ElementCount VF) {
6177   Type *ValTy = getMemInstValueType(I);
6178   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6179   unsigned AS = getLoadStoreAddressSpace(I);
6180 
6181   auto Group = getInterleavedAccessGroup(I);
6182   assert(Group && "Fail to get an interleaved access group.");
6183 
6184   unsigned InterleaveFactor = Group->getFactor();
6185   assert(!VF.Scalable && "scalable vectors not yet supported.");
6186   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6187 
6188   // Holds the indices of existing members in an interleaved load group.
6189   // An interleaved store group doesn't need this as it doesn't allow gaps.
6190   SmallVector<unsigned, 4> Indices;
6191   if (isa<LoadInst>(I)) {
6192     for (unsigned i = 0; i < InterleaveFactor; i++)
6193       if (Group->getMember(i))
6194         Indices.push_back(i);
6195   }
6196 
6197   // Calculate the cost of the whole interleaved group.
6198   bool UseMaskForGaps =
6199       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6200   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6201       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6202       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6203 
6204   if (Group->isReverse()) {
6205     // TODO: Add support for reversed masked interleaved access.
6206     assert(!Legal->isMaskRequired(I) &&
6207            "Reverse masked interleaved access not supported.");
6208     Cost += Group->getNumMembers() *
6209             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6210   }
6211   return Cost;
6212 }
6213 
6214 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6215                                                               ElementCount VF) {
6216   // Calculate scalar cost only. Vectorization cost should be ready at this
6217   // moment.
6218   if (VF.isScalar()) {
6219     Type *ValTy = getMemInstValueType(I);
6220     const Align Alignment = getLoadStoreAlignment(I);
6221     unsigned AS = getLoadStoreAddressSpace(I);
6222 
6223     return TTI.getAddressComputationCost(ValTy) +
6224            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6225                                TTI::TCK_RecipThroughput, I);
6226   }
6227   return getWideningCost(I, VF);
6228 }
6229 
6230 LoopVectorizationCostModel::VectorizationCostTy
6231 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6232                                                ElementCount VF) {
6233   assert(!VF.Scalable &&
6234          "the cost model is not yet implemented for scalable vectorization");
6235   // If we know that this instruction will remain uniform, check the cost of
6236   // the scalar version.
6237   if (isUniformAfterVectorization(I, VF))
6238     VF = ElementCount::getFixed(1);
6239 
6240   if (VF.isVector() && isProfitableToScalarize(I, VF))
6241     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6242 
6243   // Forced scalars do not have any scalarization overhead.
6244   auto ForcedScalar = ForcedScalars.find(VF);
6245   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6246     auto InstSet = ForcedScalar->second;
6247     if (InstSet.count(I))
6248       return VectorizationCostTy(
6249           (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min),
6250           false);
6251   }
6252 
6253   Type *VectorTy;
6254   unsigned C = getInstructionCost(I, VF, VectorTy);
6255 
6256   bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() &&
6257                            TTI.getNumberOfParts(VectorTy) < VF.Min;
6258   return VectorizationCostTy(C, TypeNotScalarized);
6259 }
6260 
6261 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6262                                                               ElementCount VF) {
6263 
6264   assert(!VF.Scalable &&
6265          "cannot compute scalarization overhead for scalable vectorization");
6266   if (VF.isScalar())
6267     return 0;
6268 
6269   unsigned Cost = 0;
6270   Type *RetTy = ToVectorTy(I->getType(), VF);
6271   if (!RetTy->isVoidTy() &&
6272       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6273     Cost += TTI.getScalarizationOverhead(
6274         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.Min), true, false);
6275 
6276   // Some targets keep addresses scalar.
6277   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6278     return Cost;
6279 
6280   // Some targets support efficient element stores.
6281   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6282     return Cost;
6283 
6284   // Collect operands to consider.
6285   CallInst *CI = dyn_cast<CallInst>(I);
6286   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6287 
6288   // Skip operands that do not require extraction/scalarization and do not incur
6289   // any overhead.
6290   return Cost +
6291          TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF),
6292                                               VF.Min);
6293 }
6294 
6295 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6296   assert(!VF.Scalable && "scalable vectors not yet supported.");
6297   if (VF.isScalar())
6298     return;
6299   NumPredStores = 0;
6300   for (BasicBlock *BB : TheLoop->blocks()) {
6301     // For each instruction in the old loop.
6302     for (Instruction &I : *BB) {
6303       Value *Ptr =  getLoadStorePointerOperand(&I);
6304       if (!Ptr)
6305         continue;
6306 
6307       // TODO: We should generate better code and update the cost model for
6308       // predicated uniform stores. Today they are treated as any other
6309       // predicated store (see added test cases in
6310       // invariant-store-vectorization.ll).
6311       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6312         NumPredStores++;
6313 
6314       if (Legal->isUniform(Ptr) &&
6315           // Conditional loads and stores should be scalarized and predicated.
6316           // isScalarWithPredication cannot be used here since masked
6317           // gather/scatters are not considered scalar with predication.
6318           !Legal->blockNeedsPredication(I.getParent())) {
6319         // TODO: Avoid replicating loads and stores instead of
6320         // relying on instcombine to remove them.
6321         // Load: Scalar load + broadcast
6322         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6323         unsigned Cost = getUniformMemOpCost(&I, VF);
6324         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6325         continue;
6326       }
6327 
6328       // We assume that widening is the best solution when possible.
6329       if (memoryInstructionCanBeWidened(&I, VF)) {
6330         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6331         int ConsecutiveStride =
6332                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6333         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6334                "Expected consecutive stride.");
6335         InstWidening Decision =
6336             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6337         setWideningDecision(&I, VF, Decision, Cost);
6338         continue;
6339       }
6340 
6341       // Choose between Interleaving, Gather/Scatter or Scalarization.
6342       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6343       unsigned NumAccesses = 1;
6344       if (isAccessInterleaved(&I)) {
6345         auto Group = getInterleavedAccessGroup(&I);
6346         assert(Group && "Fail to get an interleaved access group.");
6347 
6348         // Make one decision for the whole group.
6349         if (getWideningDecision(&I, VF) != CM_Unknown)
6350           continue;
6351 
6352         NumAccesses = Group->getNumMembers();
6353         if (interleavedAccessCanBeWidened(&I, VF))
6354           InterleaveCost = getInterleaveGroupCost(&I, VF);
6355       }
6356 
6357       unsigned GatherScatterCost =
6358           isLegalGatherOrScatter(&I)
6359               ? getGatherScatterCost(&I, VF) * NumAccesses
6360               : std::numeric_limits<unsigned>::max();
6361 
6362       unsigned ScalarizationCost =
6363           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6364 
6365       // Choose better solution for the current VF,
6366       // write down this decision and use it during vectorization.
6367       unsigned Cost;
6368       InstWidening Decision;
6369       if (InterleaveCost <= GatherScatterCost &&
6370           InterleaveCost < ScalarizationCost) {
6371         Decision = CM_Interleave;
6372         Cost = InterleaveCost;
6373       } else if (GatherScatterCost < ScalarizationCost) {
6374         Decision = CM_GatherScatter;
6375         Cost = GatherScatterCost;
6376       } else {
6377         Decision = CM_Scalarize;
6378         Cost = ScalarizationCost;
6379       }
6380       // If the instructions belongs to an interleave group, the whole group
6381       // receives the same decision. The whole group receives the cost, but
6382       // the cost will actually be assigned to one instruction.
6383       if (auto Group = getInterleavedAccessGroup(&I))
6384         setWideningDecision(Group, VF, Decision, Cost);
6385       else
6386         setWideningDecision(&I, VF, Decision, Cost);
6387     }
6388   }
6389 
6390   // Make sure that any load of address and any other address computation
6391   // remains scalar unless there is gather/scatter support. This avoids
6392   // inevitable extracts into address registers, and also has the benefit of
6393   // activating LSR more, since that pass can't optimize vectorized
6394   // addresses.
6395   if (TTI.prefersVectorizedAddressing())
6396     return;
6397 
6398   // Start with all scalar pointer uses.
6399   SmallPtrSet<Instruction *, 8> AddrDefs;
6400   for (BasicBlock *BB : TheLoop->blocks())
6401     for (Instruction &I : *BB) {
6402       Instruction *PtrDef =
6403         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6404       if (PtrDef && TheLoop->contains(PtrDef) &&
6405           getWideningDecision(&I, VF) != CM_GatherScatter)
6406         AddrDefs.insert(PtrDef);
6407     }
6408 
6409   // Add all instructions used to generate the addresses.
6410   SmallVector<Instruction *, 4> Worklist;
6411   for (auto *I : AddrDefs)
6412     Worklist.push_back(I);
6413   while (!Worklist.empty()) {
6414     Instruction *I = Worklist.pop_back_val();
6415     for (auto &Op : I->operands())
6416       if (auto *InstOp = dyn_cast<Instruction>(Op))
6417         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6418             AddrDefs.insert(InstOp).second)
6419           Worklist.push_back(InstOp);
6420   }
6421 
6422   for (auto *I : AddrDefs) {
6423     if (isa<LoadInst>(I)) {
6424       // Setting the desired widening decision should ideally be handled in
6425       // by cost functions, but since this involves the task of finding out
6426       // if the loaded register is involved in an address computation, it is
6427       // instead changed here when we know this is the case.
6428       InstWidening Decision = getWideningDecision(I, VF);
6429       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6430         // Scalarize a widened load of address.
6431         setWideningDecision(
6432             I, VF, CM_Scalarize,
6433             (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6434       else if (auto Group = getInterleavedAccessGroup(I)) {
6435         // Scalarize an interleave group of address loads.
6436         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6437           if (Instruction *Member = Group->getMember(I))
6438             setWideningDecision(
6439                 Member, VF, CM_Scalarize,
6440                 (VF.Min *
6441                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6442         }
6443       }
6444     } else
6445       // Make sure I gets scalarized and a cost estimate without
6446       // scalarization overhead.
6447       ForcedScalars[VF].insert(I);
6448   }
6449 }
6450 
6451 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6452                                                         ElementCount VF,
6453                                                         Type *&VectorTy) {
6454   Type *RetTy = I->getType();
6455   if (canTruncateToMinimalBitwidth(I, VF))
6456     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6457   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6458   auto SE = PSE.getSE();
6459   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6460 
6461   // TODO: We need to estimate the cost of intrinsic calls.
6462   switch (I->getOpcode()) {
6463   case Instruction::GetElementPtr:
6464     // We mark this instruction as zero-cost because the cost of GEPs in
6465     // vectorized code depends on whether the corresponding memory instruction
6466     // is scalarized or not. Therefore, we handle GEPs with the memory
6467     // instruction cost.
6468     return 0;
6469   case Instruction::Br: {
6470     // In cases of scalarized and predicated instructions, there will be VF
6471     // predicated blocks in the vectorized loop. Each branch around these
6472     // blocks requires also an extract of its vector compare i1 element.
6473     bool ScalarPredicatedBB = false;
6474     BranchInst *BI = cast<BranchInst>(I);
6475     if (VF.isVector() && BI->isConditional() &&
6476         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6477          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6478       ScalarPredicatedBB = true;
6479 
6480     if (ScalarPredicatedBB) {
6481       // Return cost for branches around scalarized and predicated blocks.
6482       assert(!VF.Scalable && "scalable vectors not yet supported.");
6483       auto *Vec_i1Ty =
6484           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6485       return (TTI.getScalarizationOverhead(
6486                   Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) +
6487               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min));
6488     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6489       // The back-edge branch will remain, as will all scalar branches.
6490       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6491     else
6492       // This branch will be eliminated by if-conversion.
6493       return 0;
6494     // Note: We currently assume zero cost for an unconditional branch inside
6495     // a predicated block since it will become a fall-through, although we
6496     // may decide in the future to call TTI for all branches.
6497   }
6498   case Instruction::PHI: {
6499     auto *Phi = cast<PHINode>(I);
6500 
6501     // First-order recurrences are replaced by vector shuffles inside the loop.
6502     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6503     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6504       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6505                                 cast<VectorType>(VectorTy), VF.Min - 1,
6506                                 FixedVectorType::get(RetTy, 1));
6507 
6508     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6509     // converted into select instructions. We require N - 1 selects per phi
6510     // node, where N is the number of incoming values.
6511     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6512       return (Phi->getNumIncomingValues() - 1) *
6513              TTI.getCmpSelInstrCost(
6514                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6515                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6516                  CostKind);
6517 
6518     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6519   }
6520   case Instruction::UDiv:
6521   case Instruction::SDiv:
6522   case Instruction::URem:
6523   case Instruction::SRem:
6524     // If we have a predicated instruction, it may not be executed for each
6525     // vector lane. Get the scalarization cost and scale this amount by the
6526     // probability of executing the predicated block. If the instruction is not
6527     // predicated, we fall through to the next case.
6528     if (VF.isVector() && isScalarWithPredication(I)) {
6529       unsigned Cost = 0;
6530 
6531       // These instructions have a non-void type, so account for the phi nodes
6532       // that we will create. This cost is likely to be zero. The phi node
6533       // cost, if any, should be scaled by the block probability because it
6534       // models a copy at the end of each predicated block.
6535       Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6536 
6537       // The cost of the non-predicated instruction.
6538       Cost +=
6539           VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6540 
6541       // The cost of insertelement and extractelement instructions needed for
6542       // scalarization.
6543       Cost += getScalarizationOverhead(I, VF);
6544 
6545       // Scale the cost by the probability of executing the predicated blocks.
6546       // This assumes the predicated block for each vector lane is equally
6547       // likely.
6548       return Cost / getReciprocalPredBlockProb();
6549     }
6550     LLVM_FALLTHROUGH;
6551   case Instruction::Add:
6552   case Instruction::FAdd:
6553   case Instruction::Sub:
6554   case Instruction::FSub:
6555   case Instruction::Mul:
6556   case Instruction::FMul:
6557   case Instruction::FDiv:
6558   case Instruction::FRem:
6559   case Instruction::Shl:
6560   case Instruction::LShr:
6561   case Instruction::AShr:
6562   case Instruction::And:
6563   case Instruction::Or:
6564   case Instruction::Xor: {
6565     // Since we will replace the stride by 1 the multiplication should go away.
6566     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6567       return 0;
6568     // Certain instructions can be cheaper to vectorize if they have a constant
6569     // second vector operand. One example of this are shifts on x86.
6570     Value *Op2 = I->getOperand(1);
6571     TargetTransformInfo::OperandValueProperties Op2VP;
6572     TargetTransformInfo::OperandValueKind Op2VK =
6573         TTI.getOperandInfo(Op2, Op2VP);
6574     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6575       Op2VK = TargetTransformInfo::OK_UniformValue;
6576 
6577     SmallVector<const Value *, 4> Operands(I->operand_values());
6578     unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
6579     return N * TTI.getArithmeticInstrCost(
6580                    I->getOpcode(), VectorTy, CostKind,
6581                    TargetTransformInfo::OK_AnyValue,
6582                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6583   }
6584   case Instruction::FNeg: {
6585     assert(!VF.Scalable && "VF is assumed to be non scalable.");
6586     unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
6587     return N * TTI.getArithmeticInstrCost(
6588                    I->getOpcode(), VectorTy, CostKind,
6589                    TargetTransformInfo::OK_AnyValue,
6590                    TargetTransformInfo::OK_AnyValue,
6591                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6592                    I->getOperand(0), I);
6593   }
6594   case Instruction::Select: {
6595     SelectInst *SI = cast<SelectInst>(I);
6596     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6597     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6598     Type *CondTy = SI->getCondition()->getType();
6599     if (!ScalarCond) {
6600       assert(!VF.Scalable && "VF is assumed to be non scalable.");
6601       CondTy = VectorType::get(CondTy, VF);
6602     }
6603     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6604                                   CostKind, I);
6605   }
6606   case Instruction::ICmp:
6607   case Instruction::FCmp: {
6608     Type *ValTy = I->getOperand(0)->getType();
6609     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6610     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6611       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6612     VectorTy = ToVectorTy(ValTy, VF);
6613     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6614                                   I);
6615   }
6616   case Instruction::Store:
6617   case Instruction::Load: {
6618     ElementCount Width = VF;
6619     if (Width.isVector()) {
6620       InstWidening Decision = getWideningDecision(I, Width);
6621       assert(Decision != CM_Unknown &&
6622              "CM decision should be taken at this point");
6623       if (Decision == CM_Scalarize)
6624         Width = ElementCount::getFixed(1);
6625     }
6626     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6627     return getMemoryInstructionCost(I, VF);
6628   }
6629   case Instruction::ZExt:
6630   case Instruction::SExt:
6631   case Instruction::FPToUI:
6632   case Instruction::FPToSI:
6633   case Instruction::FPExt:
6634   case Instruction::PtrToInt:
6635   case Instruction::IntToPtr:
6636   case Instruction::SIToFP:
6637   case Instruction::UIToFP:
6638   case Instruction::Trunc:
6639   case Instruction::FPTrunc:
6640   case Instruction::BitCast: {
6641     // Computes the CastContextHint from a Load/Store instruction.
6642     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6643       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6644              "Expected a load or a store!");
6645 
6646       if (VF.isScalar() || !TheLoop->contains(I))
6647         return TTI::CastContextHint::Normal;
6648 
6649       switch (getWideningDecision(I, VF)) {
6650       case LoopVectorizationCostModel::CM_GatherScatter:
6651         return TTI::CastContextHint::GatherScatter;
6652       case LoopVectorizationCostModel::CM_Interleave:
6653         return TTI::CastContextHint::Interleave;
6654       case LoopVectorizationCostModel::CM_Scalarize:
6655       case LoopVectorizationCostModel::CM_Widen:
6656         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6657                                         : TTI::CastContextHint::Normal;
6658       case LoopVectorizationCostModel::CM_Widen_Reverse:
6659         return TTI::CastContextHint::Reversed;
6660       case LoopVectorizationCostModel::CM_Unknown:
6661         llvm_unreachable("Instr did not go through cost modelling?");
6662       }
6663 
6664       llvm_unreachable("Unhandled case!");
6665     };
6666 
6667     unsigned Opcode = I->getOpcode();
6668     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6669     // For Trunc, the context is the only user, which must be a StoreInst.
6670     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6671       if (I->hasOneUse())
6672         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6673           CCH = ComputeCCH(Store);
6674     }
6675     // For Z/Sext, the context is the operand, which must be a LoadInst.
6676     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6677              Opcode == Instruction::FPExt) {
6678       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6679         CCH = ComputeCCH(Load);
6680     }
6681 
6682     // We optimize the truncation of induction variables having constant
6683     // integer steps. The cost of these truncations is the same as the scalar
6684     // operation.
6685     if (isOptimizableIVTruncate(I, VF)) {
6686       auto *Trunc = cast<TruncInst>(I);
6687       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6688                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6689     }
6690 
6691     Type *SrcScalarTy = I->getOperand(0)->getType();
6692     Type *SrcVecTy =
6693         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6694     if (canTruncateToMinimalBitwidth(I, VF)) {
6695       // This cast is going to be shrunk. This may remove the cast or it might
6696       // turn it into slightly different cast. For example, if MinBW == 16,
6697       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6698       //
6699       // Calculate the modified src and dest types.
6700       Type *MinVecTy = VectorTy;
6701       if (Opcode == Instruction::Trunc) {
6702         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6703         VectorTy =
6704             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6705       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6706         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6707         VectorTy =
6708             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6709       }
6710     }
6711 
6712     assert(!VF.Scalable && "VF is assumed to be non scalable");
6713     unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
6714     return N *
6715            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6716   }
6717   case Instruction::Call: {
6718     bool NeedToScalarize;
6719     CallInst *CI = cast<CallInst>(I);
6720     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6721     if (getVectorIntrinsicIDForCall(CI, TLI))
6722       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6723     return CallCost;
6724   }
6725   default:
6726     // The cost of executing VF copies of the scalar instruction. This opcode
6727     // is unknown. Assume that it is the same as 'mul'.
6728     return VF.Min *
6729                TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6730                                           CostKind) +
6731            getScalarizationOverhead(I, VF);
6732   } // end of switch.
6733 }
6734 
6735 char LoopVectorize::ID = 0;
6736 
6737 static const char lv_name[] = "Loop Vectorization";
6738 
6739 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6740 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6741 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6742 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6743 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6744 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6745 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6746 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6747 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6748 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6749 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6750 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6751 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6752 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6753 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6754 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6755 
6756 namespace llvm {
6757 
6758 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6759 
6760 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6761                               bool VectorizeOnlyWhenForced) {
6762   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6763 }
6764 
6765 } // end namespace llvm
6766 
6767 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6768   // Check if the pointer operand of a load or store instruction is
6769   // consecutive.
6770   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6771     return Legal->isConsecutivePtr(Ptr);
6772   return false;
6773 }
6774 
6775 void LoopVectorizationCostModel::collectValuesToIgnore() {
6776   // Ignore ephemeral values.
6777   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6778 
6779   // Ignore type-promoting instructions we identified during reduction
6780   // detection.
6781   for (auto &Reduction : Legal->getReductionVars()) {
6782     RecurrenceDescriptor &RedDes = Reduction.second;
6783     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6784     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6785   }
6786   // Ignore type-casting instructions we identified during induction
6787   // detection.
6788   for (auto &Induction : Legal->getInductionVars()) {
6789     InductionDescriptor &IndDes = Induction.second;
6790     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6791     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6792   }
6793 }
6794 
6795 void LoopVectorizationCostModel::collectInLoopReductions() {
6796   // For the moment, without predicated reduction instructions, we do not
6797   // support inloop reductions whilst folding the tail, and hence in those cases
6798   // all reductions are currently out of the loop.
6799   if (!PreferInLoopReductions || foldTailByMasking())
6800     return;
6801 
6802   for (auto &Reduction : Legal->getReductionVars()) {
6803     PHINode *Phi = Reduction.first;
6804     RecurrenceDescriptor &RdxDesc = Reduction.second;
6805 
6806     // We don't collect reductions that are type promoted (yet).
6807     if (RdxDesc.getRecurrenceType() != Phi->getType())
6808       continue;
6809 
6810     // Check that we can correctly put the reductions into the loop, by
6811     // finding the chain of operations that leads from the phi to the loop
6812     // exit value.
6813     SmallVector<Instruction *, 4> ReductionOperations =
6814         RdxDesc.getReductionOpChain(Phi, TheLoop);
6815     bool InLoop = !ReductionOperations.empty();
6816     if (InLoop)
6817       InLoopReductionChains[Phi] = ReductionOperations;
6818     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6819                       << " reduction for phi: " << *Phi << "\n");
6820   }
6821 }
6822 
6823 // TODO: we could return a pair of values that specify the max VF and
6824 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6825 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6826 // doesn't have a cost model that can choose which plan to execute if
6827 // more than one is generated.
6828 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6829                                  LoopVectorizationCostModel &CM) {
6830   unsigned WidestType;
6831   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6832   return WidestVectorRegBits / WidestType;
6833 }
6834 
6835 VectorizationFactor
6836 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6837   assert(!UserVF.Scalable && "scalable vectors not yet supported");
6838   ElementCount VF = UserVF;
6839   // Outer loop handling: They may require CFG and instruction level
6840   // transformations before even evaluating whether vectorization is profitable.
6841   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6842   // the vectorization pipeline.
6843   if (!OrigLoop->empty()) {
6844     // If the user doesn't provide a vectorization factor, determine a
6845     // reasonable one.
6846     if (UserVF.isZero()) {
6847       VF = ElementCount::getFixed(
6848           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6849       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6850 
6851       // Make sure we have a VF > 1 for stress testing.
6852       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6853         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6854                           << "overriding computed VF.\n");
6855         VF = ElementCount::getFixed(4);
6856       }
6857     }
6858     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6859     assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two");
6860     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6861                       << "VF " << VF << " to build VPlans.\n");
6862     buildVPlans(VF.Min, VF.Min);
6863 
6864     // For VPlan build stress testing, we bail out after VPlan construction.
6865     if (VPlanBuildStressTest)
6866       return VectorizationFactor::Disabled();
6867 
6868     return {VF, 0 /*Cost*/};
6869   }
6870 
6871   LLVM_DEBUG(
6872       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6873                 "VPlan-native path.\n");
6874   return VectorizationFactor::Disabled();
6875 }
6876 
6877 Optional<VectorizationFactor>
6878 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6879   assert(!UserVF.Scalable && "scalable vectorization not yet handled");
6880   assert(OrigLoop->empty() && "Inner loop expected.");
6881   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC);
6882   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6883     return None;
6884 
6885   // Invalidate interleave groups if all blocks of loop will be predicated.
6886   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6887       !useMaskedInterleavedAccesses(*TTI)) {
6888     LLVM_DEBUG(
6889         dbgs()
6890         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6891            "which requires masked-interleaved support.\n");
6892     if (CM.InterleaveInfo.invalidateGroups())
6893       // Invalidating interleave groups also requires invalidating all decisions
6894       // based on them, which includes widening decisions and uniform and scalar
6895       // values.
6896       CM.invalidateCostModelingDecisions();
6897   }
6898 
6899   if (!UserVF.isZero()) {
6900     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6901     assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two");
6902     // Collect the instructions (and their associated costs) that will be more
6903     // profitable to scalarize.
6904     CM.selectUserVectorizationFactor(UserVF);
6905     CM.collectInLoopReductions();
6906     buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min);
6907     LLVM_DEBUG(printPlans(dbgs()));
6908     return {{UserVF, 0}};
6909   }
6910 
6911   unsigned MaxVF = MaybeMaxVF.getValue();
6912   assert(MaxVF != 0 && "MaxVF is zero.");
6913 
6914   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6915     // Collect Uniform and Scalar instructions after vectorization with VF.
6916     CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
6917 
6918     // Collect the instructions (and their associated costs) that will be more
6919     // profitable to scalarize.
6920     if (VF > 1)
6921       CM.collectInstsToScalarize(ElementCount::getFixed(VF));
6922   }
6923 
6924   CM.collectInLoopReductions();
6925 
6926   buildVPlansWithVPRecipes(1, MaxVF);
6927   LLVM_DEBUG(printPlans(dbgs()));
6928   if (MaxVF == 1)
6929     return VectorizationFactor::Disabled();
6930 
6931   // Select the optimal vectorization factor.
6932   return CM.selectVectorizationFactor(MaxVF);
6933 }
6934 
6935 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
6936   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6937                     << '\n');
6938   BestVF = VF;
6939   BestUF = UF;
6940 
6941   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6942     return !Plan->hasVF(VF);
6943   });
6944   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6945 }
6946 
6947 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6948                                            DominatorTree *DT) {
6949   // Perform the actual loop transformation.
6950 
6951   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6952   VPCallbackILV CallbackILV(ILV);
6953 
6954   assert(BestVF.hasValue() && "Vectorization Factor is missing");
6955 
6956   VPTransformState State{*BestVF, BestUF,      LI,
6957                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
6958                          &ILV,    CallbackILV};
6959   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6960   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6961   State.CanonicalIV = ILV.Induction;
6962 
6963   //===------------------------------------------------===//
6964   //
6965   // Notice: any optimization or new instruction that go
6966   // into the code below should also be implemented in
6967   // the cost-model.
6968   //
6969   //===------------------------------------------------===//
6970 
6971   // 2. Copy and widen instructions from the old loop into the new loop.
6972   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6973   VPlans.front()->execute(&State);
6974 
6975   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6976   //    predication, updating analyses.
6977   ILV.fixVectorizedLoop();
6978 }
6979 
6980 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6981     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6982   BasicBlock *Latch = OrigLoop->getLoopLatch();
6983 
6984   // We create new control-flow for the vectorized loop, so the original
6985   // condition will be dead after vectorization if it's only used by the
6986   // branch.
6987   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6988   if (Cmp && Cmp->hasOneUse())
6989     DeadInstructions.insert(Cmp);
6990 
6991   // We create new "steps" for induction variable updates to which the original
6992   // induction variables map. An original update instruction will be dead if
6993   // all its users except the induction variable are dead.
6994   for (auto &Induction : Legal->getInductionVars()) {
6995     PHINode *Ind = Induction.first;
6996     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6997     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6998           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
6999         }))
7000       DeadInstructions.insert(IndUpdate);
7001 
7002     // We record as "Dead" also the type-casting instructions we had identified
7003     // during induction analysis. We don't need any handling for them in the
7004     // vectorized loop because we have proven that, under a proper runtime
7005     // test guarding the vectorized loop, the value of the phi, and the casted
7006     // value of the phi, are the same. The last instruction in this casting chain
7007     // will get its scalar/vector/widened def from the scalar/vector/widened def
7008     // of the respective phi node. Any other casts in the induction def-use chain
7009     // have no other uses outside the phi update chain, and will be ignored.
7010     InductionDescriptor &IndDes = Induction.second;
7011     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7012     DeadInstructions.insert(Casts.begin(), Casts.end());
7013   }
7014 }
7015 
7016 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7017 
7018 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7019 
7020 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7021                                         Instruction::BinaryOps BinOp) {
7022   // When unrolling and the VF is 1, we only need to add a simple scalar.
7023   Type *Ty = Val->getType();
7024   assert(!Ty->isVectorTy() && "Val must be a scalar");
7025 
7026   if (Ty->isFloatingPointTy()) {
7027     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7028 
7029     // Floating point operations had to be 'fast' to enable the unrolling.
7030     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7031     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7032   }
7033   Constant *C = ConstantInt::get(Ty, StartIdx);
7034   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7035 }
7036 
7037 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7038   SmallVector<Metadata *, 4> MDs;
7039   // Reserve first location for self reference to the LoopID metadata node.
7040   MDs.push_back(nullptr);
7041   bool IsUnrollMetadata = false;
7042   MDNode *LoopID = L->getLoopID();
7043   if (LoopID) {
7044     // First find existing loop unrolling disable metadata.
7045     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7046       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7047       if (MD) {
7048         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7049         IsUnrollMetadata =
7050             S && S->getString().startswith("llvm.loop.unroll.disable");
7051       }
7052       MDs.push_back(LoopID->getOperand(i));
7053     }
7054   }
7055 
7056   if (!IsUnrollMetadata) {
7057     // Add runtime unroll disable metadata.
7058     LLVMContext &Context = L->getHeader()->getContext();
7059     SmallVector<Metadata *, 1> DisableOperands;
7060     DisableOperands.push_back(
7061         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7062     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7063     MDs.push_back(DisableNode);
7064     MDNode *NewLoopID = MDNode::get(Context, MDs);
7065     // Set operand 0 to refer to the loop id itself.
7066     NewLoopID->replaceOperandWith(0, NewLoopID);
7067     L->setLoopID(NewLoopID);
7068   }
7069 }
7070 
7071 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7072     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7073   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7074   bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7075 
7076   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7077     if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7078       Range.End = TmpVF;
7079       break;
7080     }
7081 
7082   return PredicateAtRangeStart;
7083 }
7084 
7085 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7086 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7087 /// of VF's starting at a given VF and extending it as much as possible. Each
7088 /// vectorization decision can potentially shorten this sub-range during
7089 /// buildVPlan().
7090 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7091   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7092     VFRange SubRange = {VF, MaxVF + 1};
7093     VPlans.push_back(buildVPlan(SubRange));
7094     VF = SubRange.End;
7095   }
7096 }
7097 
7098 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7099                                          VPlanPtr &Plan) {
7100   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7101 
7102   // Look for cached value.
7103   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7104   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7105   if (ECEntryIt != EdgeMaskCache.end())
7106     return ECEntryIt->second;
7107 
7108   VPValue *SrcMask = createBlockInMask(Src, Plan);
7109 
7110   // The terminator has to be a branch inst!
7111   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7112   assert(BI && "Unexpected terminator found");
7113 
7114   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7115     return EdgeMaskCache[Edge] = SrcMask;
7116 
7117   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7118   assert(EdgeMask && "No Edge Mask found for condition");
7119 
7120   if (BI->getSuccessor(0) != Dst)
7121     EdgeMask = Builder.createNot(EdgeMask);
7122 
7123   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7124     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7125 
7126   return EdgeMaskCache[Edge] = EdgeMask;
7127 }
7128 
7129 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7130   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7131 
7132   // Look for cached value.
7133   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7134   if (BCEntryIt != BlockMaskCache.end())
7135     return BCEntryIt->second;
7136 
7137   // All-one mask is modelled as no-mask following the convention for masked
7138   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7139   VPValue *BlockMask = nullptr;
7140 
7141   if (OrigLoop->getHeader() == BB) {
7142     if (!CM.blockNeedsPredication(BB))
7143       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7144 
7145     // Introduce the early-exit compare IV <= BTC to form header block mask.
7146     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7147     // Start by constructing the desired canonical IV.
7148     VPValue *IV = nullptr;
7149     if (Legal->getPrimaryInduction())
7150       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7151     else {
7152       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7153       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7154       IV = IVRecipe->getVPValue();
7155     }
7156     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7157     bool TailFolded = !CM.isScalarEpilogueAllowed();
7158 
7159     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7160       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7161       // as a second argument, we only pass the IV here and extract the
7162       // tripcount from the transform state where codegen of the VP instructions
7163       // happen.
7164       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7165     } else {
7166       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7167     }
7168     return BlockMaskCache[BB] = BlockMask;
7169   }
7170 
7171   // This is the block mask. We OR all incoming edges.
7172   for (auto *Predecessor : predecessors(BB)) {
7173     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7174     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7175       return BlockMaskCache[BB] = EdgeMask;
7176 
7177     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7178       BlockMask = EdgeMask;
7179       continue;
7180     }
7181 
7182     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7183   }
7184 
7185   return BlockMaskCache[BB] = BlockMask;
7186 }
7187 
7188 VPWidenMemoryInstructionRecipe *
7189 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7190                                   VPlanPtr &Plan) {
7191   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7192          "Must be called with either a load or store");
7193 
7194   auto willWiden = [&](ElementCount VF) -> bool {
7195     assert(!VF.Scalable && "unexpected scalable ElementCount");
7196     if (VF.isScalar())
7197       return false;
7198     LoopVectorizationCostModel::InstWidening Decision =
7199         CM.getWideningDecision(I, VF);
7200     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7201            "CM decision should be taken at this point.");
7202     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7203       return true;
7204     if (CM.isScalarAfterVectorization(I, VF) ||
7205         CM.isProfitableToScalarize(I, VF))
7206       return false;
7207     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7208   };
7209 
7210   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7211     return nullptr;
7212 
7213   VPValue *Mask = nullptr;
7214   if (Legal->isMaskRequired(I))
7215     Mask = createBlockInMask(I->getParent(), Plan);
7216 
7217   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7218   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7219     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7220 
7221   StoreInst *Store = cast<StoreInst>(I);
7222   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7223   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7224 }
7225 
7226 VPWidenIntOrFpInductionRecipe *
7227 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7228   // Check if this is an integer or fp induction. If so, build the recipe that
7229   // produces its scalar and vector values.
7230   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7231   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7232       II.getKind() == InductionDescriptor::IK_FpInduction)
7233     return new VPWidenIntOrFpInductionRecipe(Phi);
7234 
7235   return nullptr;
7236 }
7237 
7238 VPWidenIntOrFpInductionRecipe *
7239 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7240                                                 VFRange &Range) const {
7241   // Optimize the special case where the source is a constant integer
7242   // induction variable. Notice that we can only optimize the 'trunc' case
7243   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7244   // (c) other casts depend on pointer size.
7245 
7246   // Determine whether \p K is a truncation based on an induction variable that
7247   // can be optimized.
7248   auto isOptimizableIVTruncate =
7249       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7250     return [=](ElementCount VF) -> bool {
7251       return CM.isOptimizableIVTruncate(K, VF);
7252     };
7253   };
7254 
7255   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7256           isOptimizableIVTruncate(I), Range))
7257     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7258                                              I);
7259   return nullptr;
7260 }
7261 
7262 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7263   // We know that all PHIs in non-header blocks are converted into selects, so
7264   // we don't have to worry about the insertion order and we can just use the
7265   // builder. At this point we generate the predication tree. There may be
7266   // duplications since this is a simple recursive scan, but future
7267   // optimizations will clean it up.
7268 
7269   SmallVector<VPValue *, 2> Operands;
7270   unsigned NumIncoming = Phi->getNumIncomingValues();
7271   for (unsigned In = 0; In < NumIncoming; In++) {
7272     VPValue *EdgeMask =
7273       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7274     assert((EdgeMask || NumIncoming == 1) &&
7275            "Multiple predecessors with one having a full mask");
7276     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7277     if (EdgeMask)
7278       Operands.push_back(EdgeMask);
7279   }
7280   return new VPBlendRecipe(Phi, Operands);
7281 }
7282 
7283 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7284                                                    VPlan &Plan) const {
7285 
7286   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7287       [this, CI](ElementCount VF) {
7288         return CM.isScalarWithPredication(CI, VF);
7289       },
7290       Range);
7291 
7292   if (IsPredicated)
7293     return nullptr;
7294 
7295   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7296   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7297              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7298     return nullptr;
7299 
7300   auto willWiden = [&](ElementCount VF) -> bool {
7301     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7302     // The following case may be scalarized depending on the VF.
7303     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7304     // version of the instruction.
7305     // Is it beneficial to perform intrinsic call compared to lib call?
7306     bool NeedToScalarize = false;
7307     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7308     bool UseVectorIntrinsic =
7309         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7310     return UseVectorIntrinsic || !NeedToScalarize;
7311   };
7312 
7313   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7314     return nullptr;
7315 
7316   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7317 }
7318 
7319 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7320   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7321          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7322   // Instruction should be widened, unless it is scalar after vectorization,
7323   // scalarization is profitable or it is predicated.
7324   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7325     return CM.isScalarAfterVectorization(I, VF) ||
7326            CM.isProfitableToScalarize(I, VF) ||
7327            CM.isScalarWithPredication(I, VF);
7328   };
7329   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7330                                                              Range);
7331 }
7332 
7333 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7334   auto IsVectorizableOpcode = [](unsigned Opcode) {
7335     switch (Opcode) {
7336     case Instruction::Add:
7337     case Instruction::And:
7338     case Instruction::AShr:
7339     case Instruction::BitCast:
7340     case Instruction::FAdd:
7341     case Instruction::FCmp:
7342     case Instruction::FDiv:
7343     case Instruction::FMul:
7344     case Instruction::FNeg:
7345     case Instruction::FPExt:
7346     case Instruction::FPToSI:
7347     case Instruction::FPToUI:
7348     case Instruction::FPTrunc:
7349     case Instruction::FRem:
7350     case Instruction::FSub:
7351     case Instruction::ICmp:
7352     case Instruction::IntToPtr:
7353     case Instruction::LShr:
7354     case Instruction::Mul:
7355     case Instruction::Or:
7356     case Instruction::PtrToInt:
7357     case Instruction::SDiv:
7358     case Instruction::Select:
7359     case Instruction::SExt:
7360     case Instruction::Shl:
7361     case Instruction::SIToFP:
7362     case Instruction::SRem:
7363     case Instruction::Sub:
7364     case Instruction::Trunc:
7365     case Instruction::UDiv:
7366     case Instruction::UIToFP:
7367     case Instruction::URem:
7368     case Instruction::Xor:
7369     case Instruction::ZExt:
7370       return true;
7371     }
7372     return false;
7373   };
7374 
7375   if (!IsVectorizableOpcode(I->getOpcode()))
7376     return nullptr;
7377 
7378   // Success: widen this instruction.
7379   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7380 }
7381 
7382 VPBasicBlock *VPRecipeBuilder::handleReplication(
7383     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7384     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7385     VPlanPtr &Plan) {
7386   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7387       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7388       Range);
7389 
7390   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7391       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7392       Range);
7393 
7394   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7395                                        IsUniform, IsPredicated);
7396   setRecipe(I, Recipe);
7397 
7398   // Find if I uses a predicated instruction. If so, it will use its scalar
7399   // value. Avoid hoisting the insert-element which packs the scalar value into
7400   // a vector value, as that happens iff all users use the vector value.
7401   for (auto &Op : I->operands())
7402     if (auto *PredInst = dyn_cast<Instruction>(Op))
7403       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7404         PredInst2Recipe[PredInst]->setAlsoPack(false);
7405 
7406   // Finalize the recipe for Instr, first if it is not predicated.
7407   if (!IsPredicated) {
7408     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7409     VPBB->appendRecipe(Recipe);
7410     return VPBB;
7411   }
7412   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7413   assert(VPBB->getSuccessors().empty() &&
7414          "VPBB has successors when handling predicated replication.");
7415   // Record predicated instructions for above packing optimizations.
7416   PredInst2Recipe[I] = Recipe;
7417   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7418   VPBlockUtils::insertBlockAfter(Region, VPBB);
7419   auto *RegSucc = new VPBasicBlock();
7420   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7421   return RegSucc;
7422 }
7423 
7424 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7425                                                       VPRecipeBase *PredRecipe,
7426                                                       VPlanPtr &Plan) {
7427   // Instructions marked for predication are replicated and placed under an
7428   // if-then construct to prevent side-effects.
7429 
7430   // Generate recipes to compute the block mask for this region.
7431   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7432 
7433   // Build the triangular if-then region.
7434   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7435   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7436   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7437   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7438   auto *PHIRecipe =
7439       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7440   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7441   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7442   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7443 
7444   // Note: first set Entry as region entry and then connect successors starting
7445   // from it in order, to propagate the "parent" of each VPBasicBlock.
7446   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7447   VPBlockUtils::connectBlocks(Pred, Exit);
7448 
7449   return Region;
7450 }
7451 
7452 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7453                                                       VFRange &Range,
7454                                                       VPlanPtr &Plan) {
7455   // First, check for specific widening recipes that deal with calls, memory
7456   // operations, inductions and Phi nodes.
7457   if (auto *CI = dyn_cast<CallInst>(Instr))
7458     return tryToWidenCall(CI, Range, *Plan);
7459 
7460   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7461     return tryToWidenMemory(Instr, Range, Plan);
7462 
7463   VPRecipeBase *Recipe;
7464   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7465     if (Phi->getParent() != OrigLoop->getHeader())
7466       return tryToBlend(Phi, Plan);
7467     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7468       return Recipe;
7469     return new VPWidenPHIRecipe(Phi);
7470   }
7471 
7472   if (isa<TruncInst>(Instr) &&
7473       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7474     return Recipe;
7475 
7476   if (!shouldWiden(Instr, Range))
7477     return nullptr;
7478 
7479   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7480     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7481                                 OrigLoop);
7482 
7483   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7484     bool InvariantCond =
7485         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7486     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7487                                    InvariantCond);
7488   }
7489 
7490   return tryToWiden(Instr, *Plan);
7491 }
7492 
7493 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7494                                                         unsigned MaxVF) {
7495   assert(OrigLoop->empty() && "Inner loop expected.");
7496 
7497   // Collect conditions feeding internal conditional branches; they need to be
7498   // represented in VPlan for it to model masking.
7499   SmallPtrSet<Value *, 1> NeedDef;
7500 
7501   auto *Latch = OrigLoop->getLoopLatch();
7502   for (BasicBlock *BB : OrigLoop->blocks()) {
7503     if (BB == Latch)
7504       continue;
7505     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7506     if (Branch && Branch->isConditional())
7507       NeedDef.insert(Branch->getCondition());
7508   }
7509 
7510   // If the tail is to be folded by masking, the primary induction variable, if
7511   // exists needs to be represented in VPlan for it to model early-exit masking.
7512   // Also, both the Phi and the live-out instruction of each reduction are
7513   // required in order to introduce a select between them in VPlan.
7514   if (CM.foldTailByMasking()) {
7515     if (Legal->getPrimaryInduction())
7516       NeedDef.insert(Legal->getPrimaryInduction());
7517     for (auto &Reduction : Legal->getReductionVars()) {
7518       NeedDef.insert(Reduction.first);
7519       NeedDef.insert(Reduction.second.getLoopExitInstr());
7520     }
7521   }
7522 
7523   // Collect instructions from the original loop that will become trivially dead
7524   // in the vectorized loop. We don't need to vectorize these instructions. For
7525   // example, original induction update instructions can become dead because we
7526   // separately emit induction "steps" when generating code for the new loop.
7527   // Similarly, we create a new latch condition when setting up the structure
7528   // of the new loop, so the old one can become dead.
7529   SmallPtrSet<Instruction *, 4> DeadInstructions;
7530   collectTriviallyDeadInstructions(DeadInstructions);
7531 
7532   // Add assume instructions we need to drop to DeadInstructions, to prevent
7533   // them from being added to the VPlan.
7534   // TODO: We only need to drop assumes in blocks that get flattend. If the
7535   // control flow is preserved, we should keep them.
7536   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7537   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7538 
7539   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7540   // Dead instructions do not need sinking. Remove them from SinkAfter.
7541   for (Instruction *I : DeadInstructions)
7542     SinkAfter.erase(I);
7543 
7544   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7545     VFRange SubRange = {VF, MaxVF + 1};
7546     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7547                                              DeadInstructions, SinkAfter));
7548     VF = SubRange.End;
7549   }
7550 }
7551 
7552 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7553     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7554     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7555     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7556 
7557   // Hold a mapping from predicated instructions to their recipes, in order to
7558   // fix their AlsoPack behavior if a user is determined to replicate and use a
7559   // scalar instead of vector value.
7560   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7561 
7562   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7563 
7564   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7565 
7566   // ---------------------------------------------------------------------------
7567   // Pre-construction: record ingredients whose recipes we'll need to further
7568   // process after constructing the initial VPlan.
7569   // ---------------------------------------------------------------------------
7570 
7571   // Mark instructions we'll need to sink later and their targets as
7572   // ingredients whose recipe we'll need to record.
7573   for (auto &Entry : SinkAfter) {
7574     RecipeBuilder.recordRecipeOf(Entry.first);
7575     RecipeBuilder.recordRecipeOf(Entry.second);
7576   }
7577   for (auto &Reduction : CM.getInLoopReductionChains()) {
7578     PHINode *Phi = Reduction.first;
7579     RecurrenceDescriptor::RecurrenceKind Kind =
7580         Legal->getReductionVars()[Phi].getRecurrenceKind();
7581     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7582 
7583     RecipeBuilder.recordRecipeOf(Phi);
7584     for (auto &R : ReductionOperations) {
7585       RecipeBuilder.recordRecipeOf(R);
7586       // For min/max reducitons, where we have a pair of icmp/select, we also
7587       // need to record the ICmp recipe, so it can be removed later.
7588       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7589           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7590         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7591       }
7592     }
7593   }
7594 
7595   // For each interleave group which is relevant for this (possibly trimmed)
7596   // Range, add it to the set of groups to be later applied to the VPlan and add
7597   // placeholders for its members' Recipes which we'll be replacing with a
7598   // single VPInterleaveRecipe.
7599   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7600     auto applyIG = [IG, this](ElementCount VF) -> bool {
7601       return (VF.isVector() && // Query is illegal for VF == 1
7602               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7603                   LoopVectorizationCostModel::CM_Interleave);
7604     };
7605     if (!getDecisionAndClampRange(applyIG, Range))
7606       continue;
7607     InterleaveGroups.insert(IG);
7608     for (unsigned i = 0; i < IG->getFactor(); i++)
7609       if (Instruction *Member = IG->getMember(i))
7610         RecipeBuilder.recordRecipeOf(Member);
7611   };
7612 
7613   // ---------------------------------------------------------------------------
7614   // Build initial VPlan: Scan the body of the loop in a topological order to
7615   // visit each basic block after having visited its predecessor basic blocks.
7616   // ---------------------------------------------------------------------------
7617 
7618   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7619   auto Plan = std::make_unique<VPlan>();
7620   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7621   Plan->setEntry(VPBB);
7622 
7623   // Represent values that will have defs inside VPlan.
7624   for (Value *V : NeedDef)
7625     Plan->addVPValue(V);
7626 
7627   // Scan the body of the loop in a topological order to visit each basic block
7628   // after having visited its predecessor basic blocks.
7629   LoopBlocksDFS DFS(OrigLoop);
7630   DFS.perform(LI);
7631 
7632   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7633     // Relevant instructions from basic block BB will be grouped into VPRecipe
7634     // ingredients and fill a new VPBasicBlock.
7635     unsigned VPBBsForBB = 0;
7636     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7637     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7638     VPBB = FirstVPBBForBB;
7639     Builder.setInsertPoint(VPBB);
7640 
7641     // Introduce each ingredient into VPlan.
7642     // TODO: Model and preserve debug instrinsics in VPlan.
7643     for (Instruction &I : BB->instructionsWithoutDebug()) {
7644       Instruction *Instr = &I;
7645 
7646       // First filter out irrelevant instructions, to ensure no recipes are
7647       // built for them.
7648       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7649         continue;
7650 
7651       if (auto Recipe =
7652               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7653         RecipeBuilder.setRecipe(Instr, Recipe);
7654         VPBB->appendRecipe(Recipe);
7655         continue;
7656       }
7657 
7658       // Otherwise, if all widening options failed, Instruction is to be
7659       // replicated. This may create a successor for VPBB.
7660       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7661           Instr, Range, VPBB, PredInst2Recipe, Plan);
7662       if (NextVPBB != VPBB) {
7663         VPBB = NextVPBB;
7664         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7665                                     : "");
7666       }
7667     }
7668   }
7669 
7670   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7671   // may also be empty, such as the last one VPBB, reflecting original
7672   // basic-blocks with no recipes.
7673   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7674   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7675   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7676   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7677   delete PreEntry;
7678 
7679   // ---------------------------------------------------------------------------
7680   // Transform initial VPlan: Apply previously taken decisions, in order, to
7681   // bring the VPlan to its final state.
7682   // ---------------------------------------------------------------------------
7683 
7684   // Apply Sink-After legal constraints.
7685   for (auto &Entry : SinkAfter) {
7686     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7687     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7688     Sink->moveAfter(Target);
7689   }
7690 
7691   // Interleave memory: for each Interleave Group we marked earlier as relevant
7692   // for this VPlan, replace the Recipes widening its memory instructions with a
7693   // single VPInterleaveRecipe at its insertion point.
7694   for (auto IG : InterleaveGroups) {
7695     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7696         RecipeBuilder.getRecipe(IG->getInsertPos()));
7697     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7698         ->insertBefore(Recipe);
7699 
7700     for (unsigned i = 0; i < IG->getFactor(); ++i)
7701       if (Instruction *Member = IG->getMember(i)) {
7702         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7703       }
7704   }
7705 
7706   // Adjust the recipes for any inloop reductions.
7707   if (Range.Start > 1)
7708     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7709 
7710   // Finally, if tail is folded by masking, introduce selects between the phi
7711   // and the live-out instruction of each reduction, at the end of the latch.
7712   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7713     Builder.setInsertPoint(VPBB);
7714     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7715     for (auto &Reduction : Legal->getReductionVars()) {
7716       assert(!CM.isInLoopReduction(Reduction.first) &&
7717              "Didn't expect inloop tail folded reduction yet!");
7718       VPValue *Phi = Plan->getVPValue(Reduction.first);
7719       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7720       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7721     }
7722   }
7723 
7724   std::string PlanName;
7725   raw_string_ostream RSO(PlanName);
7726   ElementCount VF = ElementCount::getFixed(Range.Start);
7727   Plan->addVF(VF);
7728   RSO << "Initial VPlan for VF={" << VF;
7729   for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) {
7730     Plan->addVF(VF);
7731     RSO << "," << VF;
7732   }
7733   RSO << "},UF>=1";
7734   RSO.flush();
7735   Plan->setName(PlanName);
7736 
7737   return Plan;
7738 }
7739 
7740 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7741   // Outer loop handling: They may require CFG and instruction level
7742   // transformations before even evaluating whether vectorization is profitable.
7743   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7744   // the vectorization pipeline.
7745   assert(!OrigLoop->empty());
7746   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7747 
7748   // Create new empty VPlan
7749   auto Plan = std::make_unique<VPlan>();
7750 
7751   // Build hierarchical CFG
7752   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7753   HCFGBuilder.buildHierarchicalCFG();
7754 
7755   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7756     Plan->addVF(ElementCount::getFixed(VF));
7757 
7758   if (EnableVPlanPredication) {
7759     VPlanPredicator VPP(*Plan);
7760     VPP.predicate();
7761 
7762     // Avoid running transformation to recipes until masked code generation in
7763     // VPlan-native path is in place.
7764     return Plan;
7765   }
7766 
7767   SmallPtrSet<Instruction *, 1> DeadInstructions;
7768   VPlanTransforms::VPInstructionsToVPRecipes(
7769       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7770   return Plan;
7771 }
7772 
7773 // Adjust the recipes for any inloop reductions. The chain of instructions
7774 // leading from the loop exit instr to the phi need to be converted to
7775 // reductions, with one operand being vector and the other being the scalar
7776 // reduction chain.
7777 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7778     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7779   for (auto &Reduction : CM.getInLoopReductionChains()) {
7780     PHINode *Phi = Reduction.first;
7781     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7782     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7783 
7784     // ReductionOperations are orders top-down from the phi's use to the
7785     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7786     // which of the two operands will remain scalar and which will be reduced.
7787     // For minmax the chain will be the select instructions.
7788     Instruction *Chain = Phi;
7789     for (Instruction *R : ReductionOperations) {
7790       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7791       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7792 
7793       VPValue *ChainOp = Plan->getVPValue(Chain);
7794       unsigned FirstOpId;
7795       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7796           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7797         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
7798                "Expected to replace a VPWidenSelectSC");
7799         FirstOpId = 1;
7800       } else {
7801         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7802                "Expected to replace a VPWidenSC");
7803         FirstOpId = 0;
7804       }
7805       unsigned VecOpId =
7806           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7807       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7808 
7809       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7810           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7811       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7812       WidenRecipe->eraseFromParent();
7813 
7814       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7815           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7816         VPRecipeBase *CompareRecipe =
7817             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7818         assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7819                "Expected to replace a VPWidenSC");
7820         CompareRecipe->eraseFromParent();
7821       }
7822       Chain = R;
7823     }
7824   }
7825 }
7826 
7827 Value* LoopVectorizationPlanner::VPCallbackILV::
7828 getOrCreateVectorValues(Value *V, unsigned Part) {
7829       return ILV.getOrCreateVectorValue(V, Part);
7830 }
7831 
7832 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7833     Value *V, const VPIteration &Instance) {
7834   return ILV.getOrCreateScalarValue(V, Instance);
7835 }
7836 
7837 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7838                                VPSlotTracker &SlotTracker) const {
7839   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7840   IG->getInsertPos()->printAsOperand(O, false);
7841   O << ", ";
7842   getAddr()->printAsOperand(O, SlotTracker);
7843   VPValue *Mask = getMask();
7844   if (Mask) {
7845     O << ", ";
7846     Mask->printAsOperand(O, SlotTracker);
7847   }
7848   for (unsigned i = 0; i < IG->getFactor(); ++i)
7849     if (Instruction *I = IG->getMember(i))
7850       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7851 }
7852 
7853 void VPWidenCallRecipe::execute(VPTransformState &State) {
7854   State.ILV->widenCallInstruction(Ingredient, User, State);
7855 }
7856 
7857 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7858   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7859 }
7860 
7861 void VPWidenRecipe::execute(VPTransformState &State) {
7862   State.ILV->widenInstruction(Ingredient, User, State);
7863 }
7864 
7865 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7866   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7867                       IsIndexLoopInvariant, State);
7868 }
7869 
7870 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7871   assert(!State.Instance && "Int or FP induction being replicated.");
7872   State.ILV->widenIntOrFpInduction(IV, Trunc);
7873 }
7874 
7875 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7876   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7877 }
7878 
7879 void VPBlendRecipe::execute(VPTransformState &State) {
7880   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7881   // We know that all PHIs in non-header blocks are converted into
7882   // selects, so we don't have to worry about the insertion order and we
7883   // can just use the builder.
7884   // At this point we generate the predication tree. There may be
7885   // duplications since this is a simple recursive scan, but future
7886   // optimizations will clean it up.
7887 
7888   unsigned NumIncoming = getNumIncomingValues();
7889 
7890   // Generate a sequence of selects of the form:
7891   // SELECT(Mask3, In3,
7892   //        SELECT(Mask2, In2,
7893   //               SELECT(Mask1, In1,
7894   //                      In0)))
7895   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7896   // are essentially undef are taken from In0.
7897   InnerLoopVectorizer::VectorParts Entry(State.UF);
7898   for (unsigned In = 0; In < NumIncoming; ++In) {
7899     for (unsigned Part = 0; Part < State.UF; ++Part) {
7900       // We might have single edge PHIs (blocks) - use an identity
7901       // 'select' for the first PHI operand.
7902       Value *In0 = State.get(getIncomingValue(In), Part);
7903       if (In == 0)
7904         Entry[Part] = In0; // Initialize with the first incoming value.
7905       else {
7906         // Select between the current value and the previous incoming edge
7907         // based on the incoming mask.
7908         Value *Cond = State.get(getMask(In), Part);
7909         Entry[Part] =
7910             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7911       }
7912     }
7913   }
7914   for (unsigned Part = 0; Part < State.UF; ++Part)
7915     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7916 }
7917 
7918 void VPInterleaveRecipe::execute(VPTransformState &State) {
7919   assert(!State.Instance && "Interleave group being replicated.");
7920   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7921 }
7922 
7923 void VPReductionRecipe::execute(VPTransformState &State) {
7924   assert(!State.Instance && "Reduction being replicated.");
7925   for (unsigned Part = 0; Part < State.UF; ++Part) {
7926     unsigned Kind = RdxDesc->getRecurrenceKind();
7927     Value *NewVecOp = State.get(VecOp, Part);
7928     Value *NewRed =
7929         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
7930     Value *PrevInChain = State.get(ChainOp, Part);
7931     Value *NextInChain;
7932     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7933         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7934       NextInChain =
7935           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
7936                          NewRed, PrevInChain);
7937     } else {
7938       NextInChain = State.Builder.CreateBinOp(
7939           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
7940     }
7941     State.ValueMap.setVectorValue(I, Part, NextInChain);
7942   }
7943 }
7944 
7945 void VPReplicateRecipe::execute(VPTransformState &State) {
7946   if (State.Instance) { // Generate a single instance.
7947     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7948                                     IsPredicated, State);
7949     // Insert scalar instance packing it into a vector.
7950     if (AlsoPack && State.VF.isVector()) {
7951       // If we're constructing lane 0, initialize to start from undef.
7952       if (State.Instance->Lane == 0) {
7953         assert(!State.VF.Scalable && "VF is assumed to be non scalable.");
7954         Value *Undef =
7955             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7956         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7957       }
7958       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7959     }
7960     return;
7961   }
7962 
7963   // Generate scalar instances for all VF lanes of all UF parts, unless the
7964   // instruction is uniform inwhich case generate only the first lane for each
7965   // of the UF parts.
7966   unsigned EndLane = IsUniform ? 1 : State.VF.Min;
7967   for (unsigned Part = 0; Part < State.UF; ++Part)
7968     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7969       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
7970                                       IsPredicated, State);
7971 }
7972 
7973 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7974   assert(State.Instance && "Branch on Mask works only on single instance.");
7975 
7976   unsigned Part = State.Instance->Part;
7977   unsigned Lane = State.Instance->Lane;
7978 
7979   Value *ConditionBit = nullptr;
7980   VPValue *BlockInMask = getMask();
7981   if (BlockInMask) {
7982     ConditionBit = State.get(BlockInMask, Part);
7983     if (ConditionBit->getType()->isVectorTy())
7984       ConditionBit = State.Builder.CreateExtractElement(
7985           ConditionBit, State.Builder.getInt32(Lane));
7986   } else // Block in mask is all-one.
7987     ConditionBit = State.Builder.getTrue();
7988 
7989   // Replace the temporary unreachable terminator with a new conditional branch,
7990   // whose two destinations will be set later when they are created.
7991   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7992   assert(isa<UnreachableInst>(CurrentTerminator) &&
7993          "Expected to replace unreachable terminator with conditional branch.");
7994   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7995   CondBr->setSuccessor(0, nullptr);
7996   ReplaceInstWithInst(CurrentTerminator, CondBr);
7997 }
7998 
7999 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8000   assert(State.Instance && "Predicated instruction PHI works per instance.");
8001   Instruction *ScalarPredInst = cast<Instruction>(
8002       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8003   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8004   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8005   assert(PredicatingBB && "Predicated block has no single predecessor.");
8006 
8007   // By current pack/unpack logic we need to generate only a single phi node: if
8008   // a vector value for the predicated instruction exists at this point it means
8009   // the instruction has vector users only, and a phi for the vector value is
8010   // needed. In this case the recipe of the predicated instruction is marked to
8011   // also do that packing, thereby "hoisting" the insert-element sequence.
8012   // Otherwise, a phi node for the scalar value is needed.
8013   unsigned Part = State.Instance->Part;
8014   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8015     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8016     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8017     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8018     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8019     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8020     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8021   } else {
8022     Type *PredInstType = PredInst->getType();
8023     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8024     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8025     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8026     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8027   }
8028 }
8029 
8030 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8031   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8032   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8033                                         getMask());
8034 }
8035 
8036 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8037 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8038 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8039 // for predication.
8040 static ScalarEpilogueLowering getScalarEpilogueLowering(
8041     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8042     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8043     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8044     LoopVectorizationLegality &LVL) {
8045   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8046   // don't look at hints or options, and don't request a scalar epilogue.
8047   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8048   // LoopAccessInfo (due to code dependency and not being able to reliably get
8049   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8050   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8051   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8052   // back to the old way and vectorize with versioning when forced. See D81345.)
8053   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8054                                                       PGSOQueryType::IRPass) &&
8055                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8056     return CM_ScalarEpilogueNotAllowedOptSize;
8057 
8058   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
8059                               !PreferPredicateOverEpilog;
8060 
8061   // 2) Next, if disabling predication is requested on the command line, honour
8062   // this and request a scalar epilogue.
8063   if (PredicateOptDisabled)
8064     return CM_ScalarEpilogueAllowed;
8065 
8066   // 3) and 4) look if enabling predication is requested on the command line,
8067   // with a loop hint, or if the TTI hook indicates this is profitable, request
8068   // predication .
8069   if (PreferPredicateOverEpilog ||
8070       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8071       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8072                                         LVL.getLAI()) &&
8073        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8074     return CM_ScalarEpilogueNotNeededUsePredicate;
8075 
8076   return CM_ScalarEpilogueAllowed;
8077 }
8078 
8079 // Process the loop in the VPlan-native vectorization path. This path builds
8080 // VPlan upfront in the vectorization pipeline, which allows to apply
8081 // VPlan-to-VPlan transformations from the very beginning without modifying the
8082 // input LLVM IR.
8083 static bool processLoopInVPlanNativePath(
8084     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8085     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8086     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8087     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8088     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8089 
8090   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8091     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8092     return false;
8093   }
8094   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8095   Function *F = L->getHeader()->getParent();
8096   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8097 
8098   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8099       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8100 
8101   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8102                                 &Hints, IAI);
8103   // Use the planner for outer loop vectorization.
8104   // TODO: CM is not used at this point inside the planner. Turn CM into an
8105   // optional argument if we don't need it in the future.
8106   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8107 
8108   // Get user vectorization factor.
8109   const unsigned UserVF = Hints.getWidth();
8110 
8111   // Plan how to best vectorize, return the best VF and its cost.
8112   const VectorizationFactor VF =
8113       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8114 
8115   // If we are stress testing VPlan builds, do not attempt to generate vector
8116   // code. Masked vector code generation support will follow soon.
8117   // Also, do not attempt to vectorize if no vector code will be produced.
8118   if (VPlanBuildStressTest || EnableVPlanPredication ||
8119       VectorizationFactor::Disabled() == VF)
8120     return false;
8121 
8122   LVP.setBestPlan(VF.Width, 1);
8123 
8124   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8125                          &CM, BFI, PSI);
8126   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8127                     << L->getHeader()->getParent()->getName() << "\"\n");
8128   LVP.executePlan(LB, DT);
8129 
8130   // Mark the loop as already vectorized to avoid vectorizing again.
8131   Hints.setAlreadyVectorized();
8132 
8133   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8134   return true;
8135 }
8136 
8137 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8138     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8139                                !EnableLoopInterleaving),
8140       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8141                               !EnableLoopVectorization) {}
8142 
8143 bool LoopVectorizePass::processLoop(Loop *L) {
8144   assert((EnableVPlanNativePath || L->empty()) &&
8145          "VPlan-native path is not enabled. Only process inner loops.");
8146 
8147 #ifndef NDEBUG
8148   const std::string DebugLocStr = getDebugLocString(L);
8149 #endif /* NDEBUG */
8150 
8151   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8152                     << L->getHeader()->getParent()->getName() << "\" from "
8153                     << DebugLocStr << "\n");
8154 
8155   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8156 
8157   LLVM_DEBUG(
8158       dbgs() << "LV: Loop hints:"
8159              << " force="
8160              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8161                      ? "disabled"
8162                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8163                             ? "enabled"
8164                             : "?"))
8165              << " width=" << Hints.getWidth()
8166              << " unroll=" << Hints.getInterleave() << "\n");
8167 
8168   // Function containing loop
8169   Function *F = L->getHeader()->getParent();
8170 
8171   // Looking at the diagnostic output is the only way to determine if a loop
8172   // was vectorized (other than looking at the IR or machine code), so it
8173   // is important to generate an optimization remark for each loop. Most of
8174   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8175   // generated as OptimizationRemark and OptimizationRemarkMissed are
8176   // less verbose reporting vectorized loops and unvectorized loops that may
8177   // benefit from vectorization, respectively.
8178 
8179   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8180     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8181     return false;
8182   }
8183 
8184   PredicatedScalarEvolution PSE(*SE, *L);
8185 
8186   // Check if it is legal to vectorize the loop.
8187   LoopVectorizationRequirements Requirements(*ORE);
8188   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8189                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8190   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8191     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8192     Hints.emitRemarkWithHints();
8193     return false;
8194   }
8195 
8196   // Check the function attributes and profiles to find out if this function
8197   // should be optimized for size.
8198   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8199       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8200 
8201   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8202   // here. They may require CFG and instruction level transformations before
8203   // even evaluating whether vectorization is profitable. Since we cannot modify
8204   // the incoming IR, we need to build VPlan upfront in the vectorization
8205   // pipeline.
8206   if (!L->empty())
8207     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8208                                         ORE, BFI, PSI, Hints);
8209 
8210   assert(L->empty() && "Inner loop expected.");
8211 
8212   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8213   // count by optimizing for size, to minimize overheads.
8214   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8215   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8216     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8217                       << "This loop is worth vectorizing only if no scalar "
8218                       << "iteration overheads are incurred.");
8219     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8220       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8221     else {
8222       LLVM_DEBUG(dbgs() << "\n");
8223       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8224     }
8225   }
8226 
8227   // Check the function attributes to see if implicit floats are allowed.
8228   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8229   // an integer loop and the vector instructions selected are purely integer
8230   // vector instructions?
8231   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8232     reportVectorizationFailure(
8233         "Can't vectorize when the NoImplicitFloat attribute is used",
8234         "loop not vectorized due to NoImplicitFloat attribute",
8235         "NoImplicitFloat", ORE, L);
8236     Hints.emitRemarkWithHints();
8237     return false;
8238   }
8239 
8240   // Check if the target supports potentially unsafe FP vectorization.
8241   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8242   // for the target we're vectorizing for, to make sure none of the
8243   // additional fp-math flags can help.
8244   if (Hints.isPotentiallyUnsafe() &&
8245       TTI->isFPVectorizationPotentiallyUnsafe()) {
8246     reportVectorizationFailure(
8247         "Potentially unsafe FP op prevents vectorization",
8248         "loop not vectorized due to unsafe FP support.",
8249         "UnsafeFP", ORE, L);
8250     Hints.emitRemarkWithHints();
8251     return false;
8252   }
8253 
8254   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8255   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8256 
8257   // If an override option has been passed in for interleaved accesses, use it.
8258   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8259     UseInterleaved = EnableInterleavedMemAccesses;
8260 
8261   // Analyze interleaved memory accesses.
8262   if (UseInterleaved) {
8263     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8264   }
8265 
8266   // Use the cost model.
8267   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8268                                 F, &Hints, IAI);
8269   CM.collectValuesToIgnore();
8270 
8271   // Use the planner for vectorization.
8272   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8273 
8274   // Get user vectorization factor and interleave count.
8275   unsigned UserVF = Hints.getWidth();
8276   unsigned UserIC = Hints.getInterleave();
8277 
8278   // Plan how to best vectorize, return the best VF and its cost.
8279   Optional<VectorizationFactor> MaybeVF =
8280       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8281 
8282   VectorizationFactor VF = VectorizationFactor::Disabled();
8283   unsigned IC = 1;
8284 
8285   if (MaybeVF) {
8286     VF = *MaybeVF;
8287     // Select the interleave count.
8288     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8289   }
8290 
8291   // Identify the diagnostic messages that should be produced.
8292   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8293   bool VectorizeLoop = true, InterleaveLoop = true;
8294   if (Requirements.doesNotMeet(F, L, Hints)) {
8295     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8296                          "requirements.\n");
8297     Hints.emitRemarkWithHints();
8298     return false;
8299   }
8300 
8301   if (VF.Width == 1) {
8302     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8303     VecDiagMsg = std::make_pair(
8304         "VectorizationNotBeneficial",
8305         "the cost-model indicates that vectorization is not beneficial");
8306     VectorizeLoop = false;
8307   }
8308 
8309   if (!MaybeVF && UserIC > 1) {
8310     // Tell the user interleaving was avoided up-front, despite being explicitly
8311     // requested.
8312     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8313                          "interleaving should be avoided up front\n");
8314     IntDiagMsg = std::make_pair(
8315         "InterleavingAvoided",
8316         "Ignoring UserIC, because interleaving was avoided up front");
8317     InterleaveLoop = false;
8318   } else if (IC == 1 && UserIC <= 1) {
8319     // Tell the user interleaving is not beneficial.
8320     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8321     IntDiagMsg = std::make_pair(
8322         "InterleavingNotBeneficial",
8323         "the cost-model indicates that interleaving is not beneficial");
8324     InterleaveLoop = false;
8325     if (UserIC == 1) {
8326       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8327       IntDiagMsg.second +=
8328           " and is explicitly disabled or interleave count is set to 1";
8329     }
8330   } else if (IC > 1 && UserIC == 1) {
8331     // Tell the user interleaving is beneficial, but it explicitly disabled.
8332     LLVM_DEBUG(
8333         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8334     IntDiagMsg = std::make_pair(
8335         "InterleavingBeneficialButDisabled",
8336         "the cost-model indicates that interleaving is beneficial "
8337         "but is explicitly disabled or interleave count is set to 1");
8338     InterleaveLoop = false;
8339   }
8340 
8341   // Override IC if user provided an interleave count.
8342   IC = UserIC > 0 ? UserIC : IC;
8343 
8344   // Emit diagnostic messages, if any.
8345   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8346   if (!VectorizeLoop && !InterleaveLoop) {
8347     // Do not vectorize or interleaving the loop.
8348     ORE->emit([&]() {
8349       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8350                                       L->getStartLoc(), L->getHeader())
8351              << VecDiagMsg.second;
8352     });
8353     ORE->emit([&]() {
8354       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8355                                       L->getStartLoc(), L->getHeader())
8356              << IntDiagMsg.second;
8357     });
8358     return false;
8359   } else if (!VectorizeLoop && InterleaveLoop) {
8360     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8361     ORE->emit([&]() {
8362       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8363                                         L->getStartLoc(), L->getHeader())
8364              << VecDiagMsg.second;
8365     });
8366   } else if (VectorizeLoop && !InterleaveLoop) {
8367     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8368                       << ") in " << DebugLocStr << '\n');
8369     ORE->emit([&]() {
8370       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8371                                         L->getStartLoc(), L->getHeader())
8372              << IntDiagMsg.second;
8373     });
8374   } else if (VectorizeLoop && InterleaveLoop) {
8375     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8376                       << ") in " << DebugLocStr << '\n');
8377     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8378   }
8379 
8380   LVP.setBestPlan(VF.Width, IC);
8381 
8382   using namespace ore;
8383   bool DisableRuntimeUnroll = false;
8384   MDNode *OrigLoopID = L->getLoopID();
8385 
8386   if (!VectorizeLoop) {
8387     assert(IC > 1 && "interleave count should not be 1 or 0");
8388     // If we decided that it is not legal to vectorize the loop, then
8389     // interleave it.
8390     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8391                                BFI, PSI);
8392     LVP.executePlan(Unroller, DT);
8393 
8394     ORE->emit([&]() {
8395       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8396                                 L->getHeader())
8397              << "interleaved loop (interleaved count: "
8398              << NV("InterleaveCount", IC) << ")";
8399     });
8400   } else {
8401     // If we decided that it is *legal* to vectorize the loop, then do it.
8402     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8403                            &LVL, &CM, BFI, PSI);
8404     LVP.executePlan(LB, DT);
8405     ++LoopsVectorized;
8406 
8407     // Add metadata to disable runtime unrolling a scalar loop when there are
8408     // no runtime checks about strides and memory. A scalar loop that is
8409     // rarely used is not worth unrolling.
8410     if (!LB.areSafetyChecksAdded())
8411       DisableRuntimeUnroll = true;
8412 
8413     // Report the vectorization decision.
8414     ORE->emit([&]() {
8415       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8416                                 L->getHeader())
8417              << "vectorized loop (vectorization width: "
8418              << NV("VectorizationFactor", VF.Width)
8419              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8420     });
8421   }
8422 
8423   Optional<MDNode *> RemainderLoopID =
8424       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8425                                       LLVMLoopVectorizeFollowupEpilogue});
8426   if (RemainderLoopID.hasValue()) {
8427     L->setLoopID(RemainderLoopID.getValue());
8428   } else {
8429     if (DisableRuntimeUnroll)
8430       AddRuntimeUnrollDisableMetaData(L);
8431 
8432     // Mark the loop as already vectorized to avoid vectorizing again.
8433     Hints.setAlreadyVectorized();
8434   }
8435 
8436   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8437   return true;
8438 }
8439 
8440 LoopVectorizeResult LoopVectorizePass::runImpl(
8441     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8442     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8443     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8444     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8445     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8446   SE = &SE_;
8447   LI = &LI_;
8448   TTI = &TTI_;
8449   DT = &DT_;
8450   BFI = &BFI_;
8451   TLI = TLI_;
8452   AA = &AA_;
8453   AC = &AC_;
8454   GetLAA = &GetLAA_;
8455   DB = &DB_;
8456   ORE = &ORE_;
8457   PSI = PSI_;
8458 
8459   // Don't attempt if
8460   // 1. the target claims to have no vector registers, and
8461   // 2. interleaving won't help ILP.
8462   //
8463   // The second condition is necessary because, even if the target has no
8464   // vector registers, loop vectorization may still enable scalar
8465   // interleaving.
8466   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8467       TTI->getMaxInterleaveFactor(1) < 2)
8468     return LoopVectorizeResult(false, false);
8469 
8470   bool Changed = false, CFGChanged = false;
8471 
8472   // The vectorizer requires loops to be in simplified form.
8473   // Since simplification may add new inner loops, it has to run before the
8474   // legality and profitability checks. This means running the loop vectorizer
8475   // will simplify all loops, regardless of whether anything end up being
8476   // vectorized.
8477   for (auto &L : *LI)
8478     Changed |= CFGChanged |=
8479         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8480 
8481   // Build up a worklist of inner-loops to vectorize. This is necessary as
8482   // the act of vectorizing or partially unrolling a loop creates new loops
8483   // and can invalidate iterators across the loops.
8484   SmallVector<Loop *, 8> Worklist;
8485 
8486   for (Loop *L : *LI)
8487     collectSupportedLoops(*L, LI, ORE, Worklist);
8488 
8489   LoopsAnalyzed += Worklist.size();
8490 
8491   // Now walk the identified inner loops.
8492   while (!Worklist.empty()) {
8493     Loop *L = Worklist.pop_back_val();
8494 
8495     // For the inner loops we actually process, form LCSSA to simplify the
8496     // transform.
8497     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8498 
8499     Changed |= CFGChanged |= processLoop(L);
8500   }
8501 
8502   // Process each loop nest in the function.
8503   return LoopVectorizeResult(Changed, CFGChanged);
8504 }
8505 
8506 PreservedAnalyses LoopVectorizePass::run(Function &F,
8507                                          FunctionAnalysisManager &AM) {
8508     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8509     auto &LI = AM.getResult<LoopAnalysis>(F);
8510     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8511     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8512     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8513     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8514     auto &AA = AM.getResult<AAManager>(F);
8515     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8516     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8517     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8518     MemorySSA *MSSA = EnableMSSALoopDependency
8519                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8520                           : nullptr;
8521 
8522     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8523     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8524         [&](Loop &L) -> const LoopAccessInfo & {
8525       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8526       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8527     };
8528     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8529     ProfileSummaryInfo *PSI =
8530         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8531     LoopVectorizeResult Result =
8532         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8533     if (!Result.MadeAnyChange)
8534       return PreservedAnalyses::all();
8535     PreservedAnalyses PA;
8536 
8537     // We currently do not preserve loopinfo/dominator analyses with outer loop
8538     // vectorization. Until this is addressed, mark these analyses as preserved
8539     // only for non-VPlan-native path.
8540     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8541     if (!EnableVPlanNativePath) {
8542       PA.preserve<LoopAnalysis>();
8543       PA.preserve<DominatorTreeAnalysis>();
8544     }
8545     PA.preserve<BasicAA>();
8546     PA.preserve<GlobalsAA>();
8547     if (!Result.MadeCFGChange)
8548       PA.preserveSet<CFGAnalyses>();
8549     return PA;
8550 }
8551