1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
206 
207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
208     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209     cl::desc("A flag that overrides the target's number of scalar registers."));
210 
211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
212     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213     cl::desc("A flag that overrides the target's number of vector registers."));
214 
215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
216     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217     cl::desc("A flag that overrides the target's max interleave factor for "
218              "scalar loops."));
219 
220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
221     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222     cl::desc("A flag that overrides the target's max interleave factor for "
223              "vectorized loops."));
224 
225 static cl::opt<unsigned> ForceTargetInstructionCost(
226     "force-target-instruction-cost", cl::init(0), cl::Hidden,
227     cl::desc("A flag that overrides the target's expected cost for "
228              "an instruction to a single constant value. Mostly "
229              "useful for getting consistent testing."));
230 
231 static cl::opt<unsigned> SmallLoopCost(
232     "small-loop-cost", cl::init(20), cl::Hidden,
233     cl::desc(
234         "The cost of a loop that is considered 'small' by the interleaver."));
235 
236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
237     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238     cl::desc("Enable the use of the block frequency analysis to access PGO "
239              "heuristics minimizing code growth in cold regions and being more "
240              "aggressive in hot regions."));
241 
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
244     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245     cl::desc(
246         "Enable runtime interleaving until load/store ports are saturated"));
247 
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt<unsigned> NumberOfStoresToPredicate(
250     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251     cl::desc("Max number of stores to be predicated behind an if."));
252 
253 static cl::opt<bool> EnableIndVarRegisterHeur(
254     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255     cl::desc("Count the induction variable only once when interleaving"));
256 
257 static cl::opt<bool> EnableCondStoresVectorization(
258     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259     cl::desc("Enable if predication of stores during vectorization."));
260 
261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
262     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263     cl::desc("The maximum interleave count to use when interleaving a scalar "
264              "reduction in a nested loop."));
265 
266 cl::opt<bool> EnableVPlanNativePath(
267     "enable-vplan-native-path", cl::init(false), cl::Hidden,
268     cl::desc("Enable VPlan-native vectorization path with "
269              "support for outer loop vectorization."));
270 
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt<bool> EnableVPlanPredication(
274     "enable-vplan-predication", cl::init(false), cl::Hidden,
275     cl::desc("Enable VPlan-native vectorization path predicator with "
276              "support for outer loop vectorization."));
277 
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt<bool> VPlanBuildStressTest(
283     "vplan-build-stress-test", cl::init(false), cl::Hidden,
284     cl::desc(
285         "Build VPlan for every supported loop nest in the function and bail "
286         "out right after the build (stress test the VPlan H-CFG construction "
287         "in the VPlan-native vectorization path)."));
288 
289 cl::opt<bool> llvm::EnableLoopInterleaving(
290     "interleave-loops", cl::init(true), cl::Hidden,
291     cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt<bool> llvm::EnableLoopVectorization(
293     "vectorize-loops", cl::init(true), cl::Hidden,
294     cl::desc("Run the Loop vectorization passes"));
295 
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300   if (Scalar->isVoidTy() || VF == 1)
301     return Scalar;
302   return VectorType::get(Scalar, VF);
303 }
304 
305 /// A helper function that returns the type of loaded or stored value.
306 static Type *getMemInstValueType(Value *I) {
307   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308          "Expected Load or Store instruction");
309   if (auto *LI = dyn_cast<LoadInst>(I))
310     return LI->getType();
311   return cast<StoreInst>(I)->getValueOperand()->getType();
312 }
313 
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318   // Determine if an array of VF elements of type Ty is "bitcast compatible"
319   // with a <VF x Ty> vector.
320   if (VF > 1) {
321     auto *VectorTy = VectorType::get(Ty, VF);
322     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
323   }
324 
325   // If the vectorization factor is one, we just check if an array of type Ty
326   // requires padding between elements.
327   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
328 }
329 
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
333 ///
334 /// TODO: We should use actual block probability here, if available. Currently,
335 ///       we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
337 
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value *addFastMathFlag(Value *V) {
340   if (isa<FPMathOperator>(V))
341     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342   return V;
343 }
344 
345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
346   if (isa<FPMathOperator>(V))
347     cast<Instruction>(V)->setFastMathFlags(FMF);
348   return V;
349 }
350 
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355                            : ConstantFP::get(Ty, C);
356 }
357 
358 namespace llvm {
359 
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 ///   counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
368 ///   instructions.
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer {
375 public:
376   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
377                       LoopInfo *LI, DominatorTree *DT,
378                       const TargetLibraryInfo *TLI,
379                       const TargetTransformInfo *TTI, AssumptionCache *AC,
380                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
381                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
382                       LoopVectorizationCostModel *CM)
383       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
384         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
385         Builder(PSE.getSE()->getContext()),
386         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
387   virtual ~InnerLoopVectorizer() = default;
388 
389   /// Create a new empty loop. Unlink the old loop and connect the new one.
390   /// Return the pre-header block of the new loop.
391   BasicBlock *createVectorizedLoopSkeleton();
392 
393   /// Widen a single instruction within the innermost loop.
394   void widenInstruction(Instruction &I);
395 
396   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397   void fixVectorizedLoop();
398 
399   // Return true if any runtime check is added.
400   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
401 
402   /// A type for vectorized values in the new loop. Each value from the
403   /// original loop, when vectorized, is represented by UF vector values in the
404   /// new unrolled loop, where UF is the unroll factor.
405   using VectorParts = SmallVector<Value *, 2>;
406 
407   /// Vectorize a single PHINode in a block. This method handles the induction
408   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409   /// arbitrary length vectors.
410   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
411 
412   /// A helper function to scalarize a single Instruction in the innermost loop.
413   /// Generates a sequence of scalar instances for each lane between \p MinLane
414   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
415   /// inclusive..
416   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
417                             bool IfPredicateInstr);
418 
419   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420   /// is provided, the integer induction variable will first be truncated to
421   /// the corresponding type.
422   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
423 
424   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425   /// vector or scalar value on-demand if one is not yet available. When
426   /// vectorizing a loop, we visit the definition of an instruction before its
427   /// uses. When visiting the definition, we either vectorize or scalarize the
428   /// instruction, creating an entry for it in the corresponding map. (In some
429   /// cases, such as induction variables, we will create both vector and scalar
430   /// entries.) Then, as we encounter uses of the definition, we derive values
431   /// for each scalar or vector use unless such a value is already available.
432   /// For example, if we scalarize a definition and one of its uses is vector,
433   /// we build the required vector on-demand with an insertelement sequence
434   /// when visiting the use. Otherwise, if the use is scalar, we can use the
435   /// existing scalar definition.
436   ///
437   /// Return a value in the new loop corresponding to \p V from the original
438   /// loop at unroll index \p Part. If the value has already been vectorized,
439   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441   /// a new vector value on-demand by inserting the scalar values into a vector
442   /// with an insertelement sequence. If the value has been neither vectorized
443   /// nor scalarized, it must be loop invariant, so we simply broadcast the
444   /// value into a vector.
445   Value *getOrCreateVectorValue(Value *V, unsigned Part);
446 
447   /// Return a value in the new loop corresponding to \p V from the original
448   /// loop at unroll and vector indices \p Instance. If the value has been
449   /// vectorized but not scalarized, the necessary extractelement instruction
450   /// will be generated.
451   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
452 
453   /// Construct the vector value of a scalarized value \p V one lane at a time.
454   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
455 
456   /// Try to vectorize the interleaved access group that \p Instr belongs to,
457   /// optionally masking the vector operations if \p BlockInMask is non-null.
458   void vectorizeInterleaveGroup(Instruction *Instr,
459                                 VectorParts *BlockInMask = nullptr);
460 
461   /// Vectorize Load and Store instructions, optionally masking the vector
462   /// operations if \p BlockInMask is non-null.
463   void vectorizeMemoryInstruction(Instruction *Instr,
464                                   VectorParts *BlockInMask = nullptr);
465 
466   /// Set the debug location in the builder using the debug location in
467   /// the instruction.
468   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
469 
470   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471   void fixNonInductionPHIs(void);
472 
473 protected:
474   friend class LoopVectorizationPlanner;
475 
476   /// A small list of PHINodes.
477   using PhiVector = SmallVector<PHINode *, 4>;
478 
479   /// A type for scalarized values in the new loop. Each value from the
480   /// original loop, when scalarized, is represented by UF x VF scalar values
481   /// in the new unrolled loop, where UF is the unroll factor and VF is the
482   /// vectorization factor.
483   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
484 
485   /// Set up the values of the IVs correctly when exiting the vector loop.
486   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
487                     Value *CountRoundDown, Value *EndValue,
488                     BasicBlock *MiddleBlock);
489 
490   /// Create a new induction variable inside L.
491   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
492                                    Value *Step, Instruction *DL);
493 
494   /// Handle all cross-iteration phis in the header.
495   void fixCrossIterationPHIs();
496 
497   /// Fix a first-order recurrence. This is the second phase of vectorizing
498   /// this phi node.
499   void fixFirstOrderRecurrence(PHINode *Phi);
500 
501   /// Fix a reduction cross-iteration phi. This is the second phase of
502   /// vectorizing this phi node.
503   void fixReduction(PHINode *Phi);
504 
505   /// The Loop exit block may have single value PHI nodes with some
506   /// incoming value. While vectorizing we only handled real values
507   /// that were defined inside the loop and we should have one value for
508   /// each predecessor of its parent basic block. See PR14725.
509   void fixLCSSAPHIs();
510 
511   /// Iteratively sink the scalarized operands of a predicated instruction into
512   /// the block that was created for it.
513   void sinkScalarOperands(Instruction *PredInst);
514 
515   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
516   /// represented as.
517   void truncateToMinimalBitwidths();
518 
519   /// Insert the new loop to the loop hierarchy and pass manager
520   /// and update the analysis passes.
521   void updateAnalysis();
522 
523   /// Create a broadcast instruction. This method generates a broadcast
524   /// instruction (shuffle) for loop invariant values and for the induction
525   /// value. If this is the induction variable then we extend it to N, N+1, ...
526   /// this is needed because each iteration in the loop corresponds to a SIMD
527   /// element.
528   virtual Value *getBroadcastInstrs(Value *V);
529 
530   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531   /// to each vector element of Val. The sequence starts at StartIndex.
532   /// \p Opcode is relevant for FP induction variable.
533   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
534                                Instruction::BinaryOps Opcode =
535                                Instruction::BinaryOpsEnd);
536 
537   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538   /// variable on which to base the steps, \p Step is the size of the step, and
539   /// \p EntryVal is the value from the original loop that maps to the steps.
540   /// Note that \p EntryVal doesn't have to be an induction variable - it
541   /// can also be a truncate instruction.
542   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
543                         const InductionDescriptor &ID);
544 
545   /// Create a vector induction phi node based on an existing scalar one. \p
546   /// EntryVal is the value from the original loop that maps to the vector phi
547   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548   /// truncate instruction, instead of widening the original IV, we widen a
549   /// version of the IV truncated to \p EntryVal's type.
550   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
551                                        Value *Step, Instruction *EntryVal);
552 
553   /// Returns true if an instruction \p I should be scalarized instead of
554   /// vectorized for the chosen vectorization factor.
555   bool shouldScalarizeInstruction(Instruction *I) const;
556 
557   /// Returns true if we should generate a scalar version of \p IV.
558   bool needsScalarInduction(Instruction *IV) const;
559 
560   /// If there is a cast involved in the induction variable \p ID, which should
561   /// be ignored in the vectorized loop body, this function records the
562   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563   /// cast. We had already proved that the casted Phi is equal to the uncasted
564   /// Phi in the vectorized loop (under a runtime guard), and therefore
565   /// there is no need to vectorize the cast - the same value can be used in the
566   /// vector loop for both the Phi and the cast.
567   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
569   ///
570   /// \p EntryVal is the value from the original loop that maps to the vector
571   /// phi node and is used to distinguish what is the IV currently being
572   /// processed - original one (if \p EntryVal is a phi corresponding to the
573   /// original IV) or the "newly-created" one based on the proof mentioned above
574   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575   /// latter case \p EntryVal is a TruncInst and we must not record anything for
576   /// that IV, but it's error-prone to expect callers of this routine to care
577   /// about that, hence this explicit parameter.
578   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
579                                              const Instruction *EntryVal,
580                                              Value *VectorLoopValue,
581                                              unsigned Part,
582                                              unsigned Lane = UINT_MAX);
583 
584   /// Generate a shuffle sequence that will reverse the vector Vec.
585   virtual Value *reverseVector(Value *Vec);
586 
587   /// Returns (and creates if needed) the original loop trip count.
588   Value *getOrCreateTripCount(Loop *NewLoop);
589 
590   /// Returns (and creates if needed) the trip count of the widened loop.
591   Value *getOrCreateVectorTripCount(Loop *NewLoop);
592 
593   /// Returns a bitcasted value to the requested vector type.
594   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
596                                 const DataLayout &DL);
597 
598   /// Emit a bypass check to see if the vector trip count is zero, including if
599   /// it overflows.
600   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
601 
602   /// Emit a bypass check to see if all of the SCEV assumptions we've
603   /// had to make are correct.
604   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
605 
606   /// Emit bypass checks to check any memory assumptions we may have made.
607   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
608 
609   /// Compute the transformed value of Index at offset StartValue using step
610   /// StepValue.
611   /// For integer induction, returns StartValue + Index * StepValue.
612   /// For pointer induction, returns StartValue[Index * StepValue].
613   /// FIXME: The newly created binary instructions should contain nsw/nuw
614   /// flags, which can be found from the original scalar operations.
615   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
616                               const DataLayout &DL,
617                               const InductionDescriptor &ID) const;
618 
619   /// Add additional metadata to \p To that was not present on \p Orig.
620   ///
621   /// Currently this is used to add the noalias annotations based on the
622   /// inserted memchecks.  Use this for instructions that are *cloned* into the
623   /// vector loop.
624   void addNewMetadata(Instruction *To, const Instruction *Orig);
625 
626   /// Add metadata from one instruction to another.
627   ///
628   /// This includes both the original MDs from \p From and additional ones (\see
629   /// addNewMetadata).  Use this for *newly created* instructions in the vector
630   /// loop.
631   void addMetadata(Instruction *To, Instruction *From);
632 
633   /// Similar to the previous function but it adds the metadata to a
634   /// vector of instructions.
635   void addMetadata(ArrayRef<Value *> To, Instruction *From);
636 
637   /// The original loop.
638   Loop *OrigLoop;
639 
640   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641   /// dynamic knowledge to simplify SCEV expressions and converts them to a
642   /// more usable form.
643   PredicatedScalarEvolution &PSE;
644 
645   /// Loop Info.
646   LoopInfo *LI;
647 
648   /// Dominator Tree.
649   DominatorTree *DT;
650 
651   /// Alias Analysis.
652   AliasAnalysis *AA;
653 
654   /// Target Library Info.
655   const TargetLibraryInfo *TLI;
656 
657   /// Target Transform Info.
658   const TargetTransformInfo *TTI;
659 
660   /// Assumption Cache.
661   AssumptionCache *AC;
662 
663   /// Interface to emit optimization remarks.
664   OptimizationRemarkEmitter *ORE;
665 
666   /// LoopVersioning.  It's only set up (non-null) if memchecks were
667   /// used.
668   ///
669   /// This is currently only used to add no-alias metadata based on the
670   /// memchecks.  The actually versioning is performed manually.
671   std::unique_ptr<LoopVersioning> LVer;
672 
673   /// The vectorization SIMD factor to use. Each vector will have this many
674   /// vector elements.
675   unsigned VF;
676 
677   /// The vectorization unroll factor to use. Each scalar is vectorized to this
678   /// many different vector instructions.
679   unsigned UF;
680 
681   /// The builder that we use
682   IRBuilder<> Builder;
683 
684   // --- Vectorization state ---
685 
686   /// The vector-loop preheader.
687   BasicBlock *LoopVectorPreHeader;
688 
689   /// The scalar-loop preheader.
690   BasicBlock *LoopScalarPreHeader;
691 
692   /// Middle Block between the vector and the scalar.
693   BasicBlock *LoopMiddleBlock;
694 
695   /// The ExitBlock of the scalar loop.
696   BasicBlock *LoopExitBlock;
697 
698   /// The vector loop body.
699   BasicBlock *LoopVectorBody;
700 
701   /// The scalar loop body.
702   BasicBlock *LoopScalarBody;
703 
704   /// A list of all bypass blocks. The first block is the entry of the loop.
705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706 
707   /// The new Induction variable which was added to the new block.
708   PHINode *Induction = nullptr;
709 
710   /// The induction variable of the old basic block.
711   PHINode *OldInduction = nullptr;
712 
713   /// Maps values from the original loop to their corresponding values in the
714   /// vectorized loop. A key value can map to either vector values, scalar
715   /// values or both kinds of values, depending on whether the key was
716   /// vectorized and scalarized.
717   VectorizerValueMap VectorLoopValueMap;
718 
719   /// Store instructions that were predicated.
720   SmallVector<Instruction *, 4> PredicatedInstructions;
721 
722   /// Trip count of the original loop.
723   Value *TripCount = nullptr;
724 
725   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726   Value *VectorTripCount = nullptr;
727 
728   /// The legality analysis.
729   LoopVectorizationLegality *Legal;
730 
731   /// The profitablity analysis.
732   LoopVectorizationCostModel *Cost;
733 
734   // Record whether runtime checks are added.
735   bool AddedSafetyChecks = false;
736 
737   // Holds the end values for each induction variable. We save the end values
738   // so we can later fix-up the external users of the induction variables.
739   DenseMap<PHINode *, Value *> IVEndValues;
740 
741   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742   // fixed up at the end of vector code generation.
743   SmallVector<PHINode *, 8> OrigPHIsToFix;
744 };
745 
746 class InnerLoopUnroller : public InnerLoopVectorizer {
747 public:
748   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
749                     LoopInfo *LI, DominatorTree *DT,
750                     const TargetLibraryInfo *TLI,
751                     const TargetTransformInfo *TTI, AssumptionCache *AC,
752                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
753                     LoopVectorizationLegality *LVL,
754                     LoopVectorizationCostModel *CM)
755       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
756                             UnrollFactor, LVL, CM) {}
757 
758 private:
759   Value *getBroadcastInstrs(Value *V) override;
760   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
761                        Instruction::BinaryOps Opcode =
762                        Instruction::BinaryOpsEnd) override;
763   Value *reverseVector(Value *Vec) override;
764 };
765 
766 } // end namespace llvm
767 
768 /// Look for a meaningful debug location on the instruction or it's
769 /// operands.
770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
771   if (!I)
772     return I;
773 
774   DebugLoc Empty;
775   if (I->getDebugLoc() != Empty)
776     return I;
777 
778   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
779     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
780       if (OpInst->getDebugLoc() != Empty)
781         return OpInst;
782   }
783 
784   return I;
785 }
786 
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
788   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
789     const DILocation *DIL = Inst->getDebugLoc();
790     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
791         !isa<DbgInfoIntrinsic>(Inst)) {
792       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
793       if (NewDIL)
794         B.SetCurrentDebugLocation(NewDIL.getValue());
795       else
796         LLVM_DEBUG(dbgs()
797                    << "Failed to create new discriminator: "
798                    << DIL->getFilename() << " Line: " << DIL->getLine());
799     }
800     else
801       B.SetCurrentDebugLocation(DIL);
802   } else
803     B.SetCurrentDebugLocation(DebugLoc());
804 }
805 
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
808 /// vectorization.
809 #ifndef NDEBUG
810 static void debugVectorizationFailure(const StringRef DebugMsg,
811     Instruction *I) {
812   dbgs() << "LV: Not vectorizing: " << DebugMsg;
813   if (I != nullptr)
814     dbgs() << " " << *I;
815   else
816     dbgs() << '.';
817   dbgs() << '\n';
818 }
819 #endif
820 
821 /// Create an analysis remark that explains why vectorization failed
822 ///
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
824 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
825 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
826 /// the location of the remark.  \return the remark object that can be
827 /// streamed to.
828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
829     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
830   Value *CodeRegion = TheLoop->getHeader();
831   DebugLoc DL = TheLoop->getStartLoc();
832 
833   if (I) {
834     CodeRegion = I->getParent();
835     // If there is no debug location attached to the instruction, revert back to
836     // using the loop's.
837     if (I->getDebugLoc())
838       DL = I->getDebugLoc();
839   }
840 
841   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
842   R << "loop not vectorized: ";
843   return R;
844 }
845 
846 namespace llvm {
847 
848 void reportVectorizationFailure(const StringRef DebugMsg,
849     const StringRef OREMsg, const StringRef ORETag,
850     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
851   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
852   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
853   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
854                 ORETag, TheLoop, I) << OREMsg);
855 }
856 
857 } // end namespace llvm
858 
859 #ifndef NDEBUG
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string getDebugLocString(const Loop *L) {
862   std::string Result;
863   if (L) {
864     raw_string_ostream OS(Result);
865     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
866       LoopDbgLoc.print(OS);
867     else
868       // Just print the module name.
869       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
870     OS.flush();
871   }
872   return Result;
873 }
874 #endif
875 
876 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
877                                          const Instruction *Orig) {
878   // If the loop was versioned with memchecks, add the corresponding no-alias
879   // metadata.
880   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
881     LVer->annotateInstWithNoAlias(To, Orig);
882 }
883 
884 void InnerLoopVectorizer::addMetadata(Instruction *To,
885                                       Instruction *From) {
886   propagateMetadata(To, From);
887   addNewMetadata(To, From);
888 }
889 
890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
891                                       Instruction *From) {
892   for (Value *V : To) {
893     if (Instruction *I = dyn_cast<Instruction>(V))
894       addMetadata(I, From);
895   }
896 }
897 
898 namespace llvm {
899 
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
901 // lowered.
902 enum ScalarEpilogueLowering {
903 
904   // The default: allowing scalar epilogues.
905   CM_ScalarEpilogueAllowed,
906 
907   // Vectorization with OptForSize: don't allow epilogues.
908   CM_ScalarEpilogueNotAllowedOptSize,
909 
910   // A special case of vectorisation with OptForSize: loops with a very small
911   // trip count are considered for vectorization under OptForSize, thereby
912   // making sure the cost of their loop body is dominant, free of runtime
913   // guards and scalar iteration overheads.
914   CM_ScalarEpilogueNotAllowedLowTripLoop,
915 
916   // Loop hint predicate indicating an epilogue is undesired.
917   CM_ScalarEpilogueNotNeededUsePredicate
918 };
919 
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
921 /// vectorization.
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel {
928 public:
929   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
930                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
931                              LoopVectorizationLegality *Legal,
932                              const TargetTransformInfo &TTI,
933                              const TargetLibraryInfo *TLI, DemandedBits *DB,
934                              AssumptionCache *AC,
935                              OptimizationRemarkEmitter *ORE, const Function *F,
936                              const LoopVectorizeHints *Hints,
937                              InterleavedAccessInfo &IAI)
938       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
939         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
940         Hints(Hints), InterleaveInfo(IAI) {}
941 
942   /// \return An upper bound for the vectorization factor, or None if
943   /// vectorization and interleaving should be avoided up front.
944   Optional<unsigned> computeMaxVF();
945 
946   /// \return True if runtime checks are required for vectorization, and false
947   /// otherwise.
948   bool runtimeChecksRequired();
949 
950   /// \return The most profitable vectorization factor and the cost of that VF.
951   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952   /// then this vectorization factor will be selected if vectorization is
953   /// possible.
954   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
955 
956   /// Setup cost-based decisions for user vectorization factor.
957   void selectUserVectorizationFactor(unsigned UserVF) {
958     collectUniformsAndScalars(UserVF);
959     collectInstsToScalarize(UserVF);
960   }
961 
962   /// \return The size (in bits) of the smallest and widest types in the code
963   /// that needs to be vectorized. We ignore values that remain scalar such as
964   /// 64 bit loop indices.
965   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
966 
967   /// \return The desired interleave count.
968   /// If interleave count has been specified by metadata it will be returned.
969   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970   /// are the selected vectorization factor and the cost of the selected VF.
971   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
972 
973   /// Memory access instruction may be vectorized in more than one way.
974   /// Form of instruction after vectorization depends on cost.
975   /// This function takes cost-based decisions for Load/Store instructions
976   /// and collects them in a map. This decisions map is used for building
977   /// the lists of loop-uniform and loop-scalar instructions.
978   /// The calculated cost is saved with widening decision in order to
979   /// avoid redundant calculations.
980   void setCostBasedWideningDecision(unsigned VF);
981 
982   /// A struct that represents some properties of the register usage
983   /// of a loop.
984   struct RegisterUsage {
985     /// Holds the number of loop invariant values that are used in the loop.
986     unsigned LoopInvariantRegs;
987 
988     /// Holds the maximum number of concurrent live intervals in the loop.
989     unsigned MaxLocalUsers;
990   };
991 
992   /// \return Returns information about the register usages of the loop for the
993   /// given vectorization factors.
994   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
995 
996   /// Collect values we want to ignore in the cost model.
997   void collectValuesToIgnore();
998 
999   /// \returns The smallest bitwidth each instruction can be represented with.
1000   /// The vector equivalents of these instructions should be truncated to this
1001   /// type.
1002   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1003     return MinBWs;
1004   }
1005 
1006   /// \returns True if it is more profitable to scalarize instruction \p I for
1007   /// vectorization factor \p VF.
1008   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1010 
1011     // Cost model is not run in the VPlan-native path - return conservative
1012     // result until this changes.
1013     if (EnableVPlanNativePath)
1014       return false;
1015 
1016     auto Scalars = InstsToScalarize.find(VF);
1017     assert(Scalars != InstsToScalarize.end() &&
1018            "VF not yet analyzed for scalarization profitability");
1019     return Scalars->second.find(I) != Scalars->second.end();
1020   }
1021 
1022   /// Returns true if \p I is known to be uniform after vectorization.
1023   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024     if (VF == 1)
1025       return true;
1026 
1027     // Cost model is not run in the VPlan-native path - return conservative
1028     // result until this changes.
1029     if (EnableVPlanNativePath)
1030       return false;
1031 
1032     auto UniformsPerVF = Uniforms.find(VF);
1033     assert(UniformsPerVF != Uniforms.end() &&
1034            "VF not yet analyzed for uniformity");
1035     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1036   }
1037 
1038   /// Returns true if \p I is known to be scalar after vectorization.
1039   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040     if (VF == 1)
1041       return true;
1042 
1043     // Cost model is not run in the VPlan-native path - return conservative
1044     // result until this changes.
1045     if (EnableVPlanNativePath)
1046       return false;
1047 
1048     auto ScalarsPerVF = Scalars.find(VF);
1049     assert(ScalarsPerVF != Scalars.end() &&
1050            "Scalar values are not calculated for VF");
1051     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1052   }
1053 
1054   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055   /// for vectorization factor \p VF.
1056   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058            !isProfitableToScalarize(I, VF) &&
1059            !isScalarAfterVectorization(I, VF);
1060   }
1061 
1062   /// Decision that was taken during cost calculation for memory instruction.
1063   enum InstWidening {
1064     CM_Unknown,
1065     CM_Widen,         // For consecutive accesses with stride +1.
1066     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1067     CM_Interleave,
1068     CM_GatherScatter,
1069     CM_Scalarize
1070   };
1071 
1072   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073   /// instruction \p I and vector width \p VF.
1074   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1075                            unsigned Cost) {
1076     assert(VF >= 2 && "Expected VF >=2");
1077     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1078   }
1079 
1080   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081   /// interleaving group \p Grp and vector width \p VF.
1082   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1083                            InstWidening W, unsigned Cost) {
1084     assert(VF >= 2 && "Expected VF >=2");
1085     /// Broadcast this decicion to all instructions inside the group.
1086     /// But the cost will be assigned to one instruction only.
1087     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088       if (auto *I = Grp->getMember(i)) {
1089         if (Grp->getInsertPos() == I)
1090           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091         else
1092           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1093       }
1094     }
1095   }
1096 
1097   /// Return the cost model decision for the given instruction \p I and vector
1098   /// width \p VF. Return CM_Unknown if this instruction did not pass
1099   /// through the cost modeling.
1100   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1101     assert(VF >= 2 && "Expected VF >=2");
1102 
1103     // Cost model is not run in the VPlan-native path - return conservative
1104     // result until this changes.
1105     if (EnableVPlanNativePath)
1106       return CM_GatherScatter;
1107 
1108     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109     auto Itr = WideningDecisions.find(InstOnVF);
1110     if (Itr == WideningDecisions.end())
1111       return CM_Unknown;
1112     return Itr->second.first;
1113   }
1114 
1115   /// Return the vectorization cost for the given instruction \p I and vector
1116   /// width \p VF.
1117   unsigned getWideningCost(Instruction *I, unsigned VF) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121            "The cost is not calculated");
1122     return WideningDecisions[InstOnVF].second;
1123   }
1124 
1125   /// Return True if instruction \p I is an optimizable truncate whose operand
1126   /// is an induction variable. Such a truncate will be removed by adding a new
1127   /// induction variable with the destination type.
1128   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129     // If the instruction is not a truncate, return false.
1130     auto *Trunc = dyn_cast<TruncInst>(I);
1131     if (!Trunc)
1132       return false;
1133 
1134     // Get the source and destination types of the truncate.
1135     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1137 
1138     // If the truncate is free for the given types, return false. Replacing a
1139     // free truncate with an induction variable would add an induction variable
1140     // update instruction to each iteration of the loop. We exclude from this
1141     // check the primary induction variable since it will need an update
1142     // instruction regardless.
1143     Value *Op = Trunc->getOperand(0);
1144     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145       return false;
1146 
1147     // If the truncated value is not an induction variable, return false.
1148     return Legal->isInductionPhi(Op);
1149   }
1150 
1151   /// Collects the instructions to scalarize for each predicated instruction in
1152   /// the loop.
1153   void collectInstsToScalarize(unsigned VF);
1154 
1155   /// Collect Uniform and Scalar values for the given \p VF.
1156   /// The sets depend on CM decision for Load/Store instructions
1157   /// that may be vectorized as interleave, gather-scatter or scalarized.
1158   void collectUniformsAndScalars(unsigned VF) {
1159     // Do the analysis once.
1160     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161       return;
1162     setCostBasedWideningDecision(VF);
1163     collectLoopUniforms(VF);
1164     collectLoopScalars(VF);
1165   }
1166 
1167   /// Returns true if the target machine supports masked store operation
1168   /// for the given \p DataType and kind of access to \p Ptr.
1169   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1170     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1171   }
1172 
1173   /// Returns true if the target machine supports masked load operation
1174   /// for the given \p DataType and kind of access to \p Ptr.
1175   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1176     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1177   }
1178 
1179   /// Returns true if the target machine supports masked scatter operation
1180   /// for the given \p DataType.
1181   bool isLegalMaskedScatter(Type *DataType) {
1182     return TTI.isLegalMaskedScatter(DataType);
1183   }
1184 
1185   /// Returns true if the target machine supports masked gather operation
1186   /// for the given \p DataType.
1187   bool isLegalMaskedGather(Type *DataType) {
1188     return TTI.isLegalMaskedGather(DataType);
1189   }
1190 
1191   /// Returns true if the target machine can represent \p V as a masked gather
1192   /// or scatter operation.
1193   bool isLegalGatherOrScatter(Value *V) {
1194     bool LI = isa<LoadInst>(V);
1195     bool SI = isa<StoreInst>(V);
1196     if (!LI && !SI)
1197       return false;
1198     auto *Ty = getMemInstValueType(V);
1199     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1200   }
1201 
1202   /// Returns true if \p I is an instruction that will be scalarized with
1203   /// predication. Such instructions include conditional stores and
1204   /// instructions that may divide by zero.
1205   /// If a non-zero VF has been calculated, we check if I will be scalarized
1206   /// predication for that VF.
1207   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1208 
1209   // Returns true if \p I is an instruction that will be predicated either
1210   // through scalar predication or masked load/store or masked gather/scatter.
1211   // Superset of instructions that return true for isScalarWithPredication.
1212   bool isPredicatedInst(Instruction *I) {
1213     if (!blockNeedsPredication(I->getParent()))
1214       return false;
1215     // Loads and stores that need some form of masked operation are predicated
1216     // instructions.
1217     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218       return Legal->isMaskRequired(I);
1219     return isScalarWithPredication(I);
1220   }
1221 
1222   /// Returns true if \p I is a memory instruction with consecutive memory
1223   /// access that can be widened.
1224   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1225 
1226   /// Returns true if \p I is a memory instruction in an interleaved-group
1227   /// of memory accesses that can be vectorized with wide vector loads/stores
1228   /// and shuffles.
1229   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1230 
1231   /// Check if \p Instr belongs to any interleaved access group.
1232   bool isAccessInterleaved(Instruction *Instr) {
1233     return InterleaveInfo.isInterleaved(Instr);
1234   }
1235 
1236   /// Get the interleaved access group that \p Instr belongs to.
1237   const InterleaveGroup<Instruction> *
1238   getInterleavedAccessGroup(Instruction *Instr) {
1239     return InterleaveInfo.getInterleaveGroup(Instr);
1240   }
1241 
1242   /// Returns true if an interleaved group requires a scalar iteration
1243   /// to handle accesses with gaps, and there is nothing preventing us from
1244   /// creating a scalar epilogue.
1245   bool requiresScalarEpilogue() const {
1246     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1247   }
1248 
1249   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250   /// loop hint annotation.
1251   bool isScalarEpilogueAllowed() const {
1252     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1253   }
1254 
1255   /// Returns true if all loop blocks should be masked to fold tail loop.
1256   bool foldTailByMasking() const { return FoldTailByMasking; }
1257 
1258   bool blockNeedsPredication(BasicBlock *BB) {
1259     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1260   }
1261 
1262   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263   /// with factor VF.  Return the cost of the instruction, including
1264   /// scalarization overhead if it's needed.
1265   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1266 
1267   /// Estimate cost of a call instruction CI if it were vectorized with factor
1268   /// VF. Return the cost of the instruction, including scalarization overhead
1269   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270   /// scalarized -
1271   /// i.e. either vector version isn't available, or is too expensive.
1272   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1273 
1274 private:
1275   unsigned NumPredStores = 0;
1276 
1277   /// \return An upper bound for the vectorization factor, larger than zero.
1278   /// One is returned if vectorization should best be avoided due to cost.
1279   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1280 
1281   /// The vectorization cost is a combination of the cost itself and a boolean
1282   /// indicating whether any of the contributing operations will actually
1283   /// operate on
1284   /// vector values after type legalization in the backend. If this latter value
1285   /// is
1286   /// false, then all operations will be scalarized (i.e. no vectorization has
1287   /// actually taken place).
1288   using VectorizationCostTy = std::pair<unsigned, bool>;
1289 
1290   /// Returns the expected execution cost. The unit of the cost does
1291   /// not matter because we use the 'cost' units to compare different
1292   /// vector widths. The cost that is returned is *not* normalized by
1293   /// the factor width.
1294   VectorizationCostTy expectedCost(unsigned VF);
1295 
1296   /// Returns the execution time cost of an instruction for a given vector
1297   /// width. Vector width of one means scalar.
1298   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1299 
1300   /// The cost-computation logic from getInstructionCost which provides
1301   /// the vector type as an output parameter.
1302   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1303 
1304   /// Calculate vectorization cost of memory instruction \p I.
1305   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1306 
1307   /// The cost computation for scalarized memory instruction.
1308   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1309 
1310   /// The cost computation for interleaving group of memory instructions.
1311   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1312 
1313   /// The cost computation for Gather/Scatter instruction.
1314   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1315 
1316   /// The cost computation for widening instruction \p I with consecutive
1317   /// memory access.
1318   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1319 
1320   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321   /// Load: scalar load + broadcast.
1322   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323   /// element)
1324   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1325 
1326   /// Estimate the overhead of scalarizing an instruction. This is a
1327   /// convenience wrapper for the type-based getScalarizationOverhead API.
1328   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1329 
1330   /// Returns whether the instruction is a load or store and will be a emitted
1331   /// as a vector operation.
1332   bool isConsecutiveLoadOrStore(Instruction *I);
1333 
1334   /// Returns true if an artificially high cost for emulated masked memrefs
1335   /// should be used.
1336   bool useEmulatedMaskMemRefHack(Instruction *I);
1337 
1338   /// Map of scalar integer values to the smallest bitwidth they can be legally
1339   /// represented as. The vector equivalents of these values should be truncated
1340   /// to this type.
1341   MapVector<Instruction *, uint64_t> MinBWs;
1342 
1343   /// A type representing the costs for instructions if they were to be
1344   /// scalarized rather than vectorized. The entries are Instruction-Cost
1345   /// pairs.
1346   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1347 
1348   /// A set containing all BasicBlocks that are known to present after
1349   /// vectorization as a predicated block.
1350   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1351 
1352   /// Records whether it is allowed to have the original scalar loop execute at
1353   /// least once. This may be needed as a fallback loop in case runtime
1354   /// aliasing/dependence checks fail, or to handle the tail/remainder
1355   /// iterations when the trip count is unknown or doesn't divide by the VF,
1356   /// or as a peel-loop to handle gaps in interleave-groups.
1357   /// Under optsize and when the trip count is very small we don't allow any
1358   /// iterations to execute in the scalar loop.
1359   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1360 
1361   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362   bool FoldTailByMasking = false;
1363 
1364   /// A map holding scalar costs for different vectorization factors. The
1365   /// presence of a cost for an instruction in the mapping indicates that the
1366   /// instruction will be scalarized when vectorizing with the associated
1367   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1369 
1370   /// Holds the instructions known to be uniform after vectorization.
1371   /// The data is collected per VF.
1372   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1373 
1374   /// Holds the instructions known to be scalar after vectorization.
1375   /// The data is collected per VF.
1376   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1377 
1378   /// Holds the instructions (address computations) that are forced to be
1379   /// scalarized.
1380   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1381 
1382   /// Returns the expected difference in cost from scalarizing the expression
1383   /// feeding a predicated instruction \p PredInst. The instructions to
1384   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385   /// non-negative return value implies the expression will be scalarized.
1386   /// Currently, only single-use chains are considered for scalarization.
1387   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388                               unsigned VF);
1389 
1390   /// Collect the instructions that are uniform after vectorization. An
1391   /// instruction is uniform if we represent it with a single scalar value in
1392   /// the vectorized loop corresponding to each vector iteration. Examples of
1393   /// uniform instructions include pointer operands of consecutive or
1394   /// interleaved memory accesses. Note that although uniformity implies an
1395   /// instruction will be scalar, the reverse is not true. In general, a
1396   /// scalarized instruction will be represented by VF scalar values in the
1397   /// vectorized loop, each corresponding to an iteration of the original
1398   /// scalar loop.
1399   void collectLoopUniforms(unsigned VF);
1400 
1401   /// Collect the instructions that are scalar after vectorization. An
1402   /// instruction is scalar if it is known to be uniform or will be scalarized
1403   /// during vectorization. Non-uniform scalarized instructions will be
1404   /// represented by VF values in the vectorized loop, each corresponding to an
1405   /// iteration of the original scalar loop.
1406   void collectLoopScalars(unsigned VF);
1407 
1408   /// Keeps cost model vectorization decision and cost for instructions.
1409   /// Right now it is used for memory instructions only.
1410   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1411                                 std::pair<InstWidening, unsigned>>;
1412 
1413   DecisionList WideningDecisions;
1414 
1415   /// Returns true if \p V is expected to be vectorized and it needs to be
1416   /// extracted.
1417   bool needsExtract(Value *V, unsigned VF) const {
1418     Instruction *I = dyn_cast<Instruction>(V);
1419     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420       return false;
1421 
1422     // Assume we can vectorize V (and hence we need extraction) if the
1423     // scalars are not computed yet. This can happen, because it is called
1424     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425     // the scalars are collected. That should be a safe assumption in most
1426     // cases, because we check if the operands have vectorizable types
1427     // beforehand in LoopVectorizationLegality.
1428     return Scalars.find(VF) == Scalars.end() ||
1429            !isScalarAfterVectorization(I, VF);
1430   };
1431 
1432   /// Returns a range containing only operands needing to be extracted.
1433   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434                                                    unsigned VF) {
1435     return SmallVector<Value *, 4>(make_filter_range(
1436         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1437   }
1438 
1439 public:
1440   /// The loop that we evaluate.
1441   Loop *TheLoop;
1442 
1443   /// Predicated scalar evolution analysis.
1444   PredicatedScalarEvolution &PSE;
1445 
1446   /// Loop Info analysis.
1447   LoopInfo *LI;
1448 
1449   /// Vectorization legality.
1450   LoopVectorizationLegality *Legal;
1451 
1452   /// Vector target information.
1453   const TargetTransformInfo &TTI;
1454 
1455   /// Target Library Info.
1456   const TargetLibraryInfo *TLI;
1457 
1458   /// Demanded bits analysis.
1459   DemandedBits *DB;
1460 
1461   /// Assumption cache.
1462   AssumptionCache *AC;
1463 
1464   /// Interface to emit optimization remarks.
1465   OptimizationRemarkEmitter *ORE;
1466 
1467   const Function *TheFunction;
1468 
1469   /// Loop Vectorize Hint.
1470   const LoopVectorizeHints *Hints;
1471 
1472   /// The interleave access information contains groups of interleaved accesses
1473   /// with the same stride and close to each other.
1474   InterleavedAccessInfo &InterleaveInfo;
1475 
1476   /// Values to ignore in the cost model.
1477   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1478 
1479   /// Values to ignore in the cost model when VF > 1.
1480   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1481 };
1482 
1483 } // end namespace llvm
1484 
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1500                                    OptimizationRemarkEmitter *ORE) {
1501   assert(!OuterLp->empty() && "This is not an outer loop");
1502   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1503 
1504   // Only outer loops with an explicit vectorization hint are supported.
1505   // Unannotated outer loops are ignored.
1506   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507     return false;
1508 
1509   Function *Fn = OuterLp->getHeader()->getParent();
1510   if (!Hints.allowVectorization(Fn, OuterLp,
1511                                 true /*VectorizeOnlyWhenForced*/)) {
1512     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513     return false;
1514   }
1515 
1516   if (Hints.getInterleave() > 1) {
1517     // TODO: Interleave support is future work.
1518     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519                          "outer loops.\n");
1520     Hints.emitRemarkWithHints();
1521     return false;
1522   }
1523 
1524   return true;
1525 }
1526 
1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1528                                   OptimizationRemarkEmitter *ORE,
1529                                   SmallVectorImpl<Loop *> &V) {
1530   // Collect inner loops and outer loops without irreducible control flow. For
1531   // now, only collect outer loops that have explicit vectorization hints. If we
1532   // are stress testing the VPlan H-CFG construction, we collect the outermost
1533   // loop of every loop nest.
1534   if (L.empty() || VPlanBuildStressTest ||
1535       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1536     LoopBlocksRPO RPOT(&L);
1537     RPOT.perform(LI);
1538     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539       V.push_back(&L);
1540       // TODO: Collect inner loops inside marked outer loops in case
1541       // vectorization fails for the outer loop. Do not invoke
1542       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543       // already known to be reducible. We can use an inherited attribute for
1544       // that.
1545       return;
1546     }
1547   }
1548   for (Loop *InnerL : L)
1549     collectSupportedLoops(*InnerL, LI, ORE, V);
1550 }
1551 
1552 namespace {
1553 
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556   /// Pass identification, replacement for typeid
1557   static char ID;
1558 
1559   LoopVectorizePass Impl;
1560 
1561   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562                          bool VectorizeOnlyWhenForced = false)
1563       : FunctionPass(ID) {
1564     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1566     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1567   }
1568 
1569   bool runOnFunction(Function &F) override {
1570     if (skipFunction(F))
1571       return false;
1572 
1573     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1580     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1586 
1587     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1589 
1590     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591                         GetLAA, *ORE, PSI);
1592   }
1593 
1594   void getAnalysisUsage(AnalysisUsage &AU) const override {
1595     AU.addRequired<AssumptionCacheTracker>();
1596     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1597     AU.addRequired<DominatorTreeWrapperPass>();
1598     AU.addRequired<LoopInfoWrapperPass>();
1599     AU.addRequired<ScalarEvolutionWrapperPass>();
1600     AU.addRequired<TargetTransformInfoWrapperPass>();
1601     AU.addRequired<AAResultsWrapperPass>();
1602     AU.addRequired<LoopAccessLegacyAnalysis>();
1603     AU.addRequired<DemandedBitsWrapperPass>();
1604     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1605 
1606     // We currently do not preserve loopinfo/dominator analyses with outer loop
1607     // vectorization. Until this is addressed, mark these analyses as preserved
1608     // only for non-VPlan-native path.
1609     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610     if (!EnableVPlanNativePath) {
1611       AU.addPreserved<LoopInfoWrapperPass>();
1612       AU.addPreserved<DominatorTreeWrapperPass>();
1613     }
1614 
1615     AU.addPreserved<BasicAAWrapperPass>();
1616     AU.addPreserved<GlobalsAAWrapperPass>();
1617     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1618   }
1619 };
1620 
1621 } // end anonymous namespace
1622 
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1627 
1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1629   // We need to place the broadcast of invariant variables outside the loop,
1630   // but only if it's proven safe to do so. Else, broadcast will be inside
1631   // vector loop body.
1632   Instruction *Instr = dyn_cast<Instruction>(V);
1633   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634                      (!Instr ||
1635                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1636   // Place the code for broadcasting invariant variables in the new preheader.
1637   IRBuilder<>::InsertPointGuard Guard(Builder);
1638   if (SafeToHoist)
1639     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1640 
1641   // Broadcast the scalar into all locations in the vector.
1642   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1643 
1644   return Shuf;
1645 }
1646 
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650          "Expected either an induction phi-node or a truncate of it!");
1651   Value *Start = II.getStartValue();
1652 
1653   // Construct the initial value of the vector IV in the vector loop preheader
1654   auto CurrIP = Builder.saveIP();
1655   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1656   if (isa<TruncInst>(EntryVal)) {
1657     assert(Start->getType()->isIntegerTy() &&
1658            "Truncation requires an integer type");
1659     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660     Step = Builder.CreateTrunc(Step, TruncType);
1661     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1662   }
1663   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664   Value *SteppedStart =
1665       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1666 
1667   // We create vector phi nodes for both integer and floating-point induction
1668   // variables. Here, we determine the kind of arithmetic we will perform.
1669   Instruction::BinaryOps AddOp;
1670   Instruction::BinaryOps MulOp;
1671   if (Step->getType()->isIntegerTy()) {
1672     AddOp = Instruction::Add;
1673     MulOp = Instruction::Mul;
1674   } else {
1675     AddOp = II.getInductionOpcode();
1676     MulOp = Instruction::FMul;
1677   }
1678 
1679   // Multiply the vectorization factor by the step using integer or
1680   // floating-point arithmetic as appropriate.
1681   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1683 
1684   // Create a vector splat to use in the induction update.
1685   //
1686   // FIXME: If the step is non-constant, we create the vector splat with
1687   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688   //        handle a constant vector splat.
1689   Value *SplatVF = isa<Constant>(Mul)
1690                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691                        : Builder.CreateVectorSplat(VF, Mul);
1692   Builder.restoreIP(CurrIP);
1693 
1694   // We may need to add the step a number of times, depending on the unroll
1695   // factor. The last of those goes into the PHI.
1696   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1697                                     &*LoopVectorBody->getFirstInsertionPt());
1698   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699   Instruction *LastInduction = VecInd;
1700   for (unsigned Part = 0; Part < UF; ++Part) {
1701     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1702 
1703     if (isa<TruncInst>(EntryVal))
1704       addMetadata(LastInduction, EntryVal);
1705     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1706 
1707     LastInduction = cast<Instruction>(addFastMathFlag(
1708         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1710   }
1711 
1712   // Move the last step to the end of the latch block. This ensures consistent
1713   // placement of all induction updates.
1714   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716   auto *ICmp = cast<Instruction>(Br->getCondition());
1717   LastInduction->moveBefore(ICmp);
1718   LastInduction->setName("vec.ind.next");
1719 
1720   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1722 }
1723 
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1725   return Cost->isScalarAfterVectorization(I, VF) ||
1726          Cost->isProfitableToScalarize(I, VF);
1727 }
1728 
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1730   if (shouldScalarizeInstruction(IV))
1731     return true;
1732   auto isScalarInst = [&](User *U) -> bool {
1733     auto *I = cast<Instruction>(U);
1734     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1735   };
1736   return llvm::any_of(IV->users(), isScalarInst);
1737 }
1738 
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740     const InductionDescriptor &ID, const Instruction *EntryVal,
1741     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743          "Expected either an induction phi-node or a truncate of it!");
1744 
1745   // This induction variable is not the phi from the original loop but the
1746   // newly-created IV based on the proof that casted Phi is equal to the
1747   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748   // re-uses the same InductionDescriptor that original IV uses but we don't
1749   // have to do any recording in this case - that is done when original IV is
1750   // processed.
1751   if (isa<TruncInst>(EntryVal))
1752     return;
1753 
1754   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755   if (Casts.empty())
1756     return;
1757   // Only the first Cast instruction in the Casts vector is of interest.
1758   // The rest of the Casts (if exist) have no uses outside the
1759   // induction update chain itself.
1760   Instruction *CastInst = *Casts.begin();
1761   if (Lane < UINT_MAX)
1762     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763   else
1764     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1765 }
1766 
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1768   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769          "Primary induction variable must have an integer type");
1770 
1771   auto II = Legal->getInductionVars()->find(IV);
1772   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1773 
1774   auto ID = II->second;
1775   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1776 
1777   // The scalar value to broadcast. This will be derived from the canonical
1778   // induction variable.
1779   Value *ScalarIV = nullptr;
1780 
1781   // The value from the original loop to which we are mapping the new induction
1782   // variable.
1783   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1784 
1785   // True if we have vectorized the induction variable.
1786   auto VectorizedIV = false;
1787 
1788   // Determine if we want a scalar version of the induction variable. This is
1789   // true if the induction variable itself is not widened, or if it has at
1790   // least one user in the loop that is not widened.
1791   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1792 
1793   // Generate code for the induction step. Note that induction steps are
1794   // required to be loop-invariant
1795   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796          "Induction step should be loop invariant");
1797   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798   Value *Step = nullptr;
1799   if (PSE.getSE()->isSCEVable(IV->getType())) {
1800     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1802                              LoopVectorPreHeader->getTerminator());
1803   } else {
1804     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1805   }
1806 
1807   // Try to create a new independent vector induction variable. If we can't
1808   // create the phi node, we will splat the scalar induction variable in each
1809   // loop iteration.
1810   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812     VectorizedIV = true;
1813   }
1814 
1815   // If we haven't yet vectorized the induction variable, or if we will create
1816   // a scalar one, we need to define the scalar induction variable and step
1817   // values. If we were given a truncation type, truncate the canonical
1818   // induction variable and step. Otherwise, derive these values from the
1819   // induction descriptor.
1820   if (!VectorizedIV || NeedsScalarIV) {
1821     ScalarIV = Induction;
1822     if (IV != OldInduction) {
1823       ScalarIV = IV->getType()->isIntegerTy()
1824                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1825                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1826                                           IV->getType());
1827       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828       ScalarIV->setName("offset.idx");
1829     }
1830     if (Trunc) {
1831       auto *TruncType = cast<IntegerType>(Trunc->getType());
1832       assert(Step->getType()->isIntegerTy() &&
1833              "Truncation requires an integer step");
1834       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835       Step = Builder.CreateTrunc(Step, TruncType);
1836     }
1837   }
1838 
1839   // If we haven't yet vectorized the induction variable, splat the scalar
1840   // induction variable, and build the necessary step vectors.
1841   // TODO: Don't do it unless the vectorized IV is really required.
1842   if (!VectorizedIV) {
1843     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844     for (unsigned Part = 0; Part < UF; ++Part) {
1845       Value *EntryPart =
1846           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848       if (Trunc)
1849         addMetadata(EntryPart, Trunc);
1850       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1851     }
1852   }
1853 
1854   // If an induction variable is only used for counting loop iterations or
1855   // calculating addresses, it doesn't need to be widened. Create scalar steps
1856   // that can be used by instructions we will later scalarize. Note that the
1857   // addition of the scalar steps will not increase the number of instructions
1858   // in the loop in the common case prior to InstCombine. We will be trading
1859   // one vector extract for each scalar step.
1860   if (NeedsScalarIV)
1861     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1862 }
1863 
1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1865                                           Instruction::BinaryOps BinOp) {
1866   // Create and check the types.
1867   assert(Val->getType()->isVectorTy() && "Must be a vector");
1868   int VLen = Val->getType()->getVectorNumElements();
1869 
1870   Type *STy = Val->getType()->getScalarType();
1871   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872          "Induction Step must be an integer or FP");
1873   assert(Step->getType() == STy && "Step has wrong type");
1874 
1875   SmallVector<Constant *, 8> Indices;
1876 
1877   if (STy->isIntegerTy()) {
1878     // Create a vector of consecutive numbers from zero to VF.
1879     for (int i = 0; i < VLen; ++i)
1880       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1881 
1882     // Add the consecutive indices to the vector value.
1883     Constant *Cv = ConstantVector::get(Indices);
1884     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885     Step = Builder.CreateVectorSplat(VLen, Step);
1886     assert(Step->getType() == Val->getType() && "Invalid step vec");
1887     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888     // which can be found from the original scalar operations.
1889     Step = Builder.CreateMul(Cv, Step);
1890     return Builder.CreateAdd(Val, Step, "induction");
1891   }
1892 
1893   // Floating point induction.
1894   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895          "Binary Opcode should be specified for FP induction");
1896   // Create a vector of consecutive numbers from zero to VF.
1897   for (int i = 0; i < VLen; ++i)
1898     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1899 
1900   // Add the consecutive indices to the vector value.
1901   Constant *Cv = ConstantVector::get(Indices);
1902 
1903   Step = Builder.CreateVectorSplat(VLen, Step);
1904 
1905   // Floating point operations had to be 'fast' to enable the induction.
1906   FastMathFlags Flags;
1907   Flags.setFast();
1908 
1909   Value *MulOp = Builder.CreateFMul(Cv, Step);
1910   if (isa<Instruction>(MulOp))
1911     // Have to check, MulOp may be a constant
1912     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1913 
1914   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915   if (isa<Instruction>(BOp))
1916     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917   return BOp;
1918 }
1919 
1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1921                                            Instruction *EntryVal,
1922                                            const InductionDescriptor &ID) {
1923   // We shouldn't have to build scalar steps if we aren't vectorizing.
1924   assert(VF > 1 && "VF should be greater than one");
1925 
1926   // Get the value type and ensure it and the step have the same integer type.
1927   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928   assert(ScalarIVTy == Step->getType() &&
1929          "Val and Step should have the same type");
1930 
1931   // We build scalar steps for both integer and floating-point induction
1932   // variables. Here, we determine the kind of arithmetic we will perform.
1933   Instruction::BinaryOps AddOp;
1934   Instruction::BinaryOps MulOp;
1935   if (ScalarIVTy->isIntegerTy()) {
1936     AddOp = Instruction::Add;
1937     MulOp = Instruction::Mul;
1938   } else {
1939     AddOp = ID.getInductionOpcode();
1940     MulOp = Instruction::FMul;
1941   }
1942 
1943   // Determine the number of scalars we need to generate for each unroll
1944   // iteration. If EntryVal is uniform, we only need to generate the first
1945   // lane. Otherwise, we generate all VF values.
1946   unsigned Lanes =
1947       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948                                                                          : VF;
1949   // Compute the scalar steps and save the results in VectorLoopValueMap.
1950   for (unsigned Part = 0; Part < UF; ++Part) {
1951     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1957     }
1958   }
1959 }
1960 
1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1962   assert(V != Induction && "The new induction variable should not be used.");
1963   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1965 
1966   // If we have a stride that is replaced by one, do it here. Defer this for
1967   // the VPlan-native path until we start running Legal checks in that path.
1968   if (!EnableVPlanNativePath && Legal->hasStride(V))
1969     V = ConstantInt::get(V->getType(), 1);
1970 
1971   // If we have a vector mapped to this value, return it.
1972   if (VectorLoopValueMap.hasVectorValue(V, Part))
1973     return VectorLoopValueMap.getVectorValue(V, Part);
1974 
1975   // If the value has not been vectorized, check if it has been scalarized
1976   // instead. If it has been scalarized, and we actually need the value in
1977   // vector form, we will construct the vector values on demand.
1978   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1979     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1980 
1981     // If we've scalarized a value, that value should be an instruction.
1982     auto *I = cast<Instruction>(V);
1983 
1984     // If we aren't vectorizing, we can just copy the scalar map values over to
1985     // the vector map.
1986     if (VF == 1) {
1987       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988       return ScalarValue;
1989     }
1990 
1991     // Get the last scalar instruction we generated for V and Part. If the value
1992     // is known to be uniform after vectorization, this corresponds to lane zero
1993     // of the Part unroll iteration. Otherwise, the last instruction is the one
1994     // we created for the last vector lane of the Part unroll iteration.
1995     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996     auto *LastInst = cast<Instruction>(
1997         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1998 
1999     // Set the insert point after the last scalarized instruction. This ensures
2000     // the insertelement sequence will directly follow the scalar definitions.
2001     auto OldIP = Builder.saveIP();
2002     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003     Builder.SetInsertPoint(&*NewIP);
2004 
2005     // However, if we are vectorizing, we need to construct the vector values.
2006     // If the value is known to be uniform after vectorization, we can just
2007     // broadcast the scalar value corresponding to lane zero for each unroll
2008     // iteration. Otherwise, we construct the vector values using insertelement
2009     // instructions. Since the resulting vectors are stored in
2010     // VectorLoopValueMap, we will only generate the insertelements once.
2011     Value *VectorValue = nullptr;
2012     if (Cost->isUniformAfterVectorization(I, VF)) {
2013       VectorValue = getBroadcastInstrs(ScalarValue);
2014       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015     } else {
2016       // Initialize packing with insertelements to start from undef.
2017       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2018       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019       for (unsigned Lane = 0; Lane < VF; ++Lane)
2020         packScalarIntoVectorValue(V, {Part, Lane});
2021       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2022     }
2023     Builder.restoreIP(OldIP);
2024     return VectorValue;
2025   }
2026 
2027   // If this scalar is unknown, assume that it is a constant or that it is
2028   // loop invariant. Broadcast V and save the value for future uses.
2029   Value *B = getBroadcastInstrs(V);
2030   VectorLoopValueMap.setVectorValue(V, Part, B);
2031   return B;
2032 }
2033 
2034 Value *
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2036                                             const VPIteration &Instance) {
2037   // If the value is not an instruction contained in the loop, it should
2038   // already be scalar.
2039   if (OrigLoop->isLoopInvariant(V))
2040     return V;
2041 
2042   assert(Instance.Lane > 0
2043              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044              : true && "Uniform values only have lane zero");
2045 
2046   // If the value from the original loop has not been vectorized, it is
2047   // represented by UF x VF scalar values in the new loop. Return the requested
2048   // scalar value.
2049   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050     return VectorLoopValueMap.getScalarValue(V, Instance);
2051 
2052   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053   // for the given unroll part. If this entry is not a vector type (i.e., the
2054   // vectorization factor is one), there is no need to generate an
2055   // extractelement instruction.
2056   auto *U = getOrCreateVectorValue(V, Instance.Part);
2057   if (!U->getType()->isVectorTy()) {
2058     assert(VF == 1 && "Value not scalarized has non-vector type");
2059     return U;
2060   }
2061 
2062   // Otherwise, the value from the original loop has been vectorized and is
2063   // represented by UF vector values. Extract and return the requested scalar
2064   // value from the appropriate vector lane.
2065   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2066 }
2067 
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069     Value *V, const VPIteration &Instance) {
2070   assert(V != Induction && "The new induction variable should not be used.");
2071   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2073 
2074   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077                                             Builder.getInt32(Instance.Lane));
2078   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2079 }
2080 
2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2082   assert(Vec->getType()->isVectorTy() && "Invalid type");
2083   SmallVector<Constant *, 8> ShuffleMask;
2084   for (unsigned i = 0; i < VF; ++i)
2085     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2086 
2087   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088                                      ConstantVector::get(ShuffleMask),
2089                                      "reverse");
2090 }
2091 
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2096   // If an override option has been passed in for interleaved accesses, use it.
2097   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2098     return EnableMaskedInterleavedMemAccesses;
2099 
2100   return TTI.enableMaskedInterleavedAccessVectorization();
2101 }
2102 
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2104 //
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 //   for (i = 0; i < N; i+=3) {
2107 //     R = Pic[i];             // Member of index 0
2108 //     G = Pic[i+1];           // Member of index 1
2109 //     B = Pic[i+2];           // Member of index 2
2110 //     ... // do something to R, G, B
2111 //   }
2112 // To:
2113 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2114 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2115 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2116 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2117 //
2118 // Or translate following interleaved store group (factor = 3):
2119 //   for (i = 0; i < N; i+=3) {
2120 //     ... do something to R, G, B
2121 //     Pic[i]   = R;           // Member of index 0
2122 //     Pic[i+1] = G;           // Member of index 1
2123 //     Pic[i+2] = B;           // Member of index 2
2124 //   }
2125 // To:
2126 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2130 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2132                                                    VectorParts *BlockInMask) {
2133   const InterleaveGroup<Instruction> *Group =
2134       Cost->getInterleavedAccessGroup(Instr);
2135   assert(Group && "Fail to get an interleaved access group.");
2136 
2137   // Skip if current instruction is not the insert position.
2138   if (Instr != Group->getInsertPos())
2139     return;
2140 
2141   const DataLayout &DL = Instr->getModule()->getDataLayout();
2142   Value *Ptr = getLoadStorePointerOperand(Instr);
2143 
2144   // Prepare for the vector type of the interleaved load/store.
2145   Type *ScalarTy = getMemInstValueType(Instr);
2146   unsigned InterleaveFactor = Group->getFactor();
2147   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2149 
2150   // Prepare for the new pointers.
2151   setDebugLocFromInst(Builder, Ptr);
2152   SmallVector<Value *, 2> NewPtrs;
2153   unsigned Index = Group->getIndex(Instr);
2154 
2155   VectorParts Mask;
2156   bool IsMaskForCondRequired = BlockInMask;
2157   if (IsMaskForCondRequired) {
2158     Mask = *BlockInMask;
2159     // TODO: extend the masked interleaved-group support to reversed access.
2160     assert(!Group->isReverse() && "Reversed masked interleave-group "
2161                                   "not supported.");
2162   }
2163 
2164   // If the group is reverse, adjust the index to refer to the last vector lane
2165   // instead of the first. We adjust the index from the first vector lane,
2166   // rather than directly getting the pointer for lane VF - 1, because the
2167   // pointer operand of the interleaved access is supposed to be uniform. For
2168   // uniform instructions, we're only required to generate a value for the
2169   // first vector lane in each unroll iteration.
2170   if (Group->isReverse())
2171     Index += (VF - 1) * Group->getFactor();
2172 
2173   bool InBounds = false;
2174   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175     InBounds = gep->isInBounds();
2176 
2177   for (unsigned Part = 0; Part < UF; Part++) {
2178     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2179 
2180     // Notice current instruction could be any index. Need to adjust the address
2181     // to the member of index 0.
2182     //
2183     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2184     //       b = A[i];       // Member of index 0
2185     // Current pointer is pointed to A[i+1], adjust it to A[i].
2186     //
2187     // E.g.  A[i+1] = a;     // Member of index 1
2188     //       A[i]   = b;     // Member of index 0
2189     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2190     // Current pointer is pointed to A[i+2], adjust it to A[i].
2191     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192     if (InBounds)
2193       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2194 
2195     // Cast to the vector pointer type.
2196     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2197   }
2198 
2199   setDebugLocFromInst(Builder, Instr);
2200   Value *UndefVec = UndefValue::get(VecTy);
2201 
2202   Value *MaskForGaps = nullptr;
2203   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2206   }
2207 
2208   // Vectorize the interleaved load group.
2209   if (isa<LoadInst>(Instr)) {
2210     // For each unroll part, create a wide load for the group.
2211     SmallVector<Value *, 2> NewLoads;
2212     for (unsigned Part = 0; Part < UF; Part++) {
2213       Instruction *NewLoad;
2214       if (IsMaskForCondRequired || MaskForGaps) {
2215         assert(useMaskedInterleavedAccesses(*TTI) &&
2216                "masked interleaved groups are not allowed.");
2217         Value *GroupMask = MaskForGaps;
2218         if (IsMaskForCondRequired) {
2219           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221           Value *ShuffledMask = Builder.CreateShuffleVector(
2222               Mask[Part], Undefs, RepMask, "interleaved.mask");
2223           GroupMask = MaskForGaps
2224                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225                                                 MaskForGaps)
2226                           : ShuffledMask;
2227         }
2228         NewLoad =
2229             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230                                      GroupMask, UndefVec, "wide.masked.vec");
2231       }
2232       else
2233         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234                                             Group->getAlignment(), "wide.vec");
2235       Group->addMetadata(NewLoad);
2236       NewLoads.push_back(NewLoad);
2237     }
2238 
2239     // For each member in the group, shuffle out the appropriate data from the
2240     // wide loads.
2241     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242       Instruction *Member = Group->getMember(I);
2243 
2244       // Skip the gaps in the group.
2245       if (!Member)
2246         continue;
2247 
2248       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249       for (unsigned Part = 0; Part < UF; Part++) {
2250         Value *StridedVec = Builder.CreateShuffleVector(
2251             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2252 
2253         // If this member has different type, cast the result type.
2254         if (Member->getType() != ScalarTy) {
2255           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2257         }
2258 
2259         if (Group->isReverse())
2260           StridedVec = reverseVector(StridedVec);
2261 
2262         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2263       }
2264     }
2265     return;
2266   }
2267 
2268   // The sub vector type for current instruction.
2269   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2270 
2271   // Vectorize the interleaved store group.
2272   for (unsigned Part = 0; Part < UF; Part++) {
2273     // Collect the stored vector from each member.
2274     SmallVector<Value *, 4> StoredVecs;
2275     for (unsigned i = 0; i < InterleaveFactor; i++) {
2276       // Interleaved store group doesn't allow a gap, so each index has a member
2277       Instruction *Member = Group->getMember(i);
2278       assert(Member && "Fail to get a member from an interleaved store group");
2279 
2280       Value *StoredVec = getOrCreateVectorValue(
2281           cast<StoreInst>(Member)->getValueOperand(), Part);
2282       if (Group->isReverse())
2283         StoredVec = reverseVector(StoredVec);
2284 
2285       // If this member has different type, cast it to a unified type.
2286 
2287       if (StoredVec->getType() != SubVT)
2288         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2289 
2290       StoredVecs.push_back(StoredVec);
2291     }
2292 
2293     // Concatenate all vectors into a wide vector.
2294     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2295 
2296     // Interleave the elements in the wide vector.
2297     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299                                               "interleaved.vec");
2300 
2301     Instruction *NewStoreInstr;
2302     if (IsMaskForCondRequired) {
2303       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305       Value *ShuffledMask = Builder.CreateShuffleVector(
2306           Mask[Part], Undefs, RepMask, "interleaved.mask");
2307       NewStoreInstr = Builder.CreateMaskedStore(
2308           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2309     }
2310     else
2311       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312         Group->getAlignment());
2313 
2314     Group->addMetadata(NewStoreInstr);
2315   }
2316 }
2317 
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2319                                                      VectorParts *BlockInMask) {
2320   // Attempt to issue a wide load.
2321   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2323 
2324   assert((LI || SI) && "Invalid Load/Store instruction");
2325 
2326   LoopVectorizationCostModel::InstWidening Decision =
2327       Cost->getWideningDecision(Instr, VF);
2328   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2329          "CM decision should be taken at this point");
2330   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2331     return vectorizeInterleaveGroup(Instr);
2332 
2333   Type *ScalarDataTy = getMemInstValueType(Instr);
2334   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335   Value *Ptr = getLoadStorePointerOperand(Instr);
2336   unsigned Alignment = getLoadStoreAlignment(Instr);
2337   // An alignment of 0 means target abi alignment. We need to use the scalar's
2338   // target abi alignment in such a case.
2339   const DataLayout &DL = Instr->getModule()->getDataLayout();
2340   if (!Alignment)
2341     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2343 
2344   // Determine if the pointer operand of the access is either consecutive or
2345   // reverse consecutive.
2346   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347   bool ConsecutiveStride =
2348       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349   bool CreateGatherScatter =
2350       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2351 
2352   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353   // gather/scatter. Otherwise Decision should have been to Scalarize.
2354   assert((ConsecutiveStride || CreateGatherScatter) &&
2355          "The instruction should be scalarized");
2356 
2357   // Handle consecutive loads/stores.
2358   if (ConsecutiveStride)
2359     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2360 
2361   VectorParts Mask;
2362   bool isMaskRequired = BlockInMask;
2363   if (isMaskRequired)
2364     Mask = *BlockInMask;
2365 
2366   bool InBounds = false;
2367   if (auto *gep = dyn_cast<GetElementPtrInst>(
2368           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369     InBounds = gep->isInBounds();
2370 
2371   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372     // Calculate the pointer for the specific unroll-part.
2373     GetElementPtrInst *PartPtr = nullptr;
2374 
2375     if (Reverse) {
2376       // If the address is consecutive but reversed, then the
2377       // wide store needs to start at the last vector element.
2378       PartPtr = cast<GetElementPtrInst>(
2379           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380       PartPtr->setIsInBounds(InBounds);
2381       PartPtr = cast<GetElementPtrInst>(
2382           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383       PartPtr->setIsInBounds(InBounds);
2384       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385         Mask[Part] = reverseVector(Mask[Part]);
2386     } else {
2387       PartPtr = cast<GetElementPtrInst>(
2388           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389       PartPtr->setIsInBounds(InBounds);
2390     }
2391 
2392     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2393   };
2394 
2395   // Handle Stores:
2396   if (SI) {
2397     setDebugLocFromInst(Builder, SI);
2398 
2399     for (unsigned Part = 0; Part < UF; ++Part) {
2400       Instruction *NewSI = nullptr;
2401       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402       if (CreateGatherScatter) {
2403         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406                                             MaskPart);
2407       } else {
2408         if (Reverse) {
2409           // If we store to reverse consecutive memory locations, then we need
2410           // to reverse the order of elements in the stored value.
2411           StoredVal = reverseVector(StoredVal);
2412           // We don't want to update the value in the map as it might be used in
2413           // another expression. So don't call resetVectorValue(StoredVal).
2414         }
2415         auto *VecPtr = CreateVecPtr(Part, Ptr);
2416         if (isMaskRequired)
2417           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418                                             Mask[Part]);
2419         else
2420           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2421       }
2422       addMetadata(NewSI, SI);
2423     }
2424     return;
2425   }
2426 
2427   // Handle loads.
2428   assert(LI && "Must have a load instruction");
2429   setDebugLocFromInst(Builder, LI);
2430   for (unsigned Part = 0; Part < UF; ++Part) {
2431     Value *NewLI;
2432     if (CreateGatherScatter) {
2433       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436                                          nullptr, "wide.masked.gather");
2437       addMetadata(NewLI, LI);
2438     } else {
2439       auto *VecPtr = CreateVecPtr(Part, Ptr);
2440       if (isMaskRequired)
2441         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442                                          UndefValue::get(DataTy),
2443                                          "wide.masked.load");
2444       else
2445         NewLI =
2446             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2447 
2448       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449       addMetadata(NewLI, LI);
2450       if (Reverse)
2451         NewLI = reverseVector(NewLI);
2452     }
2453     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2454   }
2455 }
2456 
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2458                                                const VPIteration &Instance,
2459                                                bool IfPredicateInstr) {
2460   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2461 
2462   setDebugLocFromInst(Builder, Instr);
2463 
2464   // Does this instruction return a value ?
2465   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2466 
2467   Instruction *Cloned = Instr->clone();
2468   if (!IsVoidRetTy)
2469     Cloned->setName(Instr->getName() + ".cloned");
2470 
2471   // Replace the operands of the cloned instructions with their scalar
2472   // equivalents in the new loop.
2473   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475     Cloned->setOperand(op, NewOp);
2476   }
2477   addNewMetadata(Cloned, Instr);
2478 
2479   // Place the cloned scalar in the new loop.
2480   Builder.Insert(Cloned);
2481 
2482   // Add the cloned scalar to the scalar map entry.
2483   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2484 
2485   // If we just cloned a new assumption, add it the assumption cache.
2486   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487     if (II->getIntrinsicID() == Intrinsic::assume)
2488       AC->registerAssumption(II);
2489 
2490   // End if-block.
2491   if (IfPredicateInstr)
2492     PredicatedInstructions.push_back(Cloned);
2493 }
2494 
2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2496                                                       Value *End, Value *Step,
2497                                                       Instruction *DL) {
2498   BasicBlock *Header = L->getHeader();
2499   BasicBlock *Latch = L->getLoopLatch();
2500   // As we're just creating this loop, it's possible no latch exists
2501   // yet. If so, use the header as this will be a single block loop.
2502   if (!Latch)
2503     Latch = Header;
2504 
2505   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2506   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2507   setDebugLocFromInst(Builder, OldInst);
2508   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2509 
2510   Builder.SetInsertPoint(Latch->getTerminator());
2511   setDebugLocFromInst(Builder, OldInst);
2512 
2513   // Create i+1 and fill the PHINode.
2514   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515   Induction->addIncoming(Start, L->getLoopPreheader());
2516   Induction->addIncoming(Next, Latch);
2517   // Create the compare.
2518   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2520 
2521   // Now we have two terminators. Remove the old one from the block.
2522   Latch->getTerminator()->eraseFromParent();
2523 
2524   return Induction;
2525 }
2526 
2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2528   if (TripCount)
2529     return TripCount;
2530 
2531   assert(L && "Create Trip Count for null loop.");
2532   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2533   // Find the loop boundaries.
2534   ScalarEvolution *SE = PSE.getSE();
2535   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537          "Invalid loop count");
2538 
2539   Type *IdxTy = Legal->getWidestInductionType();
2540   assert(IdxTy && "No type for induction");
2541 
2542   // The exit count might have the type of i64 while the phi is i32. This can
2543   // happen if we have an induction variable that is sign extended before the
2544   // compare. The only way that we get a backedge taken count is that the
2545   // induction variable was signed and as such will not overflow. In such a case
2546   // truncation is legal.
2547   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548       IdxTy->getPrimitiveSizeInBits())
2549     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2551 
2552   // Get the total trip count from the count by adding 1.
2553   const SCEV *ExitCount = SE->getAddExpr(
2554       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2555 
2556   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2557 
2558   // Expand the trip count and place the new instructions in the preheader.
2559   // Notice that the pre-header does not change, only the loop body.
2560   SCEVExpander Exp(*SE, DL, "induction");
2561 
2562   // Count holds the overall loop count (N).
2563   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2564                                 L->getLoopPreheader()->getTerminator());
2565 
2566   if (TripCount->getType()->isPointerTy())
2567     TripCount =
2568         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2569                                     L->getLoopPreheader()->getTerminator());
2570 
2571   return TripCount;
2572 }
2573 
2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2575   if (VectorTripCount)
2576     return VectorTripCount;
2577 
2578   Value *TC = getOrCreateTripCount(L);
2579   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2580 
2581   Type *Ty = TC->getType();
2582   Constant *Step = ConstantInt::get(Ty, VF * UF);
2583 
2584   // If the tail is to be folded by masking, round the number of iterations N
2585   // up to a multiple of Step instead of rounding down. This is done by first
2586   // adding Step-1 and then rounding down. Note that it's ok if this addition
2587   // overflows: the vector induction variable will eventually wrap to zero given
2588   // that it starts at zero and its Step is a power of two; the loop will then
2589   // exit, with the last early-exit vector comparison also producing all-true.
2590   if (Cost->foldTailByMasking()) {
2591     assert(isPowerOf2_32(VF * UF) &&
2592            "VF*UF must be a power of 2 when folding tail by masking");
2593     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2594   }
2595 
2596   // Now we need to generate the expression for the part of the loop that the
2597   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598   // iterations are not required for correctness, or N - Step, otherwise. Step
2599   // is equal to the vectorization factor (number of SIMD elements) times the
2600   // unroll factor (number of SIMD instructions).
2601   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2602 
2603   // If there is a non-reversed interleaved group that may speculatively access
2604   // memory out-of-bounds, we need to ensure that there will be at least one
2605   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606   // the trip count, we set the remainder to be equal to the step. If the step
2607   // does not evenly divide the trip count, no adjustment is necessary since
2608   // there will already be scalar iterations. Note that the minimum iterations
2609   // check ensures that N >= Step.
2610   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612     R = Builder.CreateSelect(IsZero, Step, R);
2613   }
2614 
2615   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2616 
2617   return VectorTripCount;
2618 }
2619 
2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2621                                                    const DataLayout &DL) {
2622   // Verify that V is a vector type with same number of elements as DstVTy.
2623   unsigned VF = DstVTy->getNumElements();
2624   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626   Type *SrcElemTy = SrcVecTy->getElementType();
2627   Type *DstElemTy = DstVTy->getElementType();
2628   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629          "Vector elements must have same size");
2630 
2631   // Do a direct cast if element types are castable.
2632   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633     return Builder.CreateBitOrPointerCast(V, DstVTy);
2634   }
2635   // V cannot be directly casted to desired vector type.
2636   // May happen when V is a floating point vector but DstVTy is a vector of
2637   // pointers or vice-versa. Handle this using a two-step bitcast using an
2638   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640          "Only one type should be a pointer type");
2641   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642          "Only one type should be a floating point type");
2643   Type *IntTy =
2644       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2645   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2648 }
2649 
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2651                                                          BasicBlock *Bypass) {
2652   Value *Count = getOrCreateTripCount(L);
2653   BasicBlock *BB = L->getLoopPreheader();
2654   IRBuilder<> Builder(BB->getTerminator());
2655 
2656   // Generate code to check if the loop's trip count is less than VF * UF, or
2657   // equal to it in case a scalar epilogue is required; this implies that the
2658   // vector trip count is zero. This check also covers the case where adding one
2659   // to the backedge-taken count overflowed leading to an incorrect trip count
2660   // of zero. In this case we will also jump to the scalar loop.
2661   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662                                           : ICmpInst::ICMP_ULT;
2663 
2664   // If tail is to be folded, vector loop takes care of all iterations.
2665   Value *CheckMinIters = Builder.getFalse();
2666   if (!Cost->foldTailByMasking())
2667     CheckMinIters = Builder.CreateICmp(
2668         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669         "min.iters.check");
2670 
2671   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672   // Update dominator tree immediately if the generated block is a
2673   // LoopBypassBlock because SCEV expansions to generate loop bypass
2674   // checks may query it before the current function is finished.
2675   DT->addNewBlock(NewBB, BB);
2676   if (L->getParentLoop())
2677     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2678   ReplaceInstWithInst(BB->getTerminator(),
2679                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680   LoopBypassBlocks.push_back(BB);
2681 }
2682 
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2684   BasicBlock *BB = L->getLoopPreheader();
2685 
2686   // Generate the code to check that the SCEV assumptions that we made.
2687   // We want the new basic block to start at the first instruction in a
2688   // sequence of instructions that form a check.
2689   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690                    "scev.check");
2691   Value *SCEVCheck =
2692       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2693 
2694   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695     if (C->isZero())
2696       return;
2697 
2698   assert(!BB->getParent()->hasOptSize() &&
2699          "Cannot SCEV check stride or overflow when optimizing for size");
2700 
2701   // Create a new block containing the stride check.
2702   BB->setName("vector.scevcheck");
2703   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2704   // Update dominator tree immediately if the generated block is a
2705   // LoopBypassBlock because SCEV expansions to generate loop bypass
2706   // checks may query it before the current function is finished.
2707   DT->addNewBlock(NewBB, BB);
2708   if (L->getParentLoop())
2709     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2710   ReplaceInstWithInst(BB->getTerminator(),
2711                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2712   LoopBypassBlocks.push_back(BB);
2713   AddedSafetyChecks = true;
2714 }
2715 
2716 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2717   // VPlan-native path does not do any analysis for runtime checks currently.
2718   if (EnableVPlanNativePath)
2719     return;
2720 
2721   BasicBlock *BB = L->getLoopPreheader();
2722 
2723   // Generate the code that checks in runtime if arrays overlap. We put the
2724   // checks into a separate block to make the more common case of few elements
2725   // faster.
2726   Instruction *FirstCheckInst;
2727   Instruction *MemRuntimeCheck;
2728   std::tie(FirstCheckInst, MemRuntimeCheck) =
2729       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2730   if (!MemRuntimeCheck)
2731     return;
2732 
2733   assert(!BB->getParent()->hasOptSize() &&
2734          "Cannot emit memory checks when optimizing for size");
2735 
2736   // Create a new block containing the memory check.
2737   BB->setName("vector.memcheck");
2738   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2739   // Update dominator tree immediately if the generated block is a
2740   // LoopBypassBlock because SCEV expansions to generate loop bypass
2741   // checks may query it before the current function is finished.
2742   DT->addNewBlock(NewBB, BB);
2743   if (L->getParentLoop())
2744     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2745   ReplaceInstWithInst(BB->getTerminator(),
2746                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2747   LoopBypassBlocks.push_back(BB);
2748   AddedSafetyChecks = true;
2749 
2750   // We currently don't use LoopVersioning for the actual loop cloning but we
2751   // still use it to add the noalias metadata.
2752   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2753                                            PSE.getSE());
2754   LVer->prepareNoAliasMetadata();
2755 }
2756 
2757 Value *InnerLoopVectorizer::emitTransformedIndex(
2758     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2759     const InductionDescriptor &ID) const {
2760 
2761   SCEVExpander Exp(*SE, DL, "induction");
2762   auto Step = ID.getStep();
2763   auto StartValue = ID.getStartValue();
2764   assert(Index->getType() == Step->getType() &&
2765          "Index type does not match StepValue type");
2766 
2767   // Note: the IR at this point is broken. We cannot use SE to create any new
2768   // SCEV and then expand it, hoping that SCEV's simplification will give us
2769   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2770   // lead to various SCEV crashes. So all we can do is to use builder and rely
2771   // on InstCombine for future simplifications. Here we handle some trivial
2772   // cases only.
2773   auto CreateAdd = [&B](Value *X, Value *Y) {
2774     assert(X->getType() == Y->getType() && "Types don't match!");
2775     if (auto *CX = dyn_cast<ConstantInt>(X))
2776       if (CX->isZero())
2777         return Y;
2778     if (auto *CY = dyn_cast<ConstantInt>(Y))
2779       if (CY->isZero())
2780         return X;
2781     return B.CreateAdd(X, Y);
2782   };
2783 
2784   auto CreateMul = [&B](Value *X, Value *Y) {
2785     assert(X->getType() == Y->getType() && "Types don't match!");
2786     if (auto *CX = dyn_cast<ConstantInt>(X))
2787       if (CX->isOne())
2788         return Y;
2789     if (auto *CY = dyn_cast<ConstantInt>(Y))
2790       if (CY->isOne())
2791         return X;
2792     return B.CreateMul(X, Y);
2793   };
2794 
2795   switch (ID.getKind()) {
2796   case InductionDescriptor::IK_IntInduction: {
2797     assert(Index->getType() == StartValue->getType() &&
2798            "Index type does not match StartValue type");
2799     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2800       return B.CreateSub(StartValue, Index);
2801     auto *Offset = CreateMul(
2802         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2803     return CreateAdd(StartValue, Offset);
2804   }
2805   case InductionDescriptor::IK_PtrInduction: {
2806     assert(isa<SCEVConstant>(Step) &&
2807            "Expected constant step for pointer induction");
2808     return B.CreateGEP(
2809         StartValue->getType()->getPointerElementType(), StartValue,
2810         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2811                                            &*B.GetInsertPoint())));
2812   }
2813   case InductionDescriptor::IK_FpInduction: {
2814     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2815     auto InductionBinOp = ID.getInductionBinOp();
2816     assert(InductionBinOp &&
2817            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2818             InductionBinOp->getOpcode() == Instruction::FSub) &&
2819            "Original bin op should be defined for FP induction");
2820 
2821     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2822 
2823     // Floating point operations had to be 'fast' to enable the induction.
2824     FastMathFlags Flags;
2825     Flags.setFast();
2826 
2827     Value *MulExp = B.CreateFMul(StepValue, Index);
2828     if (isa<Instruction>(MulExp))
2829       // We have to check, the MulExp may be a constant.
2830       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2831 
2832     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2833                                "induction");
2834     if (isa<Instruction>(BOp))
2835       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2836 
2837     return BOp;
2838   }
2839   case InductionDescriptor::IK_NoInduction:
2840     return nullptr;
2841   }
2842   llvm_unreachable("invalid enum");
2843 }
2844 
2845 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2846   /*
2847    In this function we generate a new loop. The new loop will contain
2848    the vectorized instructions while the old loop will continue to run the
2849    scalar remainder.
2850 
2851        [ ] <-- loop iteration number check.
2852     /   |
2853    /    v
2854   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2855   |  /  |
2856   | /   v
2857   ||   [ ]     <-- vector pre header.
2858   |/    |
2859   |     v
2860   |    [  ] \
2861   |    [  ]_|   <-- vector loop.
2862   |     |
2863   |     v
2864   |   -[ ]   <--- middle-block.
2865   |  /  |
2866   | /   v
2867   -|- >[ ]     <--- new preheader.
2868    |    |
2869    |    v
2870    |   [ ] \
2871    |   [ ]_|   <-- old scalar loop to handle remainder.
2872     \   |
2873      \  v
2874       >[ ]     <-- exit block.
2875    ...
2876    */
2877 
2878   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2879   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2880   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2881   MDNode *OrigLoopID = OrigLoop->getLoopID();
2882   assert(VectorPH && "Invalid loop structure");
2883   assert(ExitBlock && "Must have an exit block");
2884 
2885   // Some loops have a single integer induction variable, while other loops
2886   // don't. One example is c++ iterators that often have multiple pointer
2887   // induction variables. In the code below we also support a case where we
2888   // don't have a single induction variable.
2889   //
2890   // We try to obtain an induction variable from the original loop as hard
2891   // as possible. However if we don't find one that:
2892   //   - is an integer
2893   //   - counts from zero, stepping by one
2894   //   - is the size of the widest induction variable type
2895   // then we create a new one.
2896   OldInduction = Legal->getPrimaryInduction();
2897   Type *IdxTy = Legal->getWidestInductionType();
2898 
2899   // Split the single block loop into the two loop structure described above.
2900   BasicBlock *VecBody =
2901       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2902   BasicBlock *MiddleBlock =
2903       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2904   BasicBlock *ScalarPH =
2905       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2906 
2907   // Create and register the new vector loop.
2908   Loop *Lp = LI->AllocateLoop();
2909   Loop *ParentLoop = OrigLoop->getParentLoop();
2910 
2911   // Insert the new loop into the loop nest and register the new basic blocks
2912   // before calling any utilities such as SCEV that require valid LoopInfo.
2913   if (ParentLoop) {
2914     ParentLoop->addChildLoop(Lp);
2915     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2916     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2917   } else {
2918     LI->addTopLevelLoop(Lp);
2919   }
2920   Lp->addBasicBlockToLoop(VecBody, *LI);
2921 
2922   // Find the loop boundaries.
2923   Value *Count = getOrCreateTripCount(Lp);
2924 
2925   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2926 
2927   // Now, compare the new count to zero. If it is zero skip the vector loop and
2928   // jump to the scalar loop. This check also covers the case where the
2929   // backedge-taken count is uint##_max: adding one to it will overflow leading
2930   // to an incorrect trip count of zero. In this (rare) case we will also jump
2931   // to the scalar loop.
2932   emitMinimumIterationCountCheck(Lp, ScalarPH);
2933 
2934   // Generate the code to check any assumptions that we've made for SCEV
2935   // expressions.
2936   emitSCEVChecks(Lp, ScalarPH);
2937 
2938   // Generate the code that checks in runtime if arrays overlap. We put the
2939   // checks into a separate block to make the more common case of few elements
2940   // faster.
2941   emitMemRuntimeChecks(Lp, ScalarPH);
2942 
2943   // Generate the induction variable.
2944   // The loop step is equal to the vectorization factor (num of SIMD elements)
2945   // times the unroll factor (num of SIMD instructions).
2946   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2947   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2948   Induction =
2949       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2950                               getDebugLocFromInstOrOperands(OldInduction));
2951 
2952   // We are going to resume the execution of the scalar loop.
2953   // Go over all of the induction variables that we found and fix the
2954   // PHIs that are left in the scalar version of the loop.
2955   // The starting values of PHI nodes depend on the counter of the last
2956   // iteration in the vectorized loop.
2957   // If we come from a bypass edge then we need to start from the original
2958   // start value.
2959 
2960   // This variable saves the new starting index for the scalar loop. It is used
2961   // to test if there are any tail iterations left once the vector loop has
2962   // completed.
2963   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2964   for (auto &InductionEntry : *List) {
2965     PHINode *OrigPhi = InductionEntry.first;
2966     InductionDescriptor II = InductionEntry.second;
2967 
2968     // Create phi nodes to merge from the  backedge-taken check block.
2969     PHINode *BCResumeVal = PHINode::Create(
2970         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2971     // Copy original phi DL over to the new one.
2972     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2973     Value *&EndValue = IVEndValues[OrigPhi];
2974     if (OrigPhi == OldInduction) {
2975       // We know what the end value is.
2976       EndValue = CountRoundDown;
2977     } else {
2978       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2979       Type *StepType = II.getStep()->getType();
2980       Instruction::CastOps CastOp =
2981         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2982       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2983       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2984       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2985       EndValue->setName("ind.end");
2986     }
2987 
2988     // The new PHI merges the original incoming value, in case of a bypass,
2989     // or the value at the end of the vectorized loop.
2990     BCResumeVal->addIncoming(EndValue, MiddleBlock);
2991 
2992     // Fix the scalar body counter (PHI node).
2993     // The old induction's phi node in the scalar body needs the truncated
2994     // value.
2995     for (BasicBlock *BB : LoopBypassBlocks)
2996       BCResumeVal->addIncoming(II.getStartValue(), BB);
2997     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2998   }
2999 
3000   // We need the OrigLoop (scalar loop part) latch terminator to help
3001   // produce correct debug info for the middle block BB instructions.
3002   // The legality check stage guarantees that the loop will have a single
3003   // latch.
3004   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3005          "Scalar loop latch terminator isn't a branch");
3006   BranchInst *ScalarLatchBr =
3007       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3008 
3009   // Add a check in the middle block to see if we have completed
3010   // all of the iterations in the first vector loop.
3011   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3012   // If tail is to be folded, we know we don't need to run the remainder.
3013   Value *CmpN = Builder.getTrue();
3014   if (!Cost->foldTailByMasking()) {
3015     CmpN =
3016         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3017                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3018 
3019     // Here we use the same DebugLoc as the scalar loop latch branch instead
3020     // of the corresponding compare because they may have ended up with
3021     // different line numbers and we want to avoid awkward line stepping while
3022     // debugging. Eg. if the compare has got a line number inside the loop.
3023     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3024   }
3025 
3026   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3027   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3028   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3029 
3030   // Get ready to start creating new instructions into the vectorized body.
3031   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3032 
3033   // Save the state.
3034   LoopVectorPreHeader = Lp->getLoopPreheader();
3035   LoopScalarPreHeader = ScalarPH;
3036   LoopMiddleBlock = MiddleBlock;
3037   LoopExitBlock = ExitBlock;
3038   LoopVectorBody = VecBody;
3039   LoopScalarBody = OldBasicBlock;
3040 
3041   Optional<MDNode *> VectorizedLoopID =
3042       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3043                                       LLVMLoopVectorizeFollowupVectorized});
3044   if (VectorizedLoopID.hasValue()) {
3045     Lp->setLoopID(VectorizedLoopID.getValue());
3046 
3047     // Do not setAlreadyVectorized if loop attributes have been defined
3048     // explicitly.
3049     return LoopVectorPreHeader;
3050   }
3051 
3052   // Keep all loop hints from the original loop on the vector loop (we'll
3053   // replace the vectorizer-specific hints below).
3054   if (MDNode *LID = OrigLoop->getLoopID())
3055     Lp->setLoopID(LID);
3056 
3057   LoopVectorizeHints Hints(Lp, true, *ORE);
3058   Hints.setAlreadyVectorized();
3059 
3060   return LoopVectorPreHeader;
3061 }
3062 
3063 // Fix up external users of the induction variable. At this point, we are
3064 // in LCSSA form, with all external PHIs that use the IV having one input value,
3065 // coming from the remainder loop. We need those PHIs to also have a correct
3066 // value for the IV when arriving directly from the middle block.
3067 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3068                                        const InductionDescriptor &II,
3069                                        Value *CountRoundDown, Value *EndValue,
3070                                        BasicBlock *MiddleBlock) {
3071   // There are two kinds of external IV usages - those that use the value
3072   // computed in the last iteration (the PHI) and those that use the penultimate
3073   // value (the value that feeds into the phi from the loop latch).
3074   // We allow both, but they, obviously, have different values.
3075 
3076   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3077 
3078   DenseMap<Value *, Value *> MissingVals;
3079 
3080   // An external user of the last iteration's value should see the value that
3081   // the remainder loop uses to initialize its own IV.
3082   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3083   for (User *U : PostInc->users()) {
3084     Instruction *UI = cast<Instruction>(U);
3085     if (!OrigLoop->contains(UI)) {
3086       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3087       MissingVals[UI] = EndValue;
3088     }
3089   }
3090 
3091   // An external user of the penultimate value need to see EndValue - Step.
3092   // The simplest way to get this is to recompute it from the constituent SCEVs,
3093   // that is Start + (Step * (CRD - 1)).
3094   for (User *U : OrigPhi->users()) {
3095     auto *UI = cast<Instruction>(U);
3096     if (!OrigLoop->contains(UI)) {
3097       const DataLayout &DL =
3098           OrigLoop->getHeader()->getModule()->getDataLayout();
3099       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3100 
3101       IRBuilder<> B(MiddleBlock->getTerminator());
3102       Value *CountMinusOne = B.CreateSub(
3103           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3104       Value *CMO =
3105           !II.getStep()->getType()->isIntegerTy()
3106               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3107                              II.getStep()->getType())
3108               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3109       CMO->setName("cast.cmo");
3110       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3111       Escape->setName("ind.escape");
3112       MissingVals[UI] = Escape;
3113     }
3114   }
3115 
3116   for (auto &I : MissingVals) {
3117     PHINode *PHI = cast<PHINode>(I.first);
3118     // One corner case we have to handle is two IVs "chasing" each-other,
3119     // that is %IV2 = phi [...], [ %IV1, %latch ]
3120     // In this case, if IV1 has an external use, we need to avoid adding both
3121     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3122     // don't already have an incoming value for the middle block.
3123     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3124       PHI->addIncoming(I.second, MiddleBlock);
3125   }
3126 }
3127 
3128 namespace {
3129 
3130 struct CSEDenseMapInfo {
3131   static bool canHandle(const Instruction *I) {
3132     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3133            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3134   }
3135 
3136   static inline Instruction *getEmptyKey() {
3137     return DenseMapInfo<Instruction *>::getEmptyKey();
3138   }
3139 
3140   static inline Instruction *getTombstoneKey() {
3141     return DenseMapInfo<Instruction *>::getTombstoneKey();
3142   }
3143 
3144   static unsigned getHashValue(const Instruction *I) {
3145     assert(canHandle(I) && "Unknown instruction!");
3146     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3147                                                            I->value_op_end()));
3148   }
3149 
3150   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3151     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3152         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3153       return LHS == RHS;
3154     return LHS->isIdenticalTo(RHS);
3155   }
3156 };
3157 
3158 } // end anonymous namespace
3159 
3160 ///Perform cse of induction variable instructions.
3161 static void cse(BasicBlock *BB) {
3162   // Perform simple cse.
3163   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3164   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3165     Instruction *In = &*I++;
3166 
3167     if (!CSEDenseMapInfo::canHandle(In))
3168       continue;
3169 
3170     // Check if we can replace this instruction with any of the
3171     // visited instructions.
3172     if (Instruction *V = CSEMap.lookup(In)) {
3173       In->replaceAllUsesWith(V);
3174       In->eraseFromParent();
3175       continue;
3176     }
3177 
3178     CSEMap[In] = In;
3179   }
3180 }
3181 
3182 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3183                                                        unsigned VF,
3184                                                        bool &NeedToScalarize) {
3185   Function *F = CI->getCalledFunction();
3186   StringRef FnName = CI->getCalledFunction()->getName();
3187   Type *ScalarRetTy = CI->getType();
3188   SmallVector<Type *, 4> Tys, ScalarTys;
3189   for (auto &ArgOp : CI->arg_operands())
3190     ScalarTys.push_back(ArgOp->getType());
3191 
3192   // Estimate cost of scalarized vector call. The source operands are assumed
3193   // to be vectors, so we need to extract individual elements from there,
3194   // execute VF scalar calls, and then gather the result into the vector return
3195   // value.
3196   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3197   if (VF == 1)
3198     return ScalarCallCost;
3199 
3200   // Compute corresponding vector type for return value and arguments.
3201   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3202   for (Type *ScalarTy : ScalarTys)
3203     Tys.push_back(ToVectorTy(ScalarTy, VF));
3204 
3205   // Compute costs of unpacking argument values for the scalar calls and
3206   // packing the return values to a vector.
3207   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3208 
3209   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3210 
3211   // If we can't emit a vector call for this function, then the currently found
3212   // cost is the cost we need to return.
3213   NeedToScalarize = true;
3214   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3215     return Cost;
3216 
3217   // If the corresponding vector cost is cheaper, return its cost.
3218   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3219   if (VectorCallCost < Cost) {
3220     NeedToScalarize = false;
3221     return VectorCallCost;
3222   }
3223   return Cost;
3224 }
3225 
3226 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3227                                                             unsigned VF) {
3228   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3229   assert(ID && "Expected intrinsic call!");
3230 
3231   FastMathFlags FMF;
3232   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3233     FMF = FPMO->getFastMathFlags();
3234 
3235   SmallVector<Value *, 4> Operands(CI->arg_operands());
3236   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3237 }
3238 
3239 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3240   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3241   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3242   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3243 }
3244 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3245   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3246   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3247   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3248 }
3249 
3250 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3251   // For every instruction `I` in MinBWs, truncate the operands, create a
3252   // truncated version of `I` and reextend its result. InstCombine runs
3253   // later and will remove any ext/trunc pairs.
3254   SmallPtrSet<Value *, 4> Erased;
3255   for (const auto &KV : Cost->getMinimalBitwidths()) {
3256     // If the value wasn't vectorized, we must maintain the original scalar
3257     // type. The absence of the value from VectorLoopValueMap indicates that it
3258     // wasn't vectorized.
3259     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3260       continue;
3261     for (unsigned Part = 0; Part < UF; ++Part) {
3262       Value *I = getOrCreateVectorValue(KV.first, Part);
3263       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3264           !isa<Instruction>(I))
3265         continue;
3266       Type *OriginalTy = I->getType();
3267       Type *ScalarTruncatedTy =
3268           IntegerType::get(OriginalTy->getContext(), KV.second);
3269       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3270                                           OriginalTy->getVectorNumElements());
3271       if (TruncatedTy == OriginalTy)
3272         continue;
3273 
3274       IRBuilder<> B(cast<Instruction>(I));
3275       auto ShrinkOperand = [&](Value *V) -> Value * {
3276         if (auto *ZI = dyn_cast<ZExtInst>(V))
3277           if (ZI->getSrcTy() == TruncatedTy)
3278             return ZI->getOperand(0);
3279         return B.CreateZExtOrTrunc(V, TruncatedTy);
3280       };
3281 
3282       // The actual instruction modification depends on the instruction type,
3283       // unfortunately.
3284       Value *NewI = nullptr;
3285       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3286         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3287                              ShrinkOperand(BO->getOperand(1)));
3288 
3289         // Any wrapping introduced by shrinking this operation shouldn't be
3290         // considered undefined behavior. So, we can't unconditionally copy
3291         // arithmetic wrapping flags to NewI.
3292         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3293       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3294         NewI =
3295             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3296                          ShrinkOperand(CI->getOperand(1)));
3297       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3298         NewI = B.CreateSelect(SI->getCondition(),
3299                               ShrinkOperand(SI->getTrueValue()),
3300                               ShrinkOperand(SI->getFalseValue()));
3301       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3302         switch (CI->getOpcode()) {
3303         default:
3304           llvm_unreachable("Unhandled cast!");
3305         case Instruction::Trunc:
3306           NewI = ShrinkOperand(CI->getOperand(0));
3307           break;
3308         case Instruction::SExt:
3309           NewI = B.CreateSExtOrTrunc(
3310               CI->getOperand(0),
3311               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3312           break;
3313         case Instruction::ZExt:
3314           NewI = B.CreateZExtOrTrunc(
3315               CI->getOperand(0),
3316               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3317           break;
3318         }
3319       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3320         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3321         auto *O0 = B.CreateZExtOrTrunc(
3322             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3323         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3324         auto *O1 = B.CreateZExtOrTrunc(
3325             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3326 
3327         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3328       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3329         // Don't do anything with the operands, just extend the result.
3330         continue;
3331       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3332         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3333         auto *O0 = B.CreateZExtOrTrunc(
3334             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3335         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3336         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3337       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3338         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3339         auto *O0 = B.CreateZExtOrTrunc(
3340             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3341         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3342       } else {
3343         // If we don't know what to do, be conservative and don't do anything.
3344         continue;
3345       }
3346 
3347       // Lastly, extend the result.
3348       NewI->takeName(cast<Instruction>(I));
3349       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3350       I->replaceAllUsesWith(Res);
3351       cast<Instruction>(I)->eraseFromParent();
3352       Erased.insert(I);
3353       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3354     }
3355   }
3356 
3357   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3358   for (const auto &KV : Cost->getMinimalBitwidths()) {
3359     // If the value wasn't vectorized, we must maintain the original scalar
3360     // type. The absence of the value from VectorLoopValueMap indicates that it
3361     // wasn't vectorized.
3362     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3363       continue;
3364     for (unsigned Part = 0; Part < UF; ++Part) {
3365       Value *I = getOrCreateVectorValue(KV.first, Part);
3366       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3367       if (Inst && Inst->use_empty()) {
3368         Value *NewI = Inst->getOperand(0);
3369         Inst->eraseFromParent();
3370         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3371       }
3372     }
3373   }
3374 }
3375 
3376 void InnerLoopVectorizer::fixVectorizedLoop() {
3377   // Insert truncates and extends for any truncated instructions as hints to
3378   // InstCombine.
3379   if (VF > 1)
3380     truncateToMinimalBitwidths();
3381 
3382   // Fix widened non-induction PHIs by setting up the PHI operands.
3383   if (OrigPHIsToFix.size()) {
3384     assert(EnableVPlanNativePath &&
3385            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3386     fixNonInductionPHIs();
3387   }
3388 
3389   // At this point every instruction in the original loop is widened to a
3390   // vector form. Now we need to fix the recurrences in the loop. These PHI
3391   // nodes are currently empty because we did not want to introduce cycles.
3392   // This is the second stage of vectorizing recurrences.
3393   fixCrossIterationPHIs();
3394 
3395   // Update the dominator tree.
3396   //
3397   // FIXME: After creating the structure of the new loop, the dominator tree is
3398   //        no longer up-to-date, and it remains that way until we update it
3399   //        here. An out-of-date dominator tree is problematic for SCEV,
3400   //        because SCEVExpander uses it to guide code generation. The
3401   //        vectorizer use SCEVExpanders in several places. Instead, we should
3402   //        keep the dominator tree up-to-date as we go.
3403   updateAnalysis();
3404 
3405   // Fix-up external users of the induction variables.
3406   for (auto &Entry : *Legal->getInductionVars())
3407     fixupIVUsers(Entry.first, Entry.second,
3408                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3409                  IVEndValues[Entry.first], LoopMiddleBlock);
3410 
3411   fixLCSSAPHIs();
3412   for (Instruction *PI : PredicatedInstructions)
3413     sinkScalarOperands(&*PI);
3414 
3415   // Remove redundant induction instructions.
3416   cse(LoopVectorBody);
3417 }
3418 
3419 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3420   // In order to support recurrences we need to be able to vectorize Phi nodes.
3421   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3422   // stage #2: We now need to fix the recurrences by adding incoming edges to
3423   // the currently empty PHI nodes. At this point every instruction in the
3424   // original loop is widened to a vector form so we can use them to construct
3425   // the incoming edges.
3426   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3427     // Handle first-order recurrences and reductions that need to be fixed.
3428     if (Legal->isFirstOrderRecurrence(&Phi))
3429       fixFirstOrderRecurrence(&Phi);
3430     else if (Legal->isReductionVariable(&Phi))
3431       fixReduction(&Phi);
3432   }
3433 }
3434 
3435 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3436   // This is the second phase of vectorizing first-order recurrences. An
3437   // overview of the transformation is described below. Suppose we have the
3438   // following loop.
3439   //
3440   //   for (int i = 0; i < n; ++i)
3441   //     b[i] = a[i] - a[i - 1];
3442   //
3443   // There is a first-order recurrence on "a". For this loop, the shorthand
3444   // scalar IR looks like:
3445   //
3446   //   scalar.ph:
3447   //     s_init = a[-1]
3448   //     br scalar.body
3449   //
3450   //   scalar.body:
3451   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3452   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3453   //     s2 = a[i]
3454   //     b[i] = s2 - s1
3455   //     br cond, scalar.body, ...
3456   //
3457   // In this example, s1 is a recurrence because it's value depends on the
3458   // previous iteration. In the first phase of vectorization, we created a
3459   // temporary value for s1. We now complete the vectorization and produce the
3460   // shorthand vector IR shown below (for VF = 4, UF = 1).
3461   //
3462   //   vector.ph:
3463   //     v_init = vector(..., ..., ..., a[-1])
3464   //     br vector.body
3465   //
3466   //   vector.body
3467   //     i = phi [0, vector.ph], [i+4, vector.body]
3468   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3469   //     v2 = a[i, i+1, i+2, i+3];
3470   //     v3 = vector(v1(3), v2(0, 1, 2))
3471   //     b[i, i+1, i+2, i+3] = v2 - v3
3472   //     br cond, vector.body, middle.block
3473   //
3474   //   middle.block:
3475   //     x = v2(3)
3476   //     br scalar.ph
3477   //
3478   //   scalar.ph:
3479   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3480   //     br scalar.body
3481   //
3482   // After execution completes the vector loop, we extract the next value of
3483   // the recurrence (x) to use as the initial value in the scalar loop.
3484 
3485   // Get the original loop preheader and single loop latch.
3486   auto *Preheader = OrigLoop->getLoopPreheader();
3487   auto *Latch = OrigLoop->getLoopLatch();
3488 
3489   // Get the initial and previous values of the scalar recurrence.
3490   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3491   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3492 
3493   // Create a vector from the initial value.
3494   auto *VectorInit = ScalarInit;
3495   if (VF > 1) {
3496     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3497     VectorInit = Builder.CreateInsertElement(
3498         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3499         Builder.getInt32(VF - 1), "vector.recur.init");
3500   }
3501 
3502   // We constructed a temporary phi node in the first phase of vectorization.
3503   // This phi node will eventually be deleted.
3504   Builder.SetInsertPoint(
3505       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3506 
3507   // Create a phi node for the new recurrence. The current value will either be
3508   // the initial value inserted into a vector or loop-varying vector value.
3509   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3510   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3511 
3512   // Get the vectorized previous value of the last part UF - 1. It appears last
3513   // among all unrolled iterations, due to the order of their construction.
3514   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3515 
3516   // Set the insertion point after the previous value if it is an instruction.
3517   // Note that the previous value may have been constant-folded so it is not
3518   // guaranteed to be an instruction in the vector loop. Also, if the previous
3519   // value is a phi node, we should insert after all the phi nodes to avoid
3520   // breaking basic block verification.
3521   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3522       isa<PHINode>(PreviousLastPart))
3523     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3524   else
3525     Builder.SetInsertPoint(
3526         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3527 
3528   // We will construct a vector for the recurrence by combining the values for
3529   // the current and previous iterations. This is the required shuffle mask.
3530   SmallVector<Constant *, 8> ShuffleMask(VF);
3531   ShuffleMask[0] = Builder.getInt32(VF - 1);
3532   for (unsigned I = 1; I < VF; ++I)
3533     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3534 
3535   // The vector from which to take the initial value for the current iteration
3536   // (actual or unrolled). Initially, this is the vector phi node.
3537   Value *Incoming = VecPhi;
3538 
3539   // Shuffle the current and previous vector and update the vector parts.
3540   for (unsigned Part = 0; Part < UF; ++Part) {
3541     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3542     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3543     auto *Shuffle =
3544         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3545                                              ConstantVector::get(ShuffleMask))
3546                : Incoming;
3547     PhiPart->replaceAllUsesWith(Shuffle);
3548     cast<Instruction>(PhiPart)->eraseFromParent();
3549     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3550     Incoming = PreviousPart;
3551   }
3552 
3553   // Fix the latch value of the new recurrence in the vector loop.
3554   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3555 
3556   // Extract the last vector element in the middle block. This will be the
3557   // initial value for the recurrence when jumping to the scalar loop.
3558   auto *ExtractForScalar = Incoming;
3559   if (VF > 1) {
3560     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3561     ExtractForScalar = Builder.CreateExtractElement(
3562         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3563   }
3564   // Extract the second last element in the middle block if the
3565   // Phi is used outside the loop. We need to extract the phi itself
3566   // and not the last element (the phi update in the current iteration). This
3567   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3568   // when the scalar loop is not run at all.
3569   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3570   if (VF > 1)
3571     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3572         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3573   // When loop is unrolled without vectorizing, initialize
3574   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3575   // `Incoming`. This is analogous to the vectorized case above: extracting the
3576   // second last element when VF > 1.
3577   else if (UF > 1)
3578     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3579 
3580   // Fix the initial value of the original recurrence in the scalar loop.
3581   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3582   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3583   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3584     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3585     Start->addIncoming(Incoming, BB);
3586   }
3587 
3588   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3589   Phi->setName("scalar.recur");
3590 
3591   // Finally, fix users of the recurrence outside the loop. The users will need
3592   // either the last value of the scalar recurrence or the last value of the
3593   // vector recurrence we extracted in the middle block. Since the loop is in
3594   // LCSSA form, we just need to find all the phi nodes for the original scalar
3595   // recurrence in the exit block, and then add an edge for the middle block.
3596   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3597     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3598       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3599     }
3600   }
3601 }
3602 
3603 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3604   Constant *Zero = Builder.getInt32(0);
3605 
3606   // Get it's reduction variable descriptor.
3607   assert(Legal->isReductionVariable(Phi) &&
3608          "Unable to find the reduction variable");
3609   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3610 
3611   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3612   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3613   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3614   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3615     RdxDesc.getMinMaxRecurrenceKind();
3616   setDebugLocFromInst(Builder, ReductionStartValue);
3617 
3618   // We need to generate a reduction vector from the incoming scalar.
3619   // To do so, we need to generate the 'identity' vector and override
3620   // one of the elements with the incoming scalar reduction. We need
3621   // to do it in the vector-loop preheader.
3622   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3623 
3624   // This is the vector-clone of the value that leaves the loop.
3625   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3626 
3627   // Find the reduction identity variable. Zero for addition, or, xor,
3628   // one for multiplication, -1 for And.
3629   Value *Identity;
3630   Value *VectorStart;
3631   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3632       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3633     // MinMax reduction have the start value as their identify.
3634     if (VF == 1) {
3635       VectorStart = Identity = ReductionStartValue;
3636     } else {
3637       VectorStart = Identity =
3638         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3639     }
3640   } else {
3641     // Handle other reduction kinds:
3642     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3643         RK, VecTy->getScalarType());
3644     if (VF == 1) {
3645       Identity = Iden;
3646       // This vector is the Identity vector where the first element is the
3647       // incoming scalar reduction.
3648       VectorStart = ReductionStartValue;
3649     } else {
3650       Identity = ConstantVector::getSplat(VF, Iden);
3651 
3652       // This vector is the Identity vector where the first element is the
3653       // incoming scalar reduction.
3654       VectorStart =
3655         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3656     }
3657   }
3658 
3659   // Fix the vector-loop phi.
3660 
3661   // Reductions do not have to start at zero. They can start with
3662   // any loop invariant values.
3663   BasicBlock *Latch = OrigLoop->getLoopLatch();
3664   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3665   for (unsigned Part = 0; Part < UF; ++Part) {
3666     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3667     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3668     // Make sure to add the reduction stat value only to the
3669     // first unroll part.
3670     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3671     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3672     cast<PHINode>(VecRdxPhi)
3673       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3674   }
3675 
3676   // Before each round, move the insertion point right between
3677   // the PHIs and the values we are going to write.
3678   // This allows us to write both PHINodes and the extractelement
3679   // instructions.
3680   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3681 
3682   setDebugLocFromInst(Builder, LoopExitInst);
3683 
3684   // If tail is folded by masking, the vector value to leave the loop should be
3685   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3686   // instead of the former.
3687   if (Cost->foldTailByMasking()) {
3688     for (unsigned Part = 0; Part < UF; ++Part) {
3689       Value *VecLoopExitInst =
3690           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3691       Value *Sel = nullptr;
3692       for (User *U : VecLoopExitInst->users()) {
3693         if (isa<SelectInst>(U)) {
3694           assert(!Sel && "Reduction exit feeding two selects");
3695           Sel = U;
3696         } else
3697           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3698       }
3699       assert(Sel && "Reduction exit feeds no select");
3700       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3701     }
3702   }
3703 
3704   // If the vector reduction can be performed in a smaller type, we truncate
3705   // then extend the loop exit value to enable InstCombine to evaluate the
3706   // entire expression in the smaller type.
3707   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3708     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3709     Builder.SetInsertPoint(
3710         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3711     VectorParts RdxParts(UF);
3712     for (unsigned Part = 0; Part < UF; ++Part) {
3713       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3714       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3715       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3716                                         : Builder.CreateZExt(Trunc, VecTy);
3717       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3718            UI != RdxParts[Part]->user_end();)
3719         if (*UI != Trunc) {
3720           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3721           RdxParts[Part] = Extnd;
3722         } else {
3723           ++UI;
3724         }
3725     }
3726     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3727     for (unsigned Part = 0; Part < UF; ++Part) {
3728       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3729       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3730     }
3731   }
3732 
3733   // Reduce all of the unrolled parts into a single vector.
3734   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3735   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3736 
3737   // The middle block terminator has already been assigned a DebugLoc here (the
3738   // OrigLoop's single latch terminator). We want the whole middle block to
3739   // appear to execute on this line because: (a) it is all compiler generated,
3740   // (b) these instructions are always executed after evaluating the latch
3741   // conditional branch, and (c) other passes may add new predecessors which
3742   // terminate on this line. This is the easiest way to ensure we don't
3743   // accidentally cause an extra step back into the loop while debugging.
3744   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3745   for (unsigned Part = 1; Part < UF; ++Part) {
3746     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3747     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3748       // Floating point operations had to be 'fast' to enable the reduction.
3749       ReducedPartRdx = addFastMathFlag(
3750           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3751                               ReducedPartRdx, "bin.rdx"),
3752           RdxDesc.getFastMathFlags());
3753     else
3754       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3755                                       RdxPart);
3756   }
3757 
3758   if (VF > 1) {
3759     bool NoNaN = Legal->hasFunNoNaNAttr();
3760     ReducedPartRdx =
3761         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3762     // If the reduction can be performed in a smaller type, we need to extend
3763     // the reduction to the wider type before we branch to the original loop.
3764     if (Phi->getType() != RdxDesc.getRecurrenceType())
3765       ReducedPartRdx =
3766         RdxDesc.isSigned()
3767         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3768         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3769   }
3770 
3771   // Create a phi node that merges control-flow from the backedge-taken check
3772   // block and the middle block.
3773   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3774                                         LoopScalarPreHeader->getTerminator());
3775   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3776     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3777   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3778 
3779   // Now, we need to fix the users of the reduction variable
3780   // inside and outside of the scalar remainder loop.
3781   // We know that the loop is in LCSSA form. We need to update the
3782   // PHI nodes in the exit blocks.
3783   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3784     // All PHINodes need to have a single entry edge, or two if
3785     // we already fixed them.
3786     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3787 
3788     // We found a reduction value exit-PHI. Update it with the
3789     // incoming bypass edge.
3790     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3791       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3792   } // end of the LCSSA phi scan.
3793 
3794     // Fix the scalar loop reduction variable with the incoming reduction sum
3795     // from the vector body and from the backedge value.
3796   int IncomingEdgeBlockIdx =
3797     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3798   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3799   // Pick the other block.
3800   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3801   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3802   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3803 }
3804 
3805 void InnerLoopVectorizer::fixLCSSAPHIs() {
3806   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3807     if (LCSSAPhi.getNumIncomingValues() == 1) {
3808       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3809       // Non-instruction incoming values will have only one value.
3810       unsigned LastLane = 0;
3811       if (isa<Instruction>(IncomingValue))
3812           LastLane = Cost->isUniformAfterVectorization(
3813                          cast<Instruction>(IncomingValue), VF)
3814                          ? 0
3815                          : VF - 1;
3816       // Can be a loop invariant incoming value or the last scalar value to be
3817       // extracted from the vectorized loop.
3818       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3819       Value *lastIncomingValue =
3820           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3821       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3822     }
3823   }
3824 }
3825 
3826 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3827   // The basic block and loop containing the predicated instruction.
3828   auto *PredBB = PredInst->getParent();
3829   auto *VectorLoop = LI->getLoopFor(PredBB);
3830 
3831   // Initialize a worklist with the operands of the predicated instruction.
3832   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3833 
3834   // Holds instructions that we need to analyze again. An instruction may be
3835   // reanalyzed if we don't yet know if we can sink it or not.
3836   SmallVector<Instruction *, 8> InstsToReanalyze;
3837 
3838   // Returns true if a given use occurs in the predicated block. Phi nodes use
3839   // their operands in their corresponding predecessor blocks.
3840   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3841     auto *I = cast<Instruction>(U.getUser());
3842     BasicBlock *BB = I->getParent();
3843     if (auto *Phi = dyn_cast<PHINode>(I))
3844       BB = Phi->getIncomingBlock(
3845           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3846     return BB == PredBB;
3847   };
3848 
3849   // Iteratively sink the scalarized operands of the predicated instruction
3850   // into the block we created for it. When an instruction is sunk, it's
3851   // operands are then added to the worklist. The algorithm ends after one pass
3852   // through the worklist doesn't sink a single instruction.
3853   bool Changed;
3854   do {
3855     // Add the instructions that need to be reanalyzed to the worklist, and
3856     // reset the changed indicator.
3857     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3858     InstsToReanalyze.clear();
3859     Changed = false;
3860 
3861     while (!Worklist.empty()) {
3862       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3863 
3864       // We can't sink an instruction if it is a phi node, is already in the
3865       // predicated block, is not in the loop, or may have side effects.
3866       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3867           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3868         continue;
3869 
3870       // It's legal to sink the instruction if all its uses occur in the
3871       // predicated block. Otherwise, there's nothing to do yet, and we may
3872       // need to reanalyze the instruction.
3873       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3874         InstsToReanalyze.push_back(I);
3875         continue;
3876       }
3877 
3878       // Move the instruction to the beginning of the predicated block, and add
3879       // it's operands to the worklist.
3880       I->moveBefore(&*PredBB->getFirstInsertionPt());
3881       Worklist.insert(I->op_begin(), I->op_end());
3882 
3883       // The sinking may have enabled other instructions to be sunk, so we will
3884       // need to iterate.
3885       Changed = true;
3886     }
3887   } while (Changed);
3888 }
3889 
3890 void InnerLoopVectorizer::fixNonInductionPHIs() {
3891   for (PHINode *OrigPhi : OrigPHIsToFix) {
3892     PHINode *NewPhi =
3893         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3894     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3895 
3896     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3897         predecessors(OrigPhi->getParent()));
3898     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3899         predecessors(NewPhi->getParent()));
3900     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3901            "Scalar and Vector BB should have the same number of predecessors");
3902 
3903     // The insertion point in Builder may be invalidated by the time we get
3904     // here. Force the Builder insertion point to something valid so that we do
3905     // not run into issues during insertion point restore in
3906     // getOrCreateVectorValue calls below.
3907     Builder.SetInsertPoint(NewPhi);
3908 
3909     // The predecessor order is preserved and we can rely on mapping between
3910     // scalar and vector block predecessors.
3911     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3912       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3913 
3914       // When looking up the new scalar/vector values to fix up, use incoming
3915       // values from original phi.
3916       Value *ScIncV =
3917           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3918 
3919       // Scalar incoming value may need a broadcast
3920       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3921       NewPhi->addIncoming(NewIncV, NewPredBB);
3922     }
3923   }
3924 }
3925 
3926 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3927                                               unsigned VF) {
3928   PHINode *P = cast<PHINode>(PN);
3929   if (EnableVPlanNativePath) {
3930     // Currently we enter here in the VPlan-native path for non-induction
3931     // PHIs where all control flow is uniform. We simply widen these PHIs.
3932     // Create a vector phi with no operands - the vector phi operands will be
3933     // set at the end of vector code generation.
3934     Type *VecTy =
3935         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3936     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3937     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3938     OrigPHIsToFix.push_back(P);
3939 
3940     return;
3941   }
3942 
3943   assert(PN->getParent() == OrigLoop->getHeader() &&
3944          "Non-header phis should have been handled elsewhere");
3945 
3946   // In order to support recurrences we need to be able to vectorize Phi nodes.
3947   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3948   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3949   // this value when we vectorize all of the instructions that use the PHI.
3950   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3951     for (unsigned Part = 0; Part < UF; ++Part) {
3952       // This is phase one of vectorizing PHIs.
3953       Type *VecTy =
3954           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3955       Value *EntryPart = PHINode::Create(
3956           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3957       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3958     }
3959     return;
3960   }
3961 
3962   setDebugLocFromInst(Builder, P);
3963 
3964   // This PHINode must be an induction variable.
3965   // Make sure that we know about it.
3966   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3967 
3968   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3969   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3970 
3971   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3972   // which can be found from the original scalar operations.
3973   switch (II.getKind()) {
3974   case InductionDescriptor::IK_NoInduction:
3975     llvm_unreachable("Unknown induction");
3976   case InductionDescriptor::IK_IntInduction:
3977   case InductionDescriptor::IK_FpInduction:
3978     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3979   case InductionDescriptor::IK_PtrInduction: {
3980     // Handle the pointer induction variable case.
3981     assert(P->getType()->isPointerTy() && "Unexpected type.");
3982     // This is the normalized GEP that starts counting at zero.
3983     Value *PtrInd = Induction;
3984     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3985     // Determine the number of scalars we need to generate for each unroll
3986     // iteration. If the instruction is uniform, we only need to generate the
3987     // first lane. Otherwise, we generate all VF values.
3988     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3989     // These are the scalar results. Notice that we don't generate vector GEPs
3990     // because scalar GEPs result in better code.
3991     for (unsigned Part = 0; Part < UF; ++Part) {
3992       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3993         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3994         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3995         Value *SclrGep =
3996             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3997         SclrGep->setName("next.gep");
3998         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3999       }
4000     }
4001     return;
4002   }
4003   }
4004 }
4005 
4006 /// A helper function for checking whether an integer division-related
4007 /// instruction may divide by zero (in which case it must be predicated if
4008 /// executed conditionally in the scalar code).
4009 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4010 /// Non-zero divisors that are non compile-time constants will not be
4011 /// converted into multiplication, so we will still end up scalarizing
4012 /// the division, but can do so w/o predication.
4013 static bool mayDivideByZero(Instruction &I) {
4014   assert((I.getOpcode() == Instruction::UDiv ||
4015           I.getOpcode() == Instruction::SDiv ||
4016           I.getOpcode() == Instruction::URem ||
4017           I.getOpcode() == Instruction::SRem) &&
4018          "Unexpected instruction");
4019   Value *Divisor = I.getOperand(1);
4020   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4021   return !CInt || CInt->isZero();
4022 }
4023 
4024 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4025   switch (I.getOpcode()) {
4026   case Instruction::Br:
4027   case Instruction::PHI:
4028     llvm_unreachable("This instruction is handled by a different recipe.");
4029   case Instruction::GetElementPtr: {
4030     // Construct a vector GEP by widening the operands of the scalar GEP as
4031     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4032     // results in a vector of pointers when at least one operand of the GEP
4033     // is vector-typed. Thus, to keep the representation compact, we only use
4034     // vector-typed operands for loop-varying values.
4035     auto *GEP = cast<GetElementPtrInst>(&I);
4036 
4037     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4038       // If we are vectorizing, but the GEP has only loop-invariant operands,
4039       // the GEP we build (by only using vector-typed operands for
4040       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4041       // produce a vector of pointers, we need to either arbitrarily pick an
4042       // operand to broadcast, or broadcast a clone of the original GEP.
4043       // Here, we broadcast a clone of the original.
4044       //
4045       // TODO: If at some point we decide to scalarize instructions having
4046       //       loop-invariant operands, this special case will no longer be
4047       //       required. We would add the scalarization decision to
4048       //       collectLoopScalars() and teach getVectorValue() to broadcast
4049       //       the lane-zero scalar value.
4050       auto *Clone = Builder.Insert(GEP->clone());
4051       for (unsigned Part = 0; Part < UF; ++Part) {
4052         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4053         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4054         addMetadata(EntryPart, GEP);
4055       }
4056     } else {
4057       // If the GEP has at least one loop-varying operand, we are sure to
4058       // produce a vector of pointers. But if we are only unrolling, we want
4059       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4060       // produce with the code below will be scalar (if VF == 1) or vector
4061       // (otherwise). Note that for the unroll-only case, we still maintain
4062       // values in the vector mapping with initVector, as we do for other
4063       // instructions.
4064       for (unsigned Part = 0; Part < UF; ++Part) {
4065         // The pointer operand of the new GEP. If it's loop-invariant, we
4066         // won't broadcast it.
4067         auto *Ptr =
4068             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4069                 ? GEP->getPointerOperand()
4070                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4071 
4072         // Collect all the indices for the new GEP. If any index is
4073         // loop-invariant, we won't broadcast it.
4074         SmallVector<Value *, 4> Indices;
4075         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4076           if (OrigLoop->isLoopInvariant(U.get()))
4077             Indices.push_back(U.get());
4078           else
4079             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4080         }
4081 
4082         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4083         // but it should be a vector, otherwise.
4084         auto *NewGEP =
4085             GEP->isInBounds()
4086                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4087                                             Indices)
4088                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4089         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4090                "NewGEP is not a pointer vector");
4091         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4092         addMetadata(NewGEP, GEP);
4093       }
4094     }
4095 
4096     break;
4097   }
4098   case Instruction::UDiv:
4099   case Instruction::SDiv:
4100   case Instruction::SRem:
4101   case Instruction::URem:
4102   case Instruction::Add:
4103   case Instruction::FAdd:
4104   case Instruction::Sub:
4105   case Instruction::FSub:
4106   case Instruction::FNeg:
4107   case Instruction::Mul:
4108   case Instruction::FMul:
4109   case Instruction::FDiv:
4110   case Instruction::FRem:
4111   case Instruction::Shl:
4112   case Instruction::LShr:
4113   case Instruction::AShr:
4114   case Instruction::And:
4115   case Instruction::Or:
4116   case Instruction::Xor: {
4117     // Just widen unops and binops.
4118     setDebugLocFromInst(Builder, &I);
4119 
4120     for (unsigned Part = 0; Part < UF; ++Part) {
4121       SmallVector<Value *, 2> Ops;
4122       for (Value *Op : I.operands())
4123         Ops.push_back(getOrCreateVectorValue(Op, Part));
4124 
4125       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4126 
4127       if (auto *VecOp = dyn_cast<Instruction>(V))
4128         VecOp->copyIRFlags(&I);
4129 
4130       // Use this vector value for all users of the original instruction.
4131       VectorLoopValueMap.setVectorValue(&I, Part, V);
4132       addMetadata(V, &I);
4133     }
4134 
4135     break;
4136   }
4137   case Instruction::Select: {
4138     // Widen selects.
4139     // If the selector is loop invariant we can create a select
4140     // instruction with a scalar condition. Otherwise, use vector-select.
4141     auto *SE = PSE.getSE();
4142     bool InvariantCond =
4143         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4144     setDebugLocFromInst(Builder, &I);
4145 
4146     // The condition can be loop invariant  but still defined inside the
4147     // loop. This means that we can't just use the original 'cond' value.
4148     // We have to take the 'vectorized' value and pick the first lane.
4149     // Instcombine will make this a no-op.
4150 
4151     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4152 
4153     for (unsigned Part = 0; Part < UF; ++Part) {
4154       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4155       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4156       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4157       Value *Sel =
4158           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4159       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4160       addMetadata(Sel, &I);
4161     }
4162 
4163     break;
4164   }
4165 
4166   case Instruction::ICmp:
4167   case Instruction::FCmp: {
4168     // Widen compares. Generate vector compares.
4169     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4170     auto *Cmp = dyn_cast<CmpInst>(&I);
4171     setDebugLocFromInst(Builder, Cmp);
4172     for (unsigned Part = 0; Part < UF; ++Part) {
4173       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4174       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4175       Value *C = nullptr;
4176       if (FCmp) {
4177         // Propagate fast math flags.
4178         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4179         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4180         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4181       } else {
4182         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4183       }
4184       VectorLoopValueMap.setVectorValue(&I, Part, C);
4185       addMetadata(C, &I);
4186     }
4187 
4188     break;
4189   }
4190 
4191   case Instruction::ZExt:
4192   case Instruction::SExt:
4193   case Instruction::FPToUI:
4194   case Instruction::FPToSI:
4195   case Instruction::FPExt:
4196   case Instruction::PtrToInt:
4197   case Instruction::IntToPtr:
4198   case Instruction::SIToFP:
4199   case Instruction::UIToFP:
4200   case Instruction::Trunc:
4201   case Instruction::FPTrunc:
4202   case Instruction::BitCast: {
4203     auto *CI = dyn_cast<CastInst>(&I);
4204     setDebugLocFromInst(Builder, CI);
4205 
4206     /// Vectorize casts.
4207     Type *DestTy =
4208         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4209 
4210     for (unsigned Part = 0; Part < UF; ++Part) {
4211       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4212       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4213       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4214       addMetadata(Cast, &I);
4215     }
4216     break;
4217   }
4218 
4219   case Instruction::Call: {
4220     // Ignore dbg intrinsics.
4221     if (isa<DbgInfoIntrinsic>(I))
4222       break;
4223     setDebugLocFromInst(Builder, &I);
4224 
4225     Module *M = I.getParent()->getParent()->getParent();
4226     auto *CI = cast<CallInst>(&I);
4227 
4228     StringRef FnName = CI->getCalledFunction()->getName();
4229     Function *F = CI->getCalledFunction();
4230     Type *RetTy = ToVectorTy(CI->getType(), VF);
4231     SmallVector<Type *, 4> Tys;
4232     for (Value *ArgOperand : CI->arg_operands())
4233       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4234 
4235     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4236 
4237     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4238     // version of the instruction.
4239     // Is it beneficial to perform intrinsic call compared to lib call?
4240     bool NeedToScalarize;
4241     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4242     bool UseVectorIntrinsic =
4243         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4244     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4245            "Instruction should be scalarized elsewhere.");
4246 
4247     for (unsigned Part = 0; Part < UF; ++Part) {
4248       SmallVector<Value *, 4> Args;
4249       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4250         Value *Arg = CI->getArgOperand(i);
4251         // Some intrinsics have a scalar argument - don't replace it with a
4252         // vector.
4253         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4254           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4255         Args.push_back(Arg);
4256       }
4257 
4258       Function *VectorF;
4259       if (UseVectorIntrinsic) {
4260         // Use vector version of the intrinsic.
4261         Type *TysForDecl[] = {CI->getType()};
4262         if (VF > 1)
4263           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4264         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4265       } else {
4266         // Use vector version of the library call.
4267         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4268         assert(!VFnName.empty() && "Vector function name is empty.");
4269         VectorF = M->getFunction(VFnName);
4270         if (!VectorF) {
4271           // Generate a declaration
4272           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4273           VectorF =
4274               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4275           VectorF->copyAttributesFrom(F);
4276         }
4277       }
4278       assert(VectorF && "Can't create vector function.");
4279 
4280       SmallVector<OperandBundleDef, 1> OpBundles;
4281       CI->getOperandBundlesAsDefs(OpBundles);
4282       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4283 
4284       if (isa<FPMathOperator>(V))
4285         V->copyFastMathFlags(CI);
4286 
4287       VectorLoopValueMap.setVectorValue(&I, Part, V);
4288       addMetadata(V, &I);
4289     }
4290 
4291     break;
4292   }
4293 
4294   default:
4295     // This instruction is not vectorized by simple widening.
4296     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4297     llvm_unreachable("Unhandled instruction!");
4298   } // end of switch.
4299 }
4300 
4301 void InnerLoopVectorizer::updateAnalysis() {
4302   // Forget the original basic block.
4303   PSE.getSE()->forgetLoop(OrigLoop);
4304 
4305   // DT is not kept up-to-date for outer loop vectorization
4306   if (EnableVPlanNativePath)
4307     return;
4308 
4309   // Update the dominator tree information.
4310   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4311          "Entry does not dominate exit.");
4312 
4313   DT->addNewBlock(LoopMiddleBlock,
4314                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4315   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4316   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4317   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4318   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4319 }
4320 
4321 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4322   // We should not collect Scalars more than once per VF. Right now, this
4323   // function is called from collectUniformsAndScalars(), which already does
4324   // this check. Collecting Scalars for VF=1 does not make any sense.
4325   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4326          "This function should not be visited twice for the same VF");
4327 
4328   SmallSetVector<Instruction *, 8> Worklist;
4329 
4330   // These sets are used to seed the analysis with pointers used by memory
4331   // accesses that will remain scalar.
4332   SmallSetVector<Instruction *, 8> ScalarPtrs;
4333   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4334 
4335   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4336   // The pointer operands of loads and stores will be scalar as long as the
4337   // memory access is not a gather or scatter operation. The value operand of a
4338   // store will remain scalar if the store is scalarized.
4339   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4340     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4341     assert(WideningDecision != CM_Unknown &&
4342            "Widening decision should be ready at this moment");
4343     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4344       if (Ptr == Store->getValueOperand())
4345         return WideningDecision == CM_Scalarize;
4346     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4347            "Ptr is neither a value or pointer operand");
4348     return WideningDecision != CM_GatherScatter;
4349   };
4350 
4351   // A helper that returns true if the given value is a bitcast or
4352   // getelementptr instruction contained in the loop.
4353   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4354     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4355             isa<GetElementPtrInst>(V)) &&
4356            !TheLoop->isLoopInvariant(V);
4357   };
4358 
4359   // A helper that evaluates a memory access's use of a pointer. If the use
4360   // will be a scalar use, and the pointer is only used by memory accesses, we
4361   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4362   // PossibleNonScalarPtrs.
4363   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4364     // We only care about bitcast and getelementptr instructions contained in
4365     // the loop.
4366     if (!isLoopVaryingBitCastOrGEP(Ptr))
4367       return;
4368 
4369     // If the pointer has already been identified as scalar (e.g., if it was
4370     // also identified as uniform), there's nothing to do.
4371     auto *I = cast<Instruction>(Ptr);
4372     if (Worklist.count(I))
4373       return;
4374 
4375     // If the use of the pointer will be a scalar use, and all users of the
4376     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4377     // place the pointer in PossibleNonScalarPtrs.
4378     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4379           return isa<LoadInst>(U) || isa<StoreInst>(U);
4380         }))
4381       ScalarPtrs.insert(I);
4382     else
4383       PossibleNonScalarPtrs.insert(I);
4384   };
4385 
4386   // We seed the scalars analysis with three classes of instructions: (1)
4387   // instructions marked uniform-after-vectorization, (2) bitcast and
4388   // getelementptr instructions used by memory accesses requiring a scalar use,
4389   // and (3) pointer induction variables and their update instructions (we
4390   // currently only scalarize these).
4391   //
4392   // (1) Add to the worklist all instructions that have been identified as
4393   // uniform-after-vectorization.
4394   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4395 
4396   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4397   // memory accesses requiring a scalar use. The pointer operands of loads and
4398   // stores will be scalar as long as the memory accesses is not a gather or
4399   // scatter operation. The value operand of a store will remain scalar if the
4400   // store is scalarized.
4401   for (auto *BB : TheLoop->blocks())
4402     for (auto &I : *BB) {
4403       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4404         evaluatePtrUse(Load, Load->getPointerOperand());
4405       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4406         evaluatePtrUse(Store, Store->getPointerOperand());
4407         evaluatePtrUse(Store, Store->getValueOperand());
4408       }
4409     }
4410   for (auto *I : ScalarPtrs)
4411     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4412       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4413       Worklist.insert(I);
4414     }
4415 
4416   // (3) Add to the worklist all pointer induction variables and their update
4417   // instructions.
4418   //
4419   // TODO: Once we are able to vectorize pointer induction variables we should
4420   //       no longer insert them into the worklist here.
4421   auto *Latch = TheLoop->getLoopLatch();
4422   for (auto &Induction : *Legal->getInductionVars()) {
4423     auto *Ind = Induction.first;
4424     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4425     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4426       continue;
4427     Worklist.insert(Ind);
4428     Worklist.insert(IndUpdate);
4429     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4430     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4431                       << "\n");
4432   }
4433 
4434   // Insert the forced scalars.
4435   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4436   // induction variable when the PHI user is scalarized.
4437   auto ForcedScalar = ForcedScalars.find(VF);
4438   if (ForcedScalar != ForcedScalars.end())
4439     for (auto *I : ForcedScalar->second)
4440       Worklist.insert(I);
4441 
4442   // Expand the worklist by looking through any bitcasts and getelementptr
4443   // instructions we've already identified as scalar. This is similar to the
4444   // expansion step in collectLoopUniforms(); however, here we're only
4445   // expanding to include additional bitcasts and getelementptr instructions.
4446   unsigned Idx = 0;
4447   while (Idx != Worklist.size()) {
4448     Instruction *Dst = Worklist[Idx++];
4449     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4450       continue;
4451     auto *Src = cast<Instruction>(Dst->getOperand(0));
4452     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4453           auto *J = cast<Instruction>(U);
4454           return !TheLoop->contains(J) || Worklist.count(J) ||
4455                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4456                   isScalarUse(J, Src));
4457         })) {
4458       Worklist.insert(Src);
4459       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4460     }
4461   }
4462 
4463   // An induction variable will remain scalar if all users of the induction
4464   // variable and induction variable update remain scalar.
4465   for (auto &Induction : *Legal->getInductionVars()) {
4466     auto *Ind = Induction.first;
4467     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4468 
4469     // We already considered pointer induction variables, so there's no reason
4470     // to look at their users again.
4471     //
4472     // TODO: Once we are able to vectorize pointer induction variables we
4473     //       should no longer skip over them here.
4474     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4475       continue;
4476 
4477     // Determine if all users of the induction variable are scalar after
4478     // vectorization.
4479     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4480       auto *I = cast<Instruction>(U);
4481       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4482     });
4483     if (!ScalarInd)
4484       continue;
4485 
4486     // Determine if all users of the induction variable update instruction are
4487     // scalar after vectorization.
4488     auto ScalarIndUpdate =
4489         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4490           auto *I = cast<Instruction>(U);
4491           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4492         });
4493     if (!ScalarIndUpdate)
4494       continue;
4495 
4496     // The induction variable and its update instruction will remain scalar.
4497     Worklist.insert(Ind);
4498     Worklist.insert(IndUpdate);
4499     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4500     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4501                       << "\n");
4502   }
4503 
4504   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4505 }
4506 
4507 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4508   if (!blockNeedsPredication(I->getParent()))
4509     return false;
4510   switch(I->getOpcode()) {
4511   default:
4512     break;
4513   case Instruction::Load:
4514   case Instruction::Store: {
4515     if (!Legal->isMaskRequired(I))
4516       return false;
4517     auto *Ptr = getLoadStorePointerOperand(I);
4518     auto *Ty = getMemInstValueType(I);
4519     // We have already decided how to vectorize this instruction, get that
4520     // result.
4521     if (VF > 1) {
4522       InstWidening WideningDecision = getWideningDecision(I, VF);
4523       assert(WideningDecision != CM_Unknown &&
4524              "Widening decision should be ready at this moment");
4525       return WideningDecision == CM_Scalarize;
4526     }
4527     return isa<LoadInst>(I) ?
4528         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4529       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4530   }
4531   case Instruction::UDiv:
4532   case Instruction::SDiv:
4533   case Instruction::SRem:
4534   case Instruction::URem:
4535     return mayDivideByZero(*I);
4536   }
4537   return false;
4538 }
4539 
4540 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4541                                                                unsigned VF) {
4542   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4543   assert(getWideningDecision(I, VF) == CM_Unknown &&
4544          "Decision should not be set yet.");
4545   auto *Group = getInterleavedAccessGroup(I);
4546   assert(Group && "Must have a group.");
4547 
4548   // If the instruction's allocated size doesn't equal it's type size, it
4549   // requires padding and will be scalarized.
4550   auto &DL = I->getModule()->getDataLayout();
4551   auto *ScalarTy = getMemInstValueType(I);
4552   if (hasIrregularType(ScalarTy, DL, VF))
4553     return false;
4554 
4555   // Check if masking is required.
4556   // A Group may need masking for one of two reasons: it resides in a block that
4557   // needs predication, or it was decided to use masking to deal with gaps.
4558   bool PredicatedAccessRequiresMasking =
4559       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4560   bool AccessWithGapsRequiresMasking =
4561       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4562   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4563     return true;
4564 
4565   // If masked interleaving is required, we expect that the user/target had
4566   // enabled it, because otherwise it either wouldn't have been created or
4567   // it should have been invalidated by the CostModel.
4568   assert(useMaskedInterleavedAccesses(TTI) &&
4569          "Masked interleave-groups for predicated accesses are not enabled.");
4570 
4571   auto *Ty = getMemInstValueType(I);
4572   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4573                           : TTI.isLegalMaskedStore(Ty);
4574 }
4575 
4576 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4577                                                                unsigned VF) {
4578   // Get and ensure we have a valid memory instruction.
4579   LoadInst *LI = dyn_cast<LoadInst>(I);
4580   StoreInst *SI = dyn_cast<StoreInst>(I);
4581   assert((LI || SI) && "Invalid memory instruction");
4582 
4583   auto *Ptr = getLoadStorePointerOperand(I);
4584 
4585   // In order to be widened, the pointer should be consecutive, first of all.
4586   if (!Legal->isConsecutivePtr(Ptr))
4587     return false;
4588 
4589   // If the instruction is a store located in a predicated block, it will be
4590   // scalarized.
4591   if (isScalarWithPredication(I))
4592     return false;
4593 
4594   // If the instruction's allocated size doesn't equal it's type size, it
4595   // requires padding and will be scalarized.
4596   auto &DL = I->getModule()->getDataLayout();
4597   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4598   if (hasIrregularType(ScalarTy, DL, VF))
4599     return false;
4600 
4601   return true;
4602 }
4603 
4604 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4605   // We should not collect Uniforms more than once per VF. Right now,
4606   // this function is called from collectUniformsAndScalars(), which
4607   // already does this check. Collecting Uniforms for VF=1 does not make any
4608   // sense.
4609 
4610   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4611          "This function should not be visited twice for the same VF");
4612 
4613   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4614   // not analyze again.  Uniforms.count(VF) will return 1.
4615   Uniforms[VF].clear();
4616 
4617   // We now know that the loop is vectorizable!
4618   // Collect instructions inside the loop that will remain uniform after
4619   // vectorization.
4620 
4621   // Global values, params and instructions outside of current loop are out of
4622   // scope.
4623   auto isOutOfScope = [&](Value *V) -> bool {
4624     Instruction *I = dyn_cast<Instruction>(V);
4625     return (!I || !TheLoop->contains(I));
4626   };
4627 
4628   SetVector<Instruction *> Worklist;
4629   BasicBlock *Latch = TheLoop->getLoopLatch();
4630 
4631   // Start with the conditional branch. If the branch condition is an
4632   // instruction contained in the loop that is only used by the branch, it is
4633   // uniform.
4634   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4635   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4636     Worklist.insert(Cmp);
4637     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4638   }
4639 
4640   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4641   // are pointers that are treated like consecutive pointers during
4642   // vectorization. The pointer operands of interleaved accesses are an
4643   // example.
4644   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4645 
4646   // Holds pointer operands of instructions that are possibly non-uniform.
4647   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4648 
4649   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4650     InstWidening WideningDecision = getWideningDecision(I, VF);
4651     assert(WideningDecision != CM_Unknown &&
4652            "Widening decision should be ready at this moment");
4653 
4654     return (WideningDecision == CM_Widen ||
4655             WideningDecision == CM_Widen_Reverse ||
4656             WideningDecision == CM_Interleave);
4657   };
4658   // Iterate over the instructions in the loop, and collect all
4659   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4660   // that a consecutive-like pointer operand will be scalarized, we collect it
4661   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4662   // getelementptr instruction can be used by both vectorized and scalarized
4663   // memory instructions. For example, if a loop loads and stores from the same
4664   // location, but the store is conditional, the store will be scalarized, and
4665   // the getelementptr won't remain uniform.
4666   for (auto *BB : TheLoop->blocks())
4667     for (auto &I : *BB) {
4668       // If there's no pointer operand, there's nothing to do.
4669       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4670       if (!Ptr)
4671         continue;
4672 
4673       // True if all users of Ptr are memory accesses that have Ptr as their
4674       // pointer operand.
4675       auto UsersAreMemAccesses =
4676           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4677             return getLoadStorePointerOperand(U) == Ptr;
4678           });
4679 
4680       // Ensure the memory instruction will not be scalarized or used by
4681       // gather/scatter, making its pointer operand non-uniform. If the pointer
4682       // operand is used by any instruction other than a memory access, we
4683       // conservatively assume the pointer operand may be non-uniform.
4684       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4685         PossibleNonUniformPtrs.insert(Ptr);
4686 
4687       // If the memory instruction will be vectorized and its pointer operand
4688       // is consecutive-like, or interleaving - the pointer operand should
4689       // remain uniform.
4690       else
4691         ConsecutiveLikePtrs.insert(Ptr);
4692     }
4693 
4694   // Add to the Worklist all consecutive and consecutive-like pointers that
4695   // aren't also identified as possibly non-uniform.
4696   for (auto *V : ConsecutiveLikePtrs)
4697     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4698       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4699       Worklist.insert(V);
4700     }
4701 
4702   // Expand Worklist in topological order: whenever a new instruction
4703   // is added , its users should be already inside Worklist.  It ensures
4704   // a uniform instruction will only be used by uniform instructions.
4705   unsigned idx = 0;
4706   while (idx != Worklist.size()) {
4707     Instruction *I = Worklist[idx++];
4708 
4709     for (auto OV : I->operand_values()) {
4710       // isOutOfScope operands cannot be uniform instructions.
4711       if (isOutOfScope(OV))
4712         continue;
4713       // First order recurrence Phi's should typically be considered
4714       // non-uniform.
4715       auto *OP = dyn_cast<PHINode>(OV);
4716       if (OP && Legal->isFirstOrderRecurrence(OP))
4717         continue;
4718       // If all the users of the operand are uniform, then add the
4719       // operand into the uniform worklist.
4720       auto *OI = cast<Instruction>(OV);
4721       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4722             auto *J = cast<Instruction>(U);
4723             return Worklist.count(J) ||
4724                    (OI == getLoadStorePointerOperand(J) &&
4725                     isUniformDecision(J, VF));
4726           })) {
4727         Worklist.insert(OI);
4728         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4729       }
4730     }
4731   }
4732 
4733   // Returns true if Ptr is the pointer operand of a memory access instruction
4734   // I, and I is known to not require scalarization.
4735   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4736     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4737   };
4738 
4739   // For an instruction to be added into Worklist above, all its users inside
4740   // the loop should also be in Worklist. However, this condition cannot be
4741   // true for phi nodes that form a cyclic dependence. We must process phi
4742   // nodes separately. An induction variable will remain uniform if all users
4743   // of the induction variable and induction variable update remain uniform.
4744   // The code below handles both pointer and non-pointer induction variables.
4745   for (auto &Induction : *Legal->getInductionVars()) {
4746     auto *Ind = Induction.first;
4747     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4748 
4749     // Determine if all users of the induction variable are uniform after
4750     // vectorization.
4751     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4752       auto *I = cast<Instruction>(U);
4753       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4754              isVectorizedMemAccessUse(I, Ind);
4755     });
4756     if (!UniformInd)
4757       continue;
4758 
4759     // Determine if all users of the induction variable update instruction are
4760     // uniform after vectorization.
4761     auto UniformIndUpdate =
4762         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4763           auto *I = cast<Instruction>(U);
4764           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4765                  isVectorizedMemAccessUse(I, IndUpdate);
4766         });
4767     if (!UniformIndUpdate)
4768       continue;
4769 
4770     // The induction variable and its update instruction will remain uniform.
4771     Worklist.insert(Ind);
4772     Worklist.insert(IndUpdate);
4773     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4774     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4775                       << "\n");
4776   }
4777 
4778   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4779 }
4780 
4781 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4782   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4783 
4784   if (Legal->getRuntimePointerChecking()->Need) {
4785     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4786         "runtime pointer checks needed. Enable vectorization of this "
4787         "loop with '#pragma clang loop vectorize(enable)' when "
4788         "compiling with -Os/-Oz",
4789         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4790     return true;
4791   }
4792 
4793   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4794     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4795         "runtime SCEV checks needed. Enable vectorization of this "
4796         "loop with '#pragma clang loop vectorize(enable)' when "
4797         "compiling with -Os/-Oz",
4798         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4799     return true;
4800   }
4801 
4802   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4803   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4804     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4805         "runtime stride == 1 checks needed. Enable vectorization of "
4806         "this loop with '#pragma clang loop vectorize(enable)' when "
4807         "compiling with -Os/-Oz",
4808         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4809     return true;
4810   }
4811 
4812   return false;
4813 }
4814 
4815 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4816   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4817     // TODO: It may by useful to do since it's still likely to be dynamically
4818     // uniform if the target can skip.
4819     reportVectorizationFailure(
4820         "Not inserting runtime ptr check for divergent target",
4821         "runtime pointer checks needed. Not enabled for divergent target",
4822         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4823     return None;
4824   }
4825 
4826   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4827   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4828   if (TC == 1) {
4829     reportVectorizationFailure("Single iteration (non) loop",
4830         "loop trip count is one, irrelevant for vectorization",
4831         "SingleIterationLoop", ORE, TheLoop);
4832     return None;
4833   }
4834 
4835   switch (ScalarEpilogueStatus) {
4836   case CM_ScalarEpilogueAllowed:
4837     return computeFeasibleMaxVF(TC);
4838   case CM_ScalarEpilogueNotNeededUsePredicate:
4839     LLVM_DEBUG(
4840         dbgs() << "LV: vector predicate hint/switch found.\n"
4841                << "LV: Not allowing scalar epilogue, creating predicated "
4842                << "vector loop.\n");
4843     break;
4844   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4845     // fallthrough as a special case of OptForSize
4846   case CM_ScalarEpilogueNotAllowedOptSize:
4847     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4848       LLVM_DEBUG(
4849           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4850     else
4851       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4852                         << "count.\n");
4853 
4854     // Bail if runtime checks are required, which are not good when optimising
4855     // for size.
4856     if (runtimeChecksRequired())
4857       return None;
4858     break;
4859   }
4860 
4861   // Now try the tail folding
4862 
4863   // Invalidate interleave groups that require an epilogue if we can't mask
4864   // the interleave-group.
4865   if (!useMaskedInterleavedAccesses(TTI))
4866     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4867 
4868   unsigned MaxVF = computeFeasibleMaxVF(TC);
4869   if (TC > 0 && TC % MaxVF == 0) {
4870     // Accept MaxVF if we do not have a tail.
4871     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4872     return MaxVF;
4873   }
4874 
4875   // If we don't know the precise trip count, or if the trip count that we
4876   // found modulo the vectorization factor is not zero, try to fold the tail
4877   // by masking.
4878   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4879   if (Legal->prepareToFoldTailByMasking()) {
4880     FoldTailByMasking = true;
4881     return MaxVF;
4882   }
4883 
4884   if (TC == 0) {
4885     reportVectorizationFailure(
4886         "Unable to calculate the loop count due to complex control flow",
4887         "unable to calculate the loop count due to complex control flow",
4888         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4889     return None;
4890   }
4891 
4892   reportVectorizationFailure(
4893       "Cannot optimize for size and vectorize at the same time.",
4894       "cannot optimize for size and vectorize at the same time. "
4895       "Enable vectorization of this loop with '#pragma clang loop "
4896       "vectorize(enable)' when compiling with -Os/-Oz",
4897       "NoTailLoopWithOptForSize", ORE, TheLoop);
4898   return None;
4899 }
4900 
4901 unsigned
4902 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4903   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4904   unsigned SmallestType, WidestType;
4905   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4906   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4907 
4908   // Get the maximum safe dependence distance in bits computed by LAA.
4909   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4910   // the memory accesses that is most restrictive (involved in the smallest
4911   // dependence distance).
4912   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4913 
4914   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4915 
4916   unsigned MaxVectorSize = WidestRegister / WidestType;
4917 
4918   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4919                     << " / " << WidestType << " bits.\n");
4920   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4921                     << WidestRegister << " bits.\n");
4922 
4923   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4924                                  " into one vector!");
4925   if (MaxVectorSize == 0) {
4926     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4927     MaxVectorSize = 1;
4928     return MaxVectorSize;
4929   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4930              isPowerOf2_32(ConstTripCount)) {
4931     // We need to clamp the VF to be the ConstTripCount. There is no point in
4932     // choosing a higher viable VF as done in the loop below.
4933     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4934                       << ConstTripCount << "\n");
4935     MaxVectorSize = ConstTripCount;
4936     return MaxVectorSize;
4937   }
4938 
4939   unsigned MaxVF = MaxVectorSize;
4940   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4941       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4942     // Collect all viable vectorization factors larger than the default MaxVF
4943     // (i.e. MaxVectorSize).
4944     SmallVector<unsigned, 8> VFs;
4945     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4946     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4947       VFs.push_back(VS);
4948 
4949     // For each VF calculate its register usage.
4950     auto RUs = calculateRegisterUsage(VFs);
4951 
4952     // Select the largest VF which doesn't require more registers than existing
4953     // ones.
4954     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4955     for (int i = RUs.size() - 1; i >= 0; --i) {
4956       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4957         MaxVF = VFs[i];
4958         break;
4959       }
4960     }
4961     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4962       if (MaxVF < MinVF) {
4963         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4964                           << ") with target's minimum: " << MinVF << '\n');
4965         MaxVF = MinVF;
4966       }
4967     }
4968   }
4969   return MaxVF;
4970 }
4971 
4972 VectorizationFactor
4973 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4974   float Cost = expectedCost(1).first;
4975   const float ScalarCost = Cost;
4976   unsigned Width = 1;
4977   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4978 
4979   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4980   if (ForceVectorization && MaxVF > 1) {
4981     // Ignore scalar width, because the user explicitly wants vectorization.
4982     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4983     // evaluation.
4984     Cost = std::numeric_limits<float>::max();
4985   }
4986 
4987   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4988     // Notice that the vector loop needs to be executed less times, so
4989     // we need to divide the cost of the vector loops by the width of
4990     // the vector elements.
4991     VectorizationCostTy C = expectedCost(i);
4992     float VectorCost = C.first / (float)i;
4993     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4994                       << " costs: " << (int)VectorCost << ".\n");
4995     if (!C.second && !ForceVectorization) {
4996       LLVM_DEBUG(
4997           dbgs() << "LV: Not considering vector loop of width " << i
4998                  << " because it will not generate any vector instructions.\n");
4999       continue;
5000     }
5001     if (VectorCost < Cost) {
5002       Cost = VectorCost;
5003       Width = i;
5004     }
5005   }
5006 
5007   if (!EnableCondStoresVectorization && NumPredStores) {
5008     reportVectorizationFailure("There are conditional stores.",
5009         "store that is conditionally executed prevents vectorization",
5010         "ConditionalStore", ORE, TheLoop);
5011     Width = 1;
5012     Cost = ScalarCost;
5013   }
5014 
5015   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5016              << "LV: Vectorization seems to be not beneficial, "
5017              << "but was forced by a user.\n");
5018   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5019   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5020   return Factor;
5021 }
5022 
5023 std::pair<unsigned, unsigned>
5024 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5025   unsigned MinWidth = -1U;
5026   unsigned MaxWidth = 8;
5027   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5028 
5029   // For each block.
5030   for (BasicBlock *BB : TheLoop->blocks()) {
5031     // For each instruction in the loop.
5032     for (Instruction &I : BB->instructionsWithoutDebug()) {
5033       Type *T = I.getType();
5034 
5035       // Skip ignored values.
5036       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5037         continue;
5038 
5039       // Only examine Loads, Stores and PHINodes.
5040       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5041         continue;
5042 
5043       // Examine PHI nodes that are reduction variables. Update the type to
5044       // account for the recurrence type.
5045       if (auto *PN = dyn_cast<PHINode>(&I)) {
5046         if (!Legal->isReductionVariable(PN))
5047           continue;
5048         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5049         T = RdxDesc.getRecurrenceType();
5050       }
5051 
5052       // Examine the stored values.
5053       if (auto *ST = dyn_cast<StoreInst>(&I))
5054         T = ST->getValueOperand()->getType();
5055 
5056       // Ignore loaded pointer types and stored pointer types that are not
5057       // vectorizable.
5058       //
5059       // FIXME: The check here attempts to predict whether a load or store will
5060       //        be vectorized. We only know this for certain after a VF has
5061       //        been selected. Here, we assume that if an access can be
5062       //        vectorized, it will be. We should also look at extending this
5063       //        optimization to non-pointer types.
5064       //
5065       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5066           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5067         continue;
5068 
5069       MinWidth = std::min(MinWidth,
5070                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5071       MaxWidth = std::max(MaxWidth,
5072                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5073     }
5074   }
5075 
5076   return {MinWidth, MaxWidth};
5077 }
5078 
5079 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5080                                                            unsigned LoopCost) {
5081   // -- The interleave heuristics --
5082   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5083   // There are many micro-architectural considerations that we can't predict
5084   // at this level. For example, frontend pressure (on decode or fetch) due to
5085   // code size, or the number and capabilities of the execution ports.
5086   //
5087   // We use the following heuristics to select the interleave count:
5088   // 1. If the code has reductions, then we interleave to break the cross
5089   // iteration dependency.
5090   // 2. If the loop is really small, then we interleave to reduce the loop
5091   // overhead.
5092   // 3. We don't interleave if we think that we will spill registers to memory
5093   // due to the increased register pressure.
5094 
5095   if (!isScalarEpilogueAllowed())
5096     return 1;
5097 
5098   // We used the distance for the interleave count.
5099   if (Legal->getMaxSafeDepDistBytes() != -1U)
5100     return 1;
5101 
5102   // Do not interleave loops with a relatively small trip count.
5103   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5104   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5105     return 1;
5106 
5107   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5108   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5109                     << " registers\n");
5110 
5111   if (VF == 1) {
5112     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5113       TargetNumRegisters = ForceTargetNumScalarRegs;
5114   } else {
5115     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5116       TargetNumRegisters = ForceTargetNumVectorRegs;
5117   }
5118 
5119   RegisterUsage R = calculateRegisterUsage({VF})[0];
5120   // We divide by these constants so assume that we have at least one
5121   // instruction that uses at least one register.
5122   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5123 
5124   // We calculate the interleave count using the following formula.
5125   // Subtract the number of loop invariants from the number of available
5126   // registers. These registers are used by all of the interleaved instances.
5127   // Next, divide the remaining registers by the number of registers that is
5128   // required by the loop, in order to estimate how many parallel instances
5129   // fit without causing spills. All of this is rounded down if necessary to be
5130   // a power of two. We want power of two interleave count to simplify any
5131   // addressing operations or alignment considerations.
5132   // We also want power of two interleave counts to ensure that the induction
5133   // variable of the vector loop wraps to zero, when tail is folded by masking;
5134   // this currently happens when OptForSize, in which case IC is set to 1 above.
5135   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5136                               R.MaxLocalUsers);
5137 
5138   // Don't count the induction variable as interleaved.
5139   if (EnableIndVarRegisterHeur)
5140     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5141                        std::max(1U, (R.MaxLocalUsers - 1)));
5142 
5143   // Clamp the interleave ranges to reasonable counts.
5144   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5145 
5146   // Check if the user has overridden the max.
5147   if (VF == 1) {
5148     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5149       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5150   } else {
5151     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5152       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5153   }
5154 
5155   // If the trip count is constant, limit the interleave count to be less than
5156   // the trip count divided by VF.
5157   if (TC > 0) {
5158     assert(TC >= VF && "VF exceeds trip count?");
5159     if ((TC / VF) < MaxInterleaveCount)
5160       MaxInterleaveCount = (TC / VF);
5161   }
5162 
5163   // If we did not calculate the cost for VF (because the user selected the VF)
5164   // then we calculate the cost of VF here.
5165   if (LoopCost == 0)
5166     LoopCost = expectedCost(VF).first;
5167 
5168   assert(LoopCost && "Non-zero loop cost expected");
5169 
5170   // Clamp the calculated IC to be between the 1 and the max interleave count
5171   // that the target and trip count allows.
5172   if (IC > MaxInterleaveCount)
5173     IC = MaxInterleaveCount;
5174   else if (IC < 1)
5175     IC = 1;
5176 
5177   // Interleave if we vectorized this loop and there is a reduction that could
5178   // benefit from interleaving.
5179   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5180     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5181     return IC;
5182   }
5183 
5184   // Note that if we've already vectorized the loop we will have done the
5185   // runtime check and so interleaving won't require further checks.
5186   bool InterleavingRequiresRuntimePointerCheck =
5187       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5188 
5189   // We want to interleave small loops in order to reduce the loop overhead and
5190   // potentially expose ILP opportunities.
5191   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5192   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5193     // We assume that the cost overhead is 1 and we use the cost model
5194     // to estimate the cost of the loop and interleave until the cost of the
5195     // loop overhead is about 5% of the cost of the loop.
5196     unsigned SmallIC =
5197         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5198 
5199     // Interleave until store/load ports (estimated by max interleave count) are
5200     // saturated.
5201     unsigned NumStores = Legal->getNumStores();
5202     unsigned NumLoads = Legal->getNumLoads();
5203     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5204     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5205 
5206     // If we have a scalar reduction (vector reductions are already dealt with
5207     // by this point), we can increase the critical path length if the loop
5208     // we're interleaving is inside another loop. Limit, by default to 2, so the
5209     // critical path only gets increased by one reduction operation.
5210     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5211       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5212       SmallIC = std::min(SmallIC, F);
5213       StoresIC = std::min(StoresIC, F);
5214       LoadsIC = std::min(LoadsIC, F);
5215     }
5216 
5217     if (EnableLoadStoreRuntimeInterleave &&
5218         std::max(StoresIC, LoadsIC) > SmallIC) {
5219       LLVM_DEBUG(
5220           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5221       return std::max(StoresIC, LoadsIC);
5222     }
5223 
5224     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5225     return SmallIC;
5226   }
5227 
5228   // Interleave if this is a large loop (small loops are already dealt with by
5229   // this point) that could benefit from interleaving.
5230   bool HasReductions = !Legal->getReductionVars()->empty();
5231   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5232     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5233     return IC;
5234   }
5235 
5236   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5237   return 1;
5238 }
5239 
5240 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5241 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5242   // This function calculates the register usage by measuring the highest number
5243   // of values that are alive at a single location. Obviously, this is a very
5244   // rough estimation. We scan the loop in a topological order in order and
5245   // assign a number to each instruction. We use RPO to ensure that defs are
5246   // met before their users. We assume that each instruction that has in-loop
5247   // users starts an interval. We record every time that an in-loop value is
5248   // used, so we have a list of the first and last occurrences of each
5249   // instruction. Next, we transpose this data structure into a multi map that
5250   // holds the list of intervals that *end* at a specific location. This multi
5251   // map allows us to perform a linear search. We scan the instructions linearly
5252   // and record each time that a new interval starts, by placing it in a set.
5253   // If we find this value in the multi-map then we remove it from the set.
5254   // The max register usage is the maximum size of the set.
5255   // We also search for instructions that are defined outside the loop, but are
5256   // used inside the loop. We need this number separately from the max-interval
5257   // usage number because when we unroll, loop-invariant values do not take
5258   // more register.
5259   LoopBlocksDFS DFS(TheLoop);
5260   DFS.perform(LI);
5261 
5262   RegisterUsage RU;
5263 
5264   // Each 'key' in the map opens a new interval. The values
5265   // of the map are the index of the 'last seen' usage of the
5266   // instruction that is the key.
5267   using IntervalMap = DenseMap<Instruction *, unsigned>;
5268 
5269   // Maps instruction to its index.
5270   SmallVector<Instruction *, 64> IdxToInstr;
5271   // Marks the end of each interval.
5272   IntervalMap EndPoint;
5273   // Saves the list of instruction indices that are used in the loop.
5274   SmallPtrSet<Instruction *, 8> Ends;
5275   // Saves the list of values that are used in the loop but are
5276   // defined outside the loop, such as arguments and constants.
5277   SmallPtrSet<Value *, 8> LoopInvariants;
5278 
5279   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5280     for (Instruction &I : BB->instructionsWithoutDebug()) {
5281       IdxToInstr.push_back(&I);
5282 
5283       // Save the end location of each USE.
5284       for (Value *U : I.operands()) {
5285         auto *Instr = dyn_cast<Instruction>(U);
5286 
5287         // Ignore non-instruction values such as arguments, constants, etc.
5288         if (!Instr)
5289           continue;
5290 
5291         // If this instruction is outside the loop then record it and continue.
5292         if (!TheLoop->contains(Instr)) {
5293           LoopInvariants.insert(Instr);
5294           continue;
5295         }
5296 
5297         // Overwrite previous end points.
5298         EndPoint[Instr] = IdxToInstr.size();
5299         Ends.insert(Instr);
5300       }
5301     }
5302   }
5303 
5304   // Saves the list of intervals that end with the index in 'key'.
5305   using InstrList = SmallVector<Instruction *, 2>;
5306   DenseMap<unsigned, InstrList> TransposeEnds;
5307 
5308   // Transpose the EndPoints to a list of values that end at each index.
5309   for (auto &Interval : EndPoint)
5310     TransposeEnds[Interval.second].push_back(Interval.first);
5311 
5312   SmallPtrSet<Instruction *, 8> OpenIntervals;
5313 
5314   // Get the size of the widest register.
5315   unsigned MaxSafeDepDist = -1U;
5316   if (Legal->getMaxSafeDepDistBytes() != -1U)
5317     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5318   unsigned WidestRegister =
5319       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5320   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5321 
5322   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5323   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5324 
5325   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5326 
5327   // A lambda that gets the register usage for the given type and VF.
5328   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5329     if (Ty->isTokenTy())
5330       return 0U;
5331     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5332     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5333   };
5334 
5335   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5336     Instruction *I = IdxToInstr[i];
5337 
5338     // Remove all of the instructions that end at this location.
5339     InstrList &List = TransposeEnds[i];
5340     for (Instruction *ToRemove : List)
5341       OpenIntervals.erase(ToRemove);
5342 
5343     // Ignore instructions that are never used within the loop.
5344     if (Ends.find(I) == Ends.end())
5345       continue;
5346 
5347     // Skip ignored values.
5348     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5349       continue;
5350 
5351     // For each VF find the maximum usage of registers.
5352     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5353       if (VFs[j] == 1) {
5354         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5355         continue;
5356       }
5357       collectUniformsAndScalars(VFs[j]);
5358       // Count the number of live intervals.
5359       unsigned RegUsage = 0;
5360       for (auto Inst : OpenIntervals) {
5361         // Skip ignored values for VF > 1.
5362         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5363             isScalarAfterVectorization(Inst, VFs[j]))
5364           continue;
5365         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5366       }
5367       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5368     }
5369 
5370     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5371                       << OpenIntervals.size() << '\n');
5372 
5373     // Add the current instruction to the list of open intervals.
5374     OpenIntervals.insert(I);
5375   }
5376 
5377   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5378     unsigned Invariant = 0;
5379     if (VFs[i] == 1)
5380       Invariant = LoopInvariants.size();
5381     else {
5382       for (auto Inst : LoopInvariants)
5383         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5384     }
5385 
5386     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5387     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5388     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5389                       << '\n');
5390 
5391     RU.LoopInvariantRegs = Invariant;
5392     RU.MaxLocalUsers = MaxUsages[i];
5393     RUs[i] = RU;
5394   }
5395 
5396   return RUs;
5397 }
5398 
5399 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5400   // TODO: Cost model for emulated masked load/store is completely
5401   // broken. This hack guides the cost model to use an artificially
5402   // high enough value to practically disable vectorization with such
5403   // operations, except where previously deployed legality hack allowed
5404   // using very low cost values. This is to avoid regressions coming simply
5405   // from moving "masked load/store" check from legality to cost model.
5406   // Masked Load/Gather emulation was previously never allowed.
5407   // Limited number of Masked Store/Scatter emulation was allowed.
5408   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5409   return isa<LoadInst>(I) ||
5410          (isa<StoreInst>(I) &&
5411           NumPredStores > NumberOfStoresToPredicate);
5412 }
5413 
5414 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5415   // If we aren't vectorizing the loop, or if we've already collected the
5416   // instructions to scalarize, there's nothing to do. Collection may already
5417   // have occurred if we have a user-selected VF and are now computing the
5418   // expected cost for interleaving.
5419   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5420     return;
5421 
5422   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5423   // not profitable to scalarize any instructions, the presence of VF in the
5424   // map will indicate that we've analyzed it already.
5425   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5426 
5427   // Find all the instructions that are scalar with predication in the loop and
5428   // determine if it would be better to not if-convert the blocks they are in.
5429   // If so, we also record the instructions to scalarize.
5430   for (BasicBlock *BB : TheLoop->blocks()) {
5431     if (!blockNeedsPredication(BB))
5432       continue;
5433     for (Instruction &I : *BB)
5434       if (isScalarWithPredication(&I)) {
5435         ScalarCostsTy ScalarCosts;
5436         // Do not apply discount logic if hacked cost is needed
5437         // for emulated masked memrefs.
5438         if (!useEmulatedMaskMemRefHack(&I) &&
5439             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5440           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5441         // Remember that BB will remain after vectorization.
5442         PredicatedBBsAfterVectorization.insert(BB);
5443       }
5444   }
5445 }
5446 
5447 int LoopVectorizationCostModel::computePredInstDiscount(
5448     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5449     unsigned VF) {
5450   assert(!isUniformAfterVectorization(PredInst, VF) &&
5451          "Instruction marked uniform-after-vectorization will be predicated");
5452 
5453   // Initialize the discount to zero, meaning that the scalar version and the
5454   // vector version cost the same.
5455   int Discount = 0;
5456 
5457   // Holds instructions to analyze. The instructions we visit are mapped in
5458   // ScalarCosts. Those instructions are the ones that would be scalarized if
5459   // we find that the scalar version costs less.
5460   SmallVector<Instruction *, 8> Worklist;
5461 
5462   // Returns true if the given instruction can be scalarized.
5463   auto canBeScalarized = [&](Instruction *I) -> bool {
5464     // We only attempt to scalarize instructions forming a single-use chain
5465     // from the original predicated block that would otherwise be vectorized.
5466     // Although not strictly necessary, we give up on instructions we know will
5467     // already be scalar to avoid traversing chains that are unlikely to be
5468     // beneficial.
5469     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5470         isScalarAfterVectorization(I, VF))
5471       return false;
5472 
5473     // If the instruction is scalar with predication, it will be analyzed
5474     // separately. We ignore it within the context of PredInst.
5475     if (isScalarWithPredication(I))
5476       return false;
5477 
5478     // If any of the instruction's operands are uniform after vectorization,
5479     // the instruction cannot be scalarized. This prevents, for example, a
5480     // masked load from being scalarized.
5481     //
5482     // We assume we will only emit a value for lane zero of an instruction
5483     // marked uniform after vectorization, rather than VF identical values.
5484     // Thus, if we scalarize an instruction that uses a uniform, we would
5485     // create uses of values corresponding to the lanes we aren't emitting code
5486     // for. This behavior can be changed by allowing getScalarValue to clone
5487     // the lane zero values for uniforms rather than asserting.
5488     for (Use &U : I->operands())
5489       if (auto *J = dyn_cast<Instruction>(U.get()))
5490         if (isUniformAfterVectorization(J, VF))
5491           return false;
5492 
5493     // Otherwise, we can scalarize the instruction.
5494     return true;
5495   };
5496 
5497   // Compute the expected cost discount from scalarizing the entire expression
5498   // feeding the predicated instruction. We currently only consider expressions
5499   // that are single-use instruction chains.
5500   Worklist.push_back(PredInst);
5501   while (!Worklist.empty()) {
5502     Instruction *I = Worklist.pop_back_val();
5503 
5504     // If we've already analyzed the instruction, there's nothing to do.
5505     if (ScalarCosts.find(I) != ScalarCosts.end())
5506       continue;
5507 
5508     // Compute the cost of the vector instruction. Note that this cost already
5509     // includes the scalarization overhead of the predicated instruction.
5510     unsigned VectorCost = getInstructionCost(I, VF).first;
5511 
5512     // Compute the cost of the scalarized instruction. This cost is the cost of
5513     // the instruction as if it wasn't if-converted and instead remained in the
5514     // predicated block. We will scale this cost by block probability after
5515     // computing the scalarization overhead.
5516     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5517 
5518     // Compute the scalarization overhead of needed insertelement instructions
5519     // and phi nodes.
5520     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5521       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5522                                                  true, false);
5523       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5524     }
5525 
5526     // Compute the scalarization overhead of needed extractelement
5527     // instructions. For each of the instruction's operands, if the operand can
5528     // be scalarized, add it to the worklist; otherwise, account for the
5529     // overhead.
5530     for (Use &U : I->operands())
5531       if (auto *J = dyn_cast<Instruction>(U.get())) {
5532         assert(VectorType::isValidElementType(J->getType()) &&
5533                "Instruction has non-scalar type");
5534         if (canBeScalarized(J))
5535           Worklist.push_back(J);
5536         else if (needsExtract(J, VF))
5537           ScalarCost += TTI.getScalarizationOverhead(
5538                               ToVectorTy(J->getType(),VF), false, true);
5539       }
5540 
5541     // Scale the total scalar cost by block probability.
5542     ScalarCost /= getReciprocalPredBlockProb();
5543 
5544     // Compute the discount. A non-negative discount means the vector version
5545     // of the instruction costs more, and scalarizing would be beneficial.
5546     Discount += VectorCost - ScalarCost;
5547     ScalarCosts[I] = ScalarCost;
5548   }
5549 
5550   return Discount;
5551 }
5552 
5553 LoopVectorizationCostModel::VectorizationCostTy
5554 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5555   VectorizationCostTy Cost;
5556 
5557   // For each block.
5558   for (BasicBlock *BB : TheLoop->blocks()) {
5559     VectorizationCostTy BlockCost;
5560 
5561     // For each instruction in the old loop.
5562     for (Instruction &I : BB->instructionsWithoutDebug()) {
5563       // Skip ignored values.
5564       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5565           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5566         continue;
5567 
5568       VectorizationCostTy C = getInstructionCost(&I, VF);
5569 
5570       // Check if we should override the cost.
5571       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5572         C.first = ForceTargetInstructionCost;
5573 
5574       BlockCost.first += C.first;
5575       BlockCost.second |= C.second;
5576       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5577                         << " for VF " << VF << " For instruction: " << I
5578                         << '\n');
5579     }
5580 
5581     // If we are vectorizing a predicated block, it will have been
5582     // if-converted. This means that the block's instructions (aside from
5583     // stores and instructions that may divide by zero) will now be
5584     // unconditionally executed. For the scalar case, we may not always execute
5585     // the predicated block. Thus, scale the block's cost by the probability of
5586     // executing it.
5587     if (VF == 1 && blockNeedsPredication(BB))
5588       BlockCost.first /= getReciprocalPredBlockProb();
5589 
5590     Cost.first += BlockCost.first;
5591     Cost.second |= BlockCost.second;
5592   }
5593 
5594   return Cost;
5595 }
5596 
5597 /// Gets Address Access SCEV after verifying that the access pattern
5598 /// is loop invariant except the induction variable dependence.
5599 ///
5600 /// This SCEV can be sent to the Target in order to estimate the address
5601 /// calculation cost.
5602 static const SCEV *getAddressAccessSCEV(
5603               Value *Ptr,
5604               LoopVectorizationLegality *Legal,
5605               PredicatedScalarEvolution &PSE,
5606               const Loop *TheLoop) {
5607 
5608   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5609   if (!Gep)
5610     return nullptr;
5611 
5612   // We are looking for a gep with all loop invariant indices except for one
5613   // which should be an induction variable.
5614   auto SE = PSE.getSE();
5615   unsigned NumOperands = Gep->getNumOperands();
5616   for (unsigned i = 1; i < NumOperands; ++i) {
5617     Value *Opd = Gep->getOperand(i);
5618     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5619         !Legal->isInductionVariable(Opd))
5620       return nullptr;
5621   }
5622 
5623   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5624   return PSE.getSCEV(Ptr);
5625 }
5626 
5627 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5628   return Legal->hasStride(I->getOperand(0)) ||
5629          Legal->hasStride(I->getOperand(1));
5630 }
5631 
5632 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5633                                                                  unsigned VF) {
5634   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5635   Type *ValTy = getMemInstValueType(I);
5636   auto SE = PSE.getSE();
5637 
5638   unsigned Alignment = getLoadStoreAlignment(I);
5639   unsigned AS = getLoadStoreAddressSpace(I);
5640   Value *Ptr = getLoadStorePointerOperand(I);
5641   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5642 
5643   // Figure out whether the access is strided and get the stride value
5644   // if it's known in compile time
5645   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5646 
5647   // Get the cost of the scalar memory instruction and address computation.
5648   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5649 
5650   // Don't pass *I here, since it is scalar but will actually be part of a
5651   // vectorized loop where the user of it is a vectorized instruction.
5652   Cost += VF *
5653           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5654                               AS);
5655 
5656   // Get the overhead of the extractelement and insertelement instructions
5657   // we might create due to scalarization.
5658   Cost += getScalarizationOverhead(I, VF);
5659 
5660   // If we have a predicated store, it may not be executed for each vector
5661   // lane. Scale the cost by the probability of executing the predicated
5662   // block.
5663   if (isPredicatedInst(I)) {
5664     Cost /= getReciprocalPredBlockProb();
5665 
5666     if (useEmulatedMaskMemRefHack(I))
5667       // Artificially setting to a high enough value to practically disable
5668       // vectorization with such operations.
5669       Cost = 3000000;
5670   }
5671 
5672   return Cost;
5673 }
5674 
5675 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5676                                                              unsigned VF) {
5677   Type *ValTy = getMemInstValueType(I);
5678   Type *VectorTy = ToVectorTy(ValTy, VF);
5679   unsigned Alignment = getLoadStoreAlignment(I);
5680   Value *Ptr = getLoadStorePointerOperand(I);
5681   unsigned AS = getLoadStoreAddressSpace(I);
5682   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5683 
5684   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5685          "Stride should be 1 or -1 for consecutive memory access");
5686   unsigned Cost = 0;
5687   if (Legal->isMaskRequired(I))
5688     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5689   else
5690     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5691 
5692   bool Reverse = ConsecutiveStride < 0;
5693   if (Reverse)
5694     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5695   return Cost;
5696 }
5697 
5698 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5699                                                          unsigned VF) {
5700   Type *ValTy = getMemInstValueType(I);
5701   Type *VectorTy = ToVectorTy(ValTy, VF);
5702   unsigned Alignment = getLoadStoreAlignment(I);
5703   unsigned AS = getLoadStoreAddressSpace(I);
5704   if (isa<LoadInst>(I)) {
5705     return TTI.getAddressComputationCost(ValTy) +
5706            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5707            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5708   }
5709   StoreInst *SI = cast<StoreInst>(I);
5710 
5711   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5712   return TTI.getAddressComputationCost(ValTy) +
5713          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5714          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5715                                                Instruction::ExtractElement,
5716                                                VectorTy, VF - 1));
5717 }
5718 
5719 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5720                                                           unsigned VF) {
5721   Type *ValTy = getMemInstValueType(I);
5722   Type *VectorTy = ToVectorTy(ValTy, VF);
5723   unsigned Alignment = getLoadStoreAlignment(I);
5724   Value *Ptr = getLoadStorePointerOperand(I);
5725 
5726   return TTI.getAddressComputationCost(VectorTy) +
5727          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5728                                     Legal->isMaskRequired(I), Alignment);
5729 }
5730 
5731 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5732                                                             unsigned VF) {
5733   Type *ValTy = getMemInstValueType(I);
5734   Type *VectorTy = ToVectorTy(ValTy, VF);
5735   unsigned AS = getLoadStoreAddressSpace(I);
5736 
5737   auto Group = getInterleavedAccessGroup(I);
5738   assert(Group && "Fail to get an interleaved access group.");
5739 
5740   unsigned InterleaveFactor = Group->getFactor();
5741   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5742 
5743   // Holds the indices of existing members in an interleaved load group.
5744   // An interleaved store group doesn't need this as it doesn't allow gaps.
5745   SmallVector<unsigned, 4> Indices;
5746   if (isa<LoadInst>(I)) {
5747     for (unsigned i = 0; i < InterleaveFactor; i++)
5748       if (Group->getMember(i))
5749         Indices.push_back(i);
5750   }
5751 
5752   // Calculate the cost of the whole interleaved group.
5753   bool UseMaskForGaps =
5754       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5755   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5756       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5757       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5758 
5759   if (Group->isReverse()) {
5760     // TODO: Add support for reversed masked interleaved access.
5761     assert(!Legal->isMaskRequired(I) &&
5762            "Reverse masked interleaved access not supported.");
5763     Cost += Group->getNumMembers() *
5764             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5765   }
5766   return Cost;
5767 }
5768 
5769 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5770                                                               unsigned VF) {
5771   // Calculate scalar cost only. Vectorization cost should be ready at this
5772   // moment.
5773   if (VF == 1) {
5774     Type *ValTy = getMemInstValueType(I);
5775     unsigned Alignment = getLoadStoreAlignment(I);
5776     unsigned AS = getLoadStoreAddressSpace(I);
5777 
5778     return TTI.getAddressComputationCost(ValTy) +
5779            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5780   }
5781   return getWideningCost(I, VF);
5782 }
5783 
5784 LoopVectorizationCostModel::VectorizationCostTy
5785 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5786   // If we know that this instruction will remain uniform, check the cost of
5787   // the scalar version.
5788   if (isUniformAfterVectorization(I, VF))
5789     VF = 1;
5790 
5791   if (VF > 1 && isProfitableToScalarize(I, VF))
5792     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5793 
5794   // Forced scalars do not have any scalarization overhead.
5795   auto ForcedScalar = ForcedScalars.find(VF);
5796   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5797     auto InstSet = ForcedScalar->second;
5798     if (InstSet.find(I) != InstSet.end())
5799       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5800   }
5801 
5802   Type *VectorTy;
5803   unsigned C = getInstructionCost(I, VF, VectorTy);
5804 
5805   bool TypeNotScalarized =
5806       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5807   return VectorizationCostTy(C, TypeNotScalarized);
5808 }
5809 
5810 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5811                                                               unsigned VF) {
5812 
5813   if (VF == 1)
5814     return 0;
5815 
5816   unsigned Cost = 0;
5817   Type *RetTy = ToVectorTy(I->getType(), VF);
5818   if (!RetTy->isVoidTy() &&
5819       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5820     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5821 
5822   // Some targets keep addresses scalar.
5823   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5824     return Cost;
5825 
5826   // Some targets support efficient element stores.
5827   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5828     return Cost;
5829 
5830   // Collect operands to consider.
5831   CallInst *CI = dyn_cast<CallInst>(I);
5832   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5833 
5834   // Skip operands that do not require extraction/scalarization and do not incur
5835   // any overhead.
5836   return Cost + TTI.getOperandsScalarizationOverhead(
5837                     filterExtractingOperands(Ops, VF), VF);
5838 }
5839 
5840 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5841   if (VF == 1)
5842     return;
5843   NumPredStores = 0;
5844   for (BasicBlock *BB : TheLoop->blocks()) {
5845     // For each instruction in the old loop.
5846     for (Instruction &I : *BB) {
5847       Value *Ptr =  getLoadStorePointerOperand(&I);
5848       if (!Ptr)
5849         continue;
5850 
5851       // TODO: We should generate better code and update the cost model for
5852       // predicated uniform stores. Today they are treated as any other
5853       // predicated store (see added test cases in
5854       // invariant-store-vectorization.ll).
5855       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5856         NumPredStores++;
5857 
5858       if (Legal->isUniform(Ptr) &&
5859           // Conditional loads and stores should be scalarized and predicated.
5860           // isScalarWithPredication cannot be used here since masked
5861           // gather/scatters are not considered scalar with predication.
5862           !Legal->blockNeedsPredication(I.getParent())) {
5863         // TODO: Avoid replicating loads and stores instead of
5864         // relying on instcombine to remove them.
5865         // Load: Scalar load + broadcast
5866         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5867         unsigned Cost = getUniformMemOpCost(&I, VF);
5868         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5869         continue;
5870       }
5871 
5872       // We assume that widening is the best solution when possible.
5873       if (memoryInstructionCanBeWidened(&I, VF)) {
5874         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5875         int ConsecutiveStride =
5876                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5877         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5878                "Expected consecutive stride.");
5879         InstWidening Decision =
5880             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5881         setWideningDecision(&I, VF, Decision, Cost);
5882         continue;
5883       }
5884 
5885       // Choose between Interleaving, Gather/Scatter or Scalarization.
5886       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5887       unsigned NumAccesses = 1;
5888       if (isAccessInterleaved(&I)) {
5889         auto Group = getInterleavedAccessGroup(&I);
5890         assert(Group && "Fail to get an interleaved access group.");
5891 
5892         // Make one decision for the whole group.
5893         if (getWideningDecision(&I, VF) != CM_Unknown)
5894           continue;
5895 
5896         NumAccesses = Group->getNumMembers();
5897         if (interleavedAccessCanBeWidened(&I, VF))
5898           InterleaveCost = getInterleaveGroupCost(&I, VF);
5899       }
5900 
5901       unsigned GatherScatterCost =
5902           isLegalGatherOrScatter(&I)
5903               ? getGatherScatterCost(&I, VF) * NumAccesses
5904               : std::numeric_limits<unsigned>::max();
5905 
5906       unsigned ScalarizationCost =
5907           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5908 
5909       // Choose better solution for the current VF,
5910       // write down this decision and use it during vectorization.
5911       unsigned Cost;
5912       InstWidening Decision;
5913       if (InterleaveCost <= GatherScatterCost &&
5914           InterleaveCost < ScalarizationCost) {
5915         Decision = CM_Interleave;
5916         Cost = InterleaveCost;
5917       } else if (GatherScatterCost < ScalarizationCost) {
5918         Decision = CM_GatherScatter;
5919         Cost = GatherScatterCost;
5920       } else {
5921         Decision = CM_Scalarize;
5922         Cost = ScalarizationCost;
5923       }
5924       // If the instructions belongs to an interleave group, the whole group
5925       // receives the same decision. The whole group receives the cost, but
5926       // the cost will actually be assigned to one instruction.
5927       if (auto Group = getInterleavedAccessGroup(&I))
5928         setWideningDecision(Group, VF, Decision, Cost);
5929       else
5930         setWideningDecision(&I, VF, Decision, Cost);
5931     }
5932   }
5933 
5934   // Make sure that any load of address and any other address computation
5935   // remains scalar unless there is gather/scatter support. This avoids
5936   // inevitable extracts into address registers, and also has the benefit of
5937   // activating LSR more, since that pass can't optimize vectorized
5938   // addresses.
5939   if (TTI.prefersVectorizedAddressing())
5940     return;
5941 
5942   // Start with all scalar pointer uses.
5943   SmallPtrSet<Instruction *, 8> AddrDefs;
5944   for (BasicBlock *BB : TheLoop->blocks())
5945     for (Instruction &I : *BB) {
5946       Instruction *PtrDef =
5947         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5948       if (PtrDef && TheLoop->contains(PtrDef) &&
5949           getWideningDecision(&I, VF) != CM_GatherScatter)
5950         AddrDefs.insert(PtrDef);
5951     }
5952 
5953   // Add all instructions used to generate the addresses.
5954   SmallVector<Instruction *, 4> Worklist;
5955   for (auto *I : AddrDefs)
5956     Worklist.push_back(I);
5957   while (!Worklist.empty()) {
5958     Instruction *I = Worklist.pop_back_val();
5959     for (auto &Op : I->operands())
5960       if (auto *InstOp = dyn_cast<Instruction>(Op))
5961         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5962             AddrDefs.insert(InstOp).second)
5963           Worklist.push_back(InstOp);
5964   }
5965 
5966   for (auto *I : AddrDefs) {
5967     if (isa<LoadInst>(I)) {
5968       // Setting the desired widening decision should ideally be handled in
5969       // by cost functions, but since this involves the task of finding out
5970       // if the loaded register is involved in an address computation, it is
5971       // instead changed here when we know this is the case.
5972       InstWidening Decision = getWideningDecision(I, VF);
5973       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5974         // Scalarize a widened load of address.
5975         setWideningDecision(I, VF, CM_Scalarize,
5976                             (VF * getMemoryInstructionCost(I, 1)));
5977       else if (auto Group = getInterleavedAccessGroup(I)) {
5978         // Scalarize an interleave group of address loads.
5979         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5980           if (Instruction *Member = Group->getMember(I))
5981             setWideningDecision(Member, VF, CM_Scalarize,
5982                                 (VF * getMemoryInstructionCost(Member, 1)));
5983         }
5984       }
5985     } else
5986       // Make sure I gets scalarized and a cost estimate without
5987       // scalarization overhead.
5988       ForcedScalars[VF].insert(I);
5989   }
5990 }
5991 
5992 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5993                                                         unsigned VF,
5994                                                         Type *&VectorTy) {
5995   Type *RetTy = I->getType();
5996   if (canTruncateToMinimalBitwidth(I, VF))
5997     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5998   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5999   auto SE = PSE.getSE();
6000 
6001   // TODO: We need to estimate the cost of intrinsic calls.
6002   switch (I->getOpcode()) {
6003   case Instruction::GetElementPtr:
6004     // We mark this instruction as zero-cost because the cost of GEPs in
6005     // vectorized code depends on whether the corresponding memory instruction
6006     // is scalarized or not. Therefore, we handle GEPs with the memory
6007     // instruction cost.
6008     return 0;
6009   case Instruction::Br: {
6010     // In cases of scalarized and predicated instructions, there will be VF
6011     // predicated blocks in the vectorized loop. Each branch around these
6012     // blocks requires also an extract of its vector compare i1 element.
6013     bool ScalarPredicatedBB = false;
6014     BranchInst *BI = cast<BranchInst>(I);
6015     if (VF > 1 && BI->isConditional() &&
6016         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6017              PredicatedBBsAfterVectorization.end() ||
6018          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6019              PredicatedBBsAfterVectorization.end()))
6020       ScalarPredicatedBB = true;
6021 
6022     if (ScalarPredicatedBB) {
6023       // Return cost for branches around scalarized and predicated blocks.
6024       Type *Vec_i1Ty =
6025           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6026       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6027               (TTI.getCFInstrCost(Instruction::Br) * VF));
6028     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6029       // The back-edge branch will remain, as will all scalar branches.
6030       return TTI.getCFInstrCost(Instruction::Br);
6031     else
6032       // This branch will be eliminated by if-conversion.
6033       return 0;
6034     // Note: We currently assume zero cost for an unconditional branch inside
6035     // a predicated block since it will become a fall-through, although we
6036     // may decide in the future to call TTI for all branches.
6037   }
6038   case Instruction::PHI: {
6039     auto *Phi = cast<PHINode>(I);
6040 
6041     // First-order recurrences are replaced by vector shuffles inside the loop.
6042     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6043     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6044       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6045                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6046 
6047     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6048     // converted into select instructions. We require N - 1 selects per phi
6049     // node, where N is the number of incoming values.
6050     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6051       return (Phi->getNumIncomingValues() - 1) *
6052              TTI.getCmpSelInstrCost(
6053                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6054                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6055 
6056     return TTI.getCFInstrCost(Instruction::PHI);
6057   }
6058   case Instruction::UDiv:
6059   case Instruction::SDiv:
6060   case Instruction::URem:
6061   case Instruction::SRem:
6062     // If we have a predicated instruction, it may not be executed for each
6063     // vector lane. Get the scalarization cost and scale this amount by the
6064     // probability of executing the predicated block. If the instruction is not
6065     // predicated, we fall through to the next case.
6066     if (VF > 1 && isScalarWithPredication(I)) {
6067       unsigned Cost = 0;
6068 
6069       // These instructions have a non-void type, so account for the phi nodes
6070       // that we will create. This cost is likely to be zero. The phi node
6071       // cost, if any, should be scaled by the block probability because it
6072       // models a copy at the end of each predicated block.
6073       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6074 
6075       // The cost of the non-predicated instruction.
6076       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6077 
6078       // The cost of insertelement and extractelement instructions needed for
6079       // scalarization.
6080       Cost += getScalarizationOverhead(I, VF);
6081 
6082       // Scale the cost by the probability of executing the predicated blocks.
6083       // This assumes the predicated block for each vector lane is equally
6084       // likely.
6085       return Cost / getReciprocalPredBlockProb();
6086     }
6087     LLVM_FALLTHROUGH;
6088   case Instruction::Add:
6089   case Instruction::FAdd:
6090   case Instruction::Sub:
6091   case Instruction::FSub:
6092   case Instruction::Mul:
6093   case Instruction::FMul:
6094   case Instruction::FDiv:
6095   case Instruction::FRem:
6096   case Instruction::Shl:
6097   case Instruction::LShr:
6098   case Instruction::AShr:
6099   case Instruction::And:
6100   case Instruction::Or:
6101   case Instruction::Xor: {
6102     // Since we will replace the stride by 1 the multiplication should go away.
6103     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6104       return 0;
6105     // Certain instructions can be cheaper to vectorize if they have a constant
6106     // second vector operand. One example of this are shifts on x86.
6107     Value *Op2 = I->getOperand(1);
6108     TargetTransformInfo::OperandValueProperties Op2VP;
6109     TargetTransformInfo::OperandValueKind Op2VK =
6110         TTI.getOperandInfo(Op2, Op2VP);
6111     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6112       Op2VK = TargetTransformInfo::OK_UniformValue;
6113 
6114     SmallVector<const Value *, 4> Operands(I->operand_values());
6115     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6116     return N * TTI.getArithmeticInstrCost(
6117                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6118                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6119   }
6120   case Instruction::FNeg: {
6121     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6122     return N * TTI.getArithmeticInstrCost(
6123                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6124                    TargetTransformInfo::OK_AnyValue,
6125                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6126                    I->getOperand(0));
6127   }
6128   case Instruction::Select: {
6129     SelectInst *SI = cast<SelectInst>(I);
6130     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6131     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6132     Type *CondTy = SI->getCondition()->getType();
6133     if (!ScalarCond)
6134       CondTy = VectorType::get(CondTy, VF);
6135 
6136     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6137   }
6138   case Instruction::ICmp:
6139   case Instruction::FCmp: {
6140     Type *ValTy = I->getOperand(0)->getType();
6141     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6142     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6143       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6144     VectorTy = ToVectorTy(ValTy, VF);
6145     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6146   }
6147   case Instruction::Store:
6148   case Instruction::Load: {
6149     unsigned Width = VF;
6150     if (Width > 1) {
6151       InstWidening Decision = getWideningDecision(I, Width);
6152       assert(Decision != CM_Unknown &&
6153              "CM decision should be taken at this point");
6154       if (Decision == CM_Scalarize)
6155         Width = 1;
6156     }
6157     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6158     return getMemoryInstructionCost(I, VF);
6159   }
6160   case Instruction::ZExt:
6161   case Instruction::SExt:
6162   case Instruction::FPToUI:
6163   case Instruction::FPToSI:
6164   case Instruction::FPExt:
6165   case Instruction::PtrToInt:
6166   case Instruction::IntToPtr:
6167   case Instruction::SIToFP:
6168   case Instruction::UIToFP:
6169   case Instruction::Trunc:
6170   case Instruction::FPTrunc:
6171   case Instruction::BitCast: {
6172     // We optimize the truncation of induction variables having constant
6173     // integer steps. The cost of these truncations is the same as the scalar
6174     // operation.
6175     if (isOptimizableIVTruncate(I, VF)) {
6176       auto *Trunc = cast<TruncInst>(I);
6177       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6178                                   Trunc->getSrcTy(), Trunc);
6179     }
6180 
6181     Type *SrcScalarTy = I->getOperand(0)->getType();
6182     Type *SrcVecTy =
6183         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6184     if (canTruncateToMinimalBitwidth(I, VF)) {
6185       // This cast is going to be shrunk. This may remove the cast or it might
6186       // turn it into slightly different cast. For example, if MinBW == 16,
6187       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6188       //
6189       // Calculate the modified src and dest types.
6190       Type *MinVecTy = VectorTy;
6191       if (I->getOpcode() == Instruction::Trunc) {
6192         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6193         VectorTy =
6194             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6195       } else if (I->getOpcode() == Instruction::ZExt ||
6196                  I->getOpcode() == Instruction::SExt) {
6197         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6198         VectorTy =
6199             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6200       }
6201     }
6202 
6203     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6204     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6205   }
6206   case Instruction::Call: {
6207     bool NeedToScalarize;
6208     CallInst *CI = cast<CallInst>(I);
6209     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6210     if (getVectorIntrinsicIDForCall(CI, TLI))
6211       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6212     return CallCost;
6213   }
6214   default:
6215     // The cost of executing VF copies of the scalar instruction. This opcode
6216     // is unknown. Assume that it is the same as 'mul'.
6217     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6218            getScalarizationOverhead(I, VF);
6219   } // end of switch.
6220 }
6221 
6222 char LoopVectorize::ID = 0;
6223 
6224 static const char lv_name[] = "Loop Vectorization";
6225 
6226 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6227 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6228 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6229 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6230 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6231 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6232 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6233 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6234 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6235 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6236 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6237 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6238 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6239 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6240 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6241 
6242 namespace llvm {
6243 
6244 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6245 
6246 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6247                               bool VectorizeOnlyWhenForced) {
6248   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6249 }
6250 
6251 } // end namespace llvm
6252 
6253 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6254   // Check if the pointer operand of a load or store instruction is
6255   // consecutive.
6256   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6257     return Legal->isConsecutivePtr(Ptr);
6258   return false;
6259 }
6260 
6261 void LoopVectorizationCostModel::collectValuesToIgnore() {
6262   // Ignore ephemeral values.
6263   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6264 
6265   // Ignore type-promoting instructions we identified during reduction
6266   // detection.
6267   for (auto &Reduction : *Legal->getReductionVars()) {
6268     RecurrenceDescriptor &RedDes = Reduction.second;
6269     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6270     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6271   }
6272   // Ignore type-casting instructions we identified during induction
6273   // detection.
6274   for (auto &Induction : *Legal->getInductionVars()) {
6275     InductionDescriptor &IndDes = Induction.second;
6276     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6277     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6278   }
6279 }
6280 
6281 // TODO: we could return a pair of values that specify the max VF and
6282 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6283 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6284 // doesn't have a cost model that can choose which plan to execute if
6285 // more than one is generated.
6286 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6287                                  LoopVectorizationCostModel &CM) {
6288   unsigned WidestType;
6289   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6290   return WidestVectorRegBits / WidestType;
6291 }
6292 
6293 VectorizationFactor
6294 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6295   unsigned VF = UserVF;
6296   // Outer loop handling: They may require CFG and instruction level
6297   // transformations before even evaluating whether vectorization is profitable.
6298   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6299   // the vectorization pipeline.
6300   if (!OrigLoop->empty()) {
6301     // If the user doesn't provide a vectorization factor, determine a
6302     // reasonable one.
6303     if (!UserVF) {
6304       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6305       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6306 
6307       // Make sure we have a VF > 1 for stress testing.
6308       if (VPlanBuildStressTest && VF < 2) {
6309         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6310                           << "overriding computed VF.\n");
6311         VF = 4;
6312       }
6313     }
6314     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6315     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6316     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6317                       << " to build VPlans.\n");
6318     buildVPlans(VF, VF);
6319 
6320     // For VPlan build stress testing, we bail out after VPlan construction.
6321     if (VPlanBuildStressTest)
6322       return VectorizationFactor::Disabled();
6323 
6324     return {VF, 0};
6325   }
6326 
6327   LLVM_DEBUG(
6328       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6329                 "VPlan-native path.\n");
6330   return VectorizationFactor::Disabled();
6331 }
6332 
6333 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6334   assert(OrigLoop->empty() && "Inner loop expected.");
6335   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6336   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6337     return None;
6338 
6339   // Invalidate interleave groups if all blocks of loop will be predicated.
6340   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6341       !useMaskedInterleavedAccesses(*TTI)) {
6342     LLVM_DEBUG(
6343         dbgs()
6344         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6345            "which requires masked-interleaved support.\n");
6346     CM.InterleaveInfo.reset();
6347   }
6348 
6349   if (UserVF) {
6350     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6351     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6352     // Collect the instructions (and their associated costs) that will be more
6353     // profitable to scalarize.
6354     CM.selectUserVectorizationFactor(UserVF);
6355     buildVPlansWithVPRecipes(UserVF, UserVF);
6356     LLVM_DEBUG(printPlans(dbgs()));
6357     return {{UserVF, 0}};
6358   }
6359 
6360   unsigned MaxVF = MaybeMaxVF.getValue();
6361   assert(MaxVF != 0 && "MaxVF is zero.");
6362 
6363   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6364     // Collect Uniform and Scalar instructions after vectorization with VF.
6365     CM.collectUniformsAndScalars(VF);
6366 
6367     // Collect the instructions (and their associated costs) that will be more
6368     // profitable to scalarize.
6369     if (VF > 1)
6370       CM.collectInstsToScalarize(VF);
6371   }
6372 
6373   buildVPlansWithVPRecipes(1, MaxVF);
6374   LLVM_DEBUG(printPlans(dbgs()));
6375   if (MaxVF == 1)
6376     return VectorizationFactor::Disabled();
6377 
6378   // Select the optimal vectorization factor.
6379   return CM.selectVectorizationFactor(MaxVF);
6380 }
6381 
6382 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6383   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6384                     << '\n');
6385   BestVF = VF;
6386   BestUF = UF;
6387 
6388   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6389     return !Plan->hasVF(VF);
6390   });
6391   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6392 }
6393 
6394 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6395                                            DominatorTree *DT) {
6396   // Perform the actual loop transformation.
6397 
6398   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6399   VPCallbackILV CallbackILV(ILV);
6400 
6401   VPTransformState State{BestVF, BestUF,      LI,
6402                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6403                          &ILV,   CallbackILV};
6404   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6405   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6406 
6407   //===------------------------------------------------===//
6408   //
6409   // Notice: any optimization or new instruction that go
6410   // into the code below should also be implemented in
6411   // the cost-model.
6412   //
6413   //===------------------------------------------------===//
6414 
6415   // 2. Copy and widen instructions from the old loop into the new loop.
6416   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6417   VPlans.front()->execute(&State);
6418 
6419   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6420   //    predication, updating analyses.
6421   ILV.fixVectorizedLoop();
6422 }
6423 
6424 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6425     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6426   BasicBlock *Latch = OrigLoop->getLoopLatch();
6427 
6428   // We create new control-flow for the vectorized loop, so the original
6429   // condition will be dead after vectorization if it's only used by the
6430   // branch.
6431   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6432   if (Cmp && Cmp->hasOneUse())
6433     DeadInstructions.insert(Cmp);
6434 
6435   // We create new "steps" for induction variable updates to which the original
6436   // induction variables map. An original update instruction will be dead if
6437   // all its users except the induction variable are dead.
6438   for (auto &Induction : *Legal->getInductionVars()) {
6439     PHINode *Ind = Induction.first;
6440     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6441     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6442           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6443                                  DeadInstructions.end();
6444         }))
6445       DeadInstructions.insert(IndUpdate);
6446 
6447     // We record as "Dead" also the type-casting instructions we had identified
6448     // during induction analysis. We don't need any handling for them in the
6449     // vectorized loop because we have proven that, under a proper runtime
6450     // test guarding the vectorized loop, the value of the phi, and the casted
6451     // value of the phi, are the same. The last instruction in this casting chain
6452     // will get its scalar/vector/widened def from the scalar/vector/widened def
6453     // of the respective phi node. Any other casts in the induction def-use chain
6454     // have no other uses outside the phi update chain, and will be ignored.
6455     InductionDescriptor &IndDes = Induction.second;
6456     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6457     DeadInstructions.insert(Casts.begin(), Casts.end());
6458   }
6459 }
6460 
6461 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6462 
6463 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6464 
6465 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6466                                         Instruction::BinaryOps BinOp) {
6467   // When unrolling and the VF is 1, we only need to add a simple scalar.
6468   Type *Ty = Val->getType();
6469   assert(!Ty->isVectorTy() && "Val must be a scalar");
6470 
6471   if (Ty->isFloatingPointTy()) {
6472     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6473 
6474     // Floating point operations had to be 'fast' to enable the unrolling.
6475     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6476     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6477   }
6478   Constant *C = ConstantInt::get(Ty, StartIdx);
6479   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6480 }
6481 
6482 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6483   SmallVector<Metadata *, 4> MDs;
6484   // Reserve first location for self reference to the LoopID metadata node.
6485   MDs.push_back(nullptr);
6486   bool IsUnrollMetadata = false;
6487   MDNode *LoopID = L->getLoopID();
6488   if (LoopID) {
6489     // First find existing loop unrolling disable metadata.
6490     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6491       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6492       if (MD) {
6493         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6494         IsUnrollMetadata =
6495             S && S->getString().startswith("llvm.loop.unroll.disable");
6496       }
6497       MDs.push_back(LoopID->getOperand(i));
6498     }
6499   }
6500 
6501   if (!IsUnrollMetadata) {
6502     // Add runtime unroll disable metadata.
6503     LLVMContext &Context = L->getHeader()->getContext();
6504     SmallVector<Metadata *, 1> DisableOperands;
6505     DisableOperands.push_back(
6506         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6507     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6508     MDs.push_back(DisableNode);
6509     MDNode *NewLoopID = MDNode::get(Context, MDs);
6510     // Set operand 0 to refer to the loop id itself.
6511     NewLoopID->replaceOperandWith(0, NewLoopID);
6512     L->setLoopID(NewLoopID);
6513   }
6514 }
6515 
6516 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6517     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6518   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6519   bool PredicateAtRangeStart = Predicate(Range.Start);
6520 
6521   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6522     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6523       Range.End = TmpVF;
6524       break;
6525     }
6526 
6527   return PredicateAtRangeStart;
6528 }
6529 
6530 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6531 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6532 /// of VF's starting at a given VF and extending it as much as possible. Each
6533 /// vectorization decision can potentially shorten this sub-range during
6534 /// buildVPlan().
6535 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6536   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6537     VFRange SubRange = {VF, MaxVF + 1};
6538     VPlans.push_back(buildVPlan(SubRange));
6539     VF = SubRange.End;
6540   }
6541 }
6542 
6543 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6544                                          VPlanPtr &Plan) {
6545   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6546 
6547   // Look for cached value.
6548   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6549   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6550   if (ECEntryIt != EdgeMaskCache.end())
6551     return ECEntryIt->second;
6552 
6553   VPValue *SrcMask = createBlockInMask(Src, Plan);
6554 
6555   // The terminator has to be a branch inst!
6556   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6557   assert(BI && "Unexpected terminator found");
6558 
6559   if (!BI->isConditional())
6560     return EdgeMaskCache[Edge] = SrcMask;
6561 
6562   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6563   assert(EdgeMask && "No Edge Mask found for condition");
6564 
6565   if (BI->getSuccessor(0) != Dst)
6566     EdgeMask = Builder.createNot(EdgeMask);
6567 
6568   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6569     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6570 
6571   return EdgeMaskCache[Edge] = EdgeMask;
6572 }
6573 
6574 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6575   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6576 
6577   // Look for cached value.
6578   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6579   if (BCEntryIt != BlockMaskCache.end())
6580     return BCEntryIt->second;
6581 
6582   // All-one mask is modelled as no-mask following the convention for masked
6583   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6584   VPValue *BlockMask = nullptr;
6585 
6586   if (OrigLoop->getHeader() == BB) {
6587     if (!CM.blockNeedsPredication(BB))
6588       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6589 
6590     // Introduce the early-exit compare IV <= BTC to form header block mask.
6591     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6592     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6593     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6594     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6595     return BlockMaskCache[BB] = BlockMask;
6596   }
6597 
6598   // This is the block mask. We OR all incoming edges.
6599   for (auto *Predecessor : predecessors(BB)) {
6600     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6601     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6602       return BlockMaskCache[BB] = EdgeMask;
6603 
6604     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6605       BlockMask = EdgeMask;
6606       continue;
6607     }
6608 
6609     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6610   }
6611 
6612   return BlockMaskCache[BB] = BlockMask;
6613 }
6614 
6615 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6616                                                            VFRange &Range,
6617                                                            VPlanPtr &Plan) {
6618   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6619   if (!IG)
6620     return nullptr;
6621 
6622   // Now check if IG is relevant for VF's in the given range.
6623   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6624     return [=](unsigned VF) -> bool {
6625       return (VF >= 2 && // Query is illegal for VF == 1
6626               CM.getWideningDecision(I, VF) ==
6627                   LoopVectorizationCostModel::CM_Interleave);
6628     };
6629   };
6630   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6631     return nullptr;
6632 
6633   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6634   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6635   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6636   assert(I == IG->getInsertPos() &&
6637          "Generating a recipe for an adjunct member of an interleave group");
6638 
6639   VPValue *Mask = nullptr;
6640   if (Legal->isMaskRequired(I))
6641     Mask = createBlockInMask(I->getParent(), Plan);
6642 
6643   return new VPInterleaveRecipe(IG, Mask);
6644 }
6645 
6646 VPWidenMemoryInstructionRecipe *
6647 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6648                                   VPlanPtr &Plan) {
6649   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6650     return nullptr;
6651 
6652   auto willWiden = [&](unsigned VF) -> bool {
6653     if (VF == 1)
6654       return false;
6655     if (CM.isScalarAfterVectorization(I, VF) ||
6656         CM.isProfitableToScalarize(I, VF))
6657       return false;
6658     LoopVectorizationCostModel::InstWidening Decision =
6659         CM.getWideningDecision(I, VF);
6660     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6661            "CM decision should be taken at this point.");
6662     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6663            "Interleave memory opportunity should be caught earlier.");
6664     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6665   };
6666 
6667   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6668     return nullptr;
6669 
6670   VPValue *Mask = nullptr;
6671   if (Legal->isMaskRequired(I))
6672     Mask = createBlockInMask(I->getParent(), Plan);
6673 
6674   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6675 }
6676 
6677 VPWidenIntOrFpInductionRecipe *
6678 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6679   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6680     // Check if this is an integer or fp induction. If so, build the recipe that
6681     // produces its scalar and vector values.
6682     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6683     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6684         II.getKind() == InductionDescriptor::IK_FpInduction)
6685       return new VPWidenIntOrFpInductionRecipe(Phi);
6686 
6687     return nullptr;
6688   }
6689 
6690   // Optimize the special case where the source is a constant integer
6691   // induction variable. Notice that we can only optimize the 'trunc' case
6692   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6693   // (c) other casts depend on pointer size.
6694 
6695   // Determine whether \p K is a truncation based on an induction variable that
6696   // can be optimized.
6697   auto isOptimizableIVTruncate =
6698       [&](Instruction *K) -> std::function<bool(unsigned)> {
6699     return
6700         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6701   };
6702 
6703   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6704                                isOptimizableIVTruncate(I), Range))
6705     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6706                                              cast<TruncInst>(I));
6707   return nullptr;
6708 }
6709 
6710 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6711   PHINode *Phi = dyn_cast<PHINode>(I);
6712   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6713     return nullptr;
6714 
6715   // We know that all PHIs in non-header blocks are converted into selects, so
6716   // we don't have to worry about the insertion order and we can just use the
6717   // builder. At this point we generate the predication tree. There may be
6718   // duplications since this is a simple recursive scan, but future
6719   // optimizations will clean it up.
6720 
6721   SmallVector<VPValue *, 2> Masks;
6722   unsigned NumIncoming = Phi->getNumIncomingValues();
6723   for (unsigned In = 0; In < NumIncoming; In++) {
6724     VPValue *EdgeMask =
6725       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6726     assert((EdgeMask || NumIncoming == 1) &&
6727            "Multiple predecessors with one having a full mask");
6728     if (EdgeMask)
6729       Masks.push_back(EdgeMask);
6730   }
6731   return new VPBlendRecipe(Phi, Masks);
6732 }
6733 
6734 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6735                                  VFRange &Range) {
6736 
6737   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6738       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6739 
6740   if (IsPredicated)
6741     return false;
6742 
6743   auto IsVectorizableOpcode = [](unsigned Opcode) {
6744     switch (Opcode) {
6745     case Instruction::Add:
6746     case Instruction::And:
6747     case Instruction::AShr:
6748     case Instruction::BitCast:
6749     case Instruction::Br:
6750     case Instruction::Call:
6751     case Instruction::FAdd:
6752     case Instruction::FCmp:
6753     case Instruction::FDiv:
6754     case Instruction::FMul:
6755     case Instruction::FNeg:
6756     case Instruction::FPExt:
6757     case Instruction::FPToSI:
6758     case Instruction::FPToUI:
6759     case Instruction::FPTrunc:
6760     case Instruction::FRem:
6761     case Instruction::FSub:
6762     case Instruction::GetElementPtr:
6763     case Instruction::ICmp:
6764     case Instruction::IntToPtr:
6765     case Instruction::Load:
6766     case Instruction::LShr:
6767     case Instruction::Mul:
6768     case Instruction::Or:
6769     case Instruction::PHI:
6770     case Instruction::PtrToInt:
6771     case Instruction::SDiv:
6772     case Instruction::Select:
6773     case Instruction::SExt:
6774     case Instruction::Shl:
6775     case Instruction::SIToFP:
6776     case Instruction::SRem:
6777     case Instruction::Store:
6778     case Instruction::Sub:
6779     case Instruction::Trunc:
6780     case Instruction::UDiv:
6781     case Instruction::UIToFP:
6782     case Instruction::URem:
6783     case Instruction::Xor:
6784     case Instruction::ZExt:
6785       return true;
6786     }
6787     return false;
6788   };
6789 
6790   if (!IsVectorizableOpcode(I->getOpcode()))
6791     return false;
6792 
6793   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6794     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6795     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6796                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6797       return false;
6798   }
6799 
6800   auto willWiden = [&](unsigned VF) -> bool {
6801     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6802                              CM.isProfitableToScalarize(I, VF)))
6803       return false;
6804     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6805       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6806       // The following case may be scalarized depending on the VF.
6807       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6808       // version of the instruction.
6809       // Is it beneficial to perform intrinsic call compared to lib call?
6810       bool NeedToScalarize;
6811       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6812       bool UseVectorIntrinsic =
6813           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6814       return UseVectorIntrinsic || !NeedToScalarize;
6815     }
6816     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6817       assert(CM.getWideningDecision(I, VF) ==
6818                  LoopVectorizationCostModel::CM_Scalarize &&
6819              "Memory widening decisions should have been taken care by now");
6820       return false;
6821     }
6822     return true;
6823   };
6824 
6825   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6826     return false;
6827 
6828   // Success: widen this instruction. We optimize the common case where
6829   // consecutive instructions can be represented by a single recipe.
6830   if (!VPBB->empty()) {
6831     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6832     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6833       return true;
6834   }
6835 
6836   VPBB->appendRecipe(new VPWidenRecipe(I));
6837   return true;
6838 }
6839 
6840 VPBasicBlock *VPRecipeBuilder::handleReplication(
6841     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6842     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6843     VPlanPtr &Plan) {
6844   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6845       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6846       Range);
6847 
6848   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6849       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6850 
6851   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6852 
6853   // Find if I uses a predicated instruction. If so, it will use its scalar
6854   // value. Avoid hoisting the insert-element which packs the scalar value into
6855   // a vector value, as that happens iff all users use the vector value.
6856   for (auto &Op : I->operands())
6857     if (auto *PredInst = dyn_cast<Instruction>(Op))
6858       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6859         PredInst2Recipe[PredInst]->setAlsoPack(false);
6860 
6861   // Finalize the recipe for Instr, first if it is not predicated.
6862   if (!IsPredicated) {
6863     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6864     VPBB->appendRecipe(Recipe);
6865     return VPBB;
6866   }
6867   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6868   assert(VPBB->getSuccessors().empty() &&
6869          "VPBB has successors when handling predicated replication.");
6870   // Record predicated instructions for above packing optimizations.
6871   PredInst2Recipe[I] = Recipe;
6872   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6873   VPBlockUtils::insertBlockAfter(Region, VPBB);
6874   auto *RegSucc = new VPBasicBlock();
6875   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6876   return RegSucc;
6877 }
6878 
6879 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6880                                                       VPRecipeBase *PredRecipe,
6881                                                       VPlanPtr &Plan) {
6882   // Instructions marked for predication are replicated and placed under an
6883   // if-then construct to prevent side-effects.
6884 
6885   // Generate recipes to compute the block mask for this region.
6886   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6887 
6888   // Build the triangular if-then region.
6889   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6890   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6891   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6892   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6893   auto *PHIRecipe =
6894       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6895   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6896   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6897   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6898 
6899   // Note: first set Entry as region entry and then connect successors starting
6900   // from it in order, to propagate the "parent" of each VPBasicBlock.
6901   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6902   VPBlockUtils::connectBlocks(Pred, Exit);
6903 
6904   return Region;
6905 }
6906 
6907 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6908                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6909   VPRecipeBase *Recipe = nullptr;
6910   // Check if Instr should belong to an interleave memory recipe, or already
6911   // does. In the latter case Instr is irrelevant.
6912   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6913     VPBB->appendRecipe(Recipe);
6914     return true;
6915   }
6916 
6917   // Check if Instr is a memory operation that should be widened.
6918   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6919     VPBB->appendRecipe(Recipe);
6920     return true;
6921   }
6922 
6923   // Check if Instr should form some PHI recipe.
6924   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6925     VPBB->appendRecipe(Recipe);
6926     return true;
6927   }
6928   if ((Recipe = tryToBlend(Instr, Plan))) {
6929     VPBB->appendRecipe(Recipe);
6930     return true;
6931   }
6932   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6933     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6934     return true;
6935   }
6936 
6937   // Check if Instr is to be widened by a general VPWidenRecipe, after
6938   // having first checked for specific widening recipes that deal with
6939   // Interleave Groups, Inductions and Phi nodes.
6940   if (tryToWiden(Instr, VPBB, Range))
6941     return true;
6942 
6943   return false;
6944 }
6945 
6946 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6947                                                         unsigned MaxVF) {
6948   assert(OrigLoop->empty() && "Inner loop expected.");
6949 
6950   // Collect conditions feeding internal conditional branches; they need to be
6951   // represented in VPlan for it to model masking.
6952   SmallPtrSet<Value *, 1> NeedDef;
6953 
6954   auto *Latch = OrigLoop->getLoopLatch();
6955   for (BasicBlock *BB : OrigLoop->blocks()) {
6956     if (BB == Latch)
6957       continue;
6958     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6959     if (Branch && Branch->isConditional())
6960       NeedDef.insert(Branch->getCondition());
6961   }
6962 
6963   // If the tail is to be folded by masking, the primary induction variable
6964   // needs to be represented in VPlan for it to model early-exit masking.
6965   // Also, both the Phi and the live-out instruction of each reduction are
6966   // required in order to introduce a select between them in VPlan.
6967   if (CM.foldTailByMasking()) {
6968     NeedDef.insert(Legal->getPrimaryInduction());
6969     for (auto &Reduction : *Legal->getReductionVars()) {
6970       NeedDef.insert(Reduction.first);
6971       NeedDef.insert(Reduction.second.getLoopExitInstr());
6972     }
6973   }
6974 
6975   // Collect instructions from the original loop that will become trivially dead
6976   // in the vectorized loop. We don't need to vectorize these instructions. For
6977   // example, original induction update instructions can become dead because we
6978   // separately emit induction "steps" when generating code for the new loop.
6979   // Similarly, we create a new latch condition when setting up the structure
6980   // of the new loop, so the old one can become dead.
6981   SmallPtrSet<Instruction *, 4> DeadInstructions;
6982   collectTriviallyDeadInstructions(DeadInstructions);
6983 
6984   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6985     VFRange SubRange = {VF, MaxVF + 1};
6986     VPlans.push_back(
6987         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6988     VF = SubRange.End;
6989   }
6990 }
6991 
6992 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6993     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6994     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6995   // Hold a mapping from predicated instructions to their recipes, in order to
6996   // fix their AlsoPack behavior if a user is determined to replicate and use a
6997   // scalar instead of vector value.
6998   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6999 
7000   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7001   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7002 
7003   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7004   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7005   auto Plan = std::make_unique<VPlan>(VPBB);
7006 
7007   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7008   // Represent values that will have defs inside VPlan.
7009   for (Value *V : NeedDef)
7010     Plan->addVPValue(V);
7011 
7012   // Scan the body of the loop in a topological order to visit each basic block
7013   // after having visited its predecessor basic blocks.
7014   LoopBlocksDFS DFS(OrigLoop);
7015   DFS.perform(LI);
7016 
7017   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7018     // Relevant instructions from basic block BB will be grouped into VPRecipe
7019     // ingredients and fill a new VPBasicBlock.
7020     unsigned VPBBsForBB = 0;
7021     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7022     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7023     VPBB = FirstVPBBForBB;
7024     Builder.setInsertPoint(VPBB);
7025 
7026     std::vector<Instruction *> Ingredients;
7027 
7028     // Organize the ingredients to vectorize from current basic block in the
7029     // right order.
7030     for (Instruction &I : BB->instructionsWithoutDebug()) {
7031       Instruction *Instr = &I;
7032 
7033       // First filter out irrelevant instructions, to ensure no recipes are
7034       // built for them.
7035       if (isa<BranchInst>(Instr) ||
7036           DeadInstructions.find(Instr) != DeadInstructions.end())
7037         continue;
7038 
7039       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7040       // member of the IG, do not construct any Recipe for it.
7041       const InterleaveGroup<Instruction> *IG =
7042           CM.getInterleavedAccessGroup(Instr);
7043       if (IG && Instr != IG->getInsertPos() &&
7044           Range.Start >= 2 && // Query is illegal for VF == 1
7045           CM.getWideningDecision(Instr, Range.Start) ==
7046               LoopVectorizationCostModel::CM_Interleave) {
7047         auto SinkCandidate = SinkAfterInverse.find(Instr);
7048         if (SinkCandidate != SinkAfterInverse.end())
7049           Ingredients.push_back(SinkCandidate->second);
7050         continue;
7051       }
7052 
7053       // Move instructions to handle first-order recurrences, step 1: avoid
7054       // handling this instruction until after we've handled the instruction it
7055       // should follow.
7056       auto SAIt = SinkAfter.find(Instr);
7057       if (SAIt != SinkAfter.end()) {
7058         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7059                           << *SAIt->second
7060                           << " to vectorize a 1st order recurrence.\n");
7061         SinkAfterInverse[SAIt->second] = Instr;
7062         continue;
7063       }
7064 
7065       Ingredients.push_back(Instr);
7066 
7067       // Move instructions to handle first-order recurrences, step 2: push the
7068       // instruction to be sunk at its insertion point.
7069       auto SAInvIt = SinkAfterInverse.find(Instr);
7070       if (SAInvIt != SinkAfterInverse.end())
7071         Ingredients.push_back(SAInvIt->second);
7072     }
7073 
7074     // Introduce each ingredient into VPlan.
7075     for (Instruction *Instr : Ingredients) {
7076       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7077         continue;
7078 
7079       // Otherwise, if all widening options failed, Instruction is to be
7080       // replicated. This may create a successor for VPBB.
7081       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7082           Instr, Range, VPBB, PredInst2Recipe, Plan);
7083       if (NextVPBB != VPBB) {
7084         VPBB = NextVPBB;
7085         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7086                                     : "");
7087       }
7088     }
7089   }
7090 
7091   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7092   // may also be empty, such as the last one VPBB, reflecting original
7093   // basic-blocks with no recipes.
7094   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7095   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7096   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7097   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7098   delete PreEntry;
7099 
7100   // Finally, if tail is folded by masking, introduce selects between the phi
7101   // and the live-out instruction of each reduction, at the end of the latch.
7102   if (CM.foldTailByMasking()) {
7103     Builder.setInsertPoint(VPBB);
7104     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7105     for (auto &Reduction : *Legal->getReductionVars()) {
7106       VPValue *Phi = Plan->getVPValue(Reduction.first);
7107       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7108       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7109     }
7110   }
7111 
7112   std::string PlanName;
7113   raw_string_ostream RSO(PlanName);
7114   unsigned VF = Range.Start;
7115   Plan->addVF(VF);
7116   RSO << "Initial VPlan for VF={" << VF;
7117   for (VF *= 2; VF < Range.End; VF *= 2) {
7118     Plan->addVF(VF);
7119     RSO << "," << VF;
7120   }
7121   RSO << "},UF>=1";
7122   RSO.flush();
7123   Plan->setName(PlanName);
7124 
7125   return Plan;
7126 }
7127 
7128 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7129   // Outer loop handling: They may require CFG and instruction level
7130   // transformations before even evaluating whether vectorization is profitable.
7131   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7132   // the vectorization pipeline.
7133   assert(!OrigLoop->empty());
7134   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7135 
7136   // Create new empty VPlan
7137   auto Plan = std::make_unique<VPlan>();
7138 
7139   // Build hierarchical CFG
7140   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7141   HCFGBuilder.buildHierarchicalCFG();
7142 
7143   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7144     Plan->addVF(VF);
7145 
7146   if (EnableVPlanPredication) {
7147     VPlanPredicator VPP(*Plan);
7148     VPP.predicate();
7149 
7150     // Avoid running transformation to recipes until masked code generation in
7151     // VPlan-native path is in place.
7152     return Plan;
7153   }
7154 
7155   SmallPtrSet<Instruction *, 1> DeadInstructions;
7156   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7157       Plan, Legal->getInductionVars(), DeadInstructions);
7158 
7159   return Plan;
7160 }
7161 
7162 Value* LoopVectorizationPlanner::VPCallbackILV::
7163 getOrCreateVectorValues(Value *V, unsigned Part) {
7164       return ILV.getOrCreateVectorValue(V, Part);
7165 }
7166 
7167 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7168   O << " +\n"
7169     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7170   IG->getInsertPos()->printAsOperand(O, false);
7171   if (User) {
7172     O << ", ";
7173     User->getOperand(0)->printAsOperand(O);
7174   }
7175   O << "\\l\"";
7176   for (unsigned i = 0; i < IG->getFactor(); ++i)
7177     if (Instruction *I = IG->getMember(i))
7178       O << " +\n"
7179         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7180 }
7181 
7182 void VPWidenRecipe::execute(VPTransformState &State) {
7183   for (auto &Instr : make_range(Begin, End))
7184     State.ILV->widenInstruction(Instr);
7185 }
7186 
7187 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7188   assert(!State.Instance && "Int or FP induction being replicated.");
7189   State.ILV->widenIntOrFpInduction(IV, Trunc);
7190 }
7191 
7192 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7193   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7194 }
7195 
7196 void VPBlendRecipe::execute(VPTransformState &State) {
7197   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7198   // We know that all PHIs in non-header blocks are converted into
7199   // selects, so we don't have to worry about the insertion order and we
7200   // can just use the builder.
7201   // At this point we generate the predication tree. There may be
7202   // duplications since this is a simple recursive scan, but future
7203   // optimizations will clean it up.
7204 
7205   unsigned NumIncoming = Phi->getNumIncomingValues();
7206 
7207   assert((User || NumIncoming == 1) &&
7208          "Multiple predecessors with predecessors having a full mask");
7209   // Generate a sequence of selects of the form:
7210   // SELECT(Mask3, In3,
7211   //      SELECT(Mask2, In2,
7212   //                   ( ...)))
7213   InnerLoopVectorizer::VectorParts Entry(State.UF);
7214   for (unsigned In = 0; In < NumIncoming; ++In) {
7215     for (unsigned Part = 0; Part < State.UF; ++Part) {
7216       // We might have single edge PHIs (blocks) - use an identity
7217       // 'select' for the first PHI operand.
7218       Value *In0 =
7219           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7220       if (In == 0)
7221         Entry[Part] = In0; // Initialize with the first incoming value.
7222       else {
7223         // Select between the current value and the previous incoming edge
7224         // based on the incoming mask.
7225         Value *Cond = State.get(User->getOperand(In), Part);
7226         Entry[Part] =
7227             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7228       }
7229     }
7230   }
7231   for (unsigned Part = 0; Part < State.UF; ++Part)
7232     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7233 }
7234 
7235 void VPInterleaveRecipe::execute(VPTransformState &State) {
7236   assert(!State.Instance && "Interleave group being replicated.");
7237   if (!User)
7238     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7239 
7240   // Last (and currently only) operand is a mask.
7241   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7242   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7243   for (unsigned Part = 0; Part < State.UF; ++Part)
7244     MaskValues[Part] = State.get(Mask, Part);
7245   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7246 }
7247 
7248 void VPReplicateRecipe::execute(VPTransformState &State) {
7249   if (State.Instance) { // Generate a single instance.
7250     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7251     // Insert scalar instance packing it into a vector.
7252     if (AlsoPack && State.VF > 1) {
7253       // If we're constructing lane 0, initialize to start from undef.
7254       if (State.Instance->Lane == 0) {
7255         Value *Undef =
7256             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7257         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7258       }
7259       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7260     }
7261     return;
7262   }
7263 
7264   // Generate scalar instances for all VF lanes of all UF parts, unless the
7265   // instruction is uniform inwhich case generate only the first lane for each
7266   // of the UF parts.
7267   unsigned EndLane = IsUniform ? 1 : State.VF;
7268   for (unsigned Part = 0; Part < State.UF; ++Part)
7269     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7270       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7271 }
7272 
7273 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7274   assert(State.Instance && "Branch on Mask works only on single instance.");
7275 
7276   unsigned Part = State.Instance->Part;
7277   unsigned Lane = State.Instance->Lane;
7278 
7279   Value *ConditionBit = nullptr;
7280   if (!User) // Block in mask is all-one.
7281     ConditionBit = State.Builder.getTrue();
7282   else {
7283     VPValue *BlockInMask = User->getOperand(0);
7284     ConditionBit = State.get(BlockInMask, Part);
7285     if (ConditionBit->getType()->isVectorTy())
7286       ConditionBit = State.Builder.CreateExtractElement(
7287           ConditionBit, State.Builder.getInt32(Lane));
7288   }
7289 
7290   // Replace the temporary unreachable terminator with a new conditional branch,
7291   // whose two destinations will be set later when they are created.
7292   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7293   assert(isa<UnreachableInst>(CurrentTerminator) &&
7294          "Expected to replace unreachable terminator with conditional branch.");
7295   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7296   CondBr->setSuccessor(0, nullptr);
7297   ReplaceInstWithInst(CurrentTerminator, CondBr);
7298 }
7299 
7300 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7301   assert(State.Instance && "Predicated instruction PHI works per instance.");
7302   Instruction *ScalarPredInst = cast<Instruction>(
7303       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7304   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7305   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7306   assert(PredicatingBB && "Predicated block has no single predecessor.");
7307 
7308   // By current pack/unpack logic we need to generate only a single phi node: if
7309   // a vector value for the predicated instruction exists at this point it means
7310   // the instruction has vector users only, and a phi for the vector value is
7311   // needed. In this case the recipe of the predicated instruction is marked to
7312   // also do that packing, thereby "hoisting" the insert-element sequence.
7313   // Otherwise, a phi node for the scalar value is needed.
7314   unsigned Part = State.Instance->Part;
7315   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7316     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7317     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7318     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7319     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7320     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7321     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7322   } else {
7323     Type *PredInstType = PredInst->getType();
7324     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7325     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7326     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7327     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7328   }
7329 }
7330 
7331 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7332   if (!User)
7333     return State.ILV->vectorizeMemoryInstruction(&Instr);
7334 
7335   // Last (and currently only) operand is a mask.
7336   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7337   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7338   for (unsigned Part = 0; Part < State.UF; ++Part)
7339     MaskValues[Part] = State.get(Mask, Part);
7340   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7341 }
7342 
7343 static ScalarEpilogueLowering
7344 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7345                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7346   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7347   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7348       (F->hasOptSize() ||
7349        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7350     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7351   else if (PreferPredicateOverEpilog || Hints.getPredicate())
7352     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7353 
7354   return SEL;
7355 }
7356 
7357 // Process the loop in the VPlan-native vectorization path. This path builds
7358 // VPlan upfront in the vectorization pipeline, which allows to apply
7359 // VPlan-to-VPlan transformations from the very beginning without modifying the
7360 // input LLVM IR.
7361 static bool processLoopInVPlanNativePath(
7362     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7363     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7364     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7365     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7366     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7367 
7368   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7369   Function *F = L->getHeader()->getParent();
7370   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7371   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7372 
7373   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7374                                 &Hints, IAI);
7375   // Use the planner for outer loop vectorization.
7376   // TODO: CM is not used at this point inside the planner. Turn CM into an
7377   // optional argument if we don't need it in the future.
7378   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7379 
7380   // Get user vectorization factor.
7381   const unsigned UserVF = Hints.getWidth();
7382 
7383   // Plan how to best vectorize, return the best VF and its cost.
7384   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7385 
7386   // If we are stress testing VPlan builds, do not attempt to generate vector
7387   // code. Masked vector code generation support will follow soon.
7388   // Also, do not attempt to vectorize if no vector code will be produced.
7389   if (VPlanBuildStressTest || EnableVPlanPredication ||
7390       VectorizationFactor::Disabled() == VF)
7391     return false;
7392 
7393   LVP.setBestPlan(VF.Width, 1);
7394 
7395   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7396                          &CM);
7397   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7398                     << L->getHeader()->getParent()->getName() << "\"\n");
7399   LVP.executePlan(LB, DT);
7400 
7401   // Mark the loop as already vectorized to avoid vectorizing again.
7402   Hints.setAlreadyVectorized();
7403 
7404   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7405   return true;
7406 }
7407 
7408 bool LoopVectorizePass::processLoop(Loop *L) {
7409   assert((EnableVPlanNativePath || L->empty()) &&
7410          "VPlan-native path is not enabled. Only process inner loops.");
7411 
7412 #ifndef NDEBUG
7413   const std::string DebugLocStr = getDebugLocString(L);
7414 #endif /* NDEBUG */
7415 
7416   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7417                     << L->getHeader()->getParent()->getName() << "\" from "
7418                     << DebugLocStr << "\n");
7419 
7420   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7421 
7422   LLVM_DEBUG(
7423       dbgs() << "LV: Loop hints:"
7424              << " force="
7425              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7426                      ? "disabled"
7427                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7428                             ? "enabled"
7429                             : "?"))
7430              << " width=" << Hints.getWidth()
7431              << " unroll=" << Hints.getInterleave() << "\n");
7432 
7433   // Function containing loop
7434   Function *F = L->getHeader()->getParent();
7435 
7436   // Looking at the diagnostic output is the only way to determine if a loop
7437   // was vectorized (other than looking at the IR or machine code), so it
7438   // is important to generate an optimization remark for each loop. Most of
7439   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7440   // generated as OptimizationRemark and OptimizationRemarkMissed are
7441   // less verbose reporting vectorized loops and unvectorized loops that may
7442   // benefit from vectorization, respectively.
7443 
7444   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7445     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7446     return false;
7447   }
7448 
7449   PredicatedScalarEvolution PSE(*SE, *L);
7450 
7451   // Check if it is legal to vectorize the loop.
7452   LoopVectorizationRequirements Requirements(*ORE);
7453   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7454                                 &Requirements, &Hints, DB, AC);
7455   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7456     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7457     Hints.emitRemarkWithHints();
7458     return false;
7459   }
7460 
7461   // Check the function attributes and profiles to find out if this function
7462   // should be optimized for size.
7463   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7464 
7465   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7466   // here. They may require CFG and instruction level transformations before
7467   // even evaluating whether vectorization is profitable. Since we cannot modify
7468   // the incoming IR, we need to build VPlan upfront in the vectorization
7469   // pipeline.
7470   if (!L->empty())
7471     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7472                                         ORE, BFI, PSI, Hints);
7473 
7474   assert(L->empty() && "Inner loop expected.");
7475   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7476   // count by optimizing for size, to minimize overheads.
7477   // Prefer constant trip counts over profile data, over upper bound estimate.
7478   unsigned ExpectedTC = 0;
7479   bool HasExpectedTC = false;
7480   if (const SCEVConstant *ConstExits =
7481       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7482     const APInt &ExitsCount = ConstExits->getAPInt();
7483     // We are interested in small values for ExpectedTC. Skip over those that
7484     // can't fit an unsigned.
7485     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7486       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7487       HasExpectedTC = true;
7488     }
7489   }
7490   // ExpectedTC may be large because it's bound by a variable. Check
7491   // profiling information to validate we should vectorize.
7492   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7493     auto EstimatedTC = getLoopEstimatedTripCount(L);
7494     if (EstimatedTC) {
7495       ExpectedTC = *EstimatedTC;
7496       HasExpectedTC = true;
7497     }
7498   }
7499   if (!HasExpectedTC) {
7500     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7501     HasExpectedTC = (ExpectedTC > 0);
7502   }
7503 
7504   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7505     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7506                       << "This loop is worth vectorizing only if no scalar "
7507                       << "iteration overheads are incurred.");
7508     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7509       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7510     else {
7511       LLVM_DEBUG(dbgs() << "\n");
7512       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7513     }
7514   }
7515 
7516   // Check the function attributes to see if implicit floats are allowed.
7517   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7518   // an integer loop and the vector instructions selected are purely integer
7519   // vector instructions?
7520   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7521     reportVectorizationFailure(
7522         "Can't vectorize when the NoImplicitFloat attribute is used",
7523         "loop not vectorized due to NoImplicitFloat attribute",
7524         "NoImplicitFloat", ORE, L);
7525     Hints.emitRemarkWithHints();
7526     return false;
7527   }
7528 
7529   // Check if the target supports potentially unsafe FP vectorization.
7530   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7531   // for the target we're vectorizing for, to make sure none of the
7532   // additional fp-math flags can help.
7533   if (Hints.isPotentiallyUnsafe() &&
7534       TTI->isFPVectorizationPotentiallyUnsafe()) {
7535     reportVectorizationFailure(
7536         "Potentially unsafe FP op prevents vectorization",
7537         "loop not vectorized due to unsafe FP support.",
7538         "UnsafeFP", ORE, L);
7539     Hints.emitRemarkWithHints();
7540     return false;
7541   }
7542 
7543   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7544   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7545 
7546   // If an override option has been passed in for interleaved accesses, use it.
7547   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7548     UseInterleaved = EnableInterleavedMemAccesses;
7549 
7550   // Analyze interleaved memory accesses.
7551   if (UseInterleaved) {
7552     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7553   }
7554 
7555   // Use the cost model.
7556   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7557                                 F, &Hints, IAI);
7558   CM.collectValuesToIgnore();
7559 
7560   // Use the planner for vectorization.
7561   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7562 
7563   // Get user vectorization factor.
7564   unsigned UserVF = Hints.getWidth();
7565 
7566   // Plan how to best vectorize, return the best VF and its cost.
7567   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7568 
7569   VectorizationFactor VF = VectorizationFactor::Disabled();
7570   unsigned IC = 1;
7571   unsigned UserIC = Hints.getInterleave();
7572 
7573   if (MaybeVF) {
7574     VF = *MaybeVF;
7575     // Select the interleave count.
7576     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7577   }
7578 
7579   // Identify the diagnostic messages that should be produced.
7580   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7581   bool VectorizeLoop = true, InterleaveLoop = true;
7582   if (Requirements.doesNotMeet(F, L, Hints)) {
7583     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7584                          "requirements.\n");
7585     Hints.emitRemarkWithHints();
7586     return false;
7587   }
7588 
7589   if (VF.Width == 1) {
7590     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7591     VecDiagMsg = std::make_pair(
7592         "VectorizationNotBeneficial",
7593         "the cost-model indicates that vectorization is not beneficial");
7594     VectorizeLoop = false;
7595   }
7596 
7597   if (!MaybeVF && UserIC > 1) {
7598     // Tell the user interleaving was avoided up-front, despite being explicitly
7599     // requested.
7600     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7601                          "interleaving should be avoided up front\n");
7602     IntDiagMsg = std::make_pair(
7603         "InterleavingAvoided",
7604         "Ignoring UserIC, because interleaving was avoided up front");
7605     InterleaveLoop = false;
7606   } else if (IC == 1 && UserIC <= 1) {
7607     // Tell the user interleaving is not beneficial.
7608     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7609     IntDiagMsg = std::make_pair(
7610         "InterleavingNotBeneficial",
7611         "the cost-model indicates that interleaving is not beneficial");
7612     InterleaveLoop = false;
7613     if (UserIC == 1) {
7614       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7615       IntDiagMsg.second +=
7616           " and is explicitly disabled or interleave count is set to 1";
7617     }
7618   } else if (IC > 1 && UserIC == 1) {
7619     // Tell the user interleaving is beneficial, but it explicitly disabled.
7620     LLVM_DEBUG(
7621         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7622     IntDiagMsg = std::make_pair(
7623         "InterleavingBeneficialButDisabled",
7624         "the cost-model indicates that interleaving is beneficial "
7625         "but is explicitly disabled or interleave count is set to 1");
7626     InterleaveLoop = false;
7627   }
7628 
7629   // Override IC if user provided an interleave count.
7630   IC = UserIC > 0 ? UserIC : IC;
7631 
7632   // Emit diagnostic messages, if any.
7633   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7634   if (!VectorizeLoop && !InterleaveLoop) {
7635     // Do not vectorize or interleaving the loop.
7636     ORE->emit([&]() {
7637       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7638                                       L->getStartLoc(), L->getHeader())
7639              << VecDiagMsg.second;
7640     });
7641     ORE->emit([&]() {
7642       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7643                                       L->getStartLoc(), L->getHeader())
7644              << IntDiagMsg.second;
7645     });
7646     return false;
7647   } else if (!VectorizeLoop && InterleaveLoop) {
7648     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7649     ORE->emit([&]() {
7650       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7651                                         L->getStartLoc(), L->getHeader())
7652              << VecDiagMsg.second;
7653     });
7654   } else if (VectorizeLoop && !InterleaveLoop) {
7655     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7656                       << ") in " << DebugLocStr << '\n');
7657     ORE->emit([&]() {
7658       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7659                                         L->getStartLoc(), L->getHeader())
7660              << IntDiagMsg.second;
7661     });
7662   } else if (VectorizeLoop && InterleaveLoop) {
7663     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7664                       << ") in " << DebugLocStr << '\n');
7665     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7666   }
7667 
7668   LVP.setBestPlan(VF.Width, IC);
7669 
7670   using namespace ore;
7671   bool DisableRuntimeUnroll = false;
7672   MDNode *OrigLoopID = L->getLoopID();
7673 
7674   if (!VectorizeLoop) {
7675     assert(IC > 1 && "interleave count should not be 1 or 0");
7676     // If we decided that it is not legal to vectorize the loop, then
7677     // interleave it.
7678     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7679                                &CM);
7680     LVP.executePlan(Unroller, DT);
7681 
7682     ORE->emit([&]() {
7683       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7684                                 L->getHeader())
7685              << "interleaved loop (interleaved count: "
7686              << NV("InterleaveCount", IC) << ")";
7687     });
7688   } else {
7689     // If we decided that it is *legal* to vectorize the loop, then do it.
7690     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7691                            &LVL, &CM);
7692     LVP.executePlan(LB, DT);
7693     ++LoopsVectorized;
7694 
7695     // Add metadata to disable runtime unrolling a scalar loop when there are
7696     // no runtime checks about strides and memory. A scalar loop that is
7697     // rarely used is not worth unrolling.
7698     if (!LB.areSafetyChecksAdded())
7699       DisableRuntimeUnroll = true;
7700 
7701     // Report the vectorization decision.
7702     ORE->emit([&]() {
7703       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7704                                 L->getHeader())
7705              << "vectorized loop (vectorization width: "
7706              << NV("VectorizationFactor", VF.Width)
7707              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7708     });
7709   }
7710 
7711   Optional<MDNode *> RemainderLoopID =
7712       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7713                                       LLVMLoopVectorizeFollowupEpilogue});
7714   if (RemainderLoopID.hasValue()) {
7715     L->setLoopID(RemainderLoopID.getValue());
7716   } else {
7717     if (DisableRuntimeUnroll)
7718       AddRuntimeUnrollDisableMetaData(L);
7719 
7720     // Mark the loop as already vectorized to avoid vectorizing again.
7721     Hints.setAlreadyVectorized();
7722   }
7723 
7724   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7725   return true;
7726 }
7727 
7728 bool LoopVectorizePass::runImpl(
7729     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7730     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7731     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7732     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7733     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7734   SE = &SE_;
7735   LI = &LI_;
7736   TTI = &TTI_;
7737   DT = &DT_;
7738   BFI = &BFI_;
7739   TLI = TLI_;
7740   AA = &AA_;
7741   AC = &AC_;
7742   GetLAA = &GetLAA_;
7743   DB = &DB_;
7744   ORE = &ORE_;
7745   PSI = PSI_;
7746 
7747   // Don't attempt if
7748   // 1. the target claims to have no vector registers, and
7749   // 2. interleaving won't help ILP.
7750   //
7751   // The second condition is necessary because, even if the target has no
7752   // vector registers, loop vectorization may still enable scalar
7753   // interleaving.
7754   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7755     return false;
7756 
7757   bool Changed = false;
7758 
7759   // The vectorizer requires loops to be in simplified form.
7760   // Since simplification may add new inner loops, it has to run before the
7761   // legality and profitability checks. This means running the loop vectorizer
7762   // will simplify all loops, regardless of whether anything end up being
7763   // vectorized.
7764   for (auto &L : *LI)
7765     Changed |=
7766         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7767 
7768   // Build up a worklist of inner-loops to vectorize. This is necessary as
7769   // the act of vectorizing or partially unrolling a loop creates new loops
7770   // and can invalidate iterators across the loops.
7771   SmallVector<Loop *, 8> Worklist;
7772 
7773   for (Loop *L : *LI)
7774     collectSupportedLoops(*L, LI, ORE, Worklist);
7775 
7776   LoopsAnalyzed += Worklist.size();
7777 
7778   // Now walk the identified inner loops.
7779   while (!Worklist.empty()) {
7780     Loop *L = Worklist.pop_back_val();
7781 
7782     // For the inner loops we actually process, form LCSSA to simplify the
7783     // transform.
7784     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7785 
7786     Changed |= processLoop(L);
7787   }
7788 
7789   // Process each loop nest in the function.
7790   return Changed;
7791 }
7792 
7793 PreservedAnalyses LoopVectorizePass::run(Function &F,
7794                                          FunctionAnalysisManager &AM) {
7795     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7796     auto &LI = AM.getResult<LoopAnalysis>(F);
7797     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7798     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7799     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7800     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7801     auto &AA = AM.getResult<AAManager>(F);
7802     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7803     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7804     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7805     MemorySSA *MSSA = EnableMSSALoopDependency
7806                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7807                           : nullptr;
7808 
7809     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7810     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7811         [&](Loop &L) -> const LoopAccessInfo & {
7812       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7813       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7814     };
7815     const ModuleAnalysisManager &MAM =
7816         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7817     ProfileSummaryInfo *PSI =
7818         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7819     bool Changed =
7820         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7821     if (!Changed)
7822       return PreservedAnalyses::all();
7823     PreservedAnalyses PA;
7824 
7825     // We currently do not preserve loopinfo/dominator analyses with outer loop
7826     // vectorization. Until this is addressed, mark these analyses as preserved
7827     // only for non-VPlan-native path.
7828     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7829     if (!EnableVPlanNativePath) {
7830       PA.preserve<LoopAnalysis>();
7831       PA.preserve<DominatorTreeAnalysis>();
7832     }
7833     PA.preserve<BasicAA>();
7834     PA.preserve<GlobalsAA>();
7835     return PA;
7836 }
7837