1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
206 
207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
208     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209     cl::desc("A flag that overrides the target's number of scalar registers."));
210 
211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
212     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213     cl::desc("A flag that overrides the target's number of vector registers."));
214 
215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
216     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217     cl::desc("A flag that overrides the target's max interleave factor for "
218              "scalar loops."));
219 
220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
221     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222     cl::desc("A flag that overrides the target's max interleave factor for "
223              "vectorized loops."));
224 
225 static cl::opt<unsigned> ForceTargetInstructionCost(
226     "force-target-instruction-cost", cl::init(0), cl::Hidden,
227     cl::desc("A flag that overrides the target's expected cost for "
228              "an instruction to a single constant value. Mostly "
229              "useful for getting consistent testing."));
230 
231 static cl::opt<unsigned> SmallLoopCost(
232     "small-loop-cost", cl::init(20), cl::Hidden,
233     cl::desc(
234         "The cost of a loop that is considered 'small' by the interleaver."));
235 
236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
237     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238     cl::desc("Enable the use of the block frequency analysis to access PGO "
239              "heuristics minimizing code growth in cold regions and being more "
240              "aggressive in hot regions."));
241 
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
244     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245     cl::desc(
246         "Enable runtime interleaving until load/store ports are saturated"));
247 
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt<unsigned> NumberOfStoresToPredicate(
250     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251     cl::desc("Max number of stores to be predicated behind an if."));
252 
253 static cl::opt<bool> EnableIndVarRegisterHeur(
254     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255     cl::desc("Count the induction variable only once when interleaving"));
256 
257 static cl::opt<bool> EnableCondStoresVectorization(
258     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259     cl::desc("Enable if predication of stores during vectorization."));
260 
261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
262     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263     cl::desc("The maximum interleave count to use when interleaving a scalar "
264              "reduction in a nested loop."));
265 
266 cl::opt<bool> EnableVPlanNativePath(
267     "enable-vplan-native-path", cl::init(false), cl::Hidden,
268     cl::desc("Enable VPlan-native vectorization path with "
269              "support for outer loop vectorization."));
270 
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt<bool> EnableVPlanPredication(
274     "enable-vplan-predication", cl::init(false), cl::Hidden,
275     cl::desc("Enable VPlan-native vectorization path predicator with "
276              "support for outer loop vectorization."));
277 
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt<bool> VPlanBuildStressTest(
283     "vplan-build-stress-test", cl::init(false), cl::Hidden,
284     cl::desc(
285         "Build VPlan for every supported loop nest in the function and bail "
286         "out right after the build (stress test the VPlan H-CFG construction "
287         "in the VPlan-native vectorization path)."));
288 
289 cl::opt<bool> llvm::EnableLoopInterleaving(
290     "interleave-loops", cl::init(true), cl::Hidden,
291     cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt<bool> llvm::EnableLoopVectorization(
293     "vectorize-loops", cl::init(true), cl::Hidden,
294     cl::desc("Run the Loop vectorization passes"));
295 
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300   if (Scalar->isVoidTy() || VF == 1)
301     return Scalar;
302   return VectorType::get(Scalar, VF);
303 }
304 
305 /// A helper function that returns the type of loaded or stored value.
306 static Type *getMemInstValueType(Value *I) {
307   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308          "Expected Load or Store instruction");
309   if (auto *LI = dyn_cast<LoadInst>(I))
310     return LI->getType();
311   return cast<StoreInst>(I)->getValueOperand()->getType();
312 }
313 
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318   // Determine if an array of VF elements of type Ty is "bitcast compatible"
319   // with a <VF x Ty> vector.
320   if (VF > 1) {
321     auto *VectorTy = VectorType::get(Ty, VF);
322     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
323   }
324 
325   // If the vectorization factor is one, we just check if an array of type Ty
326   // requires padding between elements.
327   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
328 }
329 
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
333 ///
334 /// TODO: We should use actual block probability here, if available. Currently,
335 ///       we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
337 
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value *addFastMathFlag(Value *V) {
340   if (isa<FPMathOperator>(V))
341     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342   return V;
343 }
344 
345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
346   if (isa<FPMathOperator>(V))
347     cast<Instruction>(V)->setFastMathFlags(FMF);
348   return V;
349 }
350 
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355                            : ConstantFP::get(Ty, C);
356 }
357 
358 namespace llvm {
359 
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 ///   counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
368 ///   instructions.
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer {
375 public:
376   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
377                       LoopInfo *LI, DominatorTree *DT,
378                       const TargetLibraryInfo *TLI,
379                       const TargetTransformInfo *TTI, AssumptionCache *AC,
380                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
381                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
382                       LoopVectorizationCostModel *CM)
383       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
384         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
385         Builder(PSE.getSE()->getContext()),
386         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
387   virtual ~InnerLoopVectorizer() = default;
388 
389   /// Create a new empty loop. Unlink the old loop and connect the new one.
390   /// Return the pre-header block of the new loop.
391   BasicBlock *createVectorizedLoopSkeleton();
392 
393   /// Widen a single instruction within the innermost loop.
394   void widenInstruction(Instruction &I);
395 
396   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397   void fixVectorizedLoop();
398 
399   // Return true if any runtime check is added.
400   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
401 
402   /// A type for vectorized values in the new loop. Each value from the
403   /// original loop, when vectorized, is represented by UF vector values in the
404   /// new unrolled loop, where UF is the unroll factor.
405   using VectorParts = SmallVector<Value *, 2>;
406 
407   /// Vectorize a single PHINode in a block. This method handles the induction
408   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409   /// arbitrary length vectors.
410   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
411 
412   /// A helper function to scalarize a single Instruction in the innermost loop.
413   /// Generates a sequence of scalar instances for each lane between \p MinLane
414   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
415   /// inclusive..
416   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
417                             bool IfPredicateInstr);
418 
419   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420   /// is provided, the integer induction variable will first be truncated to
421   /// the corresponding type.
422   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
423 
424   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425   /// vector or scalar value on-demand if one is not yet available. When
426   /// vectorizing a loop, we visit the definition of an instruction before its
427   /// uses. When visiting the definition, we either vectorize or scalarize the
428   /// instruction, creating an entry for it in the corresponding map. (In some
429   /// cases, such as induction variables, we will create both vector and scalar
430   /// entries.) Then, as we encounter uses of the definition, we derive values
431   /// for each scalar or vector use unless such a value is already available.
432   /// For example, if we scalarize a definition and one of its uses is vector,
433   /// we build the required vector on-demand with an insertelement sequence
434   /// when visiting the use. Otherwise, if the use is scalar, we can use the
435   /// existing scalar definition.
436   ///
437   /// Return a value in the new loop corresponding to \p V from the original
438   /// loop at unroll index \p Part. If the value has already been vectorized,
439   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441   /// a new vector value on-demand by inserting the scalar values into a vector
442   /// with an insertelement sequence. If the value has been neither vectorized
443   /// nor scalarized, it must be loop invariant, so we simply broadcast the
444   /// value into a vector.
445   Value *getOrCreateVectorValue(Value *V, unsigned Part);
446 
447   /// Return a value in the new loop corresponding to \p V from the original
448   /// loop at unroll and vector indices \p Instance. If the value has been
449   /// vectorized but not scalarized, the necessary extractelement instruction
450   /// will be generated.
451   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
452 
453   /// Construct the vector value of a scalarized value \p V one lane at a time.
454   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
455 
456   /// Try to vectorize the interleaved access group that \p Instr belongs to,
457   /// optionally masking the vector operations if \p BlockInMask is non-null.
458   void vectorizeInterleaveGroup(Instruction *Instr,
459                                 VectorParts *BlockInMask = nullptr);
460 
461   /// Vectorize Load and Store instructions, optionally masking the vector
462   /// operations if \p BlockInMask is non-null.
463   void vectorizeMemoryInstruction(Instruction *Instr,
464                                   VectorParts *BlockInMask = nullptr);
465 
466   /// Set the debug location in the builder using the debug location in
467   /// the instruction.
468   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
469 
470   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471   void fixNonInductionPHIs(void);
472 
473 protected:
474   friend class LoopVectorizationPlanner;
475 
476   /// A small list of PHINodes.
477   using PhiVector = SmallVector<PHINode *, 4>;
478 
479   /// A type for scalarized values in the new loop. Each value from the
480   /// original loop, when scalarized, is represented by UF x VF scalar values
481   /// in the new unrolled loop, where UF is the unroll factor and VF is the
482   /// vectorization factor.
483   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
484 
485   /// Set up the values of the IVs correctly when exiting the vector loop.
486   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
487                     Value *CountRoundDown, Value *EndValue,
488                     BasicBlock *MiddleBlock);
489 
490   /// Create a new induction variable inside L.
491   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
492                                    Value *Step, Instruction *DL);
493 
494   /// Handle all cross-iteration phis in the header.
495   void fixCrossIterationPHIs();
496 
497   /// Fix a first-order recurrence. This is the second phase of vectorizing
498   /// this phi node.
499   void fixFirstOrderRecurrence(PHINode *Phi);
500 
501   /// Fix a reduction cross-iteration phi. This is the second phase of
502   /// vectorizing this phi node.
503   void fixReduction(PHINode *Phi);
504 
505   /// The Loop exit block may have single value PHI nodes with some
506   /// incoming value. While vectorizing we only handled real values
507   /// that were defined inside the loop and we should have one value for
508   /// each predecessor of its parent basic block. See PR14725.
509   void fixLCSSAPHIs();
510 
511   /// Iteratively sink the scalarized operands of a predicated instruction into
512   /// the block that was created for it.
513   void sinkScalarOperands(Instruction *PredInst);
514 
515   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
516   /// represented as.
517   void truncateToMinimalBitwidths();
518 
519   /// Insert the new loop to the loop hierarchy and pass manager
520   /// and update the analysis passes.
521   void updateAnalysis();
522 
523   /// Create a broadcast instruction. This method generates a broadcast
524   /// instruction (shuffle) for loop invariant values and for the induction
525   /// value. If this is the induction variable then we extend it to N, N+1, ...
526   /// this is needed because each iteration in the loop corresponds to a SIMD
527   /// element.
528   virtual Value *getBroadcastInstrs(Value *V);
529 
530   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531   /// to each vector element of Val. The sequence starts at StartIndex.
532   /// \p Opcode is relevant for FP induction variable.
533   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
534                                Instruction::BinaryOps Opcode =
535                                Instruction::BinaryOpsEnd);
536 
537   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538   /// variable on which to base the steps, \p Step is the size of the step, and
539   /// \p EntryVal is the value from the original loop that maps to the steps.
540   /// Note that \p EntryVal doesn't have to be an induction variable - it
541   /// can also be a truncate instruction.
542   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
543                         const InductionDescriptor &ID);
544 
545   /// Create a vector induction phi node based on an existing scalar one. \p
546   /// EntryVal is the value from the original loop that maps to the vector phi
547   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548   /// truncate instruction, instead of widening the original IV, we widen a
549   /// version of the IV truncated to \p EntryVal's type.
550   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
551                                        Value *Step, Instruction *EntryVal);
552 
553   /// Returns true if an instruction \p I should be scalarized instead of
554   /// vectorized for the chosen vectorization factor.
555   bool shouldScalarizeInstruction(Instruction *I) const;
556 
557   /// Returns true if we should generate a scalar version of \p IV.
558   bool needsScalarInduction(Instruction *IV) const;
559 
560   /// If there is a cast involved in the induction variable \p ID, which should
561   /// be ignored in the vectorized loop body, this function records the
562   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563   /// cast. We had already proved that the casted Phi is equal to the uncasted
564   /// Phi in the vectorized loop (under a runtime guard), and therefore
565   /// there is no need to vectorize the cast - the same value can be used in the
566   /// vector loop for both the Phi and the cast.
567   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
569   ///
570   /// \p EntryVal is the value from the original loop that maps to the vector
571   /// phi node and is used to distinguish what is the IV currently being
572   /// processed - original one (if \p EntryVal is a phi corresponding to the
573   /// original IV) or the "newly-created" one based on the proof mentioned above
574   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575   /// latter case \p EntryVal is a TruncInst and we must not record anything for
576   /// that IV, but it's error-prone to expect callers of this routine to care
577   /// about that, hence this explicit parameter.
578   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
579                                              const Instruction *EntryVal,
580                                              Value *VectorLoopValue,
581                                              unsigned Part,
582                                              unsigned Lane = UINT_MAX);
583 
584   /// Generate a shuffle sequence that will reverse the vector Vec.
585   virtual Value *reverseVector(Value *Vec);
586 
587   /// Returns (and creates if needed) the original loop trip count.
588   Value *getOrCreateTripCount(Loop *NewLoop);
589 
590   /// Returns (and creates if needed) the trip count of the widened loop.
591   Value *getOrCreateVectorTripCount(Loop *NewLoop);
592 
593   /// Returns a bitcasted value to the requested vector type.
594   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
596                                 const DataLayout &DL);
597 
598   /// Emit a bypass check to see if the vector trip count is zero, including if
599   /// it overflows.
600   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
601 
602   /// Emit a bypass check to see if all of the SCEV assumptions we've
603   /// had to make are correct.
604   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
605 
606   /// Emit bypass checks to check any memory assumptions we may have made.
607   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
608 
609   /// Compute the transformed value of Index at offset StartValue using step
610   /// StepValue.
611   /// For integer induction, returns StartValue + Index * StepValue.
612   /// For pointer induction, returns StartValue[Index * StepValue].
613   /// FIXME: The newly created binary instructions should contain nsw/nuw
614   /// flags, which can be found from the original scalar operations.
615   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
616                               const DataLayout &DL,
617                               const InductionDescriptor &ID) const;
618 
619   /// Add additional metadata to \p To that was not present on \p Orig.
620   ///
621   /// Currently this is used to add the noalias annotations based on the
622   /// inserted memchecks.  Use this for instructions that are *cloned* into the
623   /// vector loop.
624   void addNewMetadata(Instruction *To, const Instruction *Orig);
625 
626   /// Add metadata from one instruction to another.
627   ///
628   /// This includes both the original MDs from \p From and additional ones (\see
629   /// addNewMetadata).  Use this for *newly created* instructions in the vector
630   /// loop.
631   void addMetadata(Instruction *To, Instruction *From);
632 
633   /// Similar to the previous function but it adds the metadata to a
634   /// vector of instructions.
635   void addMetadata(ArrayRef<Value *> To, Instruction *From);
636 
637   /// The original loop.
638   Loop *OrigLoop;
639 
640   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641   /// dynamic knowledge to simplify SCEV expressions and converts them to a
642   /// more usable form.
643   PredicatedScalarEvolution &PSE;
644 
645   /// Loop Info.
646   LoopInfo *LI;
647 
648   /// Dominator Tree.
649   DominatorTree *DT;
650 
651   /// Alias Analysis.
652   AliasAnalysis *AA;
653 
654   /// Target Library Info.
655   const TargetLibraryInfo *TLI;
656 
657   /// Target Transform Info.
658   const TargetTransformInfo *TTI;
659 
660   /// Assumption Cache.
661   AssumptionCache *AC;
662 
663   /// Interface to emit optimization remarks.
664   OptimizationRemarkEmitter *ORE;
665 
666   /// LoopVersioning.  It's only set up (non-null) if memchecks were
667   /// used.
668   ///
669   /// This is currently only used to add no-alias metadata based on the
670   /// memchecks.  The actually versioning is performed manually.
671   std::unique_ptr<LoopVersioning> LVer;
672 
673   /// The vectorization SIMD factor to use. Each vector will have this many
674   /// vector elements.
675   unsigned VF;
676 
677   /// The vectorization unroll factor to use. Each scalar is vectorized to this
678   /// many different vector instructions.
679   unsigned UF;
680 
681   /// The builder that we use
682   IRBuilder<> Builder;
683 
684   // --- Vectorization state ---
685 
686   /// The vector-loop preheader.
687   BasicBlock *LoopVectorPreHeader;
688 
689   /// The scalar-loop preheader.
690   BasicBlock *LoopScalarPreHeader;
691 
692   /// Middle Block between the vector and the scalar.
693   BasicBlock *LoopMiddleBlock;
694 
695   /// The ExitBlock of the scalar loop.
696   BasicBlock *LoopExitBlock;
697 
698   /// The vector loop body.
699   BasicBlock *LoopVectorBody;
700 
701   /// The scalar loop body.
702   BasicBlock *LoopScalarBody;
703 
704   /// A list of all bypass blocks. The first block is the entry of the loop.
705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706 
707   /// The new Induction variable which was added to the new block.
708   PHINode *Induction = nullptr;
709 
710   /// The induction variable of the old basic block.
711   PHINode *OldInduction = nullptr;
712 
713   /// Maps values from the original loop to their corresponding values in the
714   /// vectorized loop. A key value can map to either vector values, scalar
715   /// values or both kinds of values, depending on whether the key was
716   /// vectorized and scalarized.
717   VectorizerValueMap VectorLoopValueMap;
718 
719   /// Store instructions that were predicated.
720   SmallVector<Instruction *, 4> PredicatedInstructions;
721 
722   /// Trip count of the original loop.
723   Value *TripCount = nullptr;
724 
725   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726   Value *VectorTripCount = nullptr;
727 
728   /// The legality analysis.
729   LoopVectorizationLegality *Legal;
730 
731   /// The profitablity analysis.
732   LoopVectorizationCostModel *Cost;
733 
734   // Record whether runtime checks are added.
735   bool AddedSafetyChecks = false;
736 
737   // Holds the end values for each induction variable. We save the end values
738   // so we can later fix-up the external users of the induction variables.
739   DenseMap<PHINode *, Value *> IVEndValues;
740 
741   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742   // fixed up at the end of vector code generation.
743   SmallVector<PHINode *, 8> OrigPHIsToFix;
744 };
745 
746 class InnerLoopUnroller : public InnerLoopVectorizer {
747 public:
748   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
749                     LoopInfo *LI, DominatorTree *DT,
750                     const TargetLibraryInfo *TLI,
751                     const TargetTransformInfo *TTI, AssumptionCache *AC,
752                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
753                     LoopVectorizationLegality *LVL,
754                     LoopVectorizationCostModel *CM)
755       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
756                             UnrollFactor, LVL, CM) {}
757 
758 private:
759   Value *getBroadcastInstrs(Value *V) override;
760   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
761                        Instruction::BinaryOps Opcode =
762                        Instruction::BinaryOpsEnd) override;
763   Value *reverseVector(Value *Vec) override;
764 };
765 
766 } // end namespace llvm
767 
768 /// Look for a meaningful debug location on the instruction or it's
769 /// operands.
770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
771   if (!I)
772     return I;
773 
774   DebugLoc Empty;
775   if (I->getDebugLoc() != Empty)
776     return I;
777 
778   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
779     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
780       if (OpInst->getDebugLoc() != Empty)
781         return OpInst;
782   }
783 
784   return I;
785 }
786 
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
788   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
789     const DILocation *DIL = Inst->getDebugLoc();
790     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
791         !isa<DbgInfoIntrinsic>(Inst)) {
792       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
793       if (NewDIL)
794         B.SetCurrentDebugLocation(NewDIL.getValue());
795       else
796         LLVM_DEBUG(dbgs()
797                    << "Failed to create new discriminator: "
798                    << DIL->getFilename() << " Line: " << DIL->getLine());
799     }
800     else
801       B.SetCurrentDebugLocation(DIL);
802   } else
803     B.SetCurrentDebugLocation(DebugLoc());
804 }
805 
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
808 /// vectorization.
809 #ifndef NDEBUG
810 static void debugVectorizationFailure(const StringRef DebugMsg,
811     Instruction *I) {
812   dbgs() << "LV: Not vectorizing: " << DebugMsg;
813   if (I != nullptr)
814     dbgs() << " " << *I;
815   else
816     dbgs() << '.';
817   dbgs() << '\n';
818 }
819 #endif
820 
821 /// Create an analysis remark that explains why vectorization failed
822 ///
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
824 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
825 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
826 /// the location of the remark.  \return the remark object that can be
827 /// streamed to.
828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
829     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
830   Value *CodeRegion = TheLoop->getHeader();
831   DebugLoc DL = TheLoop->getStartLoc();
832 
833   if (I) {
834     CodeRegion = I->getParent();
835     // If there is no debug location attached to the instruction, revert back to
836     // using the loop's.
837     if (I->getDebugLoc())
838       DL = I->getDebugLoc();
839   }
840 
841   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
842   R << "loop not vectorized: ";
843   return R;
844 }
845 
846 namespace llvm {
847 
848 void reportVectorizationFailure(const StringRef DebugMsg,
849     const StringRef OREMsg, const StringRef ORETag,
850     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
851   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
852   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
853   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
854                 ORETag, TheLoop, I) << OREMsg);
855 }
856 
857 } // end namespace llvm
858 
859 #ifndef NDEBUG
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string getDebugLocString(const Loop *L) {
862   std::string Result;
863   if (L) {
864     raw_string_ostream OS(Result);
865     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
866       LoopDbgLoc.print(OS);
867     else
868       // Just print the module name.
869       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
870     OS.flush();
871   }
872   return Result;
873 }
874 #endif
875 
876 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
877                                          const Instruction *Orig) {
878   // If the loop was versioned with memchecks, add the corresponding no-alias
879   // metadata.
880   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
881     LVer->annotateInstWithNoAlias(To, Orig);
882 }
883 
884 void InnerLoopVectorizer::addMetadata(Instruction *To,
885                                       Instruction *From) {
886   propagateMetadata(To, From);
887   addNewMetadata(To, From);
888 }
889 
890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
891                                       Instruction *From) {
892   for (Value *V : To) {
893     if (Instruction *I = dyn_cast<Instruction>(V))
894       addMetadata(I, From);
895   }
896 }
897 
898 namespace llvm {
899 
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
901 // lowered.
902 enum ScalarEpilogueLowering {
903 
904   // The default: allowing scalar epilogues.
905   CM_ScalarEpilogueAllowed,
906 
907   // Vectorization with OptForSize: don't allow epilogues.
908   CM_ScalarEpilogueNotAllowedOptSize,
909 
910   // A special case of vectorisation with OptForSize: loops with a very small
911   // trip count are considered for vectorization under OptForSize, thereby
912   // making sure the cost of their loop body is dominant, free of runtime
913   // guards and scalar iteration overheads.
914   CM_ScalarEpilogueNotAllowedLowTripLoop,
915 
916   // Loop hint predicate indicating an epilogue is undesired.
917   CM_ScalarEpilogueNotNeededUsePredicate
918 };
919 
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
921 /// vectorization.
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel {
928 public:
929   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
930                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
931                              LoopVectorizationLegality *Legal,
932                              const TargetTransformInfo &TTI,
933                              const TargetLibraryInfo *TLI, DemandedBits *DB,
934                              AssumptionCache *AC,
935                              OptimizationRemarkEmitter *ORE, const Function *F,
936                              const LoopVectorizeHints *Hints,
937                              InterleavedAccessInfo &IAI)
938       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
939         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
940         Hints(Hints), InterleaveInfo(IAI) {}
941 
942   /// \return An upper bound for the vectorization factor, or None if
943   /// vectorization and interleaving should be avoided up front.
944   Optional<unsigned> computeMaxVF();
945 
946   /// \return True if runtime checks are required for vectorization, and false
947   /// otherwise.
948   bool runtimeChecksRequired();
949 
950   /// \return The most profitable vectorization factor and the cost of that VF.
951   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952   /// then this vectorization factor will be selected if vectorization is
953   /// possible.
954   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
955 
956   /// Setup cost-based decisions for user vectorization factor.
957   void selectUserVectorizationFactor(unsigned UserVF) {
958     collectUniformsAndScalars(UserVF);
959     collectInstsToScalarize(UserVF);
960   }
961 
962   /// \return The size (in bits) of the smallest and widest types in the code
963   /// that needs to be vectorized. We ignore values that remain scalar such as
964   /// 64 bit loop indices.
965   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
966 
967   /// \return The desired interleave count.
968   /// If interleave count has been specified by metadata it will be returned.
969   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970   /// are the selected vectorization factor and the cost of the selected VF.
971   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
972 
973   /// Memory access instruction may be vectorized in more than one way.
974   /// Form of instruction after vectorization depends on cost.
975   /// This function takes cost-based decisions for Load/Store instructions
976   /// and collects them in a map. This decisions map is used for building
977   /// the lists of loop-uniform and loop-scalar instructions.
978   /// The calculated cost is saved with widening decision in order to
979   /// avoid redundant calculations.
980   void setCostBasedWideningDecision(unsigned VF);
981 
982   /// A struct that represents some properties of the register usage
983   /// of a loop.
984   struct RegisterUsage {
985     /// Holds the number of loop invariant values that are used in the loop.
986     unsigned LoopInvariantRegs;
987 
988     /// Holds the maximum number of concurrent live intervals in the loop.
989     unsigned MaxLocalUsers;
990   };
991 
992   /// \return Returns information about the register usages of the loop for the
993   /// given vectorization factors.
994   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
995 
996   /// Collect values we want to ignore in the cost model.
997   void collectValuesToIgnore();
998 
999   /// \returns The smallest bitwidth each instruction can be represented with.
1000   /// The vector equivalents of these instructions should be truncated to this
1001   /// type.
1002   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1003     return MinBWs;
1004   }
1005 
1006   /// \returns True if it is more profitable to scalarize instruction \p I for
1007   /// vectorization factor \p VF.
1008   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1010 
1011     // Cost model is not run in the VPlan-native path - return conservative
1012     // result until this changes.
1013     if (EnableVPlanNativePath)
1014       return false;
1015 
1016     auto Scalars = InstsToScalarize.find(VF);
1017     assert(Scalars != InstsToScalarize.end() &&
1018            "VF not yet analyzed for scalarization profitability");
1019     return Scalars->second.find(I) != Scalars->second.end();
1020   }
1021 
1022   /// Returns true if \p I is known to be uniform after vectorization.
1023   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024     if (VF == 1)
1025       return true;
1026 
1027     // Cost model is not run in the VPlan-native path - return conservative
1028     // result until this changes.
1029     if (EnableVPlanNativePath)
1030       return false;
1031 
1032     auto UniformsPerVF = Uniforms.find(VF);
1033     assert(UniformsPerVF != Uniforms.end() &&
1034            "VF not yet analyzed for uniformity");
1035     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1036   }
1037 
1038   /// Returns true if \p I is known to be scalar after vectorization.
1039   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040     if (VF == 1)
1041       return true;
1042 
1043     // Cost model is not run in the VPlan-native path - return conservative
1044     // result until this changes.
1045     if (EnableVPlanNativePath)
1046       return false;
1047 
1048     auto ScalarsPerVF = Scalars.find(VF);
1049     assert(ScalarsPerVF != Scalars.end() &&
1050            "Scalar values are not calculated for VF");
1051     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1052   }
1053 
1054   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055   /// for vectorization factor \p VF.
1056   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058            !isProfitableToScalarize(I, VF) &&
1059            !isScalarAfterVectorization(I, VF);
1060   }
1061 
1062   /// Decision that was taken during cost calculation for memory instruction.
1063   enum InstWidening {
1064     CM_Unknown,
1065     CM_Widen,         // For consecutive accesses with stride +1.
1066     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1067     CM_Interleave,
1068     CM_GatherScatter,
1069     CM_Scalarize
1070   };
1071 
1072   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073   /// instruction \p I and vector width \p VF.
1074   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1075                            unsigned Cost) {
1076     assert(VF >= 2 && "Expected VF >=2");
1077     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1078   }
1079 
1080   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081   /// interleaving group \p Grp and vector width \p VF.
1082   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1083                            InstWidening W, unsigned Cost) {
1084     assert(VF >= 2 && "Expected VF >=2");
1085     /// Broadcast this decicion to all instructions inside the group.
1086     /// But the cost will be assigned to one instruction only.
1087     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088       if (auto *I = Grp->getMember(i)) {
1089         if (Grp->getInsertPos() == I)
1090           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091         else
1092           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1093       }
1094     }
1095   }
1096 
1097   /// Return the cost model decision for the given instruction \p I and vector
1098   /// width \p VF. Return CM_Unknown if this instruction did not pass
1099   /// through the cost modeling.
1100   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1101     assert(VF >= 2 && "Expected VF >=2");
1102 
1103     // Cost model is not run in the VPlan-native path - return conservative
1104     // result until this changes.
1105     if (EnableVPlanNativePath)
1106       return CM_GatherScatter;
1107 
1108     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109     auto Itr = WideningDecisions.find(InstOnVF);
1110     if (Itr == WideningDecisions.end())
1111       return CM_Unknown;
1112     return Itr->second.first;
1113   }
1114 
1115   /// Return the vectorization cost for the given instruction \p I and vector
1116   /// width \p VF.
1117   unsigned getWideningCost(Instruction *I, unsigned VF) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121            "The cost is not calculated");
1122     return WideningDecisions[InstOnVF].second;
1123   }
1124 
1125   /// Return True if instruction \p I is an optimizable truncate whose operand
1126   /// is an induction variable. Such a truncate will be removed by adding a new
1127   /// induction variable with the destination type.
1128   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129     // If the instruction is not a truncate, return false.
1130     auto *Trunc = dyn_cast<TruncInst>(I);
1131     if (!Trunc)
1132       return false;
1133 
1134     // Get the source and destination types of the truncate.
1135     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1137 
1138     // If the truncate is free for the given types, return false. Replacing a
1139     // free truncate with an induction variable would add an induction variable
1140     // update instruction to each iteration of the loop. We exclude from this
1141     // check the primary induction variable since it will need an update
1142     // instruction regardless.
1143     Value *Op = Trunc->getOperand(0);
1144     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145       return false;
1146 
1147     // If the truncated value is not an induction variable, return false.
1148     return Legal->isInductionPhi(Op);
1149   }
1150 
1151   /// Collects the instructions to scalarize for each predicated instruction in
1152   /// the loop.
1153   void collectInstsToScalarize(unsigned VF);
1154 
1155   /// Collect Uniform and Scalar values for the given \p VF.
1156   /// The sets depend on CM decision for Load/Store instructions
1157   /// that may be vectorized as interleave, gather-scatter or scalarized.
1158   void collectUniformsAndScalars(unsigned VF) {
1159     // Do the analysis once.
1160     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161       return;
1162     setCostBasedWideningDecision(VF);
1163     collectLoopUniforms(VF);
1164     collectLoopScalars(VF);
1165   }
1166 
1167   /// Returns true if the target machine supports masked store operation
1168   /// for the given \p DataType and kind of access to \p Ptr.
1169   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1170     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1171   }
1172 
1173   /// Returns true if the target machine supports masked load operation
1174   /// for the given \p DataType and kind of access to \p Ptr.
1175   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1176     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1177   }
1178 
1179   /// Returns true if the target machine supports masked scatter operation
1180   /// for the given \p DataType.
1181   bool isLegalMaskedScatter(Type *DataType) {
1182     return TTI.isLegalMaskedScatter(DataType);
1183   }
1184 
1185   /// Returns true if the target machine supports masked gather operation
1186   /// for the given \p DataType.
1187   bool isLegalMaskedGather(Type *DataType) {
1188     return TTI.isLegalMaskedGather(DataType);
1189   }
1190 
1191   /// Returns true if the target machine can represent \p V as a masked gather
1192   /// or scatter operation.
1193   bool isLegalGatherOrScatter(Value *V) {
1194     bool LI = isa<LoadInst>(V);
1195     bool SI = isa<StoreInst>(V);
1196     if (!LI && !SI)
1197       return false;
1198     auto *Ty = getMemInstValueType(V);
1199     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1200   }
1201 
1202   /// Returns true if \p I is an instruction that will be scalarized with
1203   /// predication. Such instructions include conditional stores and
1204   /// instructions that may divide by zero.
1205   /// If a non-zero VF has been calculated, we check if I will be scalarized
1206   /// predication for that VF.
1207   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1208 
1209   // Returns true if \p I is an instruction that will be predicated either
1210   // through scalar predication or masked load/store or masked gather/scatter.
1211   // Superset of instructions that return true for isScalarWithPredication.
1212   bool isPredicatedInst(Instruction *I) {
1213     if (!blockNeedsPredication(I->getParent()))
1214       return false;
1215     // Loads and stores that need some form of masked operation are predicated
1216     // instructions.
1217     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218       return Legal->isMaskRequired(I);
1219     return isScalarWithPredication(I);
1220   }
1221 
1222   /// Returns true if \p I is a memory instruction with consecutive memory
1223   /// access that can be widened.
1224   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1225 
1226   /// Returns true if \p I is a memory instruction in an interleaved-group
1227   /// of memory accesses that can be vectorized with wide vector loads/stores
1228   /// and shuffles.
1229   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1230 
1231   /// Check if \p Instr belongs to any interleaved access group.
1232   bool isAccessInterleaved(Instruction *Instr) {
1233     return InterleaveInfo.isInterleaved(Instr);
1234   }
1235 
1236   /// Get the interleaved access group that \p Instr belongs to.
1237   const InterleaveGroup<Instruction> *
1238   getInterleavedAccessGroup(Instruction *Instr) {
1239     return InterleaveInfo.getInterleaveGroup(Instr);
1240   }
1241 
1242   /// Returns true if an interleaved group requires a scalar iteration
1243   /// to handle accesses with gaps, and there is nothing preventing us from
1244   /// creating a scalar epilogue.
1245   bool requiresScalarEpilogue() const {
1246     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1247   }
1248 
1249   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250   /// loop hint annotation.
1251   bool isScalarEpilogueAllowed() const {
1252     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1253   }
1254 
1255   /// Returns true if all loop blocks should be masked to fold tail loop.
1256   bool foldTailByMasking() const { return FoldTailByMasking; }
1257 
1258   bool blockNeedsPredication(BasicBlock *BB) {
1259     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1260   }
1261 
1262   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263   /// with factor VF.  Return the cost of the instruction, including
1264   /// scalarization overhead if it's needed.
1265   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1266 
1267   /// Estimate cost of a call instruction CI if it were vectorized with factor
1268   /// VF. Return the cost of the instruction, including scalarization overhead
1269   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270   /// scalarized -
1271   /// i.e. either vector version isn't available, or is too expensive.
1272   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1273 
1274 private:
1275   unsigned NumPredStores = 0;
1276 
1277   /// \return An upper bound for the vectorization factor, larger than zero.
1278   /// One is returned if vectorization should best be avoided due to cost.
1279   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1280 
1281   /// The vectorization cost is a combination of the cost itself and a boolean
1282   /// indicating whether any of the contributing operations will actually
1283   /// operate on
1284   /// vector values after type legalization in the backend. If this latter value
1285   /// is
1286   /// false, then all operations will be scalarized (i.e. no vectorization has
1287   /// actually taken place).
1288   using VectorizationCostTy = std::pair<unsigned, bool>;
1289 
1290   /// Returns the expected execution cost. The unit of the cost does
1291   /// not matter because we use the 'cost' units to compare different
1292   /// vector widths. The cost that is returned is *not* normalized by
1293   /// the factor width.
1294   VectorizationCostTy expectedCost(unsigned VF);
1295 
1296   /// Returns the execution time cost of an instruction for a given vector
1297   /// width. Vector width of one means scalar.
1298   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1299 
1300   /// The cost-computation logic from getInstructionCost which provides
1301   /// the vector type as an output parameter.
1302   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1303 
1304   /// Calculate vectorization cost of memory instruction \p I.
1305   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1306 
1307   /// The cost computation for scalarized memory instruction.
1308   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1309 
1310   /// The cost computation for interleaving group of memory instructions.
1311   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1312 
1313   /// The cost computation for Gather/Scatter instruction.
1314   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1315 
1316   /// The cost computation for widening instruction \p I with consecutive
1317   /// memory access.
1318   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1319 
1320   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321   /// Load: scalar load + broadcast.
1322   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323   /// element)
1324   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1325 
1326   /// Estimate the overhead of scalarizing an instruction. This is a
1327   /// convenience wrapper for the type-based getScalarizationOverhead API.
1328   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1329 
1330   /// Returns whether the instruction is a load or store and will be a emitted
1331   /// as a vector operation.
1332   bool isConsecutiveLoadOrStore(Instruction *I);
1333 
1334   /// Returns true if an artificially high cost for emulated masked memrefs
1335   /// should be used.
1336   bool useEmulatedMaskMemRefHack(Instruction *I);
1337 
1338   /// Map of scalar integer values to the smallest bitwidth they can be legally
1339   /// represented as. The vector equivalents of these values should be truncated
1340   /// to this type.
1341   MapVector<Instruction *, uint64_t> MinBWs;
1342 
1343   /// A type representing the costs for instructions if they were to be
1344   /// scalarized rather than vectorized. The entries are Instruction-Cost
1345   /// pairs.
1346   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1347 
1348   /// A set containing all BasicBlocks that are known to present after
1349   /// vectorization as a predicated block.
1350   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1351 
1352   /// Records whether it is allowed to have the original scalar loop execute at
1353   /// least once. This may be needed as a fallback loop in case runtime
1354   /// aliasing/dependence checks fail, or to handle the tail/remainder
1355   /// iterations when the trip count is unknown or doesn't divide by the VF,
1356   /// or as a peel-loop to handle gaps in interleave-groups.
1357   /// Under optsize and when the trip count is very small we don't allow any
1358   /// iterations to execute in the scalar loop.
1359   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1360 
1361   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362   bool FoldTailByMasking = false;
1363 
1364   /// A map holding scalar costs for different vectorization factors. The
1365   /// presence of a cost for an instruction in the mapping indicates that the
1366   /// instruction will be scalarized when vectorizing with the associated
1367   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1369 
1370   /// Holds the instructions known to be uniform after vectorization.
1371   /// The data is collected per VF.
1372   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1373 
1374   /// Holds the instructions known to be scalar after vectorization.
1375   /// The data is collected per VF.
1376   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1377 
1378   /// Holds the instructions (address computations) that are forced to be
1379   /// scalarized.
1380   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1381 
1382   /// Returns the expected difference in cost from scalarizing the expression
1383   /// feeding a predicated instruction \p PredInst. The instructions to
1384   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385   /// non-negative return value implies the expression will be scalarized.
1386   /// Currently, only single-use chains are considered for scalarization.
1387   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388                               unsigned VF);
1389 
1390   /// Collect the instructions that are uniform after vectorization. An
1391   /// instruction is uniform if we represent it with a single scalar value in
1392   /// the vectorized loop corresponding to each vector iteration. Examples of
1393   /// uniform instructions include pointer operands of consecutive or
1394   /// interleaved memory accesses. Note that although uniformity implies an
1395   /// instruction will be scalar, the reverse is not true. In general, a
1396   /// scalarized instruction will be represented by VF scalar values in the
1397   /// vectorized loop, each corresponding to an iteration of the original
1398   /// scalar loop.
1399   void collectLoopUniforms(unsigned VF);
1400 
1401   /// Collect the instructions that are scalar after vectorization. An
1402   /// instruction is scalar if it is known to be uniform or will be scalarized
1403   /// during vectorization. Non-uniform scalarized instructions will be
1404   /// represented by VF values in the vectorized loop, each corresponding to an
1405   /// iteration of the original scalar loop.
1406   void collectLoopScalars(unsigned VF);
1407 
1408   /// Keeps cost model vectorization decision and cost for instructions.
1409   /// Right now it is used for memory instructions only.
1410   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1411                                 std::pair<InstWidening, unsigned>>;
1412 
1413   DecisionList WideningDecisions;
1414 
1415   /// Returns true if \p V is expected to be vectorized and it needs to be
1416   /// extracted.
1417   bool needsExtract(Value *V, unsigned VF) const {
1418     Instruction *I = dyn_cast<Instruction>(V);
1419     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420       return false;
1421 
1422     // Assume we can vectorize V (and hence we need extraction) if the
1423     // scalars are not computed yet. This can happen, because it is called
1424     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425     // the scalars are collected. That should be a safe assumption in most
1426     // cases, because we check if the operands have vectorizable types
1427     // beforehand in LoopVectorizationLegality.
1428     return Scalars.find(VF) == Scalars.end() ||
1429            !isScalarAfterVectorization(I, VF);
1430   };
1431 
1432   /// Returns a range containing only operands needing to be extracted.
1433   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434                                                    unsigned VF) {
1435     return SmallVector<Value *, 4>(make_filter_range(
1436         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1437   }
1438 
1439 public:
1440   /// The loop that we evaluate.
1441   Loop *TheLoop;
1442 
1443   /// Predicated scalar evolution analysis.
1444   PredicatedScalarEvolution &PSE;
1445 
1446   /// Loop Info analysis.
1447   LoopInfo *LI;
1448 
1449   /// Vectorization legality.
1450   LoopVectorizationLegality *Legal;
1451 
1452   /// Vector target information.
1453   const TargetTransformInfo &TTI;
1454 
1455   /// Target Library Info.
1456   const TargetLibraryInfo *TLI;
1457 
1458   /// Demanded bits analysis.
1459   DemandedBits *DB;
1460 
1461   /// Assumption cache.
1462   AssumptionCache *AC;
1463 
1464   /// Interface to emit optimization remarks.
1465   OptimizationRemarkEmitter *ORE;
1466 
1467   const Function *TheFunction;
1468 
1469   /// Loop Vectorize Hint.
1470   const LoopVectorizeHints *Hints;
1471 
1472   /// The interleave access information contains groups of interleaved accesses
1473   /// with the same stride and close to each other.
1474   InterleavedAccessInfo &InterleaveInfo;
1475 
1476   /// Values to ignore in the cost model.
1477   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1478 
1479   /// Values to ignore in the cost model when VF > 1.
1480   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1481 };
1482 
1483 } // end namespace llvm
1484 
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1500                                    OptimizationRemarkEmitter *ORE) {
1501   assert(!OuterLp->empty() && "This is not an outer loop");
1502   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1503 
1504   // Only outer loops with an explicit vectorization hint are supported.
1505   // Unannotated outer loops are ignored.
1506   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507     return false;
1508 
1509   Function *Fn = OuterLp->getHeader()->getParent();
1510   if (!Hints.allowVectorization(Fn, OuterLp,
1511                                 true /*VectorizeOnlyWhenForced*/)) {
1512     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513     return false;
1514   }
1515 
1516   if (Hints.getInterleave() > 1) {
1517     // TODO: Interleave support is future work.
1518     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519                          "outer loops.\n");
1520     Hints.emitRemarkWithHints();
1521     return false;
1522   }
1523 
1524   return true;
1525 }
1526 
1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1528                                   OptimizationRemarkEmitter *ORE,
1529                                   SmallVectorImpl<Loop *> &V) {
1530   // Collect inner loops and outer loops without irreducible control flow. For
1531   // now, only collect outer loops that have explicit vectorization hints. If we
1532   // are stress testing the VPlan H-CFG construction, we collect the outermost
1533   // loop of every loop nest.
1534   if (L.empty() || VPlanBuildStressTest ||
1535       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1536     LoopBlocksRPO RPOT(&L);
1537     RPOT.perform(LI);
1538     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539       V.push_back(&L);
1540       // TODO: Collect inner loops inside marked outer loops in case
1541       // vectorization fails for the outer loop. Do not invoke
1542       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543       // already known to be reducible. We can use an inherited attribute for
1544       // that.
1545       return;
1546     }
1547   }
1548   for (Loop *InnerL : L)
1549     collectSupportedLoops(*InnerL, LI, ORE, V);
1550 }
1551 
1552 namespace {
1553 
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556   /// Pass identification, replacement for typeid
1557   static char ID;
1558 
1559   LoopVectorizePass Impl;
1560 
1561   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562                          bool VectorizeOnlyWhenForced = false)
1563       : FunctionPass(ID) {
1564     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1566     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1567   }
1568 
1569   bool runOnFunction(Function &F) override {
1570     if (skipFunction(F))
1571       return false;
1572 
1573     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1580     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1586 
1587     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1589 
1590     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591                         GetLAA, *ORE, PSI);
1592   }
1593 
1594   void getAnalysisUsage(AnalysisUsage &AU) const override {
1595     AU.addRequired<AssumptionCacheTracker>();
1596     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1597     AU.addRequired<DominatorTreeWrapperPass>();
1598     AU.addRequired<LoopInfoWrapperPass>();
1599     AU.addRequired<ScalarEvolutionWrapperPass>();
1600     AU.addRequired<TargetTransformInfoWrapperPass>();
1601     AU.addRequired<AAResultsWrapperPass>();
1602     AU.addRequired<LoopAccessLegacyAnalysis>();
1603     AU.addRequired<DemandedBitsWrapperPass>();
1604     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1605 
1606     // We currently do not preserve loopinfo/dominator analyses with outer loop
1607     // vectorization. Until this is addressed, mark these analyses as preserved
1608     // only for non-VPlan-native path.
1609     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610     if (!EnableVPlanNativePath) {
1611       AU.addPreserved<LoopInfoWrapperPass>();
1612       AU.addPreserved<DominatorTreeWrapperPass>();
1613     }
1614 
1615     AU.addPreserved<BasicAAWrapperPass>();
1616     AU.addPreserved<GlobalsAAWrapperPass>();
1617     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1618   }
1619 };
1620 
1621 } // end anonymous namespace
1622 
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1627 
1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1629   // We need to place the broadcast of invariant variables outside the loop,
1630   // but only if it's proven safe to do so. Else, broadcast will be inside
1631   // vector loop body.
1632   Instruction *Instr = dyn_cast<Instruction>(V);
1633   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634                      (!Instr ||
1635                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1636   // Place the code for broadcasting invariant variables in the new preheader.
1637   IRBuilder<>::InsertPointGuard Guard(Builder);
1638   if (SafeToHoist)
1639     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1640 
1641   // Broadcast the scalar into all locations in the vector.
1642   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1643 
1644   return Shuf;
1645 }
1646 
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650          "Expected either an induction phi-node or a truncate of it!");
1651   Value *Start = II.getStartValue();
1652 
1653   // Construct the initial value of the vector IV in the vector loop preheader
1654   auto CurrIP = Builder.saveIP();
1655   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1656   if (isa<TruncInst>(EntryVal)) {
1657     assert(Start->getType()->isIntegerTy() &&
1658            "Truncation requires an integer type");
1659     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660     Step = Builder.CreateTrunc(Step, TruncType);
1661     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1662   }
1663   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664   Value *SteppedStart =
1665       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1666 
1667   // We create vector phi nodes for both integer and floating-point induction
1668   // variables. Here, we determine the kind of arithmetic we will perform.
1669   Instruction::BinaryOps AddOp;
1670   Instruction::BinaryOps MulOp;
1671   if (Step->getType()->isIntegerTy()) {
1672     AddOp = Instruction::Add;
1673     MulOp = Instruction::Mul;
1674   } else {
1675     AddOp = II.getInductionOpcode();
1676     MulOp = Instruction::FMul;
1677   }
1678 
1679   // Multiply the vectorization factor by the step using integer or
1680   // floating-point arithmetic as appropriate.
1681   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1683 
1684   // Create a vector splat to use in the induction update.
1685   //
1686   // FIXME: If the step is non-constant, we create the vector splat with
1687   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688   //        handle a constant vector splat.
1689   Value *SplatVF = isa<Constant>(Mul)
1690                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691                        : Builder.CreateVectorSplat(VF, Mul);
1692   Builder.restoreIP(CurrIP);
1693 
1694   // We may need to add the step a number of times, depending on the unroll
1695   // factor. The last of those goes into the PHI.
1696   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1697                                     &*LoopVectorBody->getFirstInsertionPt());
1698   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699   Instruction *LastInduction = VecInd;
1700   for (unsigned Part = 0; Part < UF; ++Part) {
1701     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1702 
1703     if (isa<TruncInst>(EntryVal))
1704       addMetadata(LastInduction, EntryVal);
1705     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1706 
1707     LastInduction = cast<Instruction>(addFastMathFlag(
1708         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1710   }
1711 
1712   // Move the last step to the end of the latch block. This ensures consistent
1713   // placement of all induction updates.
1714   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716   auto *ICmp = cast<Instruction>(Br->getCondition());
1717   LastInduction->moveBefore(ICmp);
1718   LastInduction->setName("vec.ind.next");
1719 
1720   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1722 }
1723 
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1725   return Cost->isScalarAfterVectorization(I, VF) ||
1726          Cost->isProfitableToScalarize(I, VF);
1727 }
1728 
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1730   if (shouldScalarizeInstruction(IV))
1731     return true;
1732   auto isScalarInst = [&](User *U) -> bool {
1733     auto *I = cast<Instruction>(U);
1734     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1735   };
1736   return llvm::any_of(IV->users(), isScalarInst);
1737 }
1738 
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740     const InductionDescriptor &ID, const Instruction *EntryVal,
1741     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743          "Expected either an induction phi-node or a truncate of it!");
1744 
1745   // This induction variable is not the phi from the original loop but the
1746   // newly-created IV based on the proof that casted Phi is equal to the
1747   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748   // re-uses the same InductionDescriptor that original IV uses but we don't
1749   // have to do any recording in this case - that is done when original IV is
1750   // processed.
1751   if (isa<TruncInst>(EntryVal))
1752     return;
1753 
1754   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755   if (Casts.empty())
1756     return;
1757   // Only the first Cast instruction in the Casts vector is of interest.
1758   // The rest of the Casts (if exist) have no uses outside the
1759   // induction update chain itself.
1760   Instruction *CastInst = *Casts.begin();
1761   if (Lane < UINT_MAX)
1762     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763   else
1764     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1765 }
1766 
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1768   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769          "Primary induction variable must have an integer type");
1770 
1771   auto II = Legal->getInductionVars()->find(IV);
1772   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1773 
1774   auto ID = II->second;
1775   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1776 
1777   // The scalar value to broadcast. This will be derived from the canonical
1778   // induction variable.
1779   Value *ScalarIV = nullptr;
1780 
1781   // The value from the original loop to which we are mapping the new induction
1782   // variable.
1783   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1784 
1785   // True if we have vectorized the induction variable.
1786   auto VectorizedIV = false;
1787 
1788   // Determine if we want a scalar version of the induction variable. This is
1789   // true if the induction variable itself is not widened, or if it has at
1790   // least one user in the loop that is not widened.
1791   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1792 
1793   // Generate code for the induction step. Note that induction steps are
1794   // required to be loop-invariant
1795   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796          "Induction step should be loop invariant");
1797   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798   Value *Step = nullptr;
1799   if (PSE.getSE()->isSCEVable(IV->getType())) {
1800     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1802                              LoopVectorPreHeader->getTerminator());
1803   } else {
1804     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1805   }
1806 
1807   // Try to create a new independent vector induction variable. If we can't
1808   // create the phi node, we will splat the scalar induction variable in each
1809   // loop iteration.
1810   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812     VectorizedIV = true;
1813   }
1814 
1815   // If we haven't yet vectorized the induction variable, or if we will create
1816   // a scalar one, we need to define the scalar induction variable and step
1817   // values. If we were given a truncation type, truncate the canonical
1818   // induction variable and step. Otherwise, derive these values from the
1819   // induction descriptor.
1820   if (!VectorizedIV || NeedsScalarIV) {
1821     ScalarIV = Induction;
1822     if (IV != OldInduction) {
1823       ScalarIV = IV->getType()->isIntegerTy()
1824                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1825                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1826                                           IV->getType());
1827       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828       ScalarIV->setName("offset.idx");
1829     }
1830     if (Trunc) {
1831       auto *TruncType = cast<IntegerType>(Trunc->getType());
1832       assert(Step->getType()->isIntegerTy() &&
1833              "Truncation requires an integer step");
1834       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835       Step = Builder.CreateTrunc(Step, TruncType);
1836     }
1837   }
1838 
1839   // If we haven't yet vectorized the induction variable, splat the scalar
1840   // induction variable, and build the necessary step vectors.
1841   // TODO: Don't do it unless the vectorized IV is really required.
1842   if (!VectorizedIV) {
1843     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844     for (unsigned Part = 0; Part < UF; ++Part) {
1845       Value *EntryPart =
1846           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848       if (Trunc)
1849         addMetadata(EntryPart, Trunc);
1850       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1851     }
1852   }
1853 
1854   // If an induction variable is only used for counting loop iterations or
1855   // calculating addresses, it doesn't need to be widened. Create scalar steps
1856   // that can be used by instructions we will later scalarize. Note that the
1857   // addition of the scalar steps will not increase the number of instructions
1858   // in the loop in the common case prior to InstCombine. We will be trading
1859   // one vector extract for each scalar step.
1860   if (NeedsScalarIV)
1861     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1862 }
1863 
1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1865                                           Instruction::BinaryOps BinOp) {
1866   // Create and check the types.
1867   assert(Val->getType()->isVectorTy() && "Must be a vector");
1868   int VLen = Val->getType()->getVectorNumElements();
1869 
1870   Type *STy = Val->getType()->getScalarType();
1871   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872          "Induction Step must be an integer or FP");
1873   assert(Step->getType() == STy && "Step has wrong type");
1874 
1875   SmallVector<Constant *, 8> Indices;
1876 
1877   if (STy->isIntegerTy()) {
1878     // Create a vector of consecutive numbers from zero to VF.
1879     for (int i = 0; i < VLen; ++i)
1880       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1881 
1882     // Add the consecutive indices to the vector value.
1883     Constant *Cv = ConstantVector::get(Indices);
1884     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885     Step = Builder.CreateVectorSplat(VLen, Step);
1886     assert(Step->getType() == Val->getType() && "Invalid step vec");
1887     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888     // which can be found from the original scalar operations.
1889     Step = Builder.CreateMul(Cv, Step);
1890     return Builder.CreateAdd(Val, Step, "induction");
1891   }
1892 
1893   // Floating point induction.
1894   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895          "Binary Opcode should be specified for FP induction");
1896   // Create a vector of consecutive numbers from zero to VF.
1897   for (int i = 0; i < VLen; ++i)
1898     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1899 
1900   // Add the consecutive indices to the vector value.
1901   Constant *Cv = ConstantVector::get(Indices);
1902 
1903   Step = Builder.CreateVectorSplat(VLen, Step);
1904 
1905   // Floating point operations had to be 'fast' to enable the induction.
1906   FastMathFlags Flags;
1907   Flags.setFast();
1908 
1909   Value *MulOp = Builder.CreateFMul(Cv, Step);
1910   if (isa<Instruction>(MulOp))
1911     // Have to check, MulOp may be a constant
1912     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1913 
1914   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915   if (isa<Instruction>(BOp))
1916     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917   return BOp;
1918 }
1919 
1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1921                                            Instruction *EntryVal,
1922                                            const InductionDescriptor &ID) {
1923   // We shouldn't have to build scalar steps if we aren't vectorizing.
1924   assert(VF > 1 && "VF should be greater than one");
1925 
1926   // Get the value type and ensure it and the step have the same integer type.
1927   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928   assert(ScalarIVTy == Step->getType() &&
1929          "Val and Step should have the same type");
1930 
1931   // We build scalar steps for both integer and floating-point induction
1932   // variables. Here, we determine the kind of arithmetic we will perform.
1933   Instruction::BinaryOps AddOp;
1934   Instruction::BinaryOps MulOp;
1935   if (ScalarIVTy->isIntegerTy()) {
1936     AddOp = Instruction::Add;
1937     MulOp = Instruction::Mul;
1938   } else {
1939     AddOp = ID.getInductionOpcode();
1940     MulOp = Instruction::FMul;
1941   }
1942 
1943   // Determine the number of scalars we need to generate for each unroll
1944   // iteration. If EntryVal is uniform, we only need to generate the first
1945   // lane. Otherwise, we generate all VF values.
1946   unsigned Lanes =
1947       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948                                                                          : VF;
1949   // Compute the scalar steps and save the results in VectorLoopValueMap.
1950   for (unsigned Part = 0; Part < UF; ++Part) {
1951     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1957     }
1958   }
1959 }
1960 
1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1962   assert(V != Induction && "The new induction variable should not be used.");
1963   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1965 
1966   // If we have a stride that is replaced by one, do it here. Defer this for
1967   // the VPlan-native path until we start running Legal checks in that path.
1968   if (!EnableVPlanNativePath && Legal->hasStride(V))
1969     V = ConstantInt::get(V->getType(), 1);
1970 
1971   // If we have a vector mapped to this value, return it.
1972   if (VectorLoopValueMap.hasVectorValue(V, Part))
1973     return VectorLoopValueMap.getVectorValue(V, Part);
1974 
1975   // If the value has not been vectorized, check if it has been scalarized
1976   // instead. If it has been scalarized, and we actually need the value in
1977   // vector form, we will construct the vector values on demand.
1978   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1979     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1980 
1981     // If we've scalarized a value, that value should be an instruction.
1982     auto *I = cast<Instruction>(V);
1983 
1984     // If we aren't vectorizing, we can just copy the scalar map values over to
1985     // the vector map.
1986     if (VF == 1) {
1987       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988       return ScalarValue;
1989     }
1990 
1991     // Get the last scalar instruction we generated for V and Part. If the value
1992     // is known to be uniform after vectorization, this corresponds to lane zero
1993     // of the Part unroll iteration. Otherwise, the last instruction is the one
1994     // we created for the last vector lane of the Part unroll iteration.
1995     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996     auto *LastInst = cast<Instruction>(
1997         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1998 
1999     // Set the insert point after the last scalarized instruction. This ensures
2000     // the insertelement sequence will directly follow the scalar definitions.
2001     auto OldIP = Builder.saveIP();
2002     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003     Builder.SetInsertPoint(&*NewIP);
2004 
2005     // However, if we are vectorizing, we need to construct the vector values.
2006     // If the value is known to be uniform after vectorization, we can just
2007     // broadcast the scalar value corresponding to lane zero for each unroll
2008     // iteration. Otherwise, we construct the vector values using insertelement
2009     // instructions. Since the resulting vectors are stored in
2010     // VectorLoopValueMap, we will only generate the insertelements once.
2011     Value *VectorValue = nullptr;
2012     if (Cost->isUniformAfterVectorization(I, VF)) {
2013       VectorValue = getBroadcastInstrs(ScalarValue);
2014       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015     } else {
2016       // Initialize packing with insertelements to start from undef.
2017       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2018       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019       for (unsigned Lane = 0; Lane < VF; ++Lane)
2020         packScalarIntoVectorValue(V, {Part, Lane});
2021       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2022     }
2023     Builder.restoreIP(OldIP);
2024     return VectorValue;
2025   }
2026 
2027   // If this scalar is unknown, assume that it is a constant or that it is
2028   // loop invariant. Broadcast V and save the value for future uses.
2029   Value *B = getBroadcastInstrs(V);
2030   VectorLoopValueMap.setVectorValue(V, Part, B);
2031   return B;
2032 }
2033 
2034 Value *
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2036                                             const VPIteration &Instance) {
2037   // If the value is not an instruction contained in the loop, it should
2038   // already be scalar.
2039   if (OrigLoop->isLoopInvariant(V))
2040     return V;
2041 
2042   assert(Instance.Lane > 0
2043              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044              : true && "Uniform values only have lane zero");
2045 
2046   // If the value from the original loop has not been vectorized, it is
2047   // represented by UF x VF scalar values in the new loop. Return the requested
2048   // scalar value.
2049   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050     return VectorLoopValueMap.getScalarValue(V, Instance);
2051 
2052   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053   // for the given unroll part. If this entry is not a vector type (i.e., the
2054   // vectorization factor is one), there is no need to generate an
2055   // extractelement instruction.
2056   auto *U = getOrCreateVectorValue(V, Instance.Part);
2057   if (!U->getType()->isVectorTy()) {
2058     assert(VF == 1 && "Value not scalarized has non-vector type");
2059     return U;
2060   }
2061 
2062   // Otherwise, the value from the original loop has been vectorized and is
2063   // represented by UF vector values. Extract and return the requested scalar
2064   // value from the appropriate vector lane.
2065   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2066 }
2067 
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069     Value *V, const VPIteration &Instance) {
2070   assert(V != Induction && "The new induction variable should not be used.");
2071   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2073 
2074   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077                                             Builder.getInt32(Instance.Lane));
2078   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2079 }
2080 
2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2082   assert(Vec->getType()->isVectorTy() && "Invalid type");
2083   SmallVector<Constant *, 8> ShuffleMask;
2084   for (unsigned i = 0; i < VF; ++i)
2085     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2086 
2087   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088                                      ConstantVector::get(ShuffleMask),
2089                                      "reverse");
2090 }
2091 
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2096   // If an override option has been passed in for interleaved accesses, use it.
2097   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2098     return EnableMaskedInterleavedMemAccesses;
2099 
2100   return TTI.enableMaskedInterleavedAccessVectorization();
2101 }
2102 
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2104 //
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 //   for (i = 0; i < N; i+=3) {
2107 //     R = Pic[i];             // Member of index 0
2108 //     G = Pic[i+1];           // Member of index 1
2109 //     B = Pic[i+2];           // Member of index 2
2110 //     ... // do something to R, G, B
2111 //   }
2112 // To:
2113 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2114 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2115 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2116 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2117 //
2118 // Or translate following interleaved store group (factor = 3):
2119 //   for (i = 0; i < N; i+=3) {
2120 //     ... do something to R, G, B
2121 //     Pic[i]   = R;           // Member of index 0
2122 //     Pic[i+1] = G;           // Member of index 1
2123 //     Pic[i+2] = B;           // Member of index 2
2124 //   }
2125 // To:
2126 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2130 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2132                                                    VectorParts *BlockInMask) {
2133   const InterleaveGroup<Instruction> *Group =
2134       Cost->getInterleavedAccessGroup(Instr);
2135   assert(Group && "Fail to get an interleaved access group.");
2136 
2137   // Skip if current instruction is not the insert position.
2138   if (Instr != Group->getInsertPos())
2139     return;
2140 
2141   const DataLayout &DL = Instr->getModule()->getDataLayout();
2142   Value *Ptr = getLoadStorePointerOperand(Instr);
2143 
2144   // Prepare for the vector type of the interleaved load/store.
2145   Type *ScalarTy = getMemInstValueType(Instr);
2146   unsigned InterleaveFactor = Group->getFactor();
2147   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2149 
2150   // Prepare for the new pointers.
2151   setDebugLocFromInst(Builder, Ptr);
2152   SmallVector<Value *, 2> NewPtrs;
2153   unsigned Index = Group->getIndex(Instr);
2154 
2155   VectorParts Mask;
2156   bool IsMaskForCondRequired = BlockInMask;
2157   if (IsMaskForCondRequired) {
2158     Mask = *BlockInMask;
2159     // TODO: extend the masked interleaved-group support to reversed access.
2160     assert(!Group->isReverse() && "Reversed masked interleave-group "
2161                                   "not supported.");
2162   }
2163 
2164   // If the group is reverse, adjust the index to refer to the last vector lane
2165   // instead of the first. We adjust the index from the first vector lane,
2166   // rather than directly getting the pointer for lane VF - 1, because the
2167   // pointer operand of the interleaved access is supposed to be uniform. For
2168   // uniform instructions, we're only required to generate a value for the
2169   // first vector lane in each unroll iteration.
2170   if (Group->isReverse())
2171     Index += (VF - 1) * Group->getFactor();
2172 
2173   bool InBounds = false;
2174   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175     InBounds = gep->isInBounds();
2176 
2177   for (unsigned Part = 0; Part < UF; Part++) {
2178     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2179 
2180     // Notice current instruction could be any index. Need to adjust the address
2181     // to the member of index 0.
2182     //
2183     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2184     //       b = A[i];       // Member of index 0
2185     // Current pointer is pointed to A[i+1], adjust it to A[i].
2186     //
2187     // E.g.  A[i+1] = a;     // Member of index 1
2188     //       A[i]   = b;     // Member of index 0
2189     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2190     // Current pointer is pointed to A[i+2], adjust it to A[i].
2191     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192     if (InBounds)
2193       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2194 
2195     // Cast to the vector pointer type.
2196     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2197   }
2198 
2199   setDebugLocFromInst(Builder, Instr);
2200   Value *UndefVec = UndefValue::get(VecTy);
2201 
2202   Value *MaskForGaps = nullptr;
2203   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2206   }
2207 
2208   // Vectorize the interleaved load group.
2209   if (isa<LoadInst>(Instr)) {
2210     // For each unroll part, create a wide load for the group.
2211     SmallVector<Value *, 2> NewLoads;
2212     for (unsigned Part = 0; Part < UF; Part++) {
2213       Instruction *NewLoad;
2214       if (IsMaskForCondRequired || MaskForGaps) {
2215         assert(useMaskedInterleavedAccesses(*TTI) &&
2216                "masked interleaved groups are not allowed.");
2217         Value *GroupMask = MaskForGaps;
2218         if (IsMaskForCondRequired) {
2219           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221           Value *ShuffledMask = Builder.CreateShuffleVector(
2222               Mask[Part], Undefs, RepMask, "interleaved.mask");
2223           GroupMask = MaskForGaps
2224                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225                                                 MaskForGaps)
2226                           : ShuffledMask;
2227         }
2228         NewLoad =
2229             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230                                      GroupMask, UndefVec, "wide.masked.vec");
2231       }
2232       else
2233         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234                                             Group->getAlignment(), "wide.vec");
2235       Group->addMetadata(NewLoad);
2236       NewLoads.push_back(NewLoad);
2237     }
2238 
2239     // For each member in the group, shuffle out the appropriate data from the
2240     // wide loads.
2241     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242       Instruction *Member = Group->getMember(I);
2243 
2244       // Skip the gaps in the group.
2245       if (!Member)
2246         continue;
2247 
2248       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249       for (unsigned Part = 0; Part < UF; Part++) {
2250         Value *StridedVec = Builder.CreateShuffleVector(
2251             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2252 
2253         // If this member has different type, cast the result type.
2254         if (Member->getType() != ScalarTy) {
2255           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2257         }
2258 
2259         if (Group->isReverse())
2260           StridedVec = reverseVector(StridedVec);
2261 
2262         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2263       }
2264     }
2265     return;
2266   }
2267 
2268   // The sub vector type for current instruction.
2269   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2270 
2271   // Vectorize the interleaved store group.
2272   for (unsigned Part = 0; Part < UF; Part++) {
2273     // Collect the stored vector from each member.
2274     SmallVector<Value *, 4> StoredVecs;
2275     for (unsigned i = 0; i < InterleaveFactor; i++) {
2276       // Interleaved store group doesn't allow a gap, so each index has a member
2277       Instruction *Member = Group->getMember(i);
2278       assert(Member && "Fail to get a member from an interleaved store group");
2279 
2280       Value *StoredVec = getOrCreateVectorValue(
2281           cast<StoreInst>(Member)->getValueOperand(), Part);
2282       if (Group->isReverse())
2283         StoredVec = reverseVector(StoredVec);
2284 
2285       // If this member has different type, cast it to a unified type.
2286 
2287       if (StoredVec->getType() != SubVT)
2288         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2289 
2290       StoredVecs.push_back(StoredVec);
2291     }
2292 
2293     // Concatenate all vectors into a wide vector.
2294     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2295 
2296     // Interleave the elements in the wide vector.
2297     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299                                               "interleaved.vec");
2300 
2301     Instruction *NewStoreInstr;
2302     if (IsMaskForCondRequired) {
2303       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305       Value *ShuffledMask = Builder.CreateShuffleVector(
2306           Mask[Part], Undefs, RepMask, "interleaved.mask");
2307       NewStoreInstr = Builder.CreateMaskedStore(
2308           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2309     }
2310     else
2311       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312         Group->getAlignment());
2313 
2314     Group->addMetadata(NewStoreInstr);
2315   }
2316 }
2317 
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2319                                                      VectorParts *BlockInMask) {
2320   // Attempt to issue a wide load.
2321   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2323 
2324   assert((LI || SI) && "Invalid Load/Store instruction");
2325 
2326   LoopVectorizationCostModel::InstWidening Decision =
2327       Cost->getWideningDecision(Instr, VF);
2328   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2329          "CM decision should be taken at this point");
2330   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2331     return vectorizeInterleaveGroup(Instr);
2332 
2333   Type *ScalarDataTy = getMemInstValueType(Instr);
2334   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335   Value *Ptr = getLoadStorePointerOperand(Instr);
2336   unsigned Alignment = getLoadStoreAlignment(Instr);
2337   // An alignment of 0 means target abi alignment. We need to use the scalar's
2338   // target abi alignment in such a case.
2339   const DataLayout &DL = Instr->getModule()->getDataLayout();
2340   if (!Alignment)
2341     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2343 
2344   // Determine if the pointer operand of the access is either consecutive or
2345   // reverse consecutive.
2346   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347   bool ConsecutiveStride =
2348       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349   bool CreateGatherScatter =
2350       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2351 
2352   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353   // gather/scatter. Otherwise Decision should have been to Scalarize.
2354   assert((ConsecutiveStride || CreateGatherScatter) &&
2355          "The instruction should be scalarized");
2356 
2357   // Handle consecutive loads/stores.
2358   if (ConsecutiveStride)
2359     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2360 
2361   VectorParts Mask;
2362   bool isMaskRequired = BlockInMask;
2363   if (isMaskRequired)
2364     Mask = *BlockInMask;
2365 
2366   bool InBounds = false;
2367   if (auto *gep = dyn_cast<GetElementPtrInst>(
2368           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369     InBounds = gep->isInBounds();
2370 
2371   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372     // Calculate the pointer for the specific unroll-part.
2373     GetElementPtrInst *PartPtr = nullptr;
2374 
2375     if (Reverse) {
2376       // If the address is consecutive but reversed, then the
2377       // wide store needs to start at the last vector element.
2378       PartPtr = cast<GetElementPtrInst>(
2379           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380       PartPtr->setIsInBounds(InBounds);
2381       PartPtr = cast<GetElementPtrInst>(
2382           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383       PartPtr->setIsInBounds(InBounds);
2384       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385         Mask[Part] = reverseVector(Mask[Part]);
2386     } else {
2387       PartPtr = cast<GetElementPtrInst>(
2388           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389       PartPtr->setIsInBounds(InBounds);
2390     }
2391 
2392     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2393   };
2394 
2395   // Handle Stores:
2396   if (SI) {
2397     setDebugLocFromInst(Builder, SI);
2398 
2399     for (unsigned Part = 0; Part < UF; ++Part) {
2400       Instruction *NewSI = nullptr;
2401       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402       if (CreateGatherScatter) {
2403         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406                                             MaskPart);
2407       } else {
2408         if (Reverse) {
2409           // If we store to reverse consecutive memory locations, then we need
2410           // to reverse the order of elements in the stored value.
2411           StoredVal = reverseVector(StoredVal);
2412           // We don't want to update the value in the map as it might be used in
2413           // another expression. So don't call resetVectorValue(StoredVal).
2414         }
2415         auto *VecPtr = CreateVecPtr(Part, Ptr);
2416         if (isMaskRequired)
2417           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418                                             Mask[Part]);
2419         else
2420           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2421       }
2422       addMetadata(NewSI, SI);
2423     }
2424     return;
2425   }
2426 
2427   // Handle loads.
2428   assert(LI && "Must have a load instruction");
2429   setDebugLocFromInst(Builder, LI);
2430   for (unsigned Part = 0; Part < UF; ++Part) {
2431     Value *NewLI;
2432     if (CreateGatherScatter) {
2433       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436                                          nullptr, "wide.masked.gather");
2437       addMetadata(NewLI, LI);
2438     } else {
2439       auto *VecPtr = CreateVecPtr(Part, Ptr);
2440       if (isMaskRequired)
2441         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442                                          UndefValue::get(DataTy),
2443                                          "wide.masked.load");
2444       else
2445         NewLI =
2446             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2447 
2448       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449       addMetadata(NewLI, LI);
2450       if (Reverse)
2451         NewLI = reverseVector(NewLI);
2452     }
2453     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2454   }
2455 }
2456 
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2458                                                const VPIteration &Instance,
2459                                                bool IfPredicateInstr) {
2460   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2461 
2462   setDebugLocFromInst(Builder, Instr);
2463 
2464   // Does this instruction return a value ?
2465   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2466 
2467   Instruction *Cloned = Instr->clone();
2468   if (!IsVoidRetTy)
2469     Cloned->setName(Instr->getName() + ".cloned");
2470 
2471   // Replace the operands of the cloned instructions with their scalar
2472   // equivalents in the new loop.
2473   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475     Cloned->setOperand(op, NewOp);
2476   }
2477   addNewMetadata(Cloned, Instr);
2478 
2479   // Place the cloned scalar in the new loop.
2480   Builder.Insert(Cloned);
2481 
2482   // Add the cloned scalar to the scalar map entry.
2483   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2484 
2485   // If we just cloned a new assumption, add it the assumption cache.
2486   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487     if (II->getIntrinsicID() == Intrinsic::assume)
2488       AC->registerAssumption(II);
2489 
2490   // End if-block.
2491   if (IfPredicateInstr)
2492     PredicatedInstructions.push_back(Cloned);
2493 }
2494 
2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2496                                                       Value *End, Value *Step,
2497                                                       Instruction *DL) {
2498   BasicBlock *Header = L->getHeader();
2499   BasicBlock *Latch = L->getLoopLatch();
2500   // As we're just creating this loop, it's possible no latch exists
2501   // yet. If so, use the header as this will be a single block loop.
2502   if (!Latch)
2503     Latch = Header;
2504 
2505   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2506   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2507   setDebugLocFromInst(Builder, OldInst);
2508   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2509 
2510   Builder.SetInsertPoint(Latch->getTerminator());
2511   setDebugLocFromInst(Builder, OldInst);
2512 
2513   // Create i+1 and fill the PHINode.
2514   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515   Induction->addIncoming(Start, L->getLoopPreheader());
2516   Induction->addIncoming(Next, Latch);
2517   // Create the compare.
2518   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2520 
2521   // Now we have two terminators. Remove the old one from the block.
2522   Latch->getTerminator()->eraseFromParent();
2523 
2524   return Induction;
2525 }
2526 
2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2528   if (TripCount)
2529     return TripCount;
2530 
2531   assert(L && "Create Trip Count for null loop.");
2532   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2533   // Find the loop boundaries.
2534   ScalarEvolution *SE = PSE.getSE();
2535   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537          "Invalid loop count");
2538 
2539   Type *IdxTy = Legal->getWidestInductionType();
2540   assert(IdxTy && "No type for induction");
2541 
2542   // The exit count might have the type of i64 while the phi is i32. This can
2543   // happen if we have an induction variable that is sign extended before the
2544   // compare. The only way that we get a backedge taken count is that the
2545   // induction variable was signed and as such will not overflow. In such a case
2546   // truncation is legal.
2547   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548       IdxTy->getPrimitiveSizeInBits())
2549     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2551 
2552   // Get the total trip count from the count by adding 1.
2553   const SCEV *ExitCount = SE->getAddExpr(
2554       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2555 
2556   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2557 
2558   // Expand the trip count and place the new instructions in the preheader.
2559   // Notice that the pre-header does not change, only the loop body.
2560   SCEVExpander Exp(*SE, DL, "induction");
2561 
2562   // Count holds the overall loop count (N).
2563   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2564                                 L->getLoopPreheader()->getTerminator());
2565 
2566   if (TripCount->getType()->isPointerTy())
2567     TripCount =
2568         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2569                                     L->getLoopPreheader()->getTerminator());
2570 
2571   return TripCount;
2572 }
2573 
2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2575   if (VectorTripCount)
2576     return VectorTripCount;
2577 
2578   Value *TC = getOrCreateTripCount(L);
2579   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2580 
2581   Type *Ty = TC->getType();
2582   Constant *Step = ConstantInt::get(Ty, VF * UF);
2583 
2584   // If the tail is to be folded by masking, round the number of iterations N
2585   // up to a multiple of Step instead of rounding down. This is done by first
2586   // adding Step-1 and then rounding down. Note that it's ok if this addition
2587   // overflows: the vector induction variable will eventually wrap to zero given
2588   // that it starts at zero and its Step is a power of two; the loop will then
2589   // exit, with the last early-exit vector comparison also producing all-true.
2590   if (Cost->foldTailByMasking()) {
2591     assert(isPowerOf2_32(VF * UF) &&
2592            "VF*UF must be a power of 2 when folding tail by masking");
2593     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2594   }
2595 
2596   // Now we need to generate the expression for the part of the loop that the
2597   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598   // iterations are not required for correctness, or N - Step, otherwise. Step
2599   // is equal to the vectorization factor (number of SIMD elements) times the
2600   // unroll factor (number of SIMD instructions).
2601   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2602 
2603   // If there is a non-reversed interleaved group that may speculatively access
2604   // memory out-of-bounds, we need to ensure that there will be at least one
2605   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606   // the trip count, we set the remainder to be equal to the step. If the step
2607   // does not evenly divide the trip count, no adjustment is necessary since
2608   // there will already be scalar iterations. Note that the minimum iterations
2609   // check ensures that N >= Step.
2610   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612     R = Builder.CreateSelect(IsZero, Step, R);
2613   }
2614 
2615   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2616 
2617   return VectorTripCount;
2618 }
2619 
2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2621                                                    const DataLayout &DL) {
2622   // Verify that V is a vector type with same number of elements as DstVTy.
2623   unsigned VF = DstVTy->getNumElements();
2624   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626   Type *SrcElemTy = SrcVecTy->getElementType();
2627   Type *DstElemTy = DstVTy->getElementType();
2628   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629          "Vector elements must have same size");
2630 
2631   // Do a direct cast if element types are castable.
2632   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633     return Builder.CreateBitOrPointerCast(V, DstVTy);
2634   }
2635   // V cannot be directly casted to desired vector type.
2636   // May happen when V is a floating point vector but DstVTy is a vector of
2637   // pointers or vice-versa. Handle this using a two-step bitcast using an
2638   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640          "Only one type should be a pointer type");
2641   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642          "Only one type should be a floating point type");
2643   Type *IntTy =
2644       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2645   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2648 }
2649 
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2651                                                          BasicBlock *Bypass) {
2652   Value *Count = getOrCreateTripCount(L);
2653   BasicBlock *BB = L->getLoopPreheader();
2654   IRBuilder<> Builder(BB->getTerminator());
2655 
2656   // Generate code to check if the loop's trip count is less than VF * UF, or
2657   // equal to it in case a scalar epilogue is required; this implies that the
2658   // vector trip count is zero. This check also covers the case where adding one
2659   // to the backedge-taken count overflowed leading to an incorrect trip count
2660   // of zero. In this case we will also jump to the scalar loop.
2661   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662                                           : ICmpInst::ICMP_ULT;
2663 
2664   // If tail is to be folded, vector loop takes care of all iterations.
2665   Value *CheckMinIters = Builder.getFalse();
2666   if (!Cost->foldTailByMasking())
2667     CheckMinIters = Builder.CreateICmp(
2668         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669         "min.iters.check");
2670 
2671   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672   // Update dominator tree immediately if the generated block is a
2673   // LoopBypassBlock because SCEV expansions to generate loop bypass
2674   // checks may query it before the current function is finished.
2675   DT->addNewBlock(NewBB, BB);
2676   if (L->getParentLoop())
2677     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2678   ReplaceInstWithInst(BB->getTerminator(),
2679                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680   LoopBypassBlocks.push_back(BB);
2681 }
2682 
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2684   BasicBlock *BB = L->getLoopPreheader();
2685 
2686   // Generate the code to check that the SCEV assumptions that we made.
2687   // We want the new basic block to start at the first instruction in a
2688   // sequence of instructions that form a check.
2689   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690                    "scev.check");
2691   Value *SCEVCheck =
2692       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2693 
2694   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695     if (C->isZero())
2696       return;
2697 
2698   assert(!Cost->foldTailByMasking() &&
2699          "Cannot SCEV check stride or overflow when folding tail");
2700   // Create a new block containing the stride check.
2701   BB->setName("vector.scevcheck");
2702   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2703   // Update dominator tree immediately if the generated block is a
2704   // LoopBypassBlock because SCEV expansions to generate loop bypass
2705   // checks may query it before the current function is finished.
2706   DT->addNewBlock(NewBB, BB);
2707   if (L->getParentLoop())
2708     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2709   ReplaceInstWithInst(BB->getTerminator(),
2710                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2711   LoopBypassBlocks.push_back(BB);
2712   AddedSafetyChecks = true;
2713 }
2714 
2715 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2716   // VPlan-native path does not do any analysis for runtime checks currently.
2717   if (EnableVPlanNativePath)
2718     return;
2719 
2720   BasicBlock *BB = L->getLoopPreheader();
2721 
2722   // Generate the code that checks in runtime if arrays overlap. We put the
2723   // checks into a separate block to make the more common case of few elements
2724   // faster.
2725   Instruction *FirstCheckInst;
2726   Instruction *MemRuntimeCheck;
2727   std::tie(FirstCheckInst, MemRuntimeCheck) =
2728       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2729   if (!MemRuntimeCheck)
2730     return;
2731 
2732   assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2733   // Create a new block containing the memory check.
2734   BB->setName("vector.memcheck");
2735   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2736   // Update dominator tree immediately if the generated block is a
2737   // LoopBypassBlock because SCEV expansions to generate loop bypass
2738   // checks may query it before the current function is finished.
2739   DT->addNewBlock(NewBB, BB);
2740   if (L->getParentLoop())
2741     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2742   ReplaceInstWithInst(BB->getTerminator(),
2743                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2744   LoopBypassBlocks.push_back(BB);
2745   AddedSafetyChecks = true;
2746 
2747   // We currently don't use LoopVersioning for the actual loop cloning but we
2748   // still use it to add the noalias metadata.
2749   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2750                                            PSE.getSE());
2751   LVer->prepareNoAliasMetadata();
2752 }
2753 
2754 Value *InnerLoopVectorizer::emitTransformedIndex(
2755     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2756     const InductionDescriptor &ID) const {
2757 
2758   SCEVExpander Exp(*SE, DL, "induction");
2759   auto Step = ID.getStep();
2760   auto StartValue = ID.getStartValue();
2761   assert(Index->getType() == Step->getType() &&
2762          "Index type does not match StepValue type");
2763 
2764   // Note: the IR at this point is broken. We cannot use SE to create any new
2765   // SCEV and then expand it, hoping that SCEV's simplification will give us
2766   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2767   // lead to various SCEV crashes. So all we can do is to use builder and rely
2768   // on InstCombine for future simplifications. Here we handle some trivial
2769   // cases only.
2770   auto CreateAdd = [&B](Value *X, Value *Y) {
2771     assert(X->getType() == Y->getType() && "Types don't match!");
2772     if (auto *CX = dyn_cast<ConstantInt>(X))
2773       if (CX->isZero())
2774         return Y;
2775     if (auto *CY = dyn_cast<ConstantInt>(Y))
2776       if (CY->isZero())
2777         return X;
2778     return B.CreateAdd(X, Y);
2779   };
2780 
2781   auto CreateMul = [&B](Value *X, Value *Y) {
2782     assert(X->getType() == Y->getType() && "Types don't match!");
2783     if (auto *CX = dyn_cast<ConstantInt>(X))
2784       if (CX->isOne())
2785         return Y;
2786     if (auto *CY = dyn_cast<ConstantInt>(Y))
2787       if (CY->isOne())
2788         return X;
2789     return B.CreateMul(X, Y);
2790   };
2791 
2792   switch (ID.getKind()) {
2793   case InductionDescriptor::IK_IntInduction: {
2794     assert(Index->getType() == StartValue->getType() &&
2795            "Index type does not match StartValue type");
2796     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2797       return B.CreateSub(StartValue, Index);
2798     auto *Offset = CreateMul(
2799         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2800     return CreateAdd(StartValue, Offset);
2801   }
2802   case InductionDescriptor::IK_PtrInduction: {
2803     assert(isa<SCEVConstant>(Step) &&
2804            "Expected constant step for pointer induction");
2805     return B.CreateGEP(
2806         StartValue->getType()->getPointerElementType(), StartValue,
2807         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2808                                            &*B.GetInsertPoint())));
2809   }
2810   case InductionDescriptor::IK_FpInduction: {
2811     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2812     auto InductionBinOp = ID.getInductionBinOp();
2813     assert(InductionBinOp &&
2814            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2815             InductionBinOp->getOpcode() == Instruction::FSub) &&
2816            "Original bin op should be defined for FP induction");
2817 
2818     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2819 
2820     // Floating point operations had to be 'fast' to enable the induction.
2821     FastMathFlags Flags;
2822     Flags.setFast();
2823 
2824     Value *MulExp = B.CreateFMul(StepValue, Index);
2825     if (isa<Instruction>(MulExp))
2826       // We have to check, the MulExp may be a constant.
2827       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2828 
2829     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2830                                "induction");
2831     if (isa<Instruction>(BOp))
2832       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2833 
2834     return BOp;
2835   }
2836   case InductionDescriptor::IK_NoInduction:
2837     return nullptr;
2838   }
2839   llvm_unreachable("invalid enum");
2840 }
2841 
2842 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2843   /*
2844    In this function we generate a new loop. The new loop will contain
2845    the vectorized instructions while the old loop will continue to run the
2846    scalar remainder.
2847 
2848        [ ] <-- loop iteration number check.
2849     /   |
2850    /    v
2851   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2852   |  /  |
2853   | /   v
2854   ||   [ ]     <-- vector pre header.
2855   |/    |
2856   |     v
2857   |    [  ] \
2858   |    [  ]_|   <-- vector loop.
2859   |     |
2860   |     v
2861   |   -[ ]   <--- middle-block.
2862   |  /  |
2863   | /   v
2864   -|- >[ ]     <--- new preheader.
2865    |    |
2866    |    v
2867    |   [ ] \
2868    |   [ ]_|   <-- old scalar loop to handle remainder.
2869     \   |
2870      \  v
2871       >[ ]     <-- exit block.
2872    ...
2873    */
2874 
2875   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2876   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2877   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2878   MDNode *OrigLoopID = OrigLoop->getLoopID();
2879   assert(VectorPH && "Invalid loop structure");
2880   assert(ExitBlock && "Must have an exit block");
2881 
2882   // Some loops have a single integer induction variable, while other loops
2883   // don't. One example is c++ iterators that often have multiple pointer
2884   // induction variables. In the code below we also support a case where we
2885   // don't have a single induction variable.
2886   //
2887   // We try to obtain an induction variable from the original loop as hard
2888   // as possible. However if we don't find one that:
2889   //   - is an integer
2890   //   - counts from zero, stepping by one
2891   //   - is the size of the widest induction variable type
2892   // then we create a new one.
2893   OldInduction = Legal->getPrimaryInduction();
2894   Type *IdxTy = Legal->getWidestInductionType();
2895 
2896   // Split the single block loop into the two loop structure described above.
2897   BasicBlock *VecBody =
2898       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2899   BasicBlock *MiddleBlock =
2900       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2901   BasicBlock *ScalarPH =
2902       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2903 
2904   // Create and register the new vector loop.
2905   Loop *Lp = LI->AllocateLoop();
2906   Loop *ParentLoop = OrigLoop->getParentLoop();
2907 
2908   // Insert the new loop into the loop nest and register the new basic blocks
2909   // before calling any utilities such as SCEV that require valid LoopInfo.
2910   if (ParentLoop) {
2911     ParentLoop->addChildLoop(Lp);
2912     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2913     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2914   } else {
2915     LI->addTopLevelLoop(Lp);
2916   }
2917   Lp->addBasicBlockToLoop(VecBody, *LI);
2918 
2919   // Find the loop boundaries.
2920   Value *Count = getOrCreateTripCount(Lp);
2921 
2922   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2923 
2924   // Now, compare the new count to zero. If it is zero skip the vector loop and
2925   // jump to the scalar loop. This check also covers the case where the
2926   // backedge-taken count is uint##_max: adding one to it will overflow leading
2927   // to an incorrect trip count of zero. In this (rare) case we will also jump
2928   // to the scalar loop.
2929   emitMinimumIterationCountCheck(Lp, ScalarPH);
2930 
2931   // Generate the code to check any assumptions that we've made for SCEV
2932   // expressions.
2933   emitSCEVChecks(Lp, ScalarPH);
2934 
2935   // Generate the code that checks in runtime if arrays overlap. We put the
2936   // checks into a separate block to make the more common case of few elements
2937   // faster.
2938   emitMemRuntimeChecks(Lp, ScalarPH);
2939 
2940   // Generate the induction variable.
2941   // The loop step is equal to the vectorization factor (num of SIMD elements)
2942   // times the unroll factor (num of SIMD instructions).
2943   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2944   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2945   Induction =
2946       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2947                               getDebugLocFromInstOrOperands(OldInduction));
2948 
2949   // We are going to resume the execution of the scalar loop.
2950   // Go over all of the induction variables that we found and fix the
2951   // PHIs that are left in the scalar version of the loop.
2952   // The starting values of PHI nodes depend on the counter of the last
2953   // iteration in the vectorized loop.
2954   // If we come from a bypass edge then we need to start from the original
2955   // start value.
2956 
2957   // This variable saves the new starting index for the scalar loop. It is used
2958   // to test if there are any tail iterations left once the vector loop has
2959   // completed.
2960   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2961   for (auto &InductionEntry : *List) {
2962     PHINode *OrigPhi = InductionEntry.first;
2963     InductionDescriptor II = InductionEntry.second;
2964 
2965     // Create phi nodes to merge from the  backedge-taken check block.
2966     PHINode *BCResumeVal = PHINode::Create(
2967         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2968     // Copy original phi DL over to the new one.
2969     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2970     Value *&EndValue = IVEndValues[OrigPhi];
2971     if (OrigPhi == OldInduction) {
2972       // We know what the end value is.
2973       EndValue = CountRoundDown;
2974     } else {
2975       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2976       Type *StepType = II.getStep()->getType();
2977       Instruction::CastOps CastOp =
2978         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2979       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2980       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2981       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2982       EndValue->setName("ind.end");
2983     }
2984 
2985     // The new PHI merges the original incoming value, in case of a bypass,
2986     // or the value at the end of the vectorized loop.
2987     BCResumeVal->addIncoming(EndValue, MiddleBlock);
2988 
2989     // Fix the scalar body counter (PHI node).
2990     // The old induction's phi node in the scalar body needs the truncated
2991     // value.
2992     for (BasicBlock *BB : LoopBypassBlocks)
2993       BCResumeVal->addIncoming(II.getStartValue(), BB);
2994     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2995   }
2996 
2997   // We need the OrigLoop (scalar loop part) latch terminator to help
2998   // produce correct debug info for the middle block BB instructions.
2999   // The legality check stage guarantees that the loop will have a single
3000   // latch.
3001   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3002          "Scalar loop latch terminator isn't a branch");
3003   BranchInst *ScalarLatchBr =
3004       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3005 
3006   // Add a check in the middle block to see if we have completed
3007   // all of the iterations in the first vector loop.
3008   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3009   // If tail is to be folded, we know we don't need to run the remainder.
3010   Value *CmpN = Builder.getTrue();
3011   if (!Cost->foldTailByMasking()) {
3012     CmpN =
3013         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3014                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3015 
3016     // Here we use the same DebugLoc as the scalar loop latch branch instead
3017     // of the corresponding compare because they may have ended up with
3018     // different line numbers and we want to avoid awkward line stepping while
3019     // debugging. Eg. if the compare has got a line number inside the loop.
3020     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3021   }
3022 
3023   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3024   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3025   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3026 
3027   // Get ready to start creating new instructions into the vectorized body.
3028   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3029 
3030   // Save the state.
3031   LoopVectorPreHeader = Lp->getLoopPreheader();
3032   LoopScalarPreHeader = ScalarPH;
3033   LoopMiddleBlock = MiddleBlock;
3034   LoopExitBlock = ExitBlock;
3035   LoopVectorBody = VecBody;
3036   LoopScalarBody = OldBasicBlock;
3037 
3038   Optional<MDNode *> VectorizedLoopID =
3039       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3040                                       LLVMLoopVectorizeFollowupVectorized});
3041   if (VectorizedLoopID.hasValue()) {
3042     Lp->setLoopID(VectorizedLoopID.getValue());
3043 
3044     // Do not setAlreadyVectorized if loop attributes have been defined
3045     // explicitly.
3046     return LoopVectorPreHeader;
3047   }
3048 
3049   // Keep all loop hints from the original loop on the vector loop (we'll
3050   // replace the vectorizer-specific hints below).
3051   if (MDNode *LID = OrigLoop->getLoopID())
3052     Lp->setLoopID(LID);
3053 
3054   LoopVectorizeHints Hints(Lp, true, *ORE);
3055   Hints.setAlreadyVectorized();
3056 
3057   return LoopVectorPreHeader;
3058 }
3059 
3060 // Fix up external users of the induction variable. At this point, we are
3061 // in LCSSA form, with all external PHIs that use the IV having one input value,
3062 // coming from the remainder loop. We need those PHIs to also have a correct
3063 // value for the IV when arriving directly from the middle block.
3064 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3065                                        const InductionDescriptor &II,
3066                                        Value *CountRoundDown, Value *EndValue,
3067                                        BasicBlock *MiddleBlock) {
3068   // There are two kinds of external IV usages - those that use the value
3069   // computed in the last iteration (the PHI) and those that use the penultimate
3070   // value (the value that feeds into the phi from the loop latch).
3071   // We allow both, but they, obviously, have different values.
3072 
3073   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3074 
3075   DenseMap<Value *, Value *> MissingVals;
3076 
3077   // An external user of the last iteration's value should see the value that
3078   // the remainder loop uses to initialize its own IV.
3079   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3080   for (User *U : PostInc->users()) {
3081     Instruction *UI = cast<Instruction>(U);
3082     if (!OrigLoop->contains(UI)) {
3083       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3084       MissingVals[UI] = EndValue;
3085     }
3086   }
3087 
3088   // An external user of the penultimate value need to see EndValue - Step.
3089   // The simplest way to get this is to recompute it from the constituent SCEVs,
3090   // that is Start + (Step * (CRD - 1)).
3091   for (User *U : OrigPhi->users()) {
3092     auto *UI = cast<Instruction>(U);
3093     if (!OrigLoop->contains(UI)) {
3094       const DataLayout &DL =
3095           OrigLoop->getHeader()->getModule()->getDataLayout();
3096       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3097 
3098       IRBuilder<> B(MiddleBlock->getTerminator());
3099       Value *CountMinusOne = B.CreateSub(
3100           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3101       Value *CMO =
3102           !II.getStep()->getType()->isIntegerTy()
3103               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3104                              II.getStep()->getType())
3105               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3106       CMO->setName("cast.cmo");
3107       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3108       Escape->setName("ind.escape");
3109       MissingVals[UI] = Escape;
3110     }
3111   }
3112 
3113   for (auto &I : MissingVals) {
3114     PHINode *PHI = cast<PHINode>(I.first);
3115     // One corner case we have to handle is two IVs "chasing" each-other,
3116     // that is %IV2 = phi [...], [ %IV1, %latch ]
3117     // In this case, if IV1 has an external use, we need to avoid adding both
3118     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3119     // don't already have an incoming value for the middle block.
3120     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3121       PHI->addIncoming(I.second, MiddleBlock);
3122   }
3123 }
3124 
3125 namespace {
3126 
3127 struct CSEDenseMapInfo {
3128   static bool canHandle(const Instruction *I) {
3129     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3130            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3131   }
3132 
3133   static inline Instruction *getEmptyKey() {
3134     return DenseMapInfo<Instruction *>::getEmptyKey();
3135   }
3136 
3137   static inline Instruction *getTombstoneKey() {
3138     return DenseMapInfo<Instruction *>::getTombstoneKey();
3139   }
3140 
3141   static unsigned getHashValue(const Instruction *I) {
3142     assert(canHandle(I) && "Unknown instruction!");
3143     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3144                                                            I->value_op_end()));
3145   }
3146 
3147   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3148     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3149         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3150       return LHS == RHS;
3151     return LHS->isIdenticalTo(RHS);
3152   }
3153 };
3154 
3155 } // end anonymous namespace
3156 
3157 ///Perform cse of induction variable instructions.
3158 static void cse(BasicBlock *BB) {
3159   // Perform simple cse.
3160   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3161   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3162     Instruction *In = &*I++;
3163 
3164     if (!CSEDenseMapInfo::canHandle(In))
3165       continue;
3166 
3167     // Check if we can replace this instruction with any of the
3168     // visited instructions.
3169     if (Instruction *V = CSEMap.lookup(In)) {
3170       In->replaceAllUsesWith(V);
3171       In->eraseFromParent();
3172       continue;
3173     }
3174 
3175     CSEMap[In] = In;
3176   }
3177 }
3178 
3179 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3180                                                        unsigned VF,
3181                                                        bool &NeedToScalarize) {
3182   Function *F = CI->getCalledFunction();
3183   StringRef FnName = CI->getCalledFunction()->getName();
3184   Type *ScalarRetTy = CI->getType();
3185   SmallVector<Type *, 4> Tys, ScalarTys;
3186   for (auto &ArgOp : CI->arg_operands())
3187     ScalarTys.push_back(ArgOp->getType());
3188 
3189   // Estimate cost of scalarized vector call. The source operands are assumed
3190   // to be vectors, so we need to extract individual elements from there,
3191   // execute VF scalar calls, and then gather the result into the vector return
3192   // value.
3193   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3194   if (VF == 1)
3195     return ScalarCallCost;
3196 
3197   // Compute corresponding vector type for return value and arguments.
3198   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3199   for (Type *ScalarTy : ScalarTys)
3200     Tys.push_back(ToVectorTy(ScalarTy, VF));
3201 
3202   // Compute costs of unpacking argument values for the scalar calls and
3203   // packing the return values to a vector.
3204   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3205 
3206   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3207 
3208   // If we can't emit a vector call for this function, then the currently found
3209   // cost is the cost we need to return.
3210   NeedToScalarize = true;
3211   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3212     return Cost;
3213 
3214   // If the corresponding vector cost is cheaper, return its cost.
3215   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3216   if (VectorCallCost < Cost) {
3217     NeedToScalarize = false;
3218     return VectorCallCost;
3219   }
3220   return Cost;
3221 }
3222 
3223 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3224                                                             unsigned VF) {
3225   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3226   assert(ID && "Expected intrinsic call!");
3227 
3228   FastMathFlags FMF;
3229   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3230     FMF = FPMO->getFastMathFlags();
3231 
3232   SmallVector<Value *, 4> Operands(CI->arg_operands());
3233   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3234 }
3235 
3236 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3237   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3238   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3239   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3240 }
3241 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3242   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3243   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3244   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3245 }
3246 
3247 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3248   // For every instruction `I` in MinBWs, truncate the operands, create a
3249   // truncated version of `I` and reextend its result. InstCombine runs
3250   // later and will remove any ext/trunc pairs.
3251   SmallPtrSet<Value *, 4> Erased;
3252   for (const auto &KV : Cost->getMinimalBitwidths()) {
3253     // If the value wasn't vectorized, we must maintain the original scalar
3254     // type. The absence of the value from VectorLoopValueMap indicates that it
3255     // wasn't vectorized.
3256     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3257       continue;
3258     for (unsigned Part = 0; Part < UF; ++Part) {
3259       Value *I = getOrCreateVectorValue(KV.first, Part);
3260       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3261           !isa<Instruction>(I))
3262         continue;
3263       Type *OriginalTy = I->getType();
3264       Type *ScalarTruncatedTy =
3265           IntegerType::get(OriginalTy->getContext(), KV.second);
3266       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3267                                           OriginalTy->getVectorNumElements());
3268       if (TruncatedTy == OriginalTy)
3269         continue;
3270 
3271       IRBuilder<> B(cast<Instruction>(I));
3272       auto ShrinkOperand = [&](Value *V) -> Value * {
3273         if (auto *ZI = dyn_cast<ZExtInst>(V))
3274           if (ZI->getSrcTy() == TruncatedTy)
3275             return ZI->getOperand(0);
3276         return B.CreateZExtOrTrunc(V, TruncatedTy);
3277       };
3278 
3279       // The actual instruction modification depends on the instruction type,
3280       // unfortunately.
3281       Value *NewI = nullptr;
3282       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3283         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3284                              ShrinkOperand(BO->getOperand(1)));
3285 
3286         // Any wrapping introduced by shrinking this operation shouldn't be
3287         // considered undefined behavior. So, we can't unconditionally copy
3288         // arithmetic wrapping flags to NewI.
3289         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3290       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3291         NewI =
3292             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3293                          ShrinkOperand(CI->getOperand(1)));
3294       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3295         NewI = B.CreateSelect(SI->getCondition(),
3296                               ShrinkOperand(SI->getTrueValue()),
3297                               ShrinkOperand(SI->getFalseValue()));
3298       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3299         switch (CI->getOpcode()) {
3300         default:
3301           llvm_unreachable("Unhandled cast!");
3302         case Instruction::Trunc:
3303           NewI = ShrinkOperand(CI->getOperand(0));
3304           break;
3305         case Instruction::SExt:
3306           NewI = B.CreateSExtOrTrunc(
3307               CI->getOperand(0),
3308               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3309           break;
3310         case Instruction::ZExt:
3311           NewI = B.CreateZExtOrTrunc(
3312               CI->getOperand(0),
3313               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3314           break;
3315         }
3316       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3317         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3318         auto *O0 = B.CreateZExtOrTrunc(
3319             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3320         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3321         auto *O1 = B.CreateZExtOrTrunc(
3322             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3323 
3324         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3325       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3326         // Don't do anything with the operands, just extend the result.
3327         continue;
3328       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3329         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3330         auto *O0 = B.CreateZExtOrTrunc(
3331             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3332         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3333         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3334       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3335         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3336         auto *O0 = B.CreateZExtOrTrunc(
3337             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3338         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3339       } else {
3340         // If we don't know what to do, be conservative and don't do anything.
3341         continue;
3342       }
3343 
3344       // Lastly, extend the result.
3345       NewI->takeName(cast<Instruction>(I));
3346       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3347       I->replaceAllUsesWith(Res);
3348       cast<Instruction>(I)->eraseFromParent();
3349       Erased.insert(I);
3350       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3351     }
3352   }
3353 
3354   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3355   for (const auto &KV : Cost->getMinimalBitwidths()) {
3356     // If the value wasn't vectorized, we must maintain the original scalar
3357     // type. The absence of the value from VectorLoopValueMap indicates that it
3358     // wasn't vectorized.
3359     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3360       continue;
3361     for (unsigned Part = 0; Part < UF; ++Part) {
3362       Value *I = getOrCreateVectorValue(KV.first, Part);
3363       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3364       if (Inst && Inst->use_empty()) {
3365         Value *NewI = Inst->getOperand(0);
3366         Inst->eraseFromParent();
3367         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3368       }
3369     }
3370   }
3371 }
3372 
3373 void InnerLoopVectorizer::fixVectorizedLoop() {
3374   // Insert truncates and extends for any truncated instructions as hints to
3375   // InstCombine.
3376   if (VF > 1)
3377     truncateToMinimalBitwidths();
3378 
3379   // Fix widened non-induction PHIs by setting up the PHI operands.
3380   if (OrigPHIsToFix.size()) {
3381     assert(EnableVPlanNativePath &&
3382            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3383     fixNonInductionPHIs();
3384   }
3385 
3386   // At this point every instruction in the original loop is widened to a
3387   // vector form. Now we need to fix the recurrences in the loop. These PHI
3388   // nodes are currently empty because we did not want to introduce cycles.
3389   // This is the second stage of vectorizing recurrences.
3390   fixCrossIterationPHIs();
3391 
3392   // Update the dominator tree.
3393   //
3394   // FIXME: After creating the structure of the new loop, the dominator tree is
3395   //        no longer up-to-date, and it remains that way until we update it
3396   //        here. An out-of-date dominator tree is problematic for SCEV,
3397   //        because SCEVExpander uses it to guide code generation. The
3398   //        vectorizer use SCEVExpanders in several places. Instead, we should
3399   //        keep the dominator tree up-to-date as we go.
3400   updateAnalysis();
3401 
3402   // Fix-up external users of the induction variables.
3403   for (auto &Entry : *Legal->getInductionVars())
3404     fixupIVUsers(Entry.first, Entry.second,
3405                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3406                  IVEndValues[Entry.first], LoopMiddleBlock);
3407 
3408   fixLCSSAPHIs();
3409   for (Instruction *PI : PredicatedInstructions)
3410     sinkScalarOperands(&*PI);
3411 
3412   // Remove redundant induction instructions.
3413   cse(LoopVectorBody);
3414 }
3415 
3416 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3417   // In order to support recurrences we need to be able to vectorize Phi nodes.
3418   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3419   // stage #2: We now need to fix the recurrences by adding incoming edges to
3420   // the currently empty PHI nodes. At this point every instruction in the
3421   // original loop is widened to a vector form so we can use them to construct
3422   // the incoming edges.
3423   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3424     // Handle first-order recurrences and reductions that need to be fixed.
3425     if (Legal->isFirstOrderRecurrence(&Phi))
3426       fixFirstOrderRecurrence(&Phi);
3427     else if (Legal->isReductionVariable(&Phi))
3428       fixReduction(&Phi);
3429   }
3430 }
3431 
3432 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3433   // This is the second phase of vectorizing first-order recurrences. An
3434   // overview of the transformation is described below. Suppose we have the
3435   // following loop.
3436   //
3437   //   for (int i = 0; i < n; ++i)
3438   //     b[i] = a[i] - a[i - 1];
3439   //
3440   // There is a first-order recurrence on "a". For this loop, the shorthand
3441   // scalar IR looks like:
3442   //
3443   //   scalar.ph:
3444   //     s_init = a[-1]
3445   //     br scalar.body
3446   //
3447   //   scalar.body:
3448   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3449   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3450   //     s2 = a[i]
3451   //     b[i] = s2 - s1
3452   //     br cond, scalar.body, ...
3453   //
3454   // In this example, s1 is a recurrence because it's value depends on the
3455   // previous iteration. In the first phase of vectorization, we created a
3456   // temporary value for s1. We now complete the vectorization and produce the
3457   // shorthand vector IR shown below (for VF = 4, UF = 1).
3458   //
3459   //   vector.ph:
3460   //     v_init = vector(..., ..., ..., a[-1])
3461   //     br vector.body
3462   //
3463   //   vector.body
3464   //     i = phi [0, vector.ph], [i+4, vector.body]
3465   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3466   //     v2 = a[i, i+1, i+2, i+3];
3467   //     v3 = vector(v1(3), v2(0, 1, 2))
3468   //     b[i, i+1, i+2, i+3] = v2 - v3
3469   //     br cond, vector.body, middle.block
3470   //
3471   //   middle.block:
3472   //     x = v2(3)
3473   //     br scalar.ph
3474   //
3475   //   scalar.ph:
3476   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3477   //     br scalar.body
3478   //
3479   // After execution completes the vector loop, we extract the next value of
3480   // the recurrence (x) to use as the initial value in the scalar loop.
3481 
3482   // Get the original loop preheader and single loop latch.
3483   auto *Preheader = OrigLoop->getLoopPreheader();
3484   auto *Latch = OrigLoop->getLoopLatch();
3485 
3486   // Get the initial and previous values of the scalar recurrence.
3487   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3488   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3489 
3490   // Create a vector from the initial value.
3491   auto *VectorInit = ScalarInit;
3492   if (VF > 1) {
3493     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3494     VectorInit = Builder.CreateInsertElement(
3495         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3496         Builder.getInt32(VF - 1), "vector.recur.init");
3497   }
3498 
3499   // We constructed a temporary phi node in the first phase of vectorization.
3500   // This phi node will eventually be deleted.
3501   Builder.SetInsertPoint(
3502       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3503 
3504   // Create a phi node for the new recurrence. The current value will either be
3505   // the initial value inserted into a vector or loop-varying vector value.
3506   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3507   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3508 
3509   // Get the vectorized previous value of the last part UF - 1. It appears last
3510   // among all unrolled iterations, due to the order of their construction.
3511   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3512 
3513   // Set the insertion point after the previous value if it is an instruction.
3514   // Note that the previous value may have been constant-folded so it is not
3515   // guaranteed to be an instruction in the vector loop. Also, if the previous
3516   // value is a phi node, we should insert after all the phi nodes to avoid
3517   // breaking basic block verification.
3518   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3519       isa<PHINode>(PreviousLastPart))
3520     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3521   else
3522     Builder.SetInsertPoint(
3523         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3524 
3525   // We will construct a vector for the recurrence by combining the values for
3526   // the current and previous iterations. This is the required shuffle mask.
3527   SmallVector<Constant *, 8> ShuffleMask(VF);
3528   ShuffleMask[0] = Builder.getInt32(VF - 1);
3529   for (unsigned I = 1; I < VF; ++I)
3530     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3531 
3532   // The vector from which to take the initial value for the current iteration
3533   // (actual or unrolled). Initially, this is the vector phi node.
3534   Value *Incoming = VecPhi;
3535 
3536   // Shuffle the current and previous vector and update the vector parts.
3537   for (unsigned Part = 0; Part < UF; ++Part) {
3538     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3539     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3540     auto *Shuffle =
3541         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3542                                              ConstantVector::get(ShuffleMask))
3543                : Incoming;
3544     PhiPart->replaceAllUsesWith(Shuffle);
3545     cast<Instruction>(PhiPart)->eraseFromParent();
3546     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3547     Incoming = PreviousPart;
3548   }
3549 
3550   // Fix the latch value of the new recurrence in the vector loop.
3551   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3552 
3553   // Extract the last vector element in the middle block. This will be the
3554   // initial value for the recurrence when jumping to the scalar loop.
3555   auto *ExtractForScalar = Incoming;
3556   if (VF > 1) {
3557     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3558     ExtractForScalar = Builder.CreateExtractElement(
3559         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3560   }
3561   // Extract the second last element in the middle block if the
3562   // Phi is used outside the loop. We need to extract the phi itself
3563   // and not the last element (the phi update in the current iteration). This
3564   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3565   // when the scalar loop is not run at all.
3566   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3567   if (VF > 1)
3568     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3569         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3570   // When loop is unrolled without vectorizing, initialize
3571   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3572   // `Incoming`. This is analogous to the vectorized case above: extracting the
3573   // second last element when VF > 1.
3574   else if (UF > 1)
3575     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3576 
3577   // Fix the initial value of the original recurrence in the scalar loop.
3578   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3579   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3580   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3581     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3582     Start->addIncoming(Incoming, BB);
3583   }
3584 
3585   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3586   Phi->setName("scalar.recur");
3587 
3588   // Finally, fix users of the recurrence outside the loop. The users will need
3589   // either the last value of the scalar recurrence or the last value of the
3590   // vector recurrence we extracted in the middle block. Since the loop is in
3591   // LCSSA form, we just need to find all the phi nodes for the original scalar
3592   // recurrence in the exit block, and then add an edge for the middle block.
3593   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3594     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3595       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3596     }
3597   }
3598 }
3599 
3600 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3601   Constant *Zero = Builder.getInt32(0);
3602 
3603   // Get it's reduction variable descriptor.
3604   assert(Legal->isReductionVariable(Phi) &&
3605          "Unable to find the reduction variable");
3606   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3607 
3608   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3609   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3610   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3611   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3612     RdxDesc.getMinMaxRecurrenceKind();
3613   setDebugLocFromInst(Builder, ReductionStartValue);
3614 
3615   // We need to generate a reduction vector from the incoming scalar.
3616   // To do so, we need to generate the 'identity' vector and override
3617   // one of the elements with the incoming scalar reduction. We need
3618   // to do it in the vector-loop preheader.
3619   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3620 
3621   // This is the vector-clone of the value that leaves the loop.
3622   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3623 
3624   // Find the reduction identity variable. Zero for addition, or, xor,
3625   // one for multiplication, -1 for And.
3626   Value *Identity;
3627   Value *VectorStart;
3628   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3629       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3630     // MinMax reduction have the start value as their identify.
3631     if (VF == 1) {
3632       VectorStart = Identity = ReductionStartValue;
3633     } else {
3634       VectorStart = Identity =
3635         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3636     }
3637   } else {
3638     // Handle other reduction kinds:
3639     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3640         RK, VecTy->getScalarType());
3641     if (VF == 1) {
3642       Identity = Iden;
3643       // This vector is the Identity vector where the first element is the
3644       // incoming scalar reduction.
3645       VectorStart = ReductionStartValue;
3646     } else {
3647       Identity = ConstantVector::getSplat(VF, Iden);
3648 
3649       // This vector is the Identity vector where the first element is the
3650       // incoming scalar reduction.
3651       VectorStart =
3652         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3653     }
3654   }
3655 
3656   // Fix the vector-loop phi.
3657 
3658   // Reductions do not have to start at zero. They can start with
3659   // any loop invariant values.
3660   BasicBlock *Latch = OrigLoop->getLoopLatch();
3661   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3662   for (unsigned Part = 0; Part < UF; ++Part) {
3663     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3664     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3665     // Make sure to add the reduction stat value only to the
3666     // first unroll part.
3667     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3668     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3669     cast<PHINode>(VecRdxPhi)
3670       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3671   }
3672 
3673   // Before each round, move the insertion point right between
3674   // the PHIs and the values we are going to write.
3675   // This allows us to write both PHINodes and the extractelement
3676   // instructions.
3677   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3678 
3679   setDebugLocFromInst(Builder, LoopExitInst);
3680 
3681   // If the vector reduction can be performed in a smaller type, we truncate
3682   // then extend the loop exit value to enable InstCombine to evaluate the
3683   // entire expression in the smaller type.
3684   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3685     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3686     Builder.SetInsertPoint(
3687         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3688     VectorParts RdxParts(UF);
3689     for (unsigned Part = 0; Part < UF; ++Part) {
3690       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3691       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3692       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3693                                         : Builder.CreateZExt(Trunc, VecTy);
3694       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3695            UI != RdxParts[Part]->user_end();)
3696         if (*UI != Trunc) {
3697           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3698           RdxParts[Part] = Extnd;
3699         } else {
3700           ++UI;
3701         }
3702     }
3703     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3704     for (unsigned Part = 0; Part < UF; ++Part) {
3705       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3706       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3707     }
3708   }
3709 
3710   // Reduce all of the unrolled parts into a single vector.
3711   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3712   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3713 
3714   // The middle block terminator has already been assigned a DebugLoc here (the
3715   // OrigLoop's single latch terminator). We want the whole middle block to
3716   // appear to execute on this line because: (a) it is all compiler generated,
3717   // (b) these instructions are always executed after evaluating the latch
3718   // conditional branch, and (c) other passes may add new predecessors which
3719   // terminate on this line. This is the easiest way to ensure we don't
3720   // accidentally cause an extra step back into the loop while debugging.
3721   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3722   for (unsigned Part = 1; Part < UF; ++Part) {
3723     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3724     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3725       // Floating point operations had to be 'fast' to enable the reduction.
3726       ReducedPartRdx = addFastMathFlag(
3727           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3728                               ReducedPartRdx, "bin.rdx"),
3729           RdxDesc.getFastMathFlags());
3730     else
3731       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3732                                       RdxPart);
3733   }
3734 
3735   if (VF > 1) {
3736     bool NoNaN = Legal->hasFunNoNaNAttr();
3737     ReducedPartRdx =
3738         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3739     // If the reduction can be performed in a smaller type, we need to extend
3740     // the reduction to the wider type before we branch to the original loop.
3741     if (Phi->getType() != RdxDesc.getRecurrenceType())
3742       ReducedPartRdx =
3743         RdxDesc.isSigned()
3744         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3745         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3746   }
3747 
3748   // Create a phi node that merges control-flow from the backedge-taken check
3749   // block and the middle block.
3750   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3751                                         LoopScalarPreHeader->getTerminator());
3752   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3753     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3754   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3755 
3756   // Now, we need to fix the users of the reduction variable
3757   // inside and outside of the scalar remainder loop.
3758   // We know that the loop is in LCSSA form. We need to update the
3759   // PHI nodes in the exit blocks.
3760   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3761     // All PHINodes need to have a single entry edge, or two if
3762     // we already fixed them.
3763     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3764 
3765     // We found a reduction value exit-PHI. Update it with the
3766     // incoming bypass edge.
3767     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3768       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3769   } // end of the LCSSA phi scan.
3770 
3771     // Fix the scalar loop reduction variable with the incoming reduction sum
3772     // from the vector body and from the backedge value.
3773   int IncomingEdgeBlockIdx =
3774     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3775   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3776   // Pick the other block.
3777   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3778   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3779   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3780 }
3781 
3782 void InnerLoopVectorizer::fixLCSSAPHIs() {
3783   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3784     if (LCSSAPhi.getNumIncomingValues() == 1) {
3785       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3786       // Non-instruction incoming values will have only one value.
3787       unsigned LastLane = 0;
3788       if (isa<Instruction>(IncomingValue))
3789           LastLane = Cost->isUniformAfterVectorization(
3790                          cast<Instruction>(IncomingValue), VF)
3791                          ? 0
3792                          : VF - 1;
3793       // Can be a loop invariant incoming value or the last scalar value to be
3794       // extracted from the vectorized loop.
3795       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3796       Value *lastIncomingValue =
3797           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3798       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3799     }
3800   }
3801 }
3802 
3803 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3804   // The basic block and loop containing the predicated instruction.
3805   auto *PredBB = PredInst->getParent();
3806   auto *VectorLoop = LI->getLoopFor(PredBB);
3807 
3808   // Initialize a worklist with the operands of the predicated instruction.
3809   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3810 
3811   // Holds instructions that we need to analyze again. An instruction may be
3812   // reanalyzed if we don't yet know if we can sink it or not.
3813   SmallVector<Instruction *, 8> InstsToReanalyze;
3814 
3815   // Returns true if a given use occurs in the predicated block. Phi nodes use
3816   // their operands in their corresponding predecessor blocks.
3817   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3818     auto *I = cast<Instruction>(U.getUser());
3819     BasicBlock *BB = I->getParent();
3820     if (auto *Phi = dyn_cast<PHINode>(I))
3821       BB = Phi->getIncomingBlock(
3822           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3823     return BB == PredBB;
3824   };
3825 
3826   // Iteratively sink the scalarized operands of the predicated instruction
3827   // into the block we created for it. When an instruction is sunk, it's
3828   // operands are then added to the worklist. The algorithm ends after one pass
3829   // through the worklist doesn't sink a single instruction.
3830   bool Changed;
3831   do {
3832     // Add the instructions that need to be reanalyzed to the worklist, and
3833     // reset the changed indicator.
3834     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3835     InstsToReanalyze.clear();
3836     Changed = false;
3837 
3838     while (!Worklist.empty()) {
3839       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3840 
3841       // We can't sink an instruction if it is a phi node, is already in the
3842       // predicated block, is not in the loop, or may have side effects.
3843       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3844           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3845         continue;
3846 
3847       // It's legal to sink the instruction if all its uses occur in the
3848       // predicated block. Otherwise, there's nothing to do yet, and we may
3849       // need to reanalyze the instruction.
3850       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3851         InstsToReanalyze.push_back(I);
3852         continue;
3853       }
3854 
3855       // Move the instruction to the beginning of the predicated block, and add
3856       // it's operands to the worklist.
3857       I->moveBefore(&*PredBB->getFirstInsertionPt());
3858       Worklist.insert(I->op_begin(), I->op_end());
3859 
3860       // The sinking may have enabled other instructions to be sunk, so we will
3861       // need to iterate.
3862       Changed = true;
3863     }
3864   } while (Changed);
3865 }
3866 
3867 void InnerLoopVectorizer::fixNonInductionPHIs() {
3868   for (PHINode *OrigPhi : OrigPHIsToFix) {
3869     PHINode *NewPhi =
3870         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3871     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3872 
3873     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3874         predecessors(OrigPhi->getParent()));
3875     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3876         predecessors(NewPhi->getParent()));
3877     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3878            "Scalar and Vector BB should have the same number of predecessors");
3879 
3880     // The insertion point in Builder may be invalidated by the time we get
3881     // here. Force the Builder insertion point to something valid so that we do
3882     // not run into issues during insertion point restore in
3883     // getOrCreateVectorValue calls below.
3884     Builder.SetInsertPoint(NewPhi);
3885 
3886     // The predecessor order is preserved and we can rely on mapping between
3887     // scalar and vector block predecessors.
3888     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3889       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3890 
3891       // When looking up the new scalar/vector values to fix up, use incoming
3892       // values from original phi.
3893       Value *ScIncV =
3894           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3895 
3896       // Scalar incoming value may need a broadcast
3897       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3898       NewPhi->addIncoming(NewIncV, NewPredBB);
3899     }
3900   }
3901 }
3902 
3903 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3904                                               unsigned VF) {
3905   PHINode *P = cast<PHINode>(PN);
3906   if (EnableVPlanNativePath) {
3907     // Currently we enter here in the VPlan-native path for non-induction
3908     // PHIs where all control flow is uniform. We simply widen these PHIs.
3909     // Create a vector phi with no operands - the vector phi operands will be
3910     // set at the end of vector code generation.
3911     Type *VecTy =
3912         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3913     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3914     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3915     OrigPHIsToFix.push_back(P);
3916 
3917     return;
3918   }
3919 
3920   assert(PN->getParent() == OrigLoop->getHeader() &&
3921          "Non-header phis should have been handled elsewhere");
3922 
3923   // In order to support recurrences we need to be able to vectorize Phi nodes.
3924   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3925   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3926   // this value when we vectorize all of the instructions that use the PHI.
3927   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3928     for (unsigned Part = 0; Part < UF; ++Part) {
3929       // This is phase one of vectorizing PHIs.
3930       Type *VecTy =
3931           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3932       Value *EntryPart = PHINode::Create(
3933           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3934       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3935     }
3936     return;
3937   }
3938 
3939   setDebugLocFromInst(Builder, P);
3940 
3941   // This PHINode must be an induction variable.
3942   // Make sure that we know about it.
3943   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3944 
3945   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3946   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3947 
3948   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3949   // which can be found from the original scalar operations.
3950   switch (II.getKind()) {
3951   case InductionDescriptor::IK_NoInduction:
3952     llvm_unreachable("Unknown induction");
3953   case InductionDescriptor::IK_IntInduction:
3954   case InductionDescriptor::IK_FpInduction:
3955     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3956   case InductionDescriptor::IK_PtrInduction: {
3957     // Handle the pointer induction variable case.
3958     assert(P->getType()->isPointerTy() && "Unexpected type.");
3959     // This is the normalized GEP that starts counting at zero.
3960     Value *PtrInd = Induction;
3961     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3962     // Determine the number of scalars we need to generate for each unroll
3963     // iteration. If the instruction is uniform, we only need to generate the
3964     // first lane. Otherwise, we generate all VF values.
3965     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3966     // These are the scalar results. Notice that we don't generate vector GEPs
3967     // because scalar GEPs result in better code.
3968     for (unsigned Part = 0; Part < UF; ++Part) {
3969       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3970         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3971         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3972         Value *SclrGep =
3973             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3974         SclrGep->setName("next.gep");
3975         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3976       }
3977     }
3978     return;
3979   }
3980   }
3981 }
3982 
3983 /// A helper function for checking whether an integer division-related
3984 /// instruction may divide by zero (in which case it must be predicated if
3985 /// executed conditionally in the scalar code).
3986 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3987 /// Non-zero divisors that are non compile-time constants will not be
3988 /// converted into multiplication, so we will still end up scalarizing
3989 /// the division, but can do so w/o predication.
3990 static bool mayDivideByZero(Instruction &I) {
3991   assert((I.getOpcode() == Instruction::UDiv ||
3992           I.getOpcode() == Instruction::SDiv ||
3993           I.getOpcode() == Instruction::URem ||
3994           I.getOpcode() == Instruction::SRem) &&
3995          "Unexpected instruction");
3996   Value *Divisor = I.getOperand(1);
3997   auto *CInt = dyn_cast<ConstantInt>(Divisor);
3998   return !CInt || CInt->isZero();
3999 }
4000 
4001 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4002   switch (I.getOpcode()) {
4003   case Instruction::Br:
4004   case Instruction::PHI:
4005     llvm_unreachable("This instruction is handled by a different recipe.");
4006   case Instruction::GetElementPtr: {
4007     // Construct a vector GEP by widening the operands of the scalar GEP as
4008     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4009     // results in a vector of pointers when at least one operand of the GEP
4010     // is vector-typed. Thus, to keep the representation compact, we only use
4011     // vector-typed operands for loop-varying values.
4012     auto *GEP = cast<GetElementPtrInst>(&I);
4013 
4014     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4015       // If we are vectorizing, but the GEP has only loop-invariant operands,
4016       // the GEP we build (by only using vector-typed operands for
4017       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4018       // produce a vector of pointers, we need to either arbitrarily pick an
4019       // operand to broadcast, or broadcast a clone of the original GEP.
4020       // Here, we broadcast a clone of the original.
4021       //
4022       // TODO: If at some point we decide to scalarize instructions having
4023       //       loop-invariant operands, this special case will no longer be
4024       //       required. We would add the scalarization decision to
4025       //       collectLoopScalars() and teach getVectorValue() to broadcast
4026       //       the lane-zero scalar value.
4027       auto *Clone = Builder.Insert(GEP->clone());
4028       for (unsigned Part = 0; Part < UF; ++Part) {
4029         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4030         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4031         addMetadata(EntryPart, GEP);
4032       }
4033     } else {
4034       // If the GEP has at least one loop-varying operand, we are sure to
4035       // produce a vector of pointers. But if we are only unrolling, we want
4036       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4037       // produce with the code below will be scalar (if VF == 1) or vector
4038       // (otherwise). Note that for the unroll-only case, we still maintain
4039       // values in the vector mapping with initVector, as we do for other
4040       // instructions.
4041       for (unsigned Part = 0; Part < UF; ++Part) {
4042         // The pointer operand of the new GEP. If it's loop-invariant, we
4043         // won't broadcast it.
4044         auto *Ptr =
4045             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4046                 ? GEP->getPointerOperand()
4047                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4048 
4049         // Collect all the indices for the new GEP. If any index is
4050         // loop-invariant, we won't broadcast it.
4051         SmallVector<Value *, 4> Indices;
4052         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4053           if (OrigLoop->isLoopInvariant(U.get()))
4054             Indices.push_back(U.get());
4055           else
4056             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4057         }
4058 
4059         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4060         // but it should be a vector, otherwise.
4061         auto *NewGEP =
4062             GEP->isInBounds()
4063                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4064                                             Indices)
4065                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4066         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4067                "NewGEP is not a pointer vector");
4068         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4069         addMetadata(NewGEP, GEP);
4070       }
4071     }
4072 
4073     break;
4074   }
4075   case Instruction::UDiv:
4076   case Instruction::SDiv:
4077   case Instruction::SRem:
4078   case Instruction::URem:
4079   case Instruction::Add:
4080   case Instruction::FAdd:
4081   case Instruction::Sub:
4082   case Instruction::FSub:
4083   case Instruction::FNeg:
4084   case Instruction::Mul:
4085   case Instruction::FMul:
4086   case Instruction::FDiv:
4087   case Instruction::FRem:
4088   case Instruction::Shl:
4089   case Instruction::LShr:
4090   case Instruction::AShr:
4091   case Instruction::And:
4092   case Instruction::Or:
4093   case Instruction::Xor: {
4094     // Just widen unops and binops.
4095     setDebugLocFromInst(Builder, &I);
4096 
4097     for (unsigned Part = 0; Part < UF; ++Part) {
4098       SmallVector<Value *, 2> Ops;
4099       for (Value *Op : I.operands())
4100         Ops.push_back(getOrCreateVectorValue(Op, Part));
4101 
4102       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4103 
4104       if (auto *VecOp = dyn_cast<Instruction>(V))
4105         VecOp->copyIRFlags(&I);
4106 
4107       // Use this vector value for all users of the original instruction.
4108       VectorLoopValueMap.setVectorValue(&I, Part, V);
4109       addMetadata(V, &I);
4110     }
4111 
4112     break;
4113   }
4114   case Instruction::Select: {
4115     // Widen selects.
4116     // If the selector is loop invariant we can create a select
4117     // instruction with a scalar condition. Otherwise, use vector-select.
4118     auto *SE = PSE.getSE();
4119     bool InvariantCond =
4120         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4121     setDebugLocFromInst(Builder, &I);
4122 
4123     // The condition can be loop invariant  but still defined inside the
4124     // loop. This means that we can't just use the original 'cond' value.
4125     // We have to take the 'vectorized' value and pick the first lane.
4126     // Instcombine will make this a no-op.
4127 
4128     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4129 
4130     for (unsigned Part = 0; Part < UF; ++Part) {
4131       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4132       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4133       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4134       Value *Sel =
4135           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4136       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4137       addMetadata(Sel, &I);
4138     }
4139 
4140     break;
4141   }
4142 
4143   case Instruction::ICmp:
4144   case Instruction::FCmp: {
4145     // Widen compares. Generate vector compares.
4146     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4147     auto *Cmp = dyn_cast<CmpInst>(&I);
4148     setDebugLocFromInst(Builder, Cmp);
4149     for (unsigned Part = 0; Part < UF; ++Part) {
4150       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4151       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4152       Value *C = nullptr;
4153       if (FCmp) {
4154         // Propagate fast math flags.
4155         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4156         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4157         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4158       } else {
4159         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4160       }
4161       VectorLoopValueMap.setVectorValue(&I, Part, C);
4162       addMetadata(C, &I);
4163     }
4164 
4165     break;
4166   }
4167 
4168   case Instruction::ZExt:
4169   case Instruction::SExt:
4170   case Instruction::FPToUI:
4171   case Instruction::FPToSI:
4172   case Instruction::FPExt:
4173   case Instruction::PtrToInt:
4174   case Instruction::IntToPtr:
4175   case Instruction::SIToFP:
4176   case Instruction::UIToFP:
4177   case Instruction::Trunc:
4178   case Instruction::FPTrunc:
4179   case Instruction::BitCast: {
4180     auto *CI = dyn_cast<CastInst>(&I);
4181     setDebugLocFromInst(Builder, CI);
4182 
4183     /// Vectorize casts.
4184     Type *DestTy =
4185         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4186 
4187     for (unsigned Part = 0; Part < UF; ++Part) {
4188       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4189       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4190       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4191       addMetadata(Cast, &I);
4192     }
4193     break;
4194   }
4195 
4196   case Instruction::Call: {
4197     // Ignore dbg intrinsics.
4198     if (isa<DbgInfoIntrinsic>(I))
4199       break;
4200     setDebugLocFromInst(Builder, &I);
4201 
4202     Module *M = I.getParent()->getParent()->getParent();
4203     auto *CI = cast<CallInst>(&I);
4204 
4205     StringRef FnName = CI->getCalledFunction()->getName();
4206     Function *F = CI->getCalledFunction();
4207     Type *RetTy = ToVectorTy(CI->getType(), VF);
4208     SmallVector<Type *, 4> Tys;
4209     for (Value *ArgOperand : CI->arg_operands())
4210       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4211 
4212     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4213 
4214     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4215     // version of the instruction.
4216     // Is it beneficial to perform intrinsic call compared to lib call?
4217     bool NeedToScalarize;
4218     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4219     bool UseVectorIntrinsic =
4220         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4221     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4222            "Instruction should be scalarized elsewhere.");
4223 
4224     for (unsigned Part = 0; Part < UF; ++Part) {
4225       SmallVector<Value *, 4> Args;
4226       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4227         Value *Arg = CI->getArgOperand(i);
4228         // Some intrinsics have a scalar argument - don't replace it with a
4229         // vector.
4230         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4231           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4232         Args.push_back(Arg);
4233       }
4234 
4235       Function *VectorF;
4236       if (UseVectorIntrinsic) {
4237         // Use vector version of the intrinsic.
4238         Type *TysForDecl[] = {CI->getType()};
4239         if (VF > 1)
4240           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4241         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4242       } else {
4243         // Use vector version of the library call.
4244         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4245         assert(!VFnName.empty() && "Vector function name is empty.");
4246         VectorF = M->getFunction(VFnName);
4247         if (!VectorF) {
4248           // Generate a declaration
4249           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4250           VectorF =
4251               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4252           VectorF->copyAttributesFrom(F);
4253         }
4254       }
4255       assert(VectorF && "Can't create vector function.");
4256 
4257       SmallVector<OperandBundleDef, 1> OpBundles;
4258       CI->getOperandBundlesAsDefs(OpBundles);
4259       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4260 
4261       if (isa<FPMathOperator>(V))
4262         V->copyFastMathFlags(CI);
4263 
4264       VectorLoopValueMap.setVectorValue(&I, Part, V);
4265       addMetadata(V, &I);
4266     }
4267 
4268     break;
4269   }
4270 
4271   default:
4272     // This instruction is not vectorized by simple widening.
4273     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4274     llvm_unreachable("Unhandled instruction!");
4275   } // end of switch.
4276 }
4277 
4278 void InnerLoopVectorizer::updateAnalysis() {
4279   // Forget the original basic block.
4280   PSE.getSE()->forgetLoop(OrigLoop);
4281 
4282   // DT is not kept up-to-date for outer loop vectorization
4283   if (EnableVPlanNativePath)
4284     return;
4285 
4286   // Update the dominator tree information.
4287   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4288          "Entry does not dominate exit.");
4289 
4290   DT->addNewBlock(LoopMiddleBlock,
4291                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4292   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4293   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4294   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4295   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4296 }
4297 
4298 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4299   // We should not collect Scalars more than once per VF. Right now, this
4300   // function is called from collectUniformsAndScalars(), which already does
4301   // this check. Collecting Scalars for VF=1 does not make any sense.
4302   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4303          "This function should not be visited twice for the same VF");
4304 
4305   SmallSetVector<Instruction *, 8> Worklist;
4306 
4307   // These sets are used to seed the analysis with pointers used by memory
4308   // accesses that will remain scalar.
4309   SmallSetVector<Instruction *, 8> ScalarPtrs;
4310   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4311 
4312   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4313   // The pointer operands of loads and stores will be scalar as long as the
4314   // memory access is not a gather or scatter operation. The value operand of a
4315   // store will remain scalar if the store is scalarized.
4316   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4317     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4318     assert(WideningDecision != CM_Unknown &&
4319            "Widening decision should be ready at this moment");
4320     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4321       if (Ptr == Store->getValueOperand())
4322         return WideningDecision == CM_Scalarize;
4323     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4324            "Ptr is neither a value or pointer operand");
4325     return WideningDecision != CM_GatherScatter;
4326   };
4327 
4328   // A helper that returns true if the given value is a bitcast or
4329   // getelementptr instruction contained in the loop.
4330   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4331     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4332             isa<GetElementPtrInst>(V)) &&
4333            !TheLoop->isLoopInvariant(V);
4334   };
4335 
4336   // A helper that evaluates a memory access's use of a pointer. If the use
4337   // will be a scalar use, and the pointer is only used by memory accesses, we
4338   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4339   // PossibleNonScalarPtrs.
4340   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4341     // We only care about bitcast and getelementptr instructions contained in
4342     // the loop.
4343     if (!isLoopVaryingBitCastOrGEP(Ptr))
4344       return;
4345 
4346     // If the pointer has already been identified as scalar (e.g., if it was
4347     // also identified as uniform), there's nothing to do.
4348     auto *I = cast<Instruction>(Ptr);
4349     if (Worklist.count(I))
4350       return;
4351 
4352     // If the use of the pointer will be a scalar use, and all users of the
4353     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4354     // place the pointer in PossibleNonScalarPtrs.
4355     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4356           return isa<LoadInst>(U) || isa<StoreInst>(U);
4357         }))
4358       ScalarPtrs.insert(I);
4359     else
4360       PossibleNonScalarPtrs.insert(I);
4361   };
4362 
4363   // We seed the scalars analysis with three classes of instructions: (1)
4364   // instructions marked uniform-after-vectorization, (2) bitcast and
4365   // getelementptr instructions used by memory accesses requiring a scalar use,
4366   // and (3) pointer induction variables and their update instructions (we
4367   // currently only scalarize these).
4368   //
4369   // (1) Add to the worklist all instructions that have been identified as
4370   // uniform-after-vectorization.
4371   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4372 
4373   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4374   // memory accesses requiring a scalar use. The pointer operands of loads and
4375   // stores will be scalar as long as the memory accesses is not a gather or
4376   // scatter operation. The value operand of a store will remain scalar if the
4377   // store is scalarized.
4378   for (auto *BB : TheLoop->blocks())
4379     for (auto &I : *BB) {
4380       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4381         evaluatePtrUse(Load, Load->getPointerOperand());
4382       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4383         evaluatePtrUse(Store, Store->getPointerOperand());
4384         evaluatePtrUse(Store, Store->getValueOperand());
4385       }
4386     }
4387   for (auto *I : ScalarPtrs)
4388     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4389       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4390       Worklist.insert(I);
4391     }
4392 
4393   // (3) Add to the worklist all pointer induction variables and their update
4394   // instructions.
4395   //
4396   // TODO: Once we are able to vectorize pointer induction variables we should
4397   //       no longer insert them into the worklist here.
4398   auto *Latch = TheLoop->getLoopLatch();
4399   for (auto &Induction : *Legal->getInductionVars()) {
4400     auto *Ind = Induction.first;
4401     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4402     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4403       continue;
4404     Worklist.insert(Ind);
4405     Worklist.insert(IndUpdate);
4406     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4407     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4408                       << "\n");
4409   }
4410 
4411   // Insert the forced scalars.
4412   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4413   // induction variable when the PHI user is scalarized.
4414   auto ForcedScalar = ForcedScalars.find(VF);
4415   if (ForcedScalar != ForcedScalars.end())
4416     for (auto *I : ForcedScalar->second)
4417       Worklist.insert(I);
4418 
4419   // Expand the worklist by looking through any bitcasts and getelementptr
4420   // instructions we've already identified as scalar. This is similar to the
4421   // expansion step in collectLoopUniforms(); however, here we're only
4422   // expanding to include additional bitcasts and getelementptr instructions.
4423   unsigned Idx = 0;
4424   while (Idx != Worklist.size()) {
4425     Instruction *Dst = Worklist[Idx++];
4426     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4427       continue;
4428     auto *Src = cast<Instruction>(Dst->getOperand(0));
4429     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4430           auto *J = cast<Instruction>(U);
4431           return !TheLoop->contains(J) || Worklist.count(J) ||
4432                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4433                   isScalarUse(J, Src));
4434         })) {
4435       Worklist.insert(Src);
4436       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4437     }
4438   }
4439 
4440   // An induction variable will remain scalar if all users of the induction
4441   // variable and induction variable update remain scalar.
4442   for (auto &Induction : *Legal->getInductionVars()) {
4443     auto *Ind = Induction.first;
4444     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4445 
4446     // We already considered pointer induction variables, so there's no reason
4447     // to look at their users again.
4448     //
4449     // TODO: Once we are able to vectorize pointer induction variables we
4450     //       should no longer skip over them here.
4451     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4452       continue;
4453 
4454     // Determine if all users of the induction variable are scalar after
4455     // vectorization.
4456     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4457       auto *I = cast<Instruction>(U);
4458       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4459     });
4460     if (!ScalarInd)
4461       continue;
4462 
4463     // Determine if all users of the induction variable update instruction are
4464     // scalar after vectorization.
4465     auto ScalarIndUpdate =
4466         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4467           auto *I = cast<Instruction>(U);
4468           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4469         });
4470     if (!ScalarIndUpdate)
4471       continue;
4472 
4473     // The induction variable and its update instruction will remain scalar.
4474     Worklist.insert(Ind);
4475     Worklist.insert(IndUpdate);
4476     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4477     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4478                       << "\n");
4479   }
4480 
4481   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4482 }
4483 
4484 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4485   if (!blockNeedsPredication(I->getParent()))
4486     return false;
4487   switch(I->getOpcode()) {
4488   default:
4489     break;
4490   case Instruction::Load:
4491   case Instruction::Store: {
4492     if (!Legal->isMaskRequired(I))
4493       return false;
4494     auto *Ptr = getLoadStorePointerOperand(I);
4495     auto *Ty = getMemInstValueType(I);
4496     // We have already decided how to vectorize this instruction, get that
4497     // result.
4498     if (VF > 1) {
4499       InstWidening WideningDecision = getWideningDecision(I, VF);
4500       assert(WideningDecision != CM_Unknown &&
4501              "Widening decision should be ready at this moment");
4502       return WideningDecision == CM_Scalarize;
4503     }
4504     return isa<LoadInst>(I) ?
4505         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4506       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4507   }
4508   case Instruction::UDiv:
4509   case Instruction::SDiv:
4510   case Instruction::SRem:
4511   case Instruction::URem:
4512     return mayDivideByZero(*I);
4513   }
4514   return false;
4515 }
4516 
4517 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4518                                                                unsigned VF) {
4519   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4520   assert(getWideningDecision(I, VF) == CM_Unknown &&
4521          "Decision should not be set yet.");
4522   auto *Group = getInterleavedAccessGroup(I);
4523   assert(Group && "Must have a group.");
4524 
4525   // If the instruction's allocated size doesn't equal it's type size, it
4526   // requires padding and will be scalarized.
4527   auto &DL = I->getModule()->getDataLayout();
4528   auto *ScalarTy = getMemInstValueType(I);
4529   if (hasIrregularType(ScalarTy, DL, VF))
4530     return false;
4531 
4532   // Check if masking is required.
4533   // A Group may need masking for one of two reasons: it resides in a block that
4534   // needs predication, or it was decided to use masking to deal with gaps.
4535   bool PredicatedAccessRequiresMasking =
4536       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4537   bool AccessWithGapsRequiresMasking =
4538       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4539   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4540     return true;
4541 
4542   // If masked interleaving is required, we expect that the user/target had
4543   // enabled it, because otherwise it either wouldn't have been created or
4544   // it should have been invalidated by the CostModel.
4545   assert(useMaskedInterleavedAccesses(TTI) &&
4546          "Masked interleave-groups for predicated accesses are not enabled.");
4547 
4548   auto *Ty = getMemInstValueType(I);
4549   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4550                           : TTI.isLegalMaskedStore(Ty);
4551 }
4552 
4553 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4554                                                                unsigned VF) {
4555   // Get and ensure we have a valid memory instruction.
4556   LoadInst *LI = dyn_cast<LoadInst>(I);
4557   StoreInst *SI = dyn_cast<StoreInst>(I);
4558   assert((LI || SI) && "Invalid memory instruction");
4559 
4560   auto *Ptr = getLoadStorePointerOperand(I);
4561 
4562   // In order to be widened, the pointer should be consecutive, first of all.
4563   if (!Legal->isConsecutivePtr(Ptr))
4564     return false;
4565 
4566   // If the instruction is a store located in a predicated block, it will be
4567   // scalarized.
4568   if (isScalarWithPredication(I))
4569     return false;
4570 
4571   // If the instruction's allocated size doesn't equal it's type size, it
4572   // requires padding and will be scalarized.
4573   auto &DL = I->getModule()->getDataLayout();
4574   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4575   if (hasIrregularType(ScalarTy, DL, VF))
4576     return false;
4577 
4578   return true;
4579 }
4580 
4581 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4582   // We should not collect Uniforms more than once per VF. Right now,
4583   // this function is called from collectUniformsAndScalars(), which
4584   // already does this check. Collecting Uniforms for VF=1 does not make any
4585   // sense.
4586 
4587   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4588          "This function should not be visited twice for the same VF");
4589 
4590   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4591   // not analyze again.  Uniforms.count(VF) will return 1.
4592   Uniforms[VF].clear();
4593 
4594   // We now know that the loop is vectorizable!
4595   // Collect instructions inside the loop that will remain uniform after
4596   // vectorization.
4597 
4598   // Global values, params and instructions outside of current loop are out of
4599   // scope.
4600   auto isOutOfScope = [&](Value *V) -> bool {
4601     Instruction *I = dyn_cast<Instruction>(V);
4602     return (!I || !TheLoop->contains(I));
4603   };
4604 
4605   SetVector<Instruction *> Worklist;
4606   BasicBlock *Latch = TheLoop->getLoopLatch();
4607 
4608   // Start with the conditional branch. If the branch condition is an
4609   // instruction contained in the loop that is only used by the branch, it is
4610   // uniform.
4611   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4612   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4613     Worklist.insert(Cmp);
4614     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4615   }
4616 
4617   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4618   // are pointers that are treated like consecutive pointers during
4619   // vectorization. The pointer operands of interleaved accesses are an
4620   // example.
4621   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4622 
4623   // Holds pointer operands of instructions that are possibly non-uniform.
4624   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4625 
4626   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4627     InstWidening WideningDecision = getWideningDecision(I, VF);
4628     assert(WideningDecision != CM_Unknown &&
4629            "Widening decision should be ready at this moment");
4630 
4631     return (WideningDecision == CM_Widen ||
4632             WideningDecision == CM_Widen_Reverse ||
4633             WideningDecision == CM_Interleave);
4634   };
4635   // Iterate over the instructions in the loop, and collect all
4636   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4637   // that a consecutive-like pointer operand will be scalarized, we collect it
4638   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4639   // getelementptr instruction can be used by both vectorized and scalarized
4640   // memory instructions. For example, if a loop loads and stores from the same
4641   // location, but the store is conditional, the store will be scalarized, and
4642   // the getelementptr won't remain uniform.
4643   for (auto *BB : TheLoop->blocks())
4644     for (auto &I : *BB) {
4645       // If there's no pointer operand, there's nothing to do.
4646       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4647       if (!Ptr)
4648         continue;
4649 
4650       // True if all users of Ptr are memory accesses that have Ptr as their
4651       // pointer operand.
4652       auto UsersAreMemAccesses =
4653           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4654             return getLoadStorePointerOperand(U) == Ptr;
4655           });
4656 
4657       // Ensure the memory instruction will not be scalarized or used by
4658       // gather/scatter, making its pointer operand non-uniform. If the pointer
4659       // operand is used by any instruction other than a memory access, we
4660       // conservatively assume the pointer operand may be non-uniform.
4661       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4662         PossibleNonUniformPtrs.insert(Ptr);
4663 
4664       // If the memory instruction will be vectorized and its pointer operand
4665       // is consecutive-like, or interleaving - the pointer operand should
4666       // remain uniform.
4667       else
4668         ConsecutiveLikePtrs.insert(Ptr);
4669     }
4670 
4671   // Add to the Worklist all consecutive and consecutive-like pointers that
4672   // aren't also identified as possibly non-uniform.
4673   for (auto *V : ConsecutiveLikePtrs)
4674     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4675       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4676       Worklist.insert(V);
4677     }
4678 
4679   // Expand Worklist in topological order: whenever a new instruction
4680   // is added , its users should be already inside Worklist.  It ensures
4681   // a uniform instruction will only be used by uniform instructions.
4682   unsigned idx = 0;
4683   while (idx != Worklist.size()) {
4684     Instruction *I = Worklist[idx++];
4685 
4686     for (auto OV : I->operand_values()) {
4687       // isOutOfScope operands cannot be uniform instructions.
4688       if (isOutOfScope(OV))
4689         continue;
4690       // First order recurrence Phi's should typically be considered
4691       // non-uniform.
4692       auto *OP = dyn_cast<PHINode>(OV);
4693       if (OP && Legal->isFirstOrderRecurrence(OP))
4694         continue;
4695       // If all the users of the operand are uniform, then add the
4696       // operand into the uniform worklist.
4697       auto *OI = cast<Instruction>(OV);
4698       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4699             auto *J = cast<Instruction>(U);
4700             return Worklist.count(J) ||
4701                    (OI == getLoadStorePointerOperand(J) &&
4702                     isUniformDecision(J, VF));
4703           })) {
4704         Worklist.insert(OI);
4705         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4706       }
4707     }
4708   }
4709 
4710   // Returns true if Ptr is the pointer operand of a memory access instruction
4711   // I, and I is known to not require scalarization.
4712   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4713     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4714   };
4715 
4716   // For an instruction to be added into Worklist above, all its users inside
4717   // the loop should also be in Worklist. However, this condition cannot be
4718   // true for phi nodes that form a cyclic dependence. We must process phi
4719   // nodes separately. An induction variable will remain uniform if all users
4720   // of the induction variable and induction variable update remain uniform.
4721   // The code below handles both pointer and non-pointer induction variables.
4722   for (auto &Induction : *Legal->getInductionVars()) {
4723     auto *Ind = Induction.first;
4724     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4725 
4726     // Determine if all users of the induction variable are uniform after
4727     // vectorization.
4728     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4729       auto *I = cast<Instruction>(U);
4730       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4731              isVectorizedMemAccessUse(I, Ind);
4732     });
4733     if (!UniformInd)
4734       continue;
4735 
4736     // Determine if all users of the induction variable update instruction are
4737     // uniform after vectorization.
4738     auto UniformIndUpdate =
4739         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4740           auto *I = cast<Instruction>(U);
4741           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4742                  isVectorizedMemAccessUse(I, IndUpdate);
4743         });
4744     if (!UniformIndUpdate)
4745       continue;
4746 
4747     // The induction variable and its update instruction will remain uniform.
4748     Worklist.insert(Ind);
4749     Worklist.insert(IndUpdate);
4750     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4751     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4752                       << "\n");
4753   }
4754 
4755   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4756 }
4757 
4758 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4759   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4760 
4761   if (Legal->getRuntimePointerChecking()->Need) {
4762     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4763         "runtime pointer checks needed. Enable vectorization of this "
4764         "loop with '#pragma clang loop vectorize(enable)' when "
4765         "compiling with -Os/-Oz",
4766         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4767     return true;
4768   }
4769 
4770   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4771     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4772         "runtime SCEV checks needed. Enable vectorization of this "
4773         "loop with '#pragma clang loop vectorize(enable)' when "
4774         "compiling with -Os/-Oz",
4775         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4776     return true;
4777   }
4778 
4779   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4780   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4781     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4782         "runtime stride == 1 checks needed. Enable vectorization of "
4783         "this loop with '#pragma clang loop vectorize(enable)' when "
4784         "compiling with -Os/-Oz",
4785         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4786     return true;
4787   }
4788 
4789   return false;
4790 }
4791 
4792 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4793   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4794     // TODO: It may by useful to do since it's still likely to be dynamically
4795     // uniform if the target can skip.
4796     reportVectorizationFailure(
4797         "Not inserting runtime ptr check for divergent target",
4798         "runtime pointer checks needed. Not enabled for divergent target",
4799         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4800     return None;
4801   }
4802 
4803   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4804   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4805   if (TC == 1) {
4806     reportVectorizationFailure("Single iteration (non) loop",
4807         "loop trip count is one, irrelevant for vectorization",
4808         "SingleIterationLoop", ORE, TheLoop);
4809     return None;
4810   }
4811 
4812   switch (ScalarEpilogueStatus) {
4813   case CM_ScalarEpilogueAllowed:
4814     return computeFeasibleMaxVF(TC);
4815   case CM_ScalarEpilogueNotNeededUsePredicate:
4816     LLVM_DEBUG(
4817         dbgs() << "LV: vector predicate hint/switch found.\n"
4818                << "LV: Not allowing scalar epilogue, creating predicated "
4819                << "vector loop.\n");
4820     break;
4821   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4822     // fallthrough as a special case of OptForSize
4823   case CM_ScalarEpilogueNotAllowedOptSize:
4824     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4825       LLVM_DEBUG(
4826           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4827     else
4828       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4829                         << "count.\n");
4830 
4831     // Bail if runtime checks are required, which are not good when optimising
4832     // for size.
4833     if (runtimeChecksRequired())
4834       return None;
4835     break;
4836   }
4837 
4838   // Now try the tail folding
4839 
4840   // Invalidate interleave groups that require an epilogue if we can't mask
4841   // the interleave-group.
4842   if (!useMaskedInterleavedAccesses(TTI))
4843     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4844 
4845   unsigned MaxVF = computeFeasibleMaxVF(TC);
4846   if (TC > 0 && TC % MaxVF == 0) {
4847     // Accept MaxVF if we do not have a tail.
4848     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4849     return MaxVF;
4850   }
4851 
4852   // If we don't know the precise trip count, or if the trip count that we
4853   // found modulo the vectorization factor is not zero, try to fold the tail
4854   // by masking.
4855   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4856   if (Legal->prepareToFoldTailByMasking()) {
4857     FoldTailByMasking = true;
4858     return MaxVF;
4859   }
4860 
4861   if (TC == 0) {
4862     reportVectorizationFailure(
4863         "Unable to calculate the loop count due to complex control flow",
4864         "unable to calculate the loop count due to complex control flow",
4865         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4866     return None;
4867   }
4868 
4869   reportVectorizationFailure(
4870       "Cannot optimize for size and vectorize at the same time.",
4871       "cannot optimize for size and vectorize at the same time. "
4872       "Enable vectorization of this loop with '#pragma clang loop "
4873       "vectorize(enable)' when compiling with -Os/-Oz",
4874       "NoTailLoopWithOptForSize", ORE, TheLoop);
4875   return None;
4876 }
4877 
4878 unsigned
4879 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4880   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4881   unsigned SmallestType, WidestType;
4882   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4883   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4884 
4885   // Get the maximum safe dependence distance in bits computed by LAA.
4886   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4887   // the memory accesses that is most restrictive (involved in the smallest
4888   // dependence distance).
4889   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4890 
4891   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4892 
4893   unsigned MaxVectorSize = WidestRegister / WidestType;
4894 
4895   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4896                     << " / " << WidestType << " bits.\n");
4897   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4898                     << WidestRegister << " bits.\n");
4899 
4900   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4901                                  " into one vector!");
4902   if (MaxVectorSize == 0) {
4903     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4904     MaxVectorSize = 1;
4905     return MaxVectorSize;
4906   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4907              isPowerOf2_32(ConstTripCount)) {
4908     // We need to clamp the VF to be the ConstTripCount. There is no point in
4909     // choosing a higher viable VF as done in the loop below.
4910     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4911                       << ConstTripCount << "\n");
4912     MaxVectorSize = ConstTripCount;
4913     return MaxVectorSize;
4914   }
4915 
4916   unsigned MaxVF = MaxVectorSize;
4917   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4918       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4919     // Collect all viable vectorization factors larger than the default MaxVF
4920     // (i.e. MaxVectorSize).
4921     SmallVector<unsigned, 8> VFs;
4922     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4923     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4924       VFs.push_back(VS);
4925 
4926     // For each VF calculate its register usage.
4927     auto RUs = calculateRegisterUsage(VFs);
4928 
4929     // Select the largest VF which doesn't require more registers than existing
4930     // ones.
4931     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4932     for (int i = RUs.size() - 1; i >= 0; --i) {
4933       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4934         MaxVF = VFs[i];
4935         break;
4936       }
4937     }
4938     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4939       if (MaxVF < MinVF) {
4940         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4941                           << ") with target's minimum: " << MinVF << '\n');
4942         MaxVF = MinVF;
4943       }
4944     }
4945   }
4946   return MaxVF;
4947 }
4948 
4949 VectorizationFactor
4950 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4951   float Cost = expectedCost(1).first;
4952   const float ScalarCost = Cost;
4953   unsigned Width = 1;
4954   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4955 
4956   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4957   if (ForceVectorization && MaxVF > 1) {
4958     // Ignore scalar width, because the user explicitly wants vectorization.
4959     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4960     // evaluation.
4961     Cost = std::numeric_limits<float>::max();
4962   }
4963 
4964   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4965     // Notice that the vector loop needs to be executed less times, so
4966     // we need to divide the cost of the vector loops by the width of
4967     // the vector elements.
4968     VectorizationCostTy C = expectedCost(i);
4969     float VectorCost = C.first / (float)i;
4970     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4971                       << " costs: " << (int)VectorCost << ".\n");
4972     if (!C.second && !ForceVectorization) {
4973       LLVM_DEBUG(
4974           dbgs() << "LV: Not considering vector loop of width " << i
4975                  << " because it will not generate any vector instructions.\n");
4976       continue;
4977     }
4978     if (VectorCost < Cost) {
4979       Cost = VectorCost;
4980       Width = i;
4981     }
4982   }
4983 
4984   if (!EnableCondStoresVectorization && NumPredStores) {
4985     reportVectorizationFailure("There are conditional stores.",
4986         "store that is conditionally executed prevents vectorization",
4987         "ConditionalStore", ORE, TheLoop);
4988     Width = 1;
4989     Cost = ScalarCost;
4990   }
4991 
4992   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4993              << "LV: Vectorization seems to be not beneficial, "
4994              << "but was forced by a user.\n");
4995   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4996   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4997   return Factor;
4998 }
4999 
5000 std::pair<unsigned, unsigned>
5001 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5002   unsigned MinWidth = -1U;
5003   unsigned MaxWidth = 8;
5004   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5005 
5006   // For each block.
5007   for (BasicBlock *BB : TheLoop->blocks()) {
5008     // For each instruction in the loop.
5009     for (Instruction &I : BB->instructionsWithoutDebug()) {
5010       Type *T = I.getType();
5011 
5012       // Skip ignored values.
5013       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5014         continue;
5015 
5016       // Only examine Loads, Stores and PHINodes.
5017       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5018         continue;
5019 
5020       // Examine PHI nodes that are reduction variables. Update the type to
5021       // account for the recurrence type.
5022       if (auto *PN = dyn_cast<PHINode>(&I)) {
5023         if (!Legal->isReductionVariable(PN))
5024           continue;
5025         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5026         T = RdxDesc.getRecurrenceType();
5027       }
5028 
5029       // Examine the stored values.
5030       if (auto *ST = dyn_cast<StoreInst>(&I))
5031         T = ST->getValueOperand()->getType();
5032 
5033       // Ignore loaded pointer types and stored pointer types that are not
5034       // vectorizable.
5035       //
5036       // FIXME: The check here attempts to predict whether a load or store will
5037       //        be vectorized. We only know this for certain after a VF has
5038       //        been selected. Here, we assume that if an access can be
5039       //        vectorized, it will be. We should also look at extending this
5040       //        optimization to non-pointer types.
5041       //
5042       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5043           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5044         continue;
5045 
5046       MinWidth = std::min(MinWidth,
5047                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5048       MaxWidth = std::max(MaxWidth,
5049                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5050     }
5051   }
5052 
5053   return {MinWidth, MaxWidth};
5054 }
5055 
5056 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5057                                                            unsigned LoopCost) {
5058   // -- The interleave heuristics --
5059   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5060   // There are many micro-architectural considerations that we can't predict
5061   // at this level. For example, frontend pressure (on decode or fetch) due to
5062   // code size, or the number and capabilities of the execution ports.
5063   //
5064   // We use the following heuristics to select the interleave count:
5065   // 1. If the code has reductions, then we interleave to break the cross
5066   // iteration dependency.
5067   // 2. If the loop is really small, then we interleave to reduce the loop
5068   // overhead.
5069   // 3. We don't interleave if we think that we will spill registers to memory
5070   // due to the increased register pressure.
5071 
5072   if (!isScalarEpilogueAllowed())
5073     return 1;
5074 
5075   // We used the distance for the interleave count.
5076   if (Legal->getMaxSafeDepDistBytes() != -1U)
5077     return 1;
5078 
5079   // Do not interleave loops with a relatively small trip count.
5080   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5081   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5082     return 1;
5083 
5084   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5085   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5086                     << " registers\n");
5087 
5088   if (VF == 1) {
5089     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5090       TargetNumRegisters = ForceTargetNumScalarRegs;
5091   } else {
5092     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5093       TargetNumRegisters = ForceTargetNumVectorRegs;
5094   }
5095 
5096   RegisterUsage R = calculateRegisterUsage({VF})[0];
5097   // We divide by these constants so assume that we have at least one
5098   // instruction that uses at least one register.
5099   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5100 
5101   // We calculate the interleave count using the following formula.
5102   // Subtract the number of loop invariants from the number of available
5103   // registers. These registers are used by all of the interleaved instances.
5104   // Next, divide the remaining registers by the number of registers that is
5105   // required by the loop, in order to estimate how many parallel instances
5106   // fit without causing spills. All of this is rounded down if necessary to be
5107   // a power of two. We want power of two interleave count to simplify any
5108   // addressing operations or alignment considerations.
5109   // We also want power of two interleave counts to ensure that the induction
5110   // variable of the vector loop wraps to zero, when tail is folded by masking;
5111   // this currently happens when OptForSize, in which case IC is set to 1 above.
5112   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5113                               R.MaxLocalUsers);
5114 
5115   // Don't count the induction variable as interleaved.
5116   if (EnableIndVarRegisterHeur)
5117     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5118                        std::max(1U, (R.MaxLocalUsers - 1)));
5119 
5120   // Clamp the interleave ranges to reasonable counts.
5121   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5122 
5123   // Check if the user has overridden the max.
5124   if (VF == 1) {
5125     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5126       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5127   } else {
5128     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5129       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5130   }
5131 
5132   // If the trip count is constant, limit the interleave count to be less than
5133   // the trip count divided by VF.
5134   if (TC > 0) {
5135     assert(TC >= VF && "VF exceeds trip count?");
5136     if ((TC / VF) < MaxInterleaveCount)
5137       MaxInterleaveCount = (TC / VF);
5138   }
5139 
5140   // If we did not calculate the cost for VF (because the user selected the VF)
5141   // then we calculate the cost of VF here.
5142   if (LoopCost == 0)
5143     LoopCost = expectedCost(VF).first;
5144 
5145   assert(LoopCost && "Non-zero loop cost expected");
5146 
5147   // Clamp the calculated IC to be between the 1 and the max interleave count
5148   // that the target and trip count allows.
5149   if (IC > MaxInterleaveCount)
5150     IC = MaxInterleaveCount;
5151   else if (IC < 1)
5152     IC = 1;
5153 
5154   // Interleave if we vectorized this loop and there is a reduction that could
5155   // benefit from interleaving.
5156   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5157     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5158     return IC;
5159   }
5160 
5161   // Note that if we've already vectorized the loop we will have done the
5162   // runtime check and so interleaving won't require further checks.
5163   bool InterleavingRequiresRuntimePointerCheck =
5164       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5165 
5166   // We want to interleave small loops in order to reduce the loop overhead and
5167   // potentially expose ILP opportunities.
5168   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5169   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5170     // We assume that the cost overhead is 1 and we use the cost model
5171     // to estimate the cost of the loop and interleave until the cost of the
5172     // loop overhead is about 5% of the cost of the loop.
5173     unsigned SmallIC =
5174         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5175 
5176     // Interleave until store/load ports (estimated by max interleave count) are
5177     // saturated.
5178     unsigned NumStores = Legal->getNumStores();
5179     unsigned NumLoads = Legal->getNumLoads();
5180     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5181     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5182 
5183     // If we have a scalar reduction (vector reductions are already dealt with
5184     // by this point), we can increase the critical path length if the loop
5185     // we're interleaving is inside another loop. Limit, by default to 2, so the
5186     // critical path only gets increased by one reduction operation.
5187     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5188       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5189       SmallIC = std::min(SmallIC, F);
5190       StoresIC = std::min(StoresIC, F);
5191       LoadsIC = std::min(LoadsIC, F);
5192     }
5193 
5194     if (EnableLoadStoreRuntimeInterleave &&
5195         std::max(StoresIC, LoadsIC) > SmallIC) {
5196       LLVM_DEBUG(
5197           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5198       return std::max(StoresIC, LoadsIC);
5199     }
5200 
5201     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5202     return SmallIC;
5203   }
5204 
5205   // Interleave if this is a large loop (small loops are already dealt with by
5206   // this point) that could benefit from interleaving.
5207   bool HasReductions = !Legal->getReductionVars()->empty();
5208   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5209     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5210     return IC;
5211   }
5212 
5213   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5214   return 1;
5215 }
5216 
5217 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5218 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5219   // This function calculates the register usage by measuring the highest number
5220   // of values that are alive at a single location. Obviously, this is a very
5221   // rough estimation. We scan the loop in a topological order in order and
5222   // assign a number to each instruction. We use RPO to ensure that defs are
5223   // met before their users. We assume that each instruction that has in-loop
5224   // users starts an interval. We record every time that an in-loop value is
5225   // used, so we have a list of the first and last occurrences of each
5226   // instruction. Next, we transpose this data structure into a multi map that
5227   // holds the list of intervals that *end* at a specific location. This multi
5228   // map allows us to perform a linear search. We scan the instructions linearly
5229   // and record each time that a new interval starts, by placing it in a set.
5230   // If we find this value in the multi-map then we remove it from the set.
5231   // The max register usage is the maximum size of the set.
5232   // We also search for instructions that are defined outside the loop, but are
5233   // used inside the loop. We need this number separately from the max-interval
5234   // usage number because when we unroll, loop-invariant values do not take
5235   // more register.
5236   LoopBlocksDFS DFS(TheLoop);
5237   DFS.perform(LI);
5238 
5239   RegisterUsage RU;
5240 
5241   // Each 'key' in the map opens a new interval. The values
5242   // of the map are the index of the 'last seen' usage of the
5243   // instruction that is the key.
5244   using IntervalMap = DenseMap<Instruction *, unsigned>;
5245 
5246   // Maps instruction to its index.
5247   SmallVector<Instruction *, 64> IdxToInstr;
5248   // Marks the end of each interval.
5249   IntervalMap EndPoint;
5250   // Saves the list of instruction indices that are used in the loop.
5251   SmallPtrSet<Instruction *, 8> Ends;
5252   // Saves the list of values that are used in the loop but are
5253   // defined outside the loop, such as arguments and constants.
5254   SmallPtrSet<Value *, 8> LoopInvariants;
5255 
5256   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5257     for (Instruction &I : BB->instructionsWithoutDebug()) {
5258       IdxToInstr.push_back(&I);
5259 
5260       // Save the end location of each USE.
5261       for (Value *U : I.operands()) {
5262         auto *Instr = dyn_cast<Instruction>(U);
5263 
5264         // Ignore non-instruction values such as arguments, constants, etc.
5265         if (!Instr)
5266           continue;
5267 
5268         // If this instruction is outside the loop then record it and continue.
5269         if (!TheLoop->contains(Instr)) {
5270           LoopInvariants.insert(Instr);
5271           continue;
5272         }
5273 
5274         // Overwrite previous end points.
5275         EndPoint[Instr] = IdxToInstr.size();
5276         Ends.insert(Instr);
5277       }
5278     }
5279   }
5280 
5281   // Saves the list of intervals that end with the index in 'key'.
5282   using InstrList = SmallVector<Instruction *, 2>;
5283   DenseMap<unsigned, InstrList> TransposeEnds;
5284 
5285   // Transpose the EndPoints to a list of values that end at each index.
5286   for (auto &Interval : EndPoint)
5287     TransposeEnds[Interval.second].push_back(Interval.first);
5288 
5289   SmallPtrSet<Instruction *, 8> OpenIntervals;
5290 
5291   // Get the size of the widest register.
5292   unsigned MaxSafeDepDist = -1U;
5293   if (Legal->getMaxSafeDepDistBytes() != -1U)
5294     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5295   unsigned WidestRegister =
5296       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5297   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5298 
5299   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5300   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5301 
5302   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5303 
5304   // A lambda that gets the register usage for the given type and VF.
5305   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5306     if (Ty->isTokenTy())
5307       return 0U;
5308     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5309     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5310   };
5311 
5312   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5313     Instruction *I = IdxToInstr[i];
5314 
5315     // Remove all of the instructions that end at this location.
5316     InstrList &List = TransposeEnds[i];
5317     for (Instruction *ToRemove : List)
5318       OpenIntervals.erase(ToRemove);
5319 
5320     // Ignore instructions that are never used within the loop.
5321     if (Ends.find(I) == Ends.end())
5322       continue;
5323 
5324     // Skip ignored values.
5325     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5326       continue;
5327 
5328     // For each VF find the maximum usage of registers.
5329     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5330       if (VFs[j] == 1) {
5331         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5332         continue;
5333       }
5334       collectUniformsAndScalars(VFs[j]);
5335       // Count the number of live intervals.
5336       unsigned RegUsage = 0;
5337       for (auto Inst : OpenIntervals) {
5338         // Skip ignored values for VF > 1.
5339         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5340             isScalarAfterVectorization(Inst, VFs[j]))
5341           continue;
5342         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5343       }
5344       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5345     }
5346 
5347     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5348                       << OpenIntervals.size() << '\n');
5349 
5350     // Add the current instruction to the list of open intervals.
5351     OpenIntervals.insert(I);
5352   }
5353 
5354   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5355     unsigned Invariant = 0;
5356     if (VFs[i] == 1)
5357       Invariant = LoopInvariants.size();
5358     else {
5359       for (auto Inst : LoopInvariants)
5360         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5361     }
5362 
5363     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5364     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5365     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5366                       << '\n');
5367 
5368     RU.LoopInvariantRegs = Invariant;
5369     RU.MaxLocalUsers = MaxUsages[i];
5370     RUs[i] = RU;
5371   }
5372 
5373   return RUs;
5374 }
5375 
5376 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5377   // TODO: Cost model for emulated masked load/store is completely
5378   // broken. This hack guides the cost model to use an artificially
5379   // high enough value to practically disable vectorization with such
5380   // operations, except where previously deployed legality hack allowed
5381   // using very low cost values. This is to avoid regressions coming simply
5382   // from moving "masked load/store" check from legality to cost model.
5383   // Masked Load/Gather emulation was previously never allowed.
5384   // Limited number of Masked Store/Scatter emulation was allowed.
5385   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5386   return isa<LoadInst>(I) ||
5387          (isa<StoreInst>(I) &&
5388           NumPredStores > NumberOfStoresToPredicate);
5389 }
5390 
5391 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5392   // If we aren't vectorizing the loop, or if we've already collected the
5393   // instructions to scalarize, there's nothing to do. Collection may already
5394   // have occurred if we have a user-selected VF and are now computing the
5395   // expected cost for interleaving.
5396   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5397     return;
5398 
5399   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5400   // not profitable to scalarize any instructions, the presence of VF in the
5401   // map will indicate that we've analyzed it already.
5402   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5403 
5404   // Find all the instructions that are scalar with predication in the loop and
5405   // determine if it would be better to not if-convert the blocks they are in.
5406   // If so, we also record the instructions to scalarize.
5407   for (BasicBlock *BB : TheLoop->blocks()) {
5408     if (!blockNeedsPredication(BB))
5409       continue;
5410     for (Instruction &I : *BB)
5411       if (isScalarWithPredication(&I)) {
5412         ScalarCostsTy ScalarCosts;
5413         // Do not apply discount logic if hacked cost is needed
5414         // for emulated masked memrefs.
5415         if (!useEmulatedMaskMemRefHack(&I) &&
5416             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5417           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5418         // Remember that BB will remain after vectorization.
5419         PredicatedBBsAfterVectorization.insert(BB);
5420       }
5421   }
5422 }
5423 
5424 int LoopVectorizationCostModel::computePredInstDiscount(
5425     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5426     unsigned VF) {
5427   assert(!isUniformAfterVectorization(PredInst, VF) &&
5428          "Instruction marked uniform-after-vectorization will be predicated");
5429 
5430   // Initialize the discount to zero, meaning that the scalar version and the
5431   // vector version cost the same.
5432   int Discount = 0;
5433 
5434   // Holds instructions to analyze. The instructions we visit are mapped in
5435   // ScalarCosts. Those instructions are the ones that would be scalarized if
5436   // we find that the scalar version costs less.
5437   SmallVector<Instruction *, 8> Worklist;
5438 
5439   // Returns true if the given instruction can be scalarized.
5440   auto canBeScalarized = [&](Instruction *I) -> bool {
5441     // We only attempt to scalarize instructions forming a single-use chain
5442     // from the original predicated block that would otherwise be vectorized.
5443     // Although not strictly necessary, we give up on instructions we know will
5444     // already be scalar to avoid traversing chains that are unlikely to be
5445     // beneficial.
5446     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5447         isScalarAfterVectorization(I, VF))
5448       return false;
5449 
5450     // If the instruction is scalar with predication, it will be analyzed
5451     // separately. We ignore it within the context of PredInst.
5452     if (isScalarWithPredication(I))
5453       return false;
5454 
5455     // If any of the instruction's operands are uniform after vectorization,
5456     // the instruction cannot be scalarized. This prevents, for example, a
5457     // masked load from being scalarized.
5458     //
5459     // We assume we will only emit a value for lane zero of an instruction
5460     // marked uniform after vectorization, rather than VF identical values.
5461     // Thus, if we scalarize an instruction that uses a uniform, we would
5462     // create uses of values corresponding to the lanes we aren't emitting code
5463     // for. This behavior can be changed by allowing getScalarValue to clone
5464     // the lane zero values for uniforms rather than asserting.
5465     for (Use &U : I->operands())
5466       if (auto *J = dyn_cast<Instruction>(U.get()))
5467         if (isUniformAfterVectorization(J, VF))
5468           return false;
5469 
5470     // Otherwise, we can scalarize the instruction.
5471     return true;
5472   };
5473 
5474   // Compute the expected cost discount from scalarizing the entire expression
5475   // feeding the predicated instruction. We currently only consider expressions
5476   // that are single-use instruction chains.
5477   Worklist.push_back(PredInst);
5478   while (!Worklist.empty()) {
5479     Instruction *I = Worklist.pop_back_val();
5480 
5481     // If we've already analyzed the instruction, there's nothing to do.
5482     if (ScalarCosts.find(I) != ScalarCosts.end())
5483       continue;
5484 
5485     // Compute the cost of the vector instruction. Note that this cost already
5486     // includes the scalarization overhead of the predicated instruction.
5487     unsigned VectorCost = getInstructionCost(I, VF).first;
5488 
5489     // Compute the cost of the scalarized instruction. This cost is the cost of
5490     // the instruction as if it wasn't if-converted and instead remained in the
5491     // predicated block. We will scale this cost by block probability after
5492     // computing the scalarization overhead.
5493     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5494 
5495     // Compute the scalarization overhead of needed insertelement instructions
5496     // and phi nodes.
5497     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5498       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5499                                                  true, false);
5500       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5501     }
5502 
5503     // Compute the scalarization overhead of needed extractelement
5504     // instructions. For each of the instruction's operands, if the operand can
5505     // be scalarized, add it to the worklist; otherwise, account for the
5506     // overhead.
5507     for (Use &U : I->operands())
5508       if (auto *J = dyn_cast<Instruction>(U.get())) {
5509         assert(VectorType::isValidElementType(J->getType()) &&
5510                "Instruction has non-scalar type");
5511         if (canBeScalarized(J))
5512           Worklist.push_back(J);
5513         else if (needsExtract(J, VF))
5514           ScalarCost += TTI.getScalarizationOverhead(
5515                               ToVectorTy(J->getType(),VF), false, true);
5516       }
5517 
5518     // Scale the total scalar cost by block probability.
5519     ScalarCost /= getReciprocalPredBlockProb();
5520 
5521     // Compute the discount. A non-negative discount means the vector version
5522     // of the instruction costs more, and scalarizing would be beneficial.
5523     Discount += VectorCost - ScalarCost;
5524     ScalarCosts[I] = ScalarCost;
5525   }
5526 
5527   return Discount;
5528 }
5529 
5530 LoopVectorizationCostModel::VectorizationCostTy
5531 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5532   VectorizationCostTy Cost;
5533 
5534   // For each block.
5535   for (BasicBlock *BB : TheLoop->blocks()) {
5536     VectorizationCostTy BlockCost;
5537 
5538     // For each instruction in the old loop.
5539     for (Instruction &I : BB->instructionsWithoutDebug()) {
5540       // Skip ignored values.
5541       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5542           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5543         continue;
5544 
5545       VectorizationCostTy C = getInstructionCost(&I, VF);
5546 
5547       // Check if we should override the cost.
5548       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5549         C.first = ForceTargetInstructionCost;
5550 
5551       BlockCost.first += C.first;
5552       BlockCost.second |= C.second;
5553       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5554                         << " for VF " << VF << " For instruction: " << I
5555                         << '\n');
5556     }
5557 
5558     // If we are vectorizing a predicated block, it will have been
5559     // if-converted. This means that the block's instructions (aside from
5560     // stores and instructions that may divide by zero) will now be
5561     // unconditionally executed. For the scalar case, we may not always execute
5562     // the predicated block. Thus, scale the block's cost by the probability of
5563     // executing it.
5564     if (VF == 1 && blockNeedsPredication(BB))
5565       BlockCost.first /= getReciprocalPredBlockProb();
5566 
5567     Cost.first += BlockCost.first;
5568     Cost.second |= BlockCost.second;
5569   }
5570 
5571   return Cost;
5572 }
5573 
5574 /// Gets Address Access SCEV after verifying that the access pattern
5575 /// is loop invariant except the induction variable dependence.
5576 ///
5577 /// This SCEV can be sent to the Target in order to estimate the address
5578 /// calculation cost.
5579 static const SCEV *getAddressAccessSCEV(
5580               Value *Ptr,
5581               LoopVectorizationLegality *Legal,
5582               PredicatedScalarEvolution &PSE,
5583               const Loop *TheLoop) {
5584 
5585   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5586   if (!Gep)
5587     return nullptr;
5588 
5589   // We are looking for a gep with all loop invariant indices except for one
5590   // which should be an induction variable.
5591   auto SE = PSE.getSE();
5592   unsigned NumOperands = Gep->getNumOperands();
5593   for (unsigned i = 1; i < NumOperands; ++i) {
5594     Value *Opd = Gep->getOperand(i);
5595     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5596         !Legal->isInductionVariable(Opd))
5597       return nullptr;
5598   }
5599 
5600   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5601   return PSE.getSCEV(Ptr);
5602 }
5603 
5604 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5605   return Legal->hasStride(I->getOperand(0)) ||
5606          Legal->hasStride(I->getOperand(1));
5607 }
5608 
5609 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5610                                                                  unsigned VF) {
5611   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5612   Type *ValTy = getMemInstValueType(I);
5613   auto SE = PSE.getSE();
5614 
5615   unsigned Alignment = getLoadStoreAlignment(I);
5616   unsigned AS = getLoadStoreAddressSpace(I);
5617   Value *Ptr = getLoadStorePointerOperand(I);
5618   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5619 
5620   // Figure out whether the access is strided and get the stride value
5621   // if it's known in compile time
5622   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5623 
5624   // Get the cost of the scalar memory instruction and address computation.
5625   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5626 
5627   // Don't pass *I here, since it is scalar but will actually be part of a
5628   // vectorized loop where the user of it is a vectorized instruction.
5629   Cost += VF *
5630           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5631                               AS);
5632 
5633   // Get the overhead of the extractelement and insertelement instructions
5634   // we might create due to scalarization.
5635   Cost += getScalarizationOverhead(I, VF);
5636 
5637   // If we have a predicated store, it may not be executed for each vector
5638   // lane. Scale the cost by the probability of executing the predicated
5639   // block.
5640   if (isPredicatedInst(I)) {
5641     Cost /= getReciprocalPredBlockProb();
5642 
5643     if (useEmulatedMaskMemRefHack(I))
5644       // Artificially setting to a high enough value to practically disable
5645       // vectorization with such operations.
5646       Cost = 3000000;
5647   }
5648 
5649   return Cost;
5650 }
5651 
5652 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5653                                                              unsigned VF) {
5654   Type *ValTy = getMemInstValueType(I);
5655   Type *VectorTy = ToVectorTy(ValTy, VF);
5656   unsigned Alignment = getLoadStoreAlignment(I);
5657   Value *Ptr = getLoadStorePointerOperand(I);
5658   unsigned AS = getLoadStoreAddressSpace(I);
5659   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5660 
5661   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5662          "Stride should be 1 or -1 for consecutive memory access");
5663   unsigned Cost = 0;
5664   if (Legal->isMaskRequired(I))
5665     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5666   else
5667     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5668 
5669   bool Reverse = ConsecutiveStride < 0;
5670   if (Reverse)
5671     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5672   return Cost;
5673 }
5674 
5675 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5676                                                          unsigned VF) {
5677   Type *ValTy = getMemInstValueType(I);
5678   Type *VectorTy = ToVectorTy(ValTy, VF);
5679   unsigned Alignment = getLoadStoreAlignment(I);
5680   unsigned AS = getLoadStoreAddressSpace(I);
5681   if (isa<LoadInst>(I)) {
5682     return TTI.getAddressComputationCost(ValTy) +
5683            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5684            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5685   }
5686   StoreInst *SI = cast<StoreInst>(I);
5687 
5688   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5689   return TTI.getAddressComputationCost(ValTy) +
5690          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5691          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5692                                                Instruction::ExtractElement,
5693                                                VectorTy, VF - 1));
5694 }
5695 
5696 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5697                                                           unsigned VF) {
5698   Type *ValTy = getMemInstValueType(I);
5699   Type *VectorTy = ToVectorTy(ValTy, VF);
5700   unsigned Alignment = getLoadStoreAlignment(I);
5701   Value *Ptr = getLoadStorePointerOperand(I);
5702 
5703   return TTI.getAddressComputationCost(VectorTy) +
5704          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5705                                     Legal->isMaskRequired(I), Alignment);
5706 }
5707 
5708 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5709                                                             unsigned VF) {
5710   Type *ValTy = getMemInstValueType(I);
5711   Type *VectorTy = ToVectorTy(ValTy, VF);
5712   unsigned AS = getLoadStoreAddressSpace(I);
5713 
5714   auto Group = getInterleavedAccessGroup(I);
5715   assert(Group && "Fail to get an interleaved access group.");
5716 
5717   unsigned InterleaveFactor = Group->getFactor();
5718   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5719 
5720   // Holds the indices of existing members in an interleaved load group.
5721   // An interleaved store group doesn't need this as it doesn't allow gaps.
5722   SmallVector<unsigned, 4> Indices;
5723   if (isa<LoadInst>(I)) {
5724     for (unsigned i = 0; i < InterleaveFactor; i++)
5725       if (Group->getMember(i))
5726         Indices.push_back(i);
5727   }
5728 
5729   // Calculate the cost of the whole interleaved group.
5730   bool UseMaskForGaps =
5731       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5732   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5733       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5734       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5735 
5736   if (Group->isReverse()) {
5737     // TODO: Add support for reversed masked interleaved access.
5738     assert(!Legal->isMaskRequired(I) &&
5739            "Reverse masked interleaved access not supported.");
5740     Cost += Group->getNumMembers() *
5741             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5742   }
5743   return Cost;
5744 }
5745 
5746 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5747                                                               unsigned VF) {
5748   // Calculate scalar cost only. Vectorization cost should be ready at this
5749   // moment.
5750   if (VF == 1) {
5751     Type *ValTy = getMemInstValueType(I);
5752     unsigned Alignment = getLoadStoreAlignment(I);
5753     unsigned AS = getLoadStoreAddressSpace(I);
5754 
5755     return TTI.getAddressComputationCost(ValTy) +
5756            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5757   }
5758   return getWideningCost(I, VF);
5759 }
5760 
5761 LoopVectorizationCostModel::VectorizationCostTy
5762 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5763   // If we know that this instruction will remain uniform, check the cost of
5764   // the scalar version.
5765   if (isUniformAfterVectorization(I, VF))
5766     VF = 1;
5767 
5768   if (VF > 1 && isProfitableToScalarize(I, VF))
5769     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5770 
5771   // Forced scalars do not have any scalarization overhead.
5772   auto ForcedScalar = ForcedScalars.find(VF);
5773   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5774     auto InstSet = ForcedScalar->second;
5775     if (InstSet.find(I) != InstSet.end())
5776       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5777   }
5778 
5779   Type *VectorTy;
5780   unsigned C = getInstructionCost(I, VF, VectorTy);
5781 
5782   bool TypeNotScalarized =
5783       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5784   return VectorizationCostTy(C, TypeNotScalarized);
5785 }
5786 
5787 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5788                                                               unsigned VF) {
5789 
5790   if (VF == 1)
5791     return 0;
5792 
5793   unsigned Cost = 0;
5794   Type *RetTy = ToVectorTy(I->getType(), VF);
5795   if (!RetTy->isVoidTy() &&
5796       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5797     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5798 
5799   // Some targets keep addresses scalar.
5800   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5801     return Cost;
5802 
5803   // Some targets support efficient element stores.
5804   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5805     return Cost;
5806 
5807   // Collect operands to consider.
5808   CallInst *CI = dyn_cast<CallInst>(I);
5809   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5810 
5811   // Skip operands that do not require extraction/scalarization and do not incur
5812   // any overhead.
5813   return Cost + TTI.getOperandsScalarizationOverhead(
5814                     filterExtractingOperands(Ops, VF), VF);
5815 }
5816 
5817 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5818   if (VF == 1)
5819     return;
5820   NumPredStores = 0;
5821   for (BasicBlock *BB : TheLoop->blocks()) {
5822     // For each instruction in the old loop.
5823     for (Instruction &I : *BB) {
5824       Value *Ptr =  getLoadStorePointerOperand(&I);
5825       if (!Ptr)
5826         continue;
5827 
5828       // TODO: We should generate better code and update the cost model for
5829       // predicated uniform stores. Today they are treated as any other
5830       // predicated store (see added test cases in
5831       // invariant-store-vectorization.ll).
5832       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5833         NumPredStores++;
5834 
5835       if (Legal->isUniform(Ptr) &&
5836           // Conditional loads and stores should be scalarized and predicated.
5837           // isScalarWithPredication cannot be used here since masked
5838           // gather/scatters are not considered scalar with predication.
5839           !Legal->blockNeedsPredication(I.getParent())) {
5840         // TODO: Avoid replicating loads and stores instead of
5841         // relying on instcombine to remove them.
5842         // Load: Scalar load + broadcast
5843         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5844         unsigned Cost = getUniformMemOpCost(&I, VF);
5845         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5846         continue;
5847       }
5848 
5849       // We assume that widening is the best solution when possible.
5850       if (memoryInstructionCanBeWidened(&I, VF)) {
5851         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5852         int ConsecutiveStride =
5853                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5854         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5855                "Expected consecutive stride.");
5856         InstWidening Decision =
5857             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5858         setWideningDecision(&I, VF, Decision, Cost);
5859         continue;
5860       }
5861 
5862       // Choose between Interleaving, Gather/Scatter or Scalarization.
5863       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5864       unsigned NumAccesses = 1;
5865       if (isAccessInterleaved(&I)) {
5866         auto Group = getInterleavedAccessGroup(&I);
5867         assert(Group && "Fail to get an interleaved access group.");
5868 
5869         // Make one decision for the whole group.
5870         if (getWideningDecision(&I, VF) != CM_Unknown)
5871           continue;
5872 
5873         NumAccesses = Group->getNumMembers();
5874         if (interleavedAccessCanBeWidened(&I, VF))
5875           InterleaveCost = getInterleaveGroupCost(&I, VF);
5876       }
5877 
5878       unsigned GatherScatterCost =
5879           isLegalGatherOrScatter(&I)
5880               ? getGatherScatterCost(&I, VF) * NumAccesses
5881               : std::numeric_limits<unsigned>::max();
5882 
5883       unsigned ScalarizationCost =
5884           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5885 
5886       // Choose better solution for the current VF,
5887       // write down this decision and use it during vectorization.
5888       unsigned Cost;
5889       InstWidening Decision;
5890       if (InterleaveCost <= GatherScatterCost &&
5891           InterleaveCost < ScalarizationCost) {
5892         Decision = CM_Interleave;
5893         Cost = InterleaveCost;
5894       } else if (GatherScatterCost < ScalarizationCost) {
5895         Decision = CM_GatherScatter;
5896         Cost = GatherScatterCost;
5897       } else {
5898         Decision = CM_Scalarize;
5899         Cost = ScalarizationCost;
5900       }
5901       // If the instructions belongs to an interleave group, the whole group
5902       // receives the same decision. The whole group receives the cost, but
5903       // the cost will actually be assigned to one instruction.
5904       if (auto Group = getInterleavedAccessGroup(&I))
5905         setWideningDecision(Group, VF, Decision, Cost);
5906       else
5907         setWideningDecision(&I, VF, Decision, Cost);
5908     }
5909   }
5910 
5911   // Make sure that any load of address and any other address computation
5912   // remains scalar unless there is gather/scatter support. This avoids
5913   // inevitable extracts into address registers, and also has the benefit of
5914   // activating LSR more, since that pass can't optimize vectorized
5915   // addresses.
5916   if (TTI.prefersVectorizedAddressing())
5917     return;
5918 
5919   // Start with all scalar pointer uses.
5920   SmallPtrSet<Instruction *, 8> AddrDefs;
5921   for (BasicBlock *BB : TheLoop->blocks())
5922     for (Instruction &I : *BB) {
5923       Instruction *PtrDef =
5924         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5925       if (PtrDef && TheLoop->contains(PtrDef) &&
5926           getWideningDecision(&I, VF) != CM_GatherScatter)
5927         AddrDefs.insert(PtrDef);
5928     }
5929 
5930   // Add all instructions used to generate the addresses.
5931   SmallVector<Instruction *, 4> Worklist;
5932   for (auto *I : AddrDefs)
5933     Worklist.push_back(I);
5934   while (!Worklist.empty()) {
5935     Instruction *I = Worklist.pop_back_val();
5936     for (auto &Op : I->operands())
5937       if (auto *InstOp = dyn_cast<Instruction>(Op))
5938         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5939             AddrDefs.insert(InstOp).second)
5940           Worklist.push_back(InstOp);
5941   }
5942 
5943   for (auto *I : AddrDefs) {
5944     if (isa<LoadInst>(I)) {
5945       // Setting the desired widening decision should ideally be handled in
5946       // by cost functions, but since this involves the task of finding out
5947       // if the loaded register is involved in an address computation, it is
5948       // instead changed here when we know this is the case.
5949       InstWidening Decision = getWideningDecision(I, VF);
5950       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5951         // Scalarize a widened load of address.
5952         setWideningDecision(I, VF, CM_Scalarize,
5953                             (VF * getMemoryInstructionCost(I, 1)));
5954       else if (auto Group = getInterleavedAccessGroup(I)) {
5955         // Scalarize an interleave group of address loads.
5956         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5957           if (Instruction *Member = Group->getMember(I))
5958             setWideningDecision(Member, VF, CM_Scalarize,
5959                                 (VF * getMemoryInstructionCost(Member, 1)));
5960         }
5961       }
5962     } else
5963       // Make sure I gets scalarized and a cost estimate without
5964       // scalarization overhead.
5965       ForcedScalars[VF].insert(I);
5966   }
5967 }
5968 
5969 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5970                                                         unsigned VF,
5971                                                         Type *&VectorTy) {
5972   Type *RetTy = I->getType();
5973   if (canTruncateToMinimalBitwidth(I, VF))
5974     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5975   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5976   auto SE = PSE.getSE();
5977 
5978   // TODO: We need to estimate the cost of intrinsic calls.
5979   switch (I->getOpcode()) {
5980   case Instruction::GetElementPtr:
5981     // We mark this instruction as zero-cost because the cost of GEPs in
5982     // vectorized code depends on whether the corresponding memory instruction
5983     // is scalarized or not. Therefore, we handle GEPs with the memory
5984     // instruction cost.
5985     return 0;
5986   case Instruction::Br: {
5987     // In cases of scalarized and predicated instructions, there will be VF
5988     // predicated blocks in the vectorized loop. Each branch around these
5989     // blocks requires also an extract of its vector compare i1 element.
5990     bool ScalarPredicatedBB = false;
5991     BranchInst *BI = cast<BranchInst>(I);
5992     if (VF > 1 && BI->isConditional() &&
5993         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5994              PredicatedBBsAfterVectorization.end() ||
5995          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5996              PredicatedBBsAfterVectorization.end()))
5997       ScalarPredicatedBB = true;
5998 
5999     if (ScalarPredicatedBB) {
6000       // Return cost for branches around scalarized and predicated blocks.
6001       Type *Vec_i1Ty =
6002           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6003       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6004               (TTI.getCFInstrCost(Instruction::Br) * VF));
6005     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6006       // The back-edge branch will remain, as will all scalar branches.
6007       return TTI.getCFInstrCost(Instruction::Br);
6008     else
6009       // This branch will be eliminated by if-conversion.
6010       return 0;
6011     // Note: We currently assume zero cost for an unconditional branch inside
6012     // a predicated block since it will become a fall-through, although we
6013     // may decide in the future to call TTI for all branches.
6014   }
6015   case Instruction::PHI: {
6016     auto *Phi = cast<PHINode>(I);
6017 
6018     // First-order recurrences are replaced by vector shuffles inside the loop.
6019     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6020     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6021       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6022                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6023 
6024     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6025     // converted into select instructions. We require N - 1 selects per phi
6026     // node, where N is the number of incoming values.
6027     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6028       return (Phi->getNumIncomingValues() - 1) *
6029              TTI.getCmpSelInstrCost(
6030                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6031                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6032 
6033     return TTI.getCFInstrCost(Instruction::PHI);
6034   }
6035   case Instruction::UDiv:
6036   case Instruction::SDiv:
6037   case Instruction::URem:
6038   case Instruction::SRem:
6039     // If we have a predicated instruction, it may not be executed for each
6040     // vector lane. Get the scalarization cost and scale this amount by the
6041     // probability of executing the predicated block. If the instruction is not
6042     // predicated, we fall through to the next case.
6043     if (VF > 1 && isScalarWithPredication(I)) {
6044       unsigned Cost = 0;
6045 
6046       // These instructions have a non-void type, so account for the phi nodes
6047       // that we will create. This cost is likely to be zero. The phi node
6048       // cost, if any, should be scaled by the block probability because it
6049       // models a copy at the end of each predicated block.
6050       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6051 
6052       // The cost of the non-predicated instruction.
6053       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6054 
6055       // The cost of insertelement and extractelement instructions needed for
6056       // scalarization.
6057       Cost += getScalarizationOverhead(I, VF);
6058 
6059       // Scale the cost by the probability of executing the predicated blocks.
6060       // This assumes the predicated block for each vector lane is equally
6061       // likely.
6062       return Cost / getReciprocalPredBlockProb();
6063     }
6064     LLVM_FALLTHROUGH;
6065   case Instruction::Add:
6066   case Instruction::FAdd:
6067   case Instruction::Sub:
6068   case Instruction::FSub:
6069   case Instruction::Mul:
6070   case Instruction::FMul:
6071   case Instruction::FDiv:
6072   case Instruction::FRem:
6073   case Instruction::Shl:
6074   case Instruction::LShr:
6075   case Instruction::AShr:
6076   case Instruction::And:
6077   case Instruction::Or:
6078   case Instruction::Xor: {
6079     // Since we will replace the stride by 1 the multiplication should go away.
6080     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6081       return 0;
6082     // Certain instructions can be cheaper to vectorize if they have a constant
6083     // second vector operand. One example of this are shifts on x86.
6084     Value *Op2 = I->getOperand(1);
6085     TargetTransformInfo::OperandValueProperties Op2VP;
6086     TargetTransformInfo::OperandValueKind Op2VK =
6087         TTI.getOperandInfo(Op2, Op2VP);
6088     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6089       Op2VK = TargetTransformInfo::OK_UniformValue;
6090 
6091     SmallVector<const Value *, 4> Operands(I->operand_values());
6092     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6093     return N * TTI.getArithmeticInstrCost(
6094                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6095                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6096   }
6097   case Instruction::FNeg: {
6098     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6099     return N * TTI.getArithmeticInstrCost(
6100                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6101                    TargetTransformInfo::OK_AnyValue,
6102                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6103                    I->getOperand(0));
6104   }
6105   case Instruction::Select: {
6106     SelectInst *SI = cast<SelectInst>(I);
6107     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6108     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6109     Type *CondTy = SI->getCondition()->getType();
6110     if (!ScalarCond)
6111       CondTy = VectorType::get(CondTy, VF);
6112 
6113     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6114   }
6115   case Instruction::ICmp:
6116   case Instruction::FCmp: {
6117     Type *ValTy = I->getOperand(0)->getType();
6118     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6119     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6120       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6121     VectorTy = ToVectorTy(ValTy, VF);
6122     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6123   }
6124   case Instruction::Store:
6125   case Instruction::Load: {
6126     unsigned Width = VF;
6127     if (Width > 1) {
6128       InstWidening Decision = getWideningDecision(I, Width);
6129       assert(Decision != CM_Unknown &&
6130              "CM decision should be taken at this point");
6131       if (Decision == CM_Scalarize)
6132         Width = 1;
6133     }
6134     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6135     return getMemoryInstructionCost(I, VF);
6136   }
6137   case Instruction::ZExt:
6138   case Instruction::SExt:
6139   case Instruction::FPToUI:
6140   case Instruction::FPToSI:
6141   case Instruction::FPExt:
6142   case Instruction::PtrToInt:
6143   case Instruction::IntToPtr:
6144   case Instruction::SIToFP:
6145   case Instruction::UIToFP:
6146   case Instruction::Trunc:
6147   case Instruction::FPTrunc:
6148   case Instruction::BitCast: {
6149     // We optimize the truncation of induction variables having constant
6150     // integer steps. The cost of these truncations is the same as the scalar
6151     // operation.
6152     if (isOptimizableIVTruncate(I, VF)) {
6153       auto *Trunc = cast<TruncInst>(I);
6154       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6155                                   Trunc->getSrcTy(), Trunc);
6156     }
6157 
6158     Type *SrcScalarTy = I->getOperand(0)->getType();
6159     Type *SrcVecTy =
6160         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6161     if (canTruncateToMinimalBitwidth(I, VF)) {
6162       // This cast is going to be shrunk. This may remove the cast or it might
6163       // turn it into slightly different cast. For example, if MinBW == 16,
6164       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6165       //
6166       // Calculate the modified src and dest types.
6167       Type *MinVecTy = VectorTy;
6168       if (I->getOpcode() == Instruction::Trunc) {
6169         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6170         VectorTy =
6171             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6172       } else if (I->getOpcode() == Instruction::ZExt ||
6173                  I->getOpcode() == Instruction::SExt) {
6174         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6175         VectorTy =
6176             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6177       }
6178     }
6179 
6180     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6181     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6182   }
6183   case Instruction::Call: {
6184     bool NeedToScalarize;
6185     CallInst *CI = cast<CallInst>(I);
6186     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6187     if (getVectorIntrinsicIDForCall(CI, TLI))
6188       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6189     return CallCost;
6190   }
6191   default:
6192     // The cost of executing VF copies of the scalar instruction. This opcode
6193     // is unknown. Assume that it is the same as 'mul'.
6194     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6195            getScalarizationOverhead(I, VF);
6196   } // end of switch.
6197 }
6198 
6199 char LoopVectorize::ID = 0;
6200 
6201 static const char lv_name[] = "Loop Vectorization";
6202 
6203 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6204 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6205 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6206 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6207 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6208 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6209 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6210 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6211 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6212 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6213 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6214 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6215 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6216 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6217 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6218 
6219 namespace llvm {
6220 
6221 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6222 
6223 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6224                               bool VectorizeOnlyWhenForced) {
6225   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6226 }
6227 
6228 } // end namespace llvm
6229 
6230 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6231   // Check if the pointer operand of a load or store instruction is
6232   // consecutive.
6233   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6234     return Legal->isConsecutivePtr(Ptr);
6235   return false;
6236 }
6237 
6238 void LoopVectorizationCostModel::collectValuesToIgnore() {
6239   // Ignore ephemeral values.
6240   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6241 
6242   // Ignore type-promoting instructions we identified during reduction
6243   // detection.
6244   for (auto &Reduction : *Legal->getReductionVars()) {
6245     RecurrenceDescriptor &RedDes = Reduction.second;
6246     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6247     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6248   }
6249   // Ignore type-casting instructions we identified during induction
6250   // detection.
6251   for (auto &Induction : *Legal->getInductionVars()) {
6252     InductionDescriptor &IndDes = Induction.second;
6253     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6254     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6255   }
6256 }
6257 
6258 // TODO: we could return a pair of values that specify the max VF and
6259 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6260 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6261 // doesn't have a cost model that can choose which plan to execute if
6262 // more than one is generated.
6263 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6264                                  LoopVectorizationCostModel &CM) {
6265   unsigned WidestType;
6266   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6267   return WidestVectorRegBits / WidestType;
6268 }
6269 
6270 VectorizationFactor
6271 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6272   unsigned VF = UserVF;
6273   // Outer loop handling: They may require CFG and instruction level
6274   // transformations before even evaluating whether vectorization is profitable.
6275   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6276   // the vectorization pipeline.
6277   if (!OrigLoop->empty()) {
6278     // If the user doesn't provide a vectorization factor, determine a
6279     // reasonable one.
6280     if (!UserVF) {
6281       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6282       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6283 
6284       // Make sure we have a VF > 1 for stress testing.
6285       if (VPlanBuildStressTest && VF < 2) {
6286         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6287                           << "overriding computed VF.\n");
6288         VF = 4;
6289       }
6290     }
6291     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6292     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6293     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6294                       << " to build VPlans.\n");
6295     buildVPlans(VF, VF);
6296 
6297     // For VPlan build stress testing, we bail out after VPlan construction.
6298     if (VPlanBuildStressTest)
6299       return VectorizationFactor::Disabled();
6300 
6301     return {VF, 0};
6302   }
6303 
6304   LLVM_DEBUG(
6305       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6306                 "VPlan-native path.\n");
6307   return VectorizationFactor::Disabled();
6308 }
6309 
6310 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6311   assert(OrigLoop->empty() && "Inner loop expected.");
6312   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6313   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6314     return None;
6315 
6316   // Invalidate interleave groups if all blocks of loop will be predicated.
6317   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6318       !useMaskedInterleavedAccesses(*TTI)) {
6319     LLVM_DEBUG(
6320         dbgs()
6321         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6322            "which requires masked-interleaved support.\n");
6323     CM.InterleaveInfo.reset();
6324   }
6325 
6326   if (UserVF) {
6327     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6328     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6329     // Collect the instructions (and their associated costs) that will be more
6330     // profitable to scalarize.
6331     CM.selectUserVectorizationFactor(UserVF);
6332     buildVPlansWithVPRecipes(UserVF, UserVF);
6333     LLVM_DEBUG(printPlans(dbgs()));
6334     return {{UserVF, 0}};
6335   }
6336 
6337   unsigned MaxVF = MaybeMaxVF.getValue();
6338   assert(MaxVF != 0 && "MaxVF is zero.");
6339 
6340   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6341     // Collect Uniform and Scalar instructions after vectorization with VF.
6342     CM.collectUniformsAndScalars(VF);
6343 
6344     // Collect the instructions (and their associated costs) that will be more
6345     // profitable to scalarize.
6346     if (VF > 1)
6347       CM.collectInstsToScalarize(VF);
6348   }
6349 
6350   buildVPlansWithVPRecipes(1, MaxVF);
6351   LLVM_DEBUG(printPlans(dbgs()));
6352   if (MaxVF == 1)
6353     return VectorizationFactor::Disabled();
6354 
6355   // Select the optimal vectorization factor.
6356   return CM.selectVectorizationFactor(MaxVF);
6357 }
6358 
6359 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6360   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6361                     << '\n');
6362   BestVF = VF;
6363   BestUF = UF;
6364 
6365   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6366     return !Plan->hasVF(VF);
6367   });
6368   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6369 }
6370 
6371 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6372                                            DominatorTree *DT) {
6373   // Perform the actual loop transformation.
6374 
6375   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6376   VPCallbackILV CallbackILV(ILV);
6377 
6378   VPTransformState State{BestVF, BestUF,      LI,
6379                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6380                          &ILV,   CallbackILV};
6381   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6382   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6383 
6384   //===------------------------------------------------===//
6385   //
6386   // Notice: any optimization or new instruction that go
6387   // into the code below should also be implemented in
6388   // the cost-model.
6389   //
6390   //===------------------------------------------------===//
6391 
6392   // 2. Copy and widen instructions from the old loop into the new loop.
6393   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6394   VPlans.front()->execute(&State);
6395 
6396   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6397   //    predication, updating analyses.
6398   ILV.fixVectorizedLoop();
6399 }
6400 
6401 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6402     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6403   BasicBlock *Latch = OrigLoop->getLoopLatch();
6404 
6405   // We create new control-flow for the vectorized loop, so the original
6406   // condition will be dead after vectorization if it's only used by the
6407   // branch.
6408   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6409   if (Cmp && Cmp->hasOneUse())
6410     DeadInstructions.insert(Cmp);
6411 
6412   // We create new "steps" for induction variable updates to which the original
6413   // induction variables map. An original update instruction will be dead if
6414   // all its users except the induction variable are dead.
6415   for (auto &Induction : *Legal->getInductionVars()) {
6416     PHINode *Ind = Induction.first;
6417     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6418     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6419           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6420                                  DeadInstructions.end();
6421         }))
6422       DeadInstructions.insert(IndUpdate);
6423 
6424     // We record as "Dead" also the type-casting instructions we had identified
6425     // during induction analysis. We don't need any handling for them in the
6426     // vectorized loop because we have proven that, under a proper runtime
6427     // test guarding the vectorized loop, the value of the phi, and the casted
6428     // value of the phi, are the same. The last instruction in this casting chain
6429     // will get its scalar/vector/widened def from the scalar/vector/widened def
6430     // of the respective phi node. Any other casts in the induction def-use chain
6431     // have no other uses outside the phi update chain, and will be ignored.
6432     InductionDescriptor &IndDes = Induction.second;
6433     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6434     DeadInstructions.insert(Casts.begin(), Casts.end());
6435   }
6436 }
6437 
6438 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6439 
6440 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6441 
6442 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6443                                         Instruction::BinaryOps BinOp) {
6444   // When unrolling and the VF is 1, we only need to add a simple scalar.
6445   Type *Ty = Val->getType();
6446   assert(!Ty->isVectorTy() && "Val must be a scalar");
6447 
6448   if (Ty->isFloatingPointTy()) {
6449     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6450 
6451     // Floating point operations had to be 'fast' to enable the unrolling.
6452     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6453     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6454   }
6455   Constant *C = ConstantInt::get(Ty, StartIdx);
6456   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6457 }
6458 
6459 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6460   SmallVector<Metadata *, 4> MDs;
6461   // Reserve first location for self reference to the LoopID metadata node.
6462   MDs.push_back(nullptr);
6463   bool IsUnrollMetadata = false;
6464   MDNode *LoopID = L->getLoopID();
6465   if (LoopID) {
6466     // First find existing loop unrolling disable metadata.
6467     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6468       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6469       if (MD) {
6470         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6471         IsUnrollMetadata =
6472             S && S->getString().startswith("llvm.loop.unroll.disable");
6473       }
6474       MDs.push_back(LoopID->getOperand(i));
6475     }
6476   }
6477 
6478   if (!IsUnrollMetadata) {
6479     // Add runtime unroll disable metadata.
6480     LLVMContext &Context = L->getHeader()->getContext();
6481     SmallVector<Metadata *, 1> DisableOperands;
6482     DisableOperands.push_back(
6483         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6484     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6485     MDs.push_back(DisableNode);
6486     MDNode *NewLoopID = MDNode::get(Context, MDs);
6487     // Set operand 0 to refer to the loop id itself.
6488     NewLoopID->replaceOperandWith(0, NewLoopID);
6489     L->setLoopID(NewLoopID);
6490   }
6491 }
6492 
6493 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6494     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6495   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6496   bool PredicateAtRangeStart = Predicate(Range.Start);
6497 
6498   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6499     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6500       Range.End = TmpVF;
6501       break;
6502     }
6503 
6504   return PredicateAtRangeStart;
6505 }
6506 
6507 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6508 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6509 /// of VF's starting at a given VF and extending it as much as possible. Each
6510 /// vectorization decision can potentially shorten this sub-range during
6511 /// buildVPlan().
6512 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6513   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6514     VFRange SubRange = {VF, MaxVF + 1};
6515     VPlans.push_back(buildVPlan(SubRange));
6516     VF = SubRange.End;
6517   }
6518 }
6519 
6520 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6521                                          VPlanPtr &Plan) {
6522   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6523 
6524   // Look for cached value.
6525   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6526   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6527   if (ECEntryIt != EdgeMaskCache.end())
6528     return ECEntryIt->second;
6529 
6530   VPValue *SrcMask = createBlockInMask(Src, Plan);
6531 
6532   // The terminator has to be a branch inst!
6533   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6534   assert(BI && "Unexpected terminator found");
6535 
6536   if (!BI->isConditional())
6537     return EdgeMaskCache[Edge] = SrcMask;
6538 
6539   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6540   assert(EdgeMask && "No Edge Mask found for condition");
6541 
6542   if (BI->getSuccessor(0) != Dst)
6543     EdgeMask = Builder.createNot(EdgeMask);
6544 
6545   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6546     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6547 
6548   return EdgeMaskCache[Edge] = EdgeMask;
6549 }
6550 
6551 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6552   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6553 
6554   // Look for cached value.
6555   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6556   if (BCEntryIt != BlockMaskCache.end())
6557     return BCEntryIt->second;
6558 
6559   // All-one mask is modelled as no-mask following the convention for masked
6560   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6561   VPValue *BlockMask = nullptr;
6562 
6563   if (OrigLoop->getHeader() == BB) {
6564     if (!CM.blockNeedsPredication(BB))
6565       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6566 
6567     // Introduce the early-exit compare IV <= BTC to form header block mask.
6568     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6569     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6570     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6571     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6572     return BlockMaskCache[BB] = BlockMask;
6573   }
6574 
6575   // This is the block mask. We OR all incoming edges.
6576   for (auto *Predecessor : predecessors(BB)) {
6577     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6578     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6579       return BlockMaskCache[BB] = EdgeMask;
6580 
6581     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6582       BlockMask = EdgeMask;
6583       continue;
6584     }
6585 
6586     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6587   }
6588 
6589   return BlockMaskCache[BB] = BlockMask;
6590 }
6591 
6592 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6593                                                            VFRange &Range,
6594                                                            VPlanPtr &Plan) {
6595   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6596   if (!IG)
6597     return nullptr;
6598 
6599   // Now check if IG is relevant for VF's in the given range.
6600   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6601     return [=](unsigned VF) -> bool {
6602       return (VF >= 2 && // Query is illegal for VF == 1
6603               CM.getWideningDecision(I, VF) ==
6604                   LoopVectorizationCostModel::CM_Interleave);
6605     };
6606   };
6607   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6608     return nullptr;
6609 
6610   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6611   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6612   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6613   assert(I == IG->getInsertPos() &&
6614          "Generating a recipe for an adjunct member of an interleave group");
6615 
6616   VPValue *Mask = nullptr;
6617   if (Legal->isMaskRequired(I))
6618     Mask = createBlockInMask(I->getParent(), Plan);
6619 
6620   return new VPInterleaveRecipe(IG, Mask);
6621 }
6622 
6623 VPWidenMemoryInstructionRecipe *
6624 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6625                                   VPlanPtr &Plan) {
6626   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6627     return nullptr;
6628 
6629   auto willWiden = [&](unsigned VF) -> bool {
6630     if (VF == 1)
6631       return false;
6632     if (CM.isScalarAfterVectorization(I, VF) ||
6633         CM.isProfitableToScalarize(I, VF))
6634       return false;
6635     LoopVectorizationCostModel::InstWidening Decision =
6636         CM.getWideningDecision(I, VF);
6637     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6638            "CM decision should be taken at this point.");
6639     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6640            "Interleave memory opportunity should be caught earlier.");
6641     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6642   };
6643 
6644   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6645     return nullptr;
6646 
6647   VPValue *Mask = nullptr;
6648   if (Legal->isMaskRequired(I))
6649     Mask = createBlockInMask(I->getParent(), Plan);
6650 
6651   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6652 }
6653 
6654 VPWidenIntOrFpInductionRecipe *
6655 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6656   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6657     // Check if this is an integer or fp induction. If so, build the recipe that
6658     // produces its scalar and vector values.
6659     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6660     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6661         II.getKind() == InductionDescriptor::IK_FpInduction)
6662       return new VPWidenIntOrFpInductionRecipe(Phi);
6663 
6664     return nullptr;
6665   }
6666 
6667   // Optimize the special case where the source is a constant integer
6668   // induction variable. Notice that we can only optimize the 'trunc' case
6669   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6670   // (c) other casts depend on pointer size.
6671 
6672   // Determine whether \p K is a truncation based on an induction variable that
6673   // can be optimized.
6674   auto isOptimizableIVTruncate =
6675       [&](Instruction *K) -> std::function<bool(unsigned)> {
6676     return
6677         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6678   };
6679 
6680   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6681                                isOptimizableIVTruncate(I), Range))
6682     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6683                                              cast<TruncInst>(I));
6684   return nullptr;
6685 }
6686 
6687 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6688   PHINode *Phi = dyn_cast<PHINode>(I);
6689   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6690     return nullptr;
6691 
6692   // We know that all PHIs in non-header blocks are converted into selects, so
6693   // we don't have to worry about the insertion order and we can just use the
6694   // builder. At this point we generate the predication tree. There may be
6695   // duplications since this is a simple recursive scan, but future
6696   // optimizations will clean it up.
6697 
6698   SmallVector<VPValue *, 2> Masks;
6699   unsigned NumIncoming = Phi->getNumIncomingValues();
6700   for (unsigned In = 0; In < NumIncoming; In++) {
6701     VPValue *EdgeMask =
6702       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6703     assert((EdgeMask || NumIncoming == 1) &&
6704            "Multiple predecessors with one having a full mask");
6705     if (EdgeMask)
6706       Masks.push_back(EdgeMask);
6707   }
6708   return new VPBlendRecipe(Phi, Masks);
6709 }
6710 
6711 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6712                                  VFRange &Range) {
6713 
6714   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6715       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6716 
6717   if (IsPredicated)
6718     return false;
6719 
6720   auto IsVectorizableOpcode = [](unsigned Opcode) {
6721     switch (Opcode) {
6722     case Instruction::Add:
6723     case Instruction::And:
6724     case Instruction::AShr:
6725     case Instruction::BitCast:
6726     case Instruction::Br:
6727     case Instruction::Call:
6728     case Instruction::FAdd:
6729     case Instruction::FCmp:
6730     case Instruction::FDiv:
6731     case Instruction::FMul:
6732     case Instruction::FNeg:
6733     case Instruction::FPExt:
6734     case Instruction::FPToSI:
6735     case Instruction::FPToUI:
6736     case Instruction::FPTrunc:
6737     case Instruction::FRem:
6738     case Instruction::FSub:
6739     case Instruction::GetElementPtr:
6740     case Instruction::ICmp:
6741     case Instruction::IntToPtr:
6742     case Instruction::Load:
6743     case Instruction::LShr:
6744     case Instruction::Mul:
6745     case Instruction::Or:
6746     case Instruction::PHI:
6747     case Instruction::PtrToInt:
6748     case Instruction::SDiv:
6749     case Instruction::Select:
6750     case Instruction::SExt:
6751     case Instruction::Shl:
6752     case Instruction::SIToFP:
6753     case Instruction::SRem:
6754     case Instruction::Store:
6755     case Instruction::Sub:
6756     case Instruction::Trunc:
6757     case Instruction::UDiv:
6758     case Instruction::UIToFP:
6759     case Instruction::URem:
6760     case Instruction::Xor:
6761     case Instruction::ZExt:
6762       return true;
6763     }
6764     return false;
6765   };
6766 
6767   if (!IsVectorizableOpcode(I->getOpcode()))
6768     return false;
6769 
6770   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6771     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6772     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6773                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6774       return false;
6775   }
6776 
6777   auto willWiden = [&](unsigned VF) -> bool {
6778     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6779                              CM.isProfitableToScalarize(I, VF)))
6780       return false;
6781     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6782       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6783       // The following case may be scalarized depending on the VF.
6784       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6785       // version of the instruction.
6786       // Is it beneficial to perform intrinsic call compared to lib call?
6787       bool NeedToScalarize;
6788       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6789       bool UseVectorIntrinsic =
6790           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6791       return UseVectorIntrinsic || !NeedToScalarize;
6792     }
6793     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6794       assert(CM.getWideningDecision(I, VF) ==
6795                  LoopVectorizationCostModel::CM_Scalarize &&
6796              "Memory widening decisions should have been taken care by now");
6797       return false;
6798     }
6799     return true;
6800   };
6801 
6802   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6803     return false;
6804 
6805   // Success: widen this instruction. We optimize the common case where
6806   // consecutive instructions can be represented by a single recipe.
6807   if (!VPBB->empty()) {
6808     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6809     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6810       return true;
6811   }
6812 
6813   VPBB->appendRecipe(new VPWidenRecipe(I));
6814   return true;
6815 }
6816 
6817 VPBasicBlock *VPRecipeBuilder::handleReplication(
6818     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6819     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6820     VPlanPtr &Plan) {
6821   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6822       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6823       Range);
6824 
6825   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6826       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6827 
6828   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6829 
6830   // Find if I uses a predicated instruction. If so, it will use its scalar
6831   // value. Avoid hoisting the insert-element which packs the scalar value into
6832   // a vector value, as that happens iff all users use the vector value.
6833   for (auto &Op : I->operands())
6834     if (auto *PredInst = dyn_cast<Instruction>(Op))
6835       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6836         PredInst2Recipe[PredInst]->setAlsoPack(false);
6837 
6838   // Finalize the recipe for Instr, first if it is not predicated.
6839   if (!IsPredicated) {
6840     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6841     VPBB->appendRecipe(Recipe);
6842     return VPBB;
6843   }
6844   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6845   assert(VPBB->getSuccessors().empty() &&
6846          "VPBB has successors when handling predicated replication.");
6847   // Record predicated instructions for above packing optimizations.
6848   PredInst2Recipe[I] = Recipe;
6849   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6850   VPBlockUtils::insertBlockAfter(Region, VPBB);
6851   auto *RegSucc = new VPBasicBlock();
6852   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6853   return RegSucc;
6854 }
6855 
6856 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6857                                                       VPRecipeBase *PredRecipe,
6858                                                       VPlanPtr &Plan) {
6859   // Instructions marked for predication are replicated and placed under an
6860   // if-then construct to prevent side-effects.
6861 
6862   // Generate recipes to compute the block mask for this region.
6863   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6864 
6865   // Build the triangular if-then region.
6866   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6867   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6868   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6869   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6870   auto *PHIRecipe =
6871       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6872   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6873   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6874   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6875 
6876   // Note: first set Entry as region entry and then connect successors starting
6877   // from it in order, to propagate the "parent" of each VPBasicBlock.
6878   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6879   VPBlockUtils::connectBlocks(Pred, Exit);
6880 
6881   return Region;
6882 }
6883 
6884 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6885                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6886   VPRecipeBase *Recipe = nullptr;
6887   // Check if Instr should belong to an interleave memory recipe, or already
6888   // does. In the latter case Instr is irrelevant.
6889   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6890     VPBB->appendRecipe(Recipe);
6891     return true;
6892   }
6893 
6894   // Check if Instr is a memory operation that should be widened.
6895   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6896     VPBB->appendRecipe(Recipe);
6897     return true;
6898   }
6899 
6900   // Check if Instr should form some PHI recipe.
6901   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6902     VPBB->appendRecipe(Recipe);
6903     return true;
6904   }
6905   if ((Recipe = tryToBlend(Instr, Plan))) {
6906     VPBB->appendRecipe(Recipe);
6907     return true;
6908   }
6909   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6910     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6911     return true;
6912   }
6913 
6914   // Check if Instr is to be widened by a general VPWidenRecipe, after
6915   // having first checked for specific widening recipes that deal with
6916   // Interleave Groups, Inductions and Phi nodes.
6917   if (tryToWiden(Instr, VPBB, Range))
6918     return true;
6919 
6920   return false;
6921 }
6922 
6923 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6924                                                         unsigned MaxVF) {
6925   assert(OrigLoop->empty() && "Inner loop expected.");
6926 
6927   // Collect conditions feeding internal conditional branches; they need to be
6928   // represented in VPlan for it to model masking.
6929   SmallPtrSet<Value *, 1> NeedDef;
6930 
6931   auto *Latch = OrigLoop->getLoopLatch();
6932   for (BasicBlock *BB : OrigLoop->blocks()) {
6933     if (BB == Latch)
6934       continue;
6935     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6936     if (Branch && Branch->isConditional())
6937       NeedDef.insert(Branch->getCondition());
6938   }
6939 
6940   // If the tail is to be folded by masking, the primary induction variable
6941   // needs to be represented in VPlan for it to model early-exit masking.
6942   if (CM.foldTailByMasking())
6943     NeedDef.insert(Legal->getPrimaryInduction());
6944 
6945   // Collect instructions from the original loop that will become trivially dead
6946   // in the vectorized loop. We don't need to vectorize these instructions. For
6947   // example, original induction update instructions can become dead because we
6948   // separately emit induction "steps" when generating code for the new loop.
6949   // Similarly, we create a new latch condition when setting up the structure
6950   // of the new loop, so the old one can become dead.
6951   SmallPtrSet<Instruction *, 4> DeadInstructions;
6952   collectTriviallyDeadInstructions(DeadInstructions);
6953 
6954   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6955     VFRange SubRange = {VF, MaxVF + 1};
6956     VPlans.push_back(
6957         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6958     VF = SubRange.End;
6959   }
6960 }
6961 
6962 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6963     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6964     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6965   // Hold a mapping from predicated instructions to their recipes, in order to
6966   // fix their AlsoPack behavior if a user is determined to replicate and use a
6967   // scalar instead of vector value.
6968   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6969 
6970   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6971   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6972 
6973   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6974   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6975   auto Plan = std::make_unique<VPlan>(VPBB);
6976 
6977   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
6978   // Represent values that will have defs inside VPlan.
6979   for (Value *V : NeedDef)
6980     Plan->addVPValue(V);
6981 
6982   // Scan the body of the loop in a topological order to visit each basic block
6983   // after having visited its predecessor basic blocks.
6984   LoopBlocksDFS DFS(OrigLoop);
6985   DFS.perform(LI);
6986 
6987   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6988     // Relevant instructions from basic block BB will be grouped into VPRecipe
6989     // ingredients and fill a new VPBasicBlock.
6990     unsigned VPBBsForBB = 0;
6991     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6992     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6993     VPBB = FirstVPBBForBB;
6994     Builder.setInsertPoint(VPBB);
6995 
6996     std::vector<Instruction *> Ingredients;
6997 
6998     // Organize the ingredients to vectorize from current basic block in the
6999     // right order.
7000     for (Instruction &I : BB->instructionsWithoutDebug()) {
7001       Instruction *Instr = &I;
7002 
7003       // First filter out irrelevant instructions, to ensure no recipes are
7004       // built for them.
7005       if (isa<BranchInst>(Instr) ||
7006           DeadInstructions.find(Instr) != DeadInstructions.end())
7007         continue;
7008 
7009       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7010       // member of the IG, do not construct any Recipe for it.
7011       const InterleaveGroup<Instruction> *IG =
7012           CM.getInterleavedAccessGroup(Instr);
7013       if (IG && Instr != IG->getInsertPos() &&
7014           Range.Start >= 2 && // Query is illegal for VF == 1
7015           CM.getWideningDecision(Instr, Range.Start) ==
7016               LoopVectorizationCostModel::CM_Interleave) {
7017         auto SinkCandidate = SinkAfterInverse.find(Instr);
7018         if (SinkCandidate != SinkAfterInverse.end())
7019           Ingredients.push_back(SinkCandidate->second);
7020         continue;
7021       }
7022 
7023       // Move instructions to handle first-order recurrences, step 1: avoid
7024       // handling this instruction until after we've handled the instruction it
7025       // should follow.
7026       auto SAIt = SinkAfter.find(Instr);
7027       if (SAIt != SinkAfter.end()) {
7028         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7029                           << *SAIt->second
7030                           << " to vectorize a 1st order recurrence.\n");
7031         SinkAfterInverse[SAIt->second] = Instr;
7032         continue;
7033       }
7034 
7035       Ingredients.push_back(Instr);
7036 
7037       // Move instructions to handle first-order recurrences, step 2: push the
7038       // instruction to be sunk at its insertion point.
7039       auto SAInvIt = SinkAfterInverse.find(Instr);
7040       if (SAInvIt != SinkAfterInverse.end())
7041         Ingredients.push_back(SAInvIt->second);
7042     }
7043 
7044     // Introduce each ingredient into VPlan.
7045     for (Instruction *Instr : Ingredients) {
7046       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7047         continue;
7048 
7049       // Otherwise, if all widening options failed, Instruction is to be
7050       // replicated. This may create a successor for VPBB.
7051       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7052           Instr, Range, VPBB, PredInst2Recipe, Plan);
7053       if (NextVPBB != VPBB) {
7054         VPBB = NextVPBB;
7055         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7056                                     : "");
7057       }
7058     }
7059   }
7060 
7061   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7062   // may also be empty, such as the last one VPBB, reflecting original
7063   // basic-blocks with no recipes.
7064   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7065   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7066   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7067   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7068   delete PreEntry;
7069 
7070   std::string PlanName;
7071   raw_string_ostream RSO(PlanName);
7072   unsigned VF = Range.Start;
7073   Plan->addVF(VF);
7074   RSO << "Initial VPlan for VF={" << VF;
7075   for (VF *= 2; VF < Range.End; VF *= 2) {
7076     Plan->addVF(VF);
7077     RSO << "," << VF;
7078   }
7079   RSO << "},UF>=1";
7080   RSO.flush();
7081   Plan->setName(PlanName);
7082 
7083   return Plan;
7084 }
7085 
7086 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7087   // Outer loop handling: They may require CFG and instruction level
7088   // transformations before even evaluating whether vectorization is profitable.
7089   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7090   // the vectorization pipeline.
7091   assert(!OrigLoop->empty());
7092   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7093 
7094   // Create new empty VPlan
7095   auto Plan = std::make_unique<VPlan>();
7096 
7097   // Build hierarchical CFG
7098   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7099   HCFGBuilder.buildHierarchicalCFG();
7100 
7101   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7102     Plan->addVF(VF);
7103 
7104   if (EnableVPlanPredication) {
7105     VPlanPredicator VPP(*Plan);
7106     VPP.predicate();
7107 
7108     // Avoid running transformation to recipes until masked code generation in
7109     // VPlan-native path is in place.
7110     return Plan;
7111   }
7112 
7113   SmallPtrSet<Instruction *, 1> DeadInstructions;
7114   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7115       Plan, Legal->getInductionVars(), DeadInstructions);
7116 
7117   return Plan;
7118 }
7119 
7120 Value* LoopVectorizationPlanner::VPCallbackILV::
7121 getOrCreateVectorValues(Value *V, unsigned Part) {
7122       return ILV.getOrCreateVectorValue(V, Part);
7123 }
7124 
7125 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7126   O << " +\n"
7127     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7128   IG->getInsertPos()->printAsOperand(O, false);
7129   if (User) {
7130     O << ", ";
7131     User->getOperand(0)->printAsOperand(O);
7132   }
7133   O << "\\l\"";
7134   for (unsigned i = 0; i < IG->getFactor(); ++i)
7135     if (Instruction *I = IG->getMember(i))
7136       O << " +\n"
7137         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7138 }
7139 
7140 void VPWidenRecipe::execute(VPTransformState &State) {
7141   for (auto &Instr : make_range(Begin, End))
7142     State.ILV->widenInstruction(Instr);
7143 }
7144 
7145 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7146   assert(!State.Instance && "Int or FP induction being replicated.");
7147   State.ILV->widenIntOrFpInduction(IV, Trunc);
7148 }
7149 
7150 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7151   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7152 }
7153 
7154 void VPBlendRecipe::execute(VPTransformState &State) {
7155   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7156   // We know that all PHIs in non-header blocks are converted into
7157   // selects, so we don't have to worry about the insertion order and we
7158   // can just use the builder.
7159   // At this point we generate the predication tree. There may be
7160   // duplications since this is a simple recursive scan, but future
7161   // optimizations will clean it up.
7162 
7163   unsigned NumIncoming = Phi->getNumIncomingValues();
7164 
7165   assert((User || NumIncoming == 1) &&
7166          "Multiple predecessors with predecessors having a full mask");
7167   // Generate a sequence of selects of the form:
7168   // SELECT(Mask3, In3,
7169   //      SELECT(Mask2, In2,
7170   //                   ( ...)))
7171   InnerLoopVectorizer::VectorParts Entry(State.UF);
7172   for (unsigned In = 0; In < NumIncoming; ++In) {
7173     for (unsigned Part = 0; Part < State.UF; ++Part) {
7174       // We might have single edge PHIs (blocks) - use an identity
7175       // 'select' for the first PHI operand.
7176       Value *In0 =
7177           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7178       if (In == 0)
7179         Entry[Part] = In0; // Initialize with the first incoming value.
7180       else {
7181         // Select between the current value and the previous incoming edge
7182         // based on the incoming mask.
7183         Value *Cond = State.get(User->getOperand(In), Part);
7184         Entry[Part] =
7185             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7186       }
7187     }
7188   }
7189   for (unsigned Part = 0; Part < State.UF; ++Part)
7190     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7191 }
7192 
7193 void VPInterleaveRecipe::execute(VPTransformState &State) {
7194   assert(!State.Instance && "Interleave group being replicated.");
7195   if (!User)
7196     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7197 
7198   // Last (and currently only) operand is a mask.
7199   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7200   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7201   for (unsigned Part = 0; Part < State.UF; ++Part)
7202     MaskValues[Part] = State.get(Mask, Part);
7203   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7204 }
7205 
7206 void VPReplicateRecipe::execute(VPTransformState &State) {
7207   if (State.Instance) { // Generate a single instance.
7208     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7209     // Insert scalar instance packing it into a vector.
7210     if (AlsoPack && State.VF > 1) {
7211       // If we're constructing lane 0, initialize to start from undef.
7212       if (State.Instance->Lane == 0) {
7213         Value *Undef =
7214             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7215         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7216       }
7217       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7218     }
7219     return;
7220   }
7221 
7222   // Generate scalar instances for all VF lanes of all UF parts, unless the
7223   // instruction is uniform inwhich case generate only the first lane for each
7224   // of the UF parts.
7225   unsigned EndLane = IsUniform ? 1 : State.VF;
7226   for (unsigned Part = 0; Part < State.UF; ++Part)
7227     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7228       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7229 }
7230 
7231 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7232   assert(State.Instance && "Branch on Mask works only on single instance.");
7233 
7234   unsigned Part = State.Instance->Part;
7235   unsigned Lane = State.Instance->Lane;
7236 
7237   Value *ConditionBit = nullptr;
7238   if (!User) // Block in mask is all-one.
7239     ConditionBit = State.Builder.getTrue();
7240   else {
7241     VPValue *BlockInMask = User->getOperand(0);
7242     ConditionBit = State.get(BlockInMask, Part);
7243     if (ConditionBit->getType()->isVectorTy())
7244       ConditionBit = State.Builder.CreateExtractElement(
7245           ConditionBit, State.Builder.getInt32(Lane));
7246   }
7247 
7248   // Replace the temporary unreachable terminator with a new conditional branch,
7249   // whose two destinations will be set later when they are created.
7250   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7251   assert(isa<UnreachableInst>(CurrentTerminator) &&
7252          "Expected to replace unreachable terminator with conditional branch.");
7253   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7254   CondBr->setSuccessor(0, nullptr);
7255   ReplaceInstWithInst(CurrentTerminator, CondBr);
7256 }
7257 
7258 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7259   assert(State.Instance && "Predicated instruction PHI works per instance.");
7260   Instruction *ScalarPredInst = cast<Instruction>(
7261       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7262   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7263   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7264   assert(PredicatingBB && "Predicated block has no single predecessor.");
7265 
7266   // By current pack/unpack logic we need to generate only a single phi node: if
7267   // a vector value for the predicated instruction exists at this point it means
7268   // the instruction has vector users only, and a phi for the vector value is
7269   // needed. In this case the recipe of the predicated instruction is marked to
7270   // also do that packing, thereby "hoisting" the insert-element sequence.
7271   // Otherwise, a phi node for the scalar value is needed.
7272   unsigned Part = State.Instance->Part;
7273   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7274     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7275     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7276     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7277     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7278     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7279     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7280   } else {
7281     Type *PredInstType = PredInst->getType();
7282     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7283     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7284     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7285     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7286   }
7287 }
7288 
7289 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7290   if (!User)
7291     return State.ILV->vectorizeMemoryInstruction(&Instr);
7292 
7293   // Last (and currently only) operand is a mask.
7294   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7295   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7296   for (unsigned Part = 0; Part < State.UF; ++Part)
7297     MaskValues[Part] = State.get(Mask, Part);
7298   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7299 }
7300 
7301 static ScalarEpilogueLowering
7302 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7303                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7304   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7305   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7306       (F->hasOptSize() ||
7307        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7308     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7309   else if (PreferPredicateOverEpilog || Hints.getPredicate())
7310     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7311 
7312   return SEL;
7313 }
7314 
7315 // Process the loop in the VPlan-native vectorization path. This path builds
7316 // VPlan upfront in the vectorization pipeline, which allows to apply
7317 // VPlan-to-VPlan transformations from the very beginning without modifying the
7318 // input LLVM IR.
7319 static bool processLoopInVPlanNativePath(
7320     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7321     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7322     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7323     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7324     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7325 
7326   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7327   Function *F = L->getHeader()->getParent();
7328   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7329   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7330 
7331   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7332                                 &Hints, IAI);
7333   // Use the planner for outer loop vectorization.
7334   // TODO: CM is not used at this point inside the planner. Turn CM into an
7335   // optional argument if we don't need it in the future.
7336   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7337 
7338   // Get user vectorization factor.
7339   const unsigned UserVF = Hints.getWidth();
7340 
7341   // Plan how to best vectorize, return the best VF and its cost.
7342   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7343 
7344   // If we are stress testing VPlan builds, do not attempt to generate vector
7345   // code. Masked vector code generation support will follow soon.
7346   // Also, do not attempt to vectorize if no vector code will be produced.
7347   if (VPlanBuildStressTest || EnableVPlanPredication ||
7348       VectorizationFactor::Disabled() == VF)
7349     return false;
7350 
7351   LVP.setBestPlan(VF.Width, 1);
7352 
7353   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7354                          &CM);
7355   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7356                     << L->getHeader()->getParent()->getName() << "\"\n");
7357   LVP.executePlan(LB, DT);
7358 
7359   // Mark the loop as already vectorized to avoid vectorizing again.
7360   Hints.setAlreadyVectorized();
7361 
7362   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7363   return true;
7364 }
7365 
7366 bool LoopVectorizePass::processLoop(Loop *L) {
7367   assert((EnableVPlanNativePath || L->empty()) &&
7368          "VPlan-native path is not enabled. Only process inner loops.");
7369 
7370 #ifndef NDEBUG
7371   const std::string DebugLocStr = getDebugLocString(L);
7372 #endif /* NDEBUG */
7373 
7374   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7375                     << L->getHeader()->getParent()->getName() << "\" from "
7376                     << DebugLocStr << "\n");
7377 
7378   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7379 
7380   LLVM_DEBUG(
7381       dbgs() << "LV: Loop hints:"
7382              << " force="
7383              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7384                      ? "disabled"
7385                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7386                             ? "enabled"
7387                             : "?"))
7388              << " width=" << Hints.getWidth()
7389              << " unroll=" << Hints.getInterleave() << "\n");
7390 
7391   // Function containing loop
7392   Function *F = L->getHeader()->getParent();
7393 
7394   // Looking at the diagnostic output is the only way to determine if a loop
7395   // was vectorized (other than looking at the IR or machine code), so it
7396   // is important to generate an optimization remark for each loop. Most of
7397   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7398   // generated as OptimizationRemark and OptimizationRemarkMissed are
7399   // less verbose reporting vectorized loops and unvectorized loops that may
7400   // benefit from vectorization, respectively.
7401 
7402   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7403     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7404     return false;
7405   }
7406 
7407   PredicatedScalarEvolution PSE(*SE, *L);
7408 
7409   // Check if it is legal to vectorize the loop.
7410   LoopVectorizationRequirements Requirements(*ORE);
7411   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7412                                 &Requirements, &Hints, DB, AC);
7413   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7414     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7415     Hints.emitRemarkWithHints();
7416     return false;
7417   }
7418 
7419   // Check the function attributes and profiles to find out if this function
7420   // should be optimized for size.
7421   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7422 
7423   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7424   // here. They may require CFG and instruction level transformations before
7425   // even evaluating whether vectorization is profitable. Since we cannot modify
7426   // the incoming IR, we need to build VPlan upfront in the vectorization
7427   // pipeline.
7428   if (!L->empty())
7429     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7430                                         ORE, BFI, PSI, Hints);
7431 
7432   assert(L->empty() && "Inner loop expected.");
7433   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7434   // count by optimizing for size, to minimize overheads.
7435   // Prefer constant trip counts over profile data, over upper bound estimate.
7436   unsigned ExpectedTC = 0;
7437   bool HasExpectedTC = false;
7438   if (const SCEVConstant *ConstExits =
7439       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7440     const APInt &ExitsCount = ConstExits->getAPInt();
7441     // We are interested in small values for ExpectedTC. Skip over those that
7442     // can't fit an unsigned.
7443     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7444       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7445       HasExpectedTC = true;
7446     }
7447   }
7448   // ExpectedTC may be large because it's bound by a variable. Check
7449   // profiling information to validate we should vectorize.
7450   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7451     auto EstimatedTC = getLoopEstimatedTripCount(L);
7452     if (EstimatedTC) {
7453       ExpectedTC = *EstimatedTC;
7454       HasExpectedTC = true;
7455     }
7456   }
7457   if (!HasExpectedTC) {
7458     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7459     HasExpectedTC = (ExpectedTC > 0);
7460   }
7461 
7462   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7463     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7464                       << "This loop is worth vectorizing only if no scalar "
7465                       << "iteration overheads are incurred.");
7466     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7467       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7468     else {
7469       LLVM_DEBUG(dbgs() << "\n");
7470       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7471     }
7472   }
7473 
7474   // Check the function attributes to see if implicit floats are allowed.
7475   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7476   // an integer loop and the vector instructions selected are purely integer
7477   // vector instructions?
7478   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7479     reportVectorizationFailure(
7480         "Can't vectorize when the NoImplicitFloat attribute is used",
7481         "loop not vectorized due to NoImplicitFloat attribute",
7482         "NoImplicitFloat", ORE, L);
7483     Hints.emitRemarkWithHints();
7484     return false;
7485   }
7486 
7487   // Check if the target supports potentially unsafe FP vectorization.
7488   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7489   // for the target we're vectorizing for, to make sure none of the
7490   // additional fp-math flags can help.
7491   if (Hints.isPotentiallyUnsafe() &&
7492       TTI->isFPVectorizationPotentiallyUnsafe()) {
7493     reportVectorizationFailure(
7494         "Potentially unsafe FP op prevents vectorization",
7495         "loop not vectorized due to unsafe FP support.",
7496         "UnsafeFP", ORE, L);
7497     Hints.emitRemarkWithHints();
7498     return false;
7499   }
7500 
7501   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7502   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7503 
7504   // If an override option has been passed in for interleaved accesses, use it.
7505   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7506     UseInterleaved = EnableInterleavedMemAccesses;
7507 
7508   // Analyze interleaved memory accesses.
7509   if (UseInterleaved) {
7510     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7511   }
7512 
7513   // Use the cost model.
7514   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7515                                 F, &Hints, IAI);
7516   CM.collectValuesToIgnore();
7517 
7518   // Use the planner for vectorization.
7519   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7520 
7521   // Get user vectorization factor.
7522   unsigned UserVF = Hints.getWidth();
7523 
7524   // Plan how to best vectorize, return the best VF and its cost.
7525   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7526 
7527   VectorizationFactor VF = VectorizationFactor::Disabled();
7528   unsigned IC = 1;
7529   unsigned UserIC = Hints.getInterleave();
7530 
7531   if (MaybeVF) {
7532     VF = *MaybeVF;
7533     // Select the interleave count.
7534     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7535   }
7536 
7537   // Identify the diagnostic messages that should be produced.
7538   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7539   bool VectorizeLoop = true, InterleaveLoop = true;
7540   if (Requirements.doesNotMeet(F, L, Hints)) {
7541     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7542                          "requirements.\n");
7543     Hints.emitRemarkWithHints();
7544     return false;
7545   }
7546 
7547   if (VF.Width == 1) {
7548     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7549     VecDiagMsg = std::make_pair(
7550         "VectorizationNotBeneficial",
7551         "the cost-model indicates that vectorization is not beneficial");
7552     VectorizeLoop = false;
7553   }
7554 
7555   if (!MaybeVF && UserIC > 1) {
7556     // Tell the user interleaving was avoided up-front, despite being explicitly
7557     // requested.
7558     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7559                          "interleaving should be avoided up front\n");
7560     IntDiagMsg = std::make_pair(
7561         "InterleavingAvoided",
7562         "Ignoring UserIC, because interleaving was avoided up front");
7563     InterleaveLoop = false;
7564   } else if (IC == 1 && UserIC <= 1) {
7565     // Tell the user interleaving is not beneficial.
7566     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7567     IntDiagMsg = std::make_pair(
7568         "InterleavingNotBeneficial",
7569         "the cost-model indicates that interleaving is not beneficial");
7570     InterleaveLoop = false;
7571     if (UserIC == 1) {
7572       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7573       IntDiagMsg.second +=
7574           " and is explicitly disabled or interleave count is set to 1";
7575     }
7576   } else if (IC > 1 && UserIC == 1) {
7577     // Tell the user interleaving is beneficial, but it explicitly disabled.
7578     LLVM_DEBUG(
7579         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7580     IntDiagMsg = std::make_pair(
7581         "InterleavingBeneficialButDisabled",
7582         "the cost-model indicates that interleaving is beneficial "
7583         "but is explicitly disabled or interleave count is set to 1");
7584     InterleaveLoop = false;
7585   }
7586 
7587   // Override IC if user provided an interleave count.
7588   IC = UserIC > 0 ? UserIC : IC;
7589 
7590   // Emit diagnostic messages, if any.
7591   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7592   if (!VectorizeLoop && !InterleaveLoop) {
7593     // Do not vectorize or interleaving the loop.
7594     ORE->emit([&]() {
7595       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7596                                       L->getStartLoc(), L->getHeader())
7597              << VecDiagMsg.second;
7598     });
7599     ORE->emit([&]() {
7600       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7601                                       L->getStartLoc(), L->getHeader())
7602              << IntDiagMsg.second;
7603     });
7604     return false;
7605   } else if (!VectorizeLoop && InterleaveLoop) {
7606     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7607     ORE->emit([&]() {
7608       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7609                                         L->getStartLoc(), L->getHeader())
7610              << VecDiagMsg.second;
7611     });
7612   } else if (VectorizeLoop && !InterleaveLoop) {
7613     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7614                       << ") in " << DebugLocStr << '\n');
7615     ORE->emit([&]() {
7616       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7617                                         L->getStartLoc(), L->getHeader())
7618              << IntDiagMsg.second;
7619     });
7620   } else if (VectorizeLoop && InterleaveLoop) {
7621     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7622                       << ") in " << DebugLocStr << '\n');
7623     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7624   }
7625 
7626   LVP.setBestPlan(VF.Width, IC);
7627 
7628   using namespace ore;
7629   bool DisableRuntimeUnroll = false;
7630   MDNode *OrigLoopID = L->getLoopID();
7631 
7632   if (!VectorizeLoop) {
7633     assert(IC > 1 && "interleave count should not be 1 or 0");
7634     // If we decided that it is not legal to vectorize the loop, then
7635     // interleave it.
7636     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7637                                &CM);
7638     LVP.executePlan(Unroller, DT);
7639 
7640     ORE->emit([&]() {
7641       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7642                                 L->getHeader())
7643              << "interleaved loop (interleaved count: "
7644              << NV("InterleaveCount", IC) << ")";
7645     });
7646   } else {
7647     // If we decided that it is *legal* to vectorize the loop, then do it.
7648     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7649                            &LVL, &CM);
7650     LVP.executePlan(LB, DT);
7651     ++LoopsVectorized;
7652 
7653     // Add metadata to disable runtime unrolling a scalar loop when there are
7654     // no runtime checks about strides and memory. A scalar loop that is
7655     // rarely used is not worth unrolling.
7656     if (!LB.areSafetyChecksAdded())
7657       DisableRuntimeUnroll = true;
7658 
7659     // Report the vectorization decision.
7660     ORE->emit([&]() {
7661       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7662                                 L->getHeader())
7663              << "vectorized loop (vectorization width: "
7664              << NV("VectorizationFactor", VF.Width)
7665              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7666     });
7667   }
7668 
7669   Optional<MDNode *> RemainderLoopID =
7670       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7671                                       LLVMLoopVectorizeFollowupEpilogue});
7672   if (RemainderLoopID.hasValue()) {
7673     L->setLoopID(RemainderLoopID.getValue());
7674   } else {
7675     if (DisableRuntimeUnroll)
7676       AddRuntimeUnrollDisableMetaData(L);
7677 
7678     // Mark the loop as already vectorized to avoid vectorizing again.
7679     Hints.setAlreadyVectorized();
7680   }
7681 
7682   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7683   return true;
7684 }
7685 
7686 bool LoopVectorizePass::runImpl(
7687     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7688     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7689     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7690     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7691     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7692   SE = &SE_;
7693   LI = &LI_;
7694   TTI = &TTI_;
7695   DT = &DT_;
7696   BFI = &BFI_;
7697   TLI = TLI_;
7698   AA = &AA_;
7699   AC = &AC_;
7700   GetLAA = &GetLAA_;
7701   DB = &DB_;
7702   ORE = &ORE_;
7703   PSI = PSI_;
7704 
7705   // Don't attempt if
7706   // 1. the target claims to have no vector registers, and
7707   // 2. interleaving won't help ILP.
7708   //
7709   // The second condition is necessary because, even if the target has no
7710   // vector registers, loop vectorization may still enable scalar
7711   // interleaving.
7712   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7713     return false;
7714 
7715   bool Changed = false;
7716 
7717   // The vectorizer requires loops to be in simplified form.
7718   // Since simplification may add new inner loops, it has to run before the
7719   // legality and profitability checks. This means running the loop vectorizer
7720   // will simplify all loops, regardless of whether anything end up being
7721   // vectorized.
7722   for (auto &L : *LI)
7723     Changed |=
7724         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7725 
7726   // Build up a worklist of inner-loops to vectorize. This is necessary as
7727   // the act of vectorizing or partially unrolling a loop creates new loops
7728   // and can invalidate iterators across the loops.
7729   SmallVector<Loop *, 8> Worklist;
7730 
7731   for (Loop *L : *LI)
7732     collectSupportedLoops(*L, LI, ORE, Worklist);
7733 
7734   LoopsAnalyzed += Worklist.size();
7735 
7736   // Now walk the identified inner loops.
7737   while (!Worklist.empty()) {
7738     Loop *L = Worklist.pop_back_val();
7739 
7740     // For the inner loops we actually process, form LCSSA to simplify the
7741     // transform.
7742     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7743 
7744     Changed |= processLoop(L);
7745   }
7746 
7747   // Process each loop nest in the function.
7748   return Changed;
7749 }
7750 
7751 PreservedAnalyses LoopVectorizePass::run(Function &F,
7752                                          FunctionAnalysisManager &AM) {
7753     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7754     auto &LI = AM.getResult<LoopAnalysis>(F);
7755     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7756     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7757     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7758     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7759     auto &AA = AM.getResult<AAManager>(F);
7760     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7761     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7762     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7763     MemorySSA *MSSA = EnableMSSALoopDependency
7764                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7765                           : nullptr;
7766 
7767     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7768     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7769         [&](Loop &L) -> const LoopAccessInfo & {
7770       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7771       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7772     };
7773     const ModuleAnalysisManager &MAM =
7774         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7775     ProfileSummaryInfo *PSI =
7776         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7777     bool Changed =
7778         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7779     if (!Changed)
7780       return PreservedAnalyses::all();
7781     PreservedAnalyses PA;
7782 
7783     // We currently do not preserve loopinfo/dominator analyses with outer loop
7784     // vectorization. Until this is addressed, mark these analyses as preserved
7785     // only for non-VPlan-native path.
7786     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7787     if (!EnableVPlanNativePath) {
7788       PA.preserve<LoopAnalysis>();
7789       PA.preserve<DominatorTreeAnalysis>();
7790     }
7791     PA.preserve<BasicAA>();
7792     PA.preserve<GlobalsAA>();
7793     return PA;
7794 }
7795