1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 static cl::opt<bool> MaximizeBandwidth(
181     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
182     cl::desc("Maximize bandwidth when selecting vectorization factor which "
183              "will be determined by the smallest type in loop."));
184 
185 static cl::opt<bool> EnableInterleavedMemAccesses(
186     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
187     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
188 
189 /// An interleave-group may need masking if it resides in a block that needs
190 /// predication, or in order to mask away gaps.
191 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
192     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
193     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
194 
195 /// We don't interleave loops with a known constant trip count below this
196 /// number.
197 static const unsigned TinyTripCountInterleaveThreshold = 128;
198 
199 static cl::opt<unsigned> ForceTargetNumScalarRegs(
200     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
201     cl::desc("A flag that overrides the target's number of scalar registers."));
202 
203 static cl::opt<unsigned> ForceTargetNumVectorRegs(
204     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
205     cl::desc("A flag that overrides the target's number of vector registers."));
206 
207 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
208     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
209     cl::desc("A flag that overrides the target's max interleave factor for "
210              "scalar loops."));
211 
212 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
213     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's max interleave factor for "
215              "vectorized loops."));
216 
217 static cl::opt<unsigned> ForceTargetInstructionCost(
218     "force-target-instruction-cost", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's expected cost for "
220              "an instruction to a single constant value. Mostly "
221              "useful for getting consistent testing."));
222 
223 static cl::opt<unsigned> SmallLoopCost(
224     "small-loop-cost", cl::init(20), cl::Hidden,
225     cl::desc(
226         "The cost of a loop that is considered 'small' by the interleaver."));
227 
228 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
229     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
230     cl::desc("Enable the use of the block frequency analysis to access PGO "
231              "heuristics minimizing code growth in cold regions and being more "
232              "aggressive in hot regions."));
233 
234 // Runtime interleave loops for load/store throughput.
235 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
236     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
237     cl::desc(
238         "Enable runtime interleaving until load/store ports are saturated"));
239 
240 /// The number of stores in a loop that are allowed to need predication.
241 static cl::opt<unsigned> NumberOfStoresToPredicate(
242     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
243     cl::desc("Max number of stores to be predicated behind an if."));
244 
245 static cl::opt<bool> EnableIndVarRegisterHeur(
246     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
247     cl::desc("Count the induction variable only once when interleaving"));
248 
249 static cl::opt<bool> EnableCondStoresVectorization(
250     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
251     cl::desc("Enable if predication of stores during vectorization."));
252 
253 static cl::opt<unsigned> MaxNestedScalarReductionIC(
254     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
255     cl::desc("The maximum interleave count to use when interleaving a scalar "
256              "reduction in a nested loop."));
257 
258 cl::opt<bool> EnableVPlanNativePath(
259     "enable-vplan-native-path", cl::init(false), cl::Hidden,
260     cl::desc("Enable VPlan-native vectorization path with "
261              "support for outer loop vectorization."));
262 
263 // FIXME: Remove this switch once we have divergence analysis. Currently we
264 // assume divergent non-backedge branches when this switch is true.
265 cl::opt<bool> EnableVPlanPredication(
266     "enable-vplan-predication", cl::init(false), cl::Hidden,
267     cl::desc("Enable VPlan-native vectorization path predicator with "
268              "support for outer loop vectorization."));
269 
270 // This flag enables the stress testing of the VPlan H-CFG construction in the
271 // VPlan-native vectorization path. It must be used in conjuction with
272 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
273 // verification of the H-CFGs built.
274 static cl::opt<bool> VPlanBuildStressTest(
275     "vplan-build-stress-test", cl::init(false), cl::Hidden,
276     cl::desc(
277         "Build VPlan for every supported loop nest in the function and bail "
278         "out right after the build (stress test the VPlan H-CFG construction "
279         "in the VPlan-native vectorization path)."));
280 
281 cl::opt<bool> llvm::EnableLoopInterleaving(
282     "interleave-loops", cl::init(true), cl::Hidden,
283     cl::desc("Enable loop interleaving in Loop vectorization passes"));
284 cl::opt<bool> llvm::EnableLoopVectorization(
285     "vectorize-loops", cl::init(true), cl::Hidden,
286     cl::desc("Run the Loop vectorization passes"));
287 
288 /// A helper function for converting Scalar types to vector types.
289 /// If the incoming type is void, we return void. If the VF is 1, we return
290 /// the scalar type.
291 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
292   if (Scalar->isVoidTy() || VF == 1)
293     return Scalar;
294   return VectorType::get(Scalar, VF);
295 }
296 
297 /// A helper function that returns the type of loaded or stored value.
298 static Type *getMemInstValueType(Value *I) {
299   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
300          "Expected Load or Store instruction");
301   if (auto *LI = dyn_cast<LoadInst>(I))
302     return LI->getType();
303   return cast<StoreInst>(I)->getValueOperand()->getType();
304 }
305 
306 /// A helper function that returns true if the given type is irregular. The
307 /// type is irregular if its allocated size doesn't equal the store size of an
308 /// element of the corresponding vector type at the given vectorization factor.
309 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
310   // Determine if an array of VF elements of type Ty is "bitcast compatible"
311   // with a <VF x Ty> vector.
312   if (VF > 1) {
313     auto *VectorTy = VectorType::get(Ty, VF);
314     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
315   }
316 
317   // If the vectorization factor is one, we just check if an array of type Ty
318   // requires padding between elements.
319   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
320 }
321 
322 /// A helper function that returns the reciprocal of the block probability of
323 /// predicated blocks. If we return X, we are assuming the predicated block
324 /// will execute once for every X iterations of the loop header.
325 ///
326 /// TODO: We should use actual block probability here, if available. Currently,
327 ///       we always assume predicated blocks have a 50% chance of executing.
328 static unsigned getReciprocalPredBlockProb() { return 2; }
329 
330 /// A helper function that adds a 'fast' flag to floating-point operations.
331 static Value *addFastMathFlag(Value *V) {
332   if (isa<FPMathOperator>(V))
333     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
334   return V;
335 }
336 
337 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
338   if (isa<FPMathOperator>(V))
339     cast<Instruction>(V)->setFastMathFlags(FMF);
340   return V;
341 }
342 
343 /// A helper function that returns an integer or floating-point constant with
344 /// value C.
345 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
346   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
347                            : ConstantFP::get(Ty, C);
348 }
349 
350 namespace llvm {
351 
352 /// InnerLoopVectorizer vectorizes loops which contain only one basic
353 /// block to a specified vectorization factor (VF).
354 /// This class performs the widening of scalars into vectors, or multiple
355 /// scalars. This class also implements the following features:
356 /// * It inserts an epilogue loop for handling loops that don't have iteration
357 ///   counts that are known to be a multiple of the vectorization factor.
358 /// * It handles the code generation for reduction variables.
359 /// * Scalarization (implementation using scalars) of un-vectorizable
360 ///   instructions.
361 /// InnerLoopVectorizer does not perform any vectorization-legality
362 /// checks, and relies on the caller to check for the different legality
363 /// aspects. The InnerLoopVectorizer relies on the
364 /// LoopVectorizationLegality class to provide information about the induction
365 /// and reduction variables that were found to a given vectorization factor.
366 class InnerLoopVectorizer {
367 public:
368   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
369                       LoopInfo *LI, DominatorTree *DT,
370                       const TargetLibraryInfo *TLI,
371                       const TargetTransformInfo *TTI, AssumptionCache *AC,
372                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
373                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
374                       LoopVectorizationCostModel *CM)
375       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
376         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
377         Builder(PSE.getSE()->getContext()),
378         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
379   virtual ~InnerLoopVectorizer() = default;
380 
381   /// Create a new empty loop. Unlink the old loop and connect the new one.
382   /// Return the pre-header block of the new loop.
383   BasicBlock *createVectorizedLoopSkeleton();
384 
385   /// Widen a single instruction within the innermost loop.
386   void widenInstruction(Instruction &I);
387 
388   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
389   void fixVectorizedLoop();
390 
391   // Return true if any runtime check is added.
392   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
393 
394   /// A type for vectorized values in the new loop. Each value from the
395   /// original loop, when vectorized, is represented by UF vector values in the
396   /// new unrolled loop, where UF is the unroll factor.
397   using VectorParts = SmallVector<Value *, 2>;
398 
399   /// Vectorize a single PHINode in a block. This method handles the induction
400   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
401   /// arbitrary length vectors.
402   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
403 
404   /// A helper function to scalarize a single Instruction in the innermost loop.
405   /// Generates a sequence of scalar instances for each lane between \p MinLane
406   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
407   /// inclusive..
408   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
409                             bool IfPredicateInstr);
410 
411   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
412   /// is provided, the integer induction variable will first be truncated to
413   /// the corresponding type.
414   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
415 
416   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
417   /// vector or scalar value on-demand if one is not yet available. When
418   /// vectorizing a loop, we visit the definition of an instruction before its
419   /// uses. When visiting the definition, we either vectorize or scalarize the
420   /// instruction, creating an entry for it in the corresponding map. (In some
421   /// cases, such as induction variables, we will create both vector and scalar
422   /// entries.) Then, as we encounter uses of the definition, we derive values
423   /// for each scalar or vector use unless such a value is already available.
424   /// For example, if we scalarize a definition and one of its uses is vector,
425   /// we build the required vector on-demand with an insertelement sequence
426   /// when visiting the use. Otherwise, if the use is scalar, we can use the
427   /// existing scalar definition.
428   ///
429   /// Return a value in the new loop corresponding to \p V from the original
430   /// loop at unroll index \p Part. If the value has already been vectorized,
431   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
432   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
433   /// a new vector value on-demand by inserting the scalar values into a vector
434   /// with an insertelement sequence. If the value has been neither vectorized
435   /// nor scalarized, it must be loop invariant, so we simply broadcast the
436   /// value into a vector.
437   Value *getOrCreateVectorValue(Value *V, unsigned Part);
438 
439   /// Return a value in the new loop corresponding to \p V from the original
440   /// loop at unroll and vector indices \p Instance. If the value has been
441   /// vectorized but not scalarized, the necessary extractelement instruction
442   /// will be generated.
443   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
444 
445   /// Construct the vector value of a scalarized value \p V one lane at a time.
446   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
447 
448   /// Try to vectorize the interleaved access group that \p Instr belongs to,
449   /// optionally masking the vector operations if \p BlockInMask is non-null.
450   void vectorizeInterleaveGroup(Instruction *Instr,
451                                 VectorParts *BlockInMask = nullptr);
452 
453   /// Vectorize Load and Store instructions, optionally masking the vector
454   /// operations if \p BlockInMask is non-null.
455   void vectorizeMemoryInstruction(Instruction *Instr,
456                                   VectorParts *BlockInMask = nullptr);
457 
458   /// Set the debug location in the builder using the debug location in
459   /// the instruction.
460   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
461 
462   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
463   void fixNonInductionPHIs(void);
464 
465 protected:
466   friend class LoopVectorizationPlanner;
467 
468   /// A small list of PHINodes.
469   using PhiVector = SmallVector<PHINode *, 4>;
470 
471   /// A type for scalarized values in the new loop. Each value from the
472   /// original loop, when scalarized, is represented by UF x VF scalar values
473   /// in the new unrolled loop, where UF is the unroll factor and VF is the
474   /// vectorization factor.
475   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
476 
477   /// Set up the values of the IVs correctly when exiting the vector loop.
478   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
479                     Value *CountRoundDown, Value *EndValue,
480                     BasicBlock *MiddleBlock);
481 
482   /// Create a new induction variable inside L.
483   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
484                                    Value *Step, Instruction *DL);
485 
486   /// Handle all cross-iteration phis in the header.
487   void fixCrossIterationPHIs();
488 
489   /// Fix a first-order recurrence. This is the second phase of vectorizing
490   /// this phi node.
491   void fixFirstOrderRecurrence(PHINode *Phi);
492 
493   /// Fix a reduction cross-iteration phi. This is the second phase of
494   /// vectorizing this phi node.
495   void fixReduction(PHINode *Phi);
496 
497   /// The Loop exit block may have single value PHI nodes with some
498   /// incoming value. While vectorizing we only handled real values
499   /// that were defined inside the loop and we should have one value for
500   /// each predecessor of its parent basic block. See PR14725.
501   void fixLCSSAPHIs();
502 
503   /// Iteratively sink the scalarized operands of a predicated instruction into
504   /// the block that was created for it.
505   void sinkScalarOperands(Instruction *PredInst);
506 
507   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
508   /// represented as.
509   void truncateToMinimalBitwidths();
510 
511   /// Insert the new loop to the loop hierarchy and pass manager
512   /// and update the analysis passes.
513   void updateAnalysis();
514 
515   /// Create a broadcast instruction. This method generates a broadcast
516   /// instruction (shuffle) for loop invariant values and for the induction
517   /// value. If this is the induction variable then we extend it to N, N+1, ...
518   /// this is needed because each iteration in the loop corresponds to a SIMD
519   /// element.
520   virtual Value *getBroadcastInstrs(Value *V);
521 
522   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
523   /// to each vector element of Val. The sequence starts at StartIndex.
524   /// \p Opcode is relevant for FP induction variable.
525   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
526                                Instruction::BinaryOps Opcode =
527                                Instruction::BinaryOpsEnd);
528 
529   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
530   /// variable on which to base the steps, \p Step is the size of the step, and
531   /// \p EntryVal is the value from the original loop that maps to the steps.
532   /// Note that \p EntryVal doesn't have to be an induction variable - it
533   /// can also be a truncate instruction.
534   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
535                         const InductionDescriptor &ID);
536 
537   /// Create a vector induction phi node based on an existing scalar one. \p
538   /// EntryVal is the value from the original loop that maps to the vector phi
539   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
540   /// truncate instruction, instead of widening the original IV, we widen a
541   /// version of the IV truncated to \p EntryVal's type.
542   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
543                                        Value *Step, Instruction *EntryVal);
544 
545   /// Returns true if an instruction \p I should be scalarized instead of
546   /// vectorized for the chosen vectorization factor.
547   bool shouldScalarizeInstruction(Instruction *I) const;
548 
549   /// Returns true if we should generate a scalar version of \p IV.
550   bool needsScalarInduction(Instruction *IV) const;
551 
552   /// If there is a cast involved in the induction variable \p ID, which should
553   /// be ignored in the vectorized loop body, this function records the
554   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
555   /// cast. We had already proved that the casted Phi is equal to the uncasted
556   /// Phi in the vectorized loop (under a runtime guard), and therefore
557   /// there is no need to vectorize the cast - the same value can be used in the
558   /// vector loop for both the Phi and the cast.
559   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
560   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
561   ///
562   /// \p EntryVal is the value from the original loop that maps to the vector
563   /// phi node and is used to distinguish what is the IV currently being
564   /// processed - original one (if \p EntryVal is a phi corresponding to the
565   /// original IV) or the "newly-created" one based on the proof mentioned above
566   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
567   /// latter case \p EntryVal is a TruncInst and we must not record anything for
568   /// that IV, but it's error-prone to expect callers of this routine to care
569   /// about that, hence this explicit parameter.
570   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
571                                              const Instruction *EntryVal,
572                                              Value *VectorLoopValue,
573                                              unsigned Part,
574                                              unsigned Lane = UINT_MAX);
575 
576   /// Generate a shuffle sequence that will reverse the vector Vec.
577   virtual Value *reverseVector(Value *Vec);
578 
579   /// Returns (and creates if needed) the original loop trip count.
580   Value *getOrCreateTripCount(Loop *NewLoop);
581 
582   /// Returns (and creates if needed) the trip count of the widened loop.
583   Value *getOrCreateVectorTripCount(Loop *NewLoop);
584 
585   /// Returns a bitcasted value to the requested vector type.
586   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
587   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
588                                 const DataLayout &DL);
589 
590   /// Emit a bypass check to see if the vector trip count is zero, including if
591   /// it overflows.
592   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
593 
594   /// Emit a bypass check to see if all of the SCEV assumptions we've
595   /// had to make are correct.
596   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
597 
598   /// Emit bypass checks to check any memory assumptions we may have made.
599   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
600 
601   /// Compute the transformed value of Index at offset StartValue using step
602   /// StepValue.
603   /// For integer induction, returns StartValue + Index * StepValue.
604   /// For pointer induction, returns StartValue[Index * StepValue].
605   /// FIXME: The newly created binary instructions should contain nsw/nuw
606   /// flags, which can be found from the original scalar operations.
607   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
608                               const DataLayout &DL,
609                               const InductionDescriptor &ID) const;
610 
611   /// Add additional metadata to \p To that was not present on \p Orig.
612   ///
613   /// Currently this is used to add the noalias annotations based on the
614   /// inserted memchecks.  Use this for instructions that are *cloned* into the
615   /// vector loop.
616   void addNewMetadata(Instruction *To, const Instruction *Orig);
617 
618   /// Add metadata from one instruction to another.
619   ///
620   /// This includes both the original MDs from \p From and additional ones (\see
621   /// addNewMetadata).  Use this for *newly created* instructions in the vector
622   /// loop.
623   void addMetadata(Instruction *To, Instruction *From);
624 
625   /// Similar to the previous function but it adds the metadata to a
626   /// vector of instructions.
627   void addMetadata(ArrayRef<Value *> To, Instruction *From);
628 
629   /// The original loop.
630   Loop *OrigLoop;
631 
632   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
633   /// dynamic knowledge to simplify SCEV expressions and converts them to a
634   /// more usable form.
635   PredicatedScalarEvolution &PSE;
636 
637   /// Loop Info.
638   LoopInfo *LI;
639 
640   /// Dominator Tree.
641   DominatorTree *DT;
642 
643   /// Alias Analysis.
644   AliasAnalysis *AA;
645 
646   /// Target Library Info.
647   const TargetLibraryInfo *TLI;
648 
649   /// Target Transform Info.
650   const TargetTransformInfo *TTI;
651 
652   /// Assumption Cache.
653   AssumptionCache *AC;
654 
655   /// Interface to emit optimization remarks.
656   OptimizationRemarkEmitter *ORE;
657 
658   /// LoopVersioning.  It's only set up (non-null) if memchecks were
659   /// used.
660   ///
661   /// This is currently only used to add no-alias metadata based on the
662   /// memchecks.  The actually versioning is performed manually.
663   std::unique_ptr<LoopVersioning> LVer;
664 
665   /// The vectorization SIMD factor to use. Each vector will have this many
666   /// vector elements.
667   unsigned VF;
668 
669   /// The vectorization unroll factor to use. Each scalar is vectorized to this
670   /// many different vector instructions.
671   unsigned UF;
672 
673   /// The builder that we use
674   IRBuilder<> Builder;
675 
676   // --- Vectorization state ---
677 
678   /// The vector-loop preheader.
679   BasicBlock *LoopVectorPreHeader;
680 
681   /// The scalar-loop preheader.
682   BasicBlock *LoopScalarPreHeader;
683 
684   /// Middle Block between the vector and the scalar.
685   BasicBlock *LoopMiddleBlock;
686 
687   /// The ExitBlock of the scalar loop.
688   BasicBlock *LoopExitBlock;
689 
690   /// The vector loop body.
691   BasicBlock *LoopVectorBody;
692 
693   /// The scalar loop body.
694   BasicBlock *LoopScalarBody;
695 
696   /// A list of all bypass blocks. The first block is the entry of the loop.
697   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
698 
699   /// The new Induction variable which was added to the new block.
700   PHINode *Induction = nullptr;
701 
702   /// The induction variable of the old basic block.
703   PHINode *OldInduction = nullptr;
704 
705   /// Maps values from the original loop to their corresponding values in the
706   /// vectorized loop. A key value can map to either vector values, scalar
707   /// values or both kinds of values, depending on whether the key was
708   /// vectorized and scalarized.
709   VectorizerValueMap VectorLoopValueMap;
710 
711   /// Store instructions that were predicated.
712   SmallVector<Instruction *, 4> PredicatedInstructions;
713 
714   /// Trip count of the original loop.
715   Value *TripCount = nullptr;
716 
717   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
718   Value *VectorTripCount = nullptr;
719 
720   /// The legality analysis.
721   LoopVectorizationLegality *Legal;
722 
723   /// The profitablity analysis.
724   LoopVectorizationCostModel *Cost;
725 
726   // Record whether runtime checks are added.
727   bool AddedSafetyChecks = false;
728 
729   // Holds the end values for each induction variable. We save the end values
730   // so we can later fix-up the external users of the induction variables.
731   DenseMap<PHINode *, Value *> IVEndValues;
732 
733   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
734   // fixed up at the end of vector code generation.
735   SmallVector<PHINode *, 8> OrigPHIsToFix;
736 };
737 
738 class InnerLoopUnroller : public InnerLoopVectorizer {
739 public:
740   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
741                     LoopInfo *LI, DominatorTree *DT,
742                     const TargetLibraryInfo *TLI,
743                     const TargetTransformInfo *TTI, AssumptionCache *AC,
744                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
745                     LoopVectorizationLegality *LVL,
746                     LoopVectorizationCostModel *CM)
747       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
748                             UnrollFactor, LVL, CM) {}
749 
750 private:
751   Value *getBroadcastInstrs(Value *V) override;
752   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
753                        Instruction::BinaryOps Opcode =
754                        Instruction::BinaryOpsEnd) override;
755   Value *reverseVector(Value *Vec) override;
756 };
757 
758 } // end namespace llvm
759 
760 /// Look for a meaningful debug location on the instruction or it's
761 /// operands.
762 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
763   if (!I)
764     return I;
765 
766   DebugLoc Empty;
767   if (I->getDebugLoc() != Empty)
768     return I;
769 
770   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
771     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
772       if (OpInst->getDebugLoc() != Empty)
773         return OpInst;
774   }
775 
776   return I;
777 }
778 
779 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
780   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
781     const DILocation *DIL = Inst->getDebugLoc();
782     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
783         !isa<DbgInfoIntrinsic>(Inst)) {
784       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
785       if (NewDIL)
786         B.SetCurrentDebugLocation(NewDIL.getValue());
787       else
788         LLVM_DEBUG(dbgs()
789                    << "Failed to create new discriminator: "
790                    << DIL->getFilename() << " Line: " << DIL->getLine());
791     }
792     else
793       B.SetCurrentDebugLocation(DIL);
794   } else
795     B.SetCurrentDebugLocation(DebugLoc());
796 }
797 
798 /// Write a record \p DebugMsg about vectorization failure to the debug
799 /// output stream. If \p I is passed, it is an instruction that prevents
800 /// vectorization.
801 #ifndef NDEBUG
802 static void debugVectorizationFailure(const StringRef DebugMsg,
803     Instruction *I) {
804   dbgs() << "LV: Not vectorizing: " << DebugMsg;
805   if (I != nullptr)
806     dbgs() << " " << *I;
807   else
808     dbgs() << '.';
809   dbgs() << '\n';
810 }
811 #endif
812 
813 /// Create an analysis remark that explains why vectorization failed
814 ///
815 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
816 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
817 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
818 /// the location of the remark.  \return the remark object that can be
819 /// streamed to.
820 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
821     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
822   Value *CodeRegion = TheLoop->getHeader();
823   DebugLoc DL = TheLoop->getStartLoc();
824 
825   if (I) {
826     CodeRegion = I->getParent();
827     // If there is no debug location attached to the instruction, revert back to
828     // using the loop's.
829     if (I->getDebugLoc())
830       DL = I->getDebugLoc();
831   }
832 
833   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
834   R << "loop not vectorized: ";
835   return R;
836 }
837 
838 namespace llvm {
839 
840 void reportVectorizationFailure(const StringRef DebugMsg,
841     const StringRef OREMsg, const StringRef ORETag,
842     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
843   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
844   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
845   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
846                 ORETag, TheLoop, I) << OREMsg);
847 }
848 
849 } // end namespace llvm
850 
851 #ifndef NDEBUG
852 /// \return string containing a file name and a line # for the given loop.
853 static std::string getDebugLocString(const Loop *L) {
854   std::string Result;
855   if (L) {
856     raw_string_ostream OS(Result);
857     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
858       LoopDbgLoc.print(OS);
859     else
860       // Just print the module name.
861       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
862     OS.flush();
863   }
864   return Result;
865 }
866 #endif
867 
868 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
869                                          const Instruction *Orig) {
870   // If the loop was versioned with memchecks, add the corresponding no-alias
871   // metadata.
872   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
873     LVer->annotateInstWithNoAlias(To, Orig);
874 }
875 
876 void InnerLoopVectorizer::addMetadata(Instruction *To,
877                                       Instruction *From) {
878   propagateMetadata(To, From);
879   addNewMetadata(To, From);
880 }
881 
882 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
883                                       Instruction *From) {
884   for (Value *V : To) {
885     if (Instruction *I = dyn_cast<Instruction>(V))
886       addMetadata(I, From);
887   }
888 }
889 
890 namespace llvm {
891 
892 // Loop vectorization cost-model hints how the scalar epilogue loop should be
893 // lowered.
894 enum ScalarEpilogueLowering {
895 
896   // The default: allowing scalar epilogues.
897   CM_ScalarEpilogueAllowed,
898 
899   // Vectorization with OptForSize: don't allow epilogues.
900   CM_ScalarEpilogueNotAllowedOptSize,
901 
902   // A special case of vectorisation with OptForSize: loops with a very small
903   // trip count are considered for vectorization under OptForSize, thereby
904   // making sure the cost of their loop body is dominant, free of runtime
905   // guards and scalar iteration overheads.
906   CM_ScalarEpilogueNotAllowedLowTripLoop,
907 
908   // Loop hint predicate indicating an epilogue is undesired.
909   CM_ScalarEpilogueNotNeededPredicatePragma
910 };
911 
912 /// LoopVectorizationCostModel - estimates the expected speedups due to
913 /// vectorization.
914 /// In many cases vectorization is not profitable. This can happen because of
915 /// a number of reasons. In this class we mainly attempt to predict the
916 /// expected speedup/slowdowns due to the supported instruction set. We use the
917 /// TargetTransformInfo to query the different backends for the cost of
918 /// different operations.
919 class LoopVectorizationCostModel {
920 public:
921   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
922                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
923                              LoopVectorizationLegality *Legal,
924                              const TargetTransformInfo &TTI,
925                              const TargetLibraryInfo *TLI, DemandedBits *DB,
926                              AssumptionCache *AC,
927                              OptimizationRemarkEmitter *ORE, const Function *F,
928                              const LoopVectorizeHints *Hints,
929                              InterleavedAccessInfo &IAI)
930       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
931         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
932         Hints(Hints), InterleaveInfo(IAI) {}
933 
934   /// \return An upper bound for the vectorization factor, or None if
935   /// vectorization and interleaving should be avoided up front.
936   Optional<unsigned> computeMaxVF();
937 
938   /// \return True if runtime checks are required for vectorization, and false
939   /// otherwise.
940   bool runtimeChecksRequired();
941 
942   /// \return The most profitable vectorization factor and the cost of that VF.
943   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
944   /// then this vectorization factor will be selected if vectorization is
945   /// possible.
946   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
947 
948   /// Setup cost-based decisions for user vectorization factor.
949   void selectUserVectorizationFactor(unsigned UserVF) {
950     collectUniformsAndScalars(UserVF);
951     collectInstsToScalarize(UserVF);
952   }
953 
954   /// \return The size (in bits) of the smallest and widest types in the code
955   /// that needs to be vectorized. We ignore values that remain scalar such as
956   /// 64 bit loop indices.
957   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
958 
959   /// \return The desired interleave count.
960   /// If interleave count has been specified by metadata it will be returned.
961   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
962   /// are the selected vectorization factor and the cost of the selected VF.
963   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
964 
965   /// Memory access instruction may be vectorized in more than one way.
966   /// Form of instruction after vectorization depends on cost.
967   /// This function takes cost-based decisions for Load/Store instructions
968   /// and collects them in a map. This decisions map is used for building
969   /// the lists of loop-uniform and loop-scalar instructions.
970   /// The calculated cost is saved with widening decision in order to
971   /// avoid redundant calculations.
972   void setCostBasedWideningDecision(unsigned VF);
973 
974   /// A struct that represents some properties of the register usage
975   /// of a loop.
976   struct RegisterUsage {
977     /// Holds the number of loop invariant values that are used in the loop.
978     unsigned LoopInvariantRegs;
979 
980     /// Holds the maximum number of concurrent live intervals in the loop.
981     unsigned MaxLocalUsers;
982   };
983 
984   /// \return Returns information about the register usages of the loop for the
985   /// given vectorization factors.
986   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
987 
988   /// Collect values we want to ignore in the cost model.
989   void collectValuesToIgnore();
990 
991   /// \returns The smallest bitwidth each instruction can be represented with.
992   /// The vector equivalents of these instructions should be truncated to this
993   /// type.
994   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
995     return MinBWs;
996   }
997 
998   /// \returns True if it is more profitable to scalarize instruction \p I for
999   /// vectorization factor \p VF.
1000   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1001     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1002 
1003     // Cost model is not run in the VPlan-native path - return conservative
1004     // result until this changes.
1005     if (EnableVPlanNativePath)
1006       return false;
1007 
1008     auto Scalars = InstsToScalarize.find(VF);
1009     assert(Scalars != InstsToScalarize.end() &&
1010            "VF not yet analyzed for scalarization profitability");
1011     return Scalars->second.find(I) != Scalars->second.end();
1012   }
1013 
1014   /// Returns true if \p I is known to be uniform after vectorization.
1015   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1016     if (VF == 1)
1017       return true;
1018 
1019     // Cost model is not run in the VPlan-native path - return conservative
1020     // result until this changes.
1021     if (EnableVPlanNativePath)
1022       return false;
1023 
1024     auto UniformsPerVF = Uniforms.find(VF);
1025     assert(UniformsPerVF != Uniforms.end() &&
1026            "VF not yet analyzed for uniformity");
1027     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1028   }
1029 
1030   /// Returns true if \p I is known to be scalar after vectorization.
1031   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1032     if (VF == 1)
1033       return true;
1034 
1035     // Cost model is not run in the VPlan-native path - return conservative
1036     // result until this changes.
1037     if (EnableVPlanNativePath)
1038       return false;
1039 
1040     auto ScalarsPerVF = Scalars.find(VF);
1041     assert(ScalarsPerVF != Scalars.end() &&
1042            "Scalar values are not calculated for VF");
1043     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1044   }
1045 
1046   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1047   /// for vectorization factor \p VF.
1048   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1049     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1050            !isProfitableToScalarize(I, VF) &&
1051            !isScalarAfterVectorization(I, VF);
1052   }
1053 
1054   /// Decision that was taken during cost calculation for memory instruction.
1055   enum InstWidening {
1056     CM_Unknown,
1057     CM_Widen,         // For consecutive accesses with stride +1.
1058     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1059     CM_Interleave,
1060     CM_GatherScatter,
1061     CM_Scalarize
1062   };
1063 
1064   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1065   /// instruction \p I and vector width \p VF.
1066   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1067                            unsigned Cost) {
1068     assert(VF >= 2 && "Expected VF >=2");
1069     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1070   }
1071 
1072   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073   /// interleaving group \p Grp and vector width \p VF.
1074   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1075                            InstWidening W, unsigned Cost) {
1076     assert(VF >= 2 && "Expected VF >=2");
1077     /// Broadcast this decicion to all instructions inside the group.
1078     /// But the cost will be assigned to one instruction only.
1079     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1080       if (auto *I = Grp->getMember(i)) {
1081         if (Grp->getInsertPos() == I)
1082           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1083         else
1084           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1085       }
1086     }
1087   }
1088 
1089   /// Return the cost model decision for the given instruction \p I and vector
1090   /// width \p VF. Return CM_Unknown if this instruction did not pass
1091   /// through the cost modeling.
1092   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1093     assert(VF >= 2 && "Expected VF >=2");
1094 
1095     // Cost model is not run in the VPlan-native path - return conservative
1096     // result until this changes.
1097     if (EnableVPlanNativePath)
1098       return CM_GatherScatter;
1099 
1100     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1101     auto Itr = WideningDecisions.find(InstOnVF);
1102     if (Itr == WideningDecisions.end())
1103       return CM_Unknown;
1104     return Itr->second.first;
1105   }
1106 
1107   /// Return the vectorization cost for the given instruction \p I and vector
1108   /// width \p VF.
1109   unsigned getWideningCost(Instruction *I, unsigned VF) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1112     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1113            "The cost is not calculated");
1114     return WideningDecisions[InstOnVF].second;
1115   }
1116 
1117   /// Return True if instruction \p I is an optimizable truncate whose operand
1118   /// is an induction variable. Such a truncate will be removed by adding a new
1119   /// induction variable with the destination type.
1120   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1121     // If the instruction is not a truncate, return false.
1122     auto *Trunc = dyn_cast<TruncInst>(I);
1123     if (!Trunc)
1124       return false;
1125 
1126     // Get the source and destination types of the truncate.
1127     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1128     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1129 
1130     // If the truncate is free for the given types, return false. Replacing a
1131     // free truncate with an induction variable would add an induction variable
1132     // update instruction to each iteration of the loop. We exclude from this
1133     // check the primary induction variable since it will need an update
1134     // instruction regardless.
1135     Value *Op = Trunc->getOperand(0);
1136     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1137       return false;
1138 
1139     // If the truncated value is not an induction variable, return false.
1140     return Legal->isInductionPhi(Op);
1141   }
1142 
1143   /// Collects the instructions to scalarize for each predicated instruction in
1144   /// the loop.
1145   void collectInstsToScalarize(unsigned VF);
1146 
1147   /// Collect Uniform and Scalar values for the given \p VF.
1148   /// The sets depend on CM decision for Load/Store instructions
1149   /// that may be vectorized as interleave, gather-scatter or scalarized.
1150   void collectUniformsAndScalars(unsigned VF) {
1151     // Do the analysis once.
1152     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1153       return;
1154     setCostBasedWideningDecision(VF);
1155     collectLoopUniforms(VF);
1156     collectLoopScalars(VF);
1157   }
1158 
1159   /// Returns true if the target machine supports masked store operation
1160   /// for the given \p DataType and kind of access to \p Ptr.
1161   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1162     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1163   }
1164 
1165   /// Returns true if the target machine supports masked load operation
1166   /// for the given \p DataType and kind of access to \p Ptr.
1167   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1168     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1169   }
1170 
1171   /// Returns true if the target machine supports masked scatter operation
1172   /// for the given \p DataType.
1173   bool isLegalMaskedScatter(Type *DataType) {
1174     return TTI.isLegalMaskedScatter(DataType);
1175   }
1176 
1177   /// Returns true if the target machine supports masked gather operation
1178   /// for the given \p DataType.
1179   bool isLegalMaskedGather(Type *DataType) {
1180     return TTI.isLegalMaskedGather(DataType);
1181   }
1182 
1183   /// Returns true if the target machine can represent \p V as a masked gather
1184   /// or scatter operation.
1185   bool isLegalGatherOrScatter(Value *V) {
1186     bool LI = isa<LoadInst>(V);
1187     bool SI = isa<StoreInst>(V);
1188     if (!LI && !SI)
1189       return false;
1190     auto *Ty = getMemInstValueType(V);
1191     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1192   }
1193 
1194   /// Returns true if \p I is an instruction that will be scalarized with
1195   /// predication. Such instructions include conditional stores and
1196   /// instructions that may divide by zero.
1197   /// If a non-zero VF has been calculated, we check if I will be scalarized
1198   /// predication for that VF.
1199   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1200 
1201   // Returns true if \p I is an instruction that will be predicated either
1202   // through scalar predication or masked load/store or masked gather/scatter.
1203   // Superset of instructions that return true for isScalarWithPredication.
1204   bool isPredicatedInst(Instruction *I) {
1205     if (!blockNeedsPredication(I->getParent()))
1206       return false;
1207     // Loads and stores that need some form of masked operation are predicated
1208     // instructions.
1209     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1210       return Legal->isMaskRequired(I);
1211     return isScalarWithPredication(I);
1212   }
1213 
1214   /// Returns true if \p I is a memory instruction with consecutive memory
1215   /// access that can be widened.
1216   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1217 
1218   /// Returns true if \p I is a memory instruction in an interleaved-group
1219   /// of memory accesses that can be vectorized with wide vector loads/stores
1220   /// and shuffles.
1221   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1222 
1223   /// Check if \p Instr belongs to any interleaved access group.
1224   bool isAccessInterleaved(Instruction *Instr) {
1225     return InterleaveInfo.isInterleaved(Instr);
1226   }
1227 
1228   /// Get the interleaved access group that \p Instr belongs to.
1229   const InterleaveGroup<Instruction> *
1230   getInterleavedAccessGroup(Instruction *Instr) {
1231     return InterleaveInfo.getInterleaveGroup(Instr);
1232   }
1233 
1234   /// Returns true if an interleaved group requires a scalar iteration
1235   /// to handle accesses with gaps, and there is nothing preventing us from
1236   /// creating a scalar epilogue.
1237   bool requiresScalarEpilogue() const {
1238     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1239   }
1240 
1241   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1242   /// loop hint annotation.
1243   bool isScalarEpilogueAllowed() const {
1244     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1245   }
1246 
1247   /// Returns true if all loop blocks should be masked to fold tail loop.
1248   bool foldTailByMasking() const { return FoldTailByMasking; }
1249 
1250   bool blockNeedsPredication(BasicBlock *BB) {
1251     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1252   }
1253 
1254   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1255   /// with factor VF.  Return the cost of the instruction, including
1256   /// scalarization overhead if it's needed.
1257   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1258 
1259   /// Estimate cost of a call instruction CI if it were vectorized with factor
1260   /// VF. Return the cost of the instruction, including scalarization overhead
1261   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1262   /// scalarized -
1263   /// i.e. either vector version isn't available, or is too expensive.
1264   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1265 
1266 private:
1267   unsigned NumPredStores = 0;
1268 
1269   /// \return An upper bound for the vectorization factor, larger than zero.
1270   /// One is returned if vectorization should best be avoided due to cost.
1271   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1272 
1273   /// The vectorization cost is a combination of the cost itself and a boolean
1274   /// indicating whether any of the contributing operations will actually
1275   /// operate on
1276   /// vector values after type legalization in the backend. If this latter value
1277   /// is
1278   /// false, then all operations will be scalarized (i.e. no vectorization has
1279   /// actually taken place).
1280   using VectorizationCostTy = std::pair<unsigned, bool>;
1281 
1282   /// Returns the expected execution cost. The unit of the cost does
1283   /// not matter because we use the 'cost' units to compare different
1284   /// vector widths. The cost that is returned is *not* normalized by
1285   /// the factor width.
1286   VectorizationCostTy expectedCost(unsigned VF);
1287 
1288   /// Returns the execution time cost of an instruction for a given vector
1289   /// width. Vector width of one means scalar.
1290   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1291 
1292   /// The cost-computation logic from getInstructionCost which provides
1293   /// the vector type as an output parameter.
1294   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1295 
1296   /// Calculate vectorization cost of memory instruction \p I.
1297   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1298 
1299   /// The cost computation for scalarized memory instruction.
1300   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1301 
1302   /// The cost computation for interleaving group of memory instructions.
1303   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1304 
1305   /// The cost computation for Gather/Scatter instruction.
1306   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1307 
1308   /// The cost computation for widening instruction \p I with consecutive
1309   /// memory access.
1310   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1311 
1312   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1313   /// Load: scalar load + broadcast.
1314   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1315   /// element)
1316   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1317 
1318   /// Estimate the overhead of scalarizing an instruction. This is a
1319   /// convenience wrapper for the type-based getScalarizationOverhead API.
1320   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1321 
1322   /// Returns whether the instruction is a load or store and will be a emitted
1323   /// as a vector operation.
1324   bool isConsecutiveLoadOrStore(Instruction *I);
1325 
1326   /// Returns true if an artificially high cost for emulated masked memrefs
1327   /// should be used.
1328   bool useEmulatedMaskMemRefHack(Instruction *I);
1329 
1330   /// Map of scalar integer values to the smallest bitwidth they can be legally
1331   /// represented as. The vector equivalents of these values should be truncated
1332   /// to this type.
1333   MapVector<Instruction *, uint64_t> MinBWs;
1334 
1335   /// A type representing the costs for instructions if they were to be
1336   /// scalarized rather than vectorized. The entries are Instruction-Cost
1337   /// pairs.
1338   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1339 
1340   /// A set containing all BasicBlocks that are known to present after
1341   /// vectorization as a predicated block.
1342   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1343 
1344   /// Records whether it is allowed to have the original scalar loop execute at
1345   /// least once. This may be needed as a fallback loop in case runtime
1346   /// aliasing/dependence checks fail, or to handle the tail/remainder
1347   /// iterations when the trip count is unknown or doesn't divide by the VF,
1348   /// or as a peel-loop to handle gaps in interleave-groups.
1349   /// Under optsize and when the trip count is very small we don't allow any
1350   /// iterations to execute in the scalar loop.
1351   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1352 
1353   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1354   bool FoldTailByMasking = false;
1355 
1356   /// A map holding scalar costs for different vectorization factors. The
1357   /// presence of a cost for an instruction in the mapping indicates that the
1358   /// instruction will be scalarized when vectorizing with the associated
1359   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1360   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1361 
1362   /// Holds the instructions known to be uniform after vectorization.
1363   /// The data is collected per VF.
1364   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1365 
1366   /// Holds the instructions known to be scalar after vectorization.
1367   /// The data is collected per VF.
1368   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1369 
1370   /// Holds the instructions (address computations) that are forced to be
1371   /// scalarized.
1372   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1373 
1374   /// Returns the expected difference in cost from scalarizing the expression
1375   /// feeding a predicated instruction \p PredInst. The instructions to
1376   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1377   /// non-negative return value implies the expression will be scalarized.
1378   /// Currently, only single-use chains are considered for scalarization.
1379   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1380                               unsigned VF);
1381 
1382   /// Collect the instructions that are uniform after vectorization. An
1383   /// instruction is uniform if we represent it with a single scalar value in
1384   /// the vectorized loop corresponding to each vector iteration. Examples of
1385   /// uniform instructions include pointer operands of consecutive or
1386   /// interleaved memory accesses. Note that although uniformity implies an
1387   /// instruction will be scalar, the reverse is not true. In general, a
1388   /// scalarized instruction will be represented by VF scalar values in the
1389   /// vectorized loop, each corresponding to an iteration of the original
1390   /// scalar loop.
1391   void collectLoopUniforms(unsigned VF);
1392 
1393   /// Collect the instructions that are scalar after vectorization. An
1394   /// instruction is scalar if it is known to be uniform or will be scalarized
1395   /// during vectorization. Non-uniform scalarized instructions will be
1396   /// represented by VF values in the vectorized loop, each corresponding to an
1397   /// iteration of the original scalar loop.
1398   void collectLoopScalars(unsigned VF);
1399 
1400   /// Keeps cost model vectorization decision and cost for instructions.
1401   /// Right now it is used for memory instructions only.
1402   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1403                                 std::pair<InstWidening, unsigned>>;
1404 
1405   DecisionList WideningDecisions;
1406 
1407   /// Returns true if \p V is expected to be vectorized and it needs to be
1408   /// extracted.
1409   bool needsExtract(Value *V, unsigned VF) const {
1410     Instruction *I = dyn_cast<Instruction>(V);
1411     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1412       return false;
1413 
1414     // Assume we can vectorize V (and hence we need extraction) if the
1415     // scalars are not computed yet. This can happen, because it is called
1416     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1417     // the scalars are collected. That should be a safe assumption in most
1418     // cases, because we check if the operands have vectorizable types
1419     // beforehand in LoopVectorizationLegality.
1420     return Scalars.find(VF) == Scalars.end() ||
1421            !isScalarAfterVectorization(I, VF);
1422   };
1423 
1424   /// Returns a range containing only operands needing to be extracted.
1425   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1426                                                    unsigned VF) {
1427     return SmallVector<Value *, 4>(make_filter_range(
1428         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1429   }
1430 
1431 public:
1432   /// The loop that we evaluate.
1433   Loop *TheLoop;
1434 
1435   /// Predicated scalar evolution analysis.
1436   PredicatedScalarEvolution &PSE;
1437 
1438   /// Loop Info analysis.
1439   LoopInfo *LI;
1440 
1441   /// Vectorization legality.
1442   LoopVectorizationLegality *Legal;
1443 
1444   /// Vector target information.
1445   const TargetTransformInfo &TTI;
1446 
1447   /// Target Library Info.
1448   const TargetLibraryInfo *TLI;
1449 
1450   /// Demanded bits analysis.
1451   DemandedBits *DB;
1452 
1453   /// Assumption cache.
1454   AssumptionCache *AC;
1455 
1456   /// Interface to emit optimization remarks.
1457   OptimizationRemarkEmitter *ORE;
1458 
1459   const Function *TheFunction;
1460 
1461   /// Loop Vectorize Hint.
1462   const LoopVectorizeHints *Hints;
1463 
1464   /// The interleave access information contains groups of interleaved accesses
1465   /// with the same stride and close to each other.
1466   InterleavedAccessInfo &InterleaveInfo;
1467 
1468   /// Values to ignore in the cost model.
1469   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1470 
1471   /// Values to ignore in the cost model when VF > 1.
1472   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1473 };
1474 
1475 } // end namespace llvm
1476 
1477 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1478 // vectorization. The loop needs to be annotated with #pragma omp simd
1479 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1480 // vector length information is not provided, vectorization is not considered
1481 // explicit. Interleave hints are not allowed either. These limitations will be
1482 // relaxed in the future.
1483 // Please, note that we are currently forced to abuse the pragma 'clang
1484 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1485 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1486 // provides *explicit vectorization hints* (LV can bypass legal checks and
1487 // assume that vectorization is legal). However, both hints are implemented
1488 // using the same metadata (llvm.loop.vectorize, processed by
1489 // LoopVectorizeHints). This will be fixed in the future when the native IR
1490 // representation for pragma 'omp simd' is introduced.
1491 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1492                                    OptimizationRemarkEmitter *ORE) {
1493   assert(!OuterLp->empty() && "This is not an outer loop");
1494   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1495 
1496   // Only outer loops with an explicit vectorization hint are supported.
1497   // Unannotated outer loops are ignored.
1498   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1499     return false;
1500 
1501   Function *Fn = OuterLp->getHeader()->getParent();
1502   if (!Hints.allowVectorization(Fn, OuterLp,
1503                                 true /*VectorizeOnlyWhenForced*/)) {
1504     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1505     return false;
1506   }
1507 
1508   if (Hints.getInterleave() > 1) {
1509     // TODO: Interleave support is future work.
1510     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1511                          "outer loops.\n");
1512     Hints.emitRemarkWithHints();
1513     return false;
1514   }
1515 
1516   return true;
1517 }
1518 
1519 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1520                                   OptimizationRemarkEmitter *ORE,
1521                                   SmallVectorImpl<Loop *> &V) {
1522   // Collect inner loops and outer loops without irreducible control flow. For
1523   // now, only collect outer loops that have explicit vectorization hints. If we
1524   // are stress testing the VPlan H-CFG construction, we collect the outermost
1525   // loop of every loop nest.
1526   if (L.empty() || VPlanBuildStressTest ||
1527       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1528     LoopBlocksRPO RPOT(&L);
1529     RPOT.perform(LI);
1530     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1531       V.push_back(&L);
1532       // TODO: Collect inner loops inside marked outer loops in case
1533       // vectorization fails for the outer loop. Do not invoke
1534       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1535       // already known to be reducible. We can use an inherited attribute for
1536       // that.
1537       return;
1538     }
1539   }
1540   for (Loop *InnerL : L)
1541     collectSupportedLoops(*InnerL, LI, ORE, V);
1542 }
1543 
1544 namespace {
1545 
1546 /// The LoopVectorize Pass.
1547 struct LoopVectorize : public FunctionPass {
1548   /// Pass identification, replacement for typeid
1549   static char ID;
1550 
1551   LoopVectorizePass Impl;
1552 
1553   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1554                          bool VectorizeOnlyWhenForced = false)
1555       : FunctionPass(ID) {
1556     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1557     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1558     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1559   }
1560 
1561   bool runOnFunction(Function &F) override {
1562     if (skipFunction(F))
1563       return false;
1564 
1565     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1566     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1567     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1568     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1569     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1570     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1571     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1572     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1573     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1574     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1575     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1576     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1577     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1578 
1579     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1580         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1581 
1582     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1583                         GetLAA, *ORE, PSI);
1584   }
1585 
1586   void getAnalysisUsage(AnalysisUsage &AU) const override {
1587     AU.addRequired<AssumptionCacheTracker>();
1588     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1589     AU.addRequired<DominatorTreeWrapperPass>();
1590     AU.addRequired<LoopInfoWrapperPass>();
1591     AU.addRequired<ScalarEvolutionWrapperPass>();
1592     AU.addRequired<TargetTransformInfoWrapperPass>();
1593     AU.addRequired<AAResultsWrapperPass>();
1594     AU.addRequired<LoopAccessLegacyAnalysis>();
1595     AU.addRequired<DemandedBitsWrapperPass>();
1596     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1597 
1598     // We currently do not preserve loopinfo/dominator analyses with outer loop
1599     // vectorization. Until this is addressed, mark these analyses as preserved
1600     // only for non-VPlan-native path.
1601     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1602     if (!EnableVPlanNativePath) {
1603       AU.addPreserved<LoopInfoWrapperPass>();
1604       AU.addPreserved<DominatorTreeWrapperPass>();
1605     }
1606 
1607     AU.addPreserved<BasicAAWrapperPass>();
1608     AU.addPreserved<GlobalsAAWrapperPass>();
1609     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1610   }
1611 };
1612 
1613 } // end anonymous namespace
1614 
1615 //===----------------------------------------------------------------------===//
1616 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1617 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1618 //===----------------------------------------------------------------------===//
1619 
1620 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1621   // We need to place the broadcast of invariant variables outside the loop,
1622   // but only if it's proven safe to do so. Else, broadcast will be inside
1623   // vector loop body.
1624   Instruction *Instr = dyn_cast<Instruction>(V);
1625   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1626                      (!Instr ||
1627                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1628   // Place the code for broadcasting invariant variables in the new preheader.
1629   IRBuilder<>::InsertPointGuard Guard(Builder);
1630   if (SafeToHoist)
1631     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1632 
1633   // Broadcast the scalar into all locations in the vector.
1634   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1635 
1636   return Shuf;
1637 }
1638 
1639 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1640     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1641   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1642          "Expected either an induction phi-node or a truncate of it!");
1643   Value *Start = II.getStartValue();
1644 
1645   // Construct the initial value of the vector IV in the vector loop preheader
1646   auto CurrIP = Builder.saveIP();
1647   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1648   if (isa<TruncInst>(EntryVal)) {
1649     assert(Start->getType()->isIntegerTy() &&
1650            "Truncation requires an integer type");
1651     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1652     Step = Builder.CreateTrunc(Step, TruncType);
1653     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1654   }
1655   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1656   Value *SteppedStart =
1657       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1658 
1659   // We create vector phi nodes for both integer and floating-point induction
1660   // variables. Here, we determine the kind of arithmetic we will perform.
1661   Instruction::BinaryOps AddOp;
1662   Instruction::BinaryOps MulOp;
1663   if (Step->getType()->isIntegerTy()) {
1664     AddOp = Instruction::Add;
1665     MulOp = Instruction::Mul;
1666   } else {
1667     AddOp = II.getInductionOpcode();
1668     MulOp = Instruction::FMul;
1669   }
1670 
1671   // Multiply the vectorization factor by the step using integer or
1672   // floating-point arithmetic as appropriate.
1673   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1674   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1675 
1676   // Create a vector splat to use in the induction update.
1677   //
1678   // FIXME: If the step is non-constant, we create the vector splat with
1679   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1680   //        handle a constant vector splat.
1681   Value *SplatVF = isa<Constant>(Mul)
1682                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1683                        : Builder.CreateVectorSplat(VF, Mul);
1684   Builder.restoreIP(CurrIP);
1685 
1686   // We may need to add the step a number of times, depending on the unroll
1687   // factor. The last of those goes into the PHI.
1688   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1689                                     &*LoopVectorBody->getFirstInsertionPt());
1690   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1691   Instruction *LastInduction = VecInd;
1692   for (unsigned Part = 0; Part < UF; ++Part) {
1693     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1694 
1695     if (isa<TruncInst>(EntryVal))
1696       addMetadata(LastInduction, EntryVal);
1697     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1698 
1699     LastInduction = cast<Instruction>(addFastMathFlag(
1700         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1701     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1702   }
1703 
1704   // Move the last step to the end of the latch block. This ensures consistent
1705   // placement of all induction updates.
1706   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1707   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1708   auto *ICmp = cast<Instruction>(Br->getCondition());
1709   LastInduction->moveBefore(ICmp);
1710   LastInduction->setName("vec.ind.next");
1711 
1712   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1713   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1714 }
1715 
1716 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1717   return Cost->isScalarAfterVectorization(I, VF) ||
1718          Cost->isProfitableToScalarize(I, VF);
1719 }
1720 
1721 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1722   if (shouldScalarizeInstruction(IV))
1723     return true;
1724   auto isScalarInst = [&](User *U) -> bool {
1725     auto *I = cast<Instruction>(U);
1726     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1727   };
1728   return llvm::any_of(IV->users(), isScalarInst);
1729 }
1730 
1731 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1732     const InductionDescriptor &ID, const Instruction *EntryVal,
1733     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1734   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1735          "Expected either an induction phi-node or a truncate of it!");
1736 
1737   // This induction variable is not the phi from the original loop but the
1738   // newly-created IV based on the proof that casted Phi is equal to the
1739   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1740   // re-uses the same InductionDescriptor that original IV uses but we don't
1741   // have to do any recording in this case - that is done when original IV is
1742   // processed.
1743   if (isa<TruncInst>(EntryVal))
1744     return;
1745 
1746   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1747   if (Casts.empty())
1748     return;
1749   // Only the first Cast instruction in the Casts vector is of interest.
1750   // The rest of the Casts (if exist) have no uses outside the
1751   // induction update chain itself.
1752   Instruction *CastInst = *Casts.begin();
1753   if (Lane < UINT_MAX)
1754     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1755   else
1756     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1757 }
1758 
1759 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1760   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1761          "Primary induction variable must have an integer type");
1762 
1763   auto II = Legal->getInductionVars()->find(IV);
1764   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1765 
1766   auto ID = II->second;
1767   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1768 
1769   // The scalar value to broadcast. This will be derived from the canonical
1770   // induction variable.
1771   Value *ScalarIV = nullptr;
1772 
1773   // The value from the original loop to which we are mapping the new induction
1774   // variable.
1775   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1776 
1777   // True if we have vectorized the induction variable.
1778   auto VectorizedIV = false;
1779 
1780   // Determine if we want a scalar version of the induction variable. This is
1781   // true if the induction variable itself is not widened, or if it has at
1782   // least one user in the loop that is not widened.
1783   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1784 
1785   // Generate code for the induction step. Note that induction steps are
1786   // required to be loop-invariant
1787   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1788          "Induction step should be loop invariant");
1789   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1790   Value *Step = nullptr;
1791   if (PSE.getSE()->isSCEVable(IV->getType())) {
1792     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1793     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1794                              LoopVectorPreHeader->getTerminator());
1795   } else {
1796     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1797   }
1798 
1799   // Try to create a new independent vector induction variable. If we can't
1800   // create the phi node, we will splat the scalar induction variable in each
1801   // loop iteration.
1802   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1803     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1804     VectorizedIV = true;
1805   }
1806 
1807   // If we haven't yet vectorized the induction variable, or if we will create
1808   // a scalar one, we need to define the scalar induction variable and step
1809   // values. If we were given a truncation type, truncate the canonical
1810   // induction variable and step. Otherwise, derive these values from the
1811   // induction descriptor.
1812   if (!VectorizedIV || NeedsScalarIV) {
1813     ScalarIV = Induction;
1814     if (IV != OldInduction) {
1815       ScalarIV = IV->getType()->isIntegerTy()
1816                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1817                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1818                                           IV->getType());
1819       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1820       ScalarIV->setName("offset.idx");
1821     }
1822     if (Trunc) {
1823       auto *TruncType = cast<IntegerType>(Trunc->getType());
1824       assert(Step->getType()->isIntegerTy() &&
1825              "Truncation requires an integer step");
1826       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1827       Step = Builder.CreateTrunc(Step, TruncType);
1828     }
1829   }
1830 
1831   // If we haven't yet vectorized the induction variable, splat the scalar
1832   // induction variable, and build the necessary step vectors.
1833   // TODO: Don't do it unless the vectorized IV is really required.
1834   if (!VectorizedIV) {
1835     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1836     for (unsigned Part = 0; Part < UF; ++Part) {
1837       Value *EntryPart =
1838           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1839       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1840       if (Trunc)
1841         addMetadata(EntryPart, Trunc);
1842       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1843     }
1844   }
1845 
1846   // If an induction variable is only used for counting loop iterations or
1847   // calculating addresses, it doesn't need to be widened. Create scalar steps
1848   // that can be used by instructions we will later scalarize. Note that the
1849   // addition of the scalar steps will not increase the number of instructions
1850   // in the loop in the common case prior to InstCombine. We will be trading
1851   // one vector extract for each scalar step.
1852   if (NeedsScalarIV)
1853     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1854 }
1855 
1856 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1857                                           Instruction::BinaryOps BinOp) {
1858   // Create and check the types.
1859   assert(Val->getType()->isVectorTy() && "Must be a vector");
1860   int VLen = Val->getType()->getVectorNumElements();
1861 
1862   Type *STy = Val->getType()->getScalarType();
1863   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1864          "Induction Step must be an integer or FP");
1865   assert(Step->getType() == STy && "Step has wrong type");
1866 
1867   SmallVector<Constant *, 8> Indices;
1868 
1869   if (STy->isIntegerTy()) {
1870     // Create a vector of consecutive numbers from zero to VF.
1871     for (int i = 0; i < VLen; ++i)
1872       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1873 
1874     // Add the consecutive indices to the vector value.
1875     Constant *Cv = ConstantVector::get(Indices);
1876     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1877     Step = Builder.CreateVectorSplat(VLen, Step);
1878     assert(Step->getType() == Val->getType() && "Invalid step vec");
1879     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1880     // which can be found from the original scalar operations.
1881     Step = Builder.CreateMul(Cv, Step);
1882     return Builder.CreateAdd(Val, Step, "induction");
1883   }
1884 
1885   // Floating point induction.
1886   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1887          "Binary Opcode should be specified for FP induction");
1888   // Create a vector of consecutive numbers from zero to VF.
1889   for (int i = 0; i < VLen; ++i)
1890     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1891 
1892   // Add the consecutive indices to the vector value.
1893   Constant *Cv = ConstantVector::get(Indices);
1894 
1895   Step = Builder.CreateVectorSplat(VLen, Step);
1896 
1897   // Floating point operations had to be 'fast' to enable the induction.
1898   FastMathFlags Flags;
1899   Flags.setFast();
1900 
1901   Value *MulOp = Builder.CreateFMul(Cv, Step);
1902   if (isa<Instruction>(MulOp))
1903     // Have to check, MulOp may be a constant
1904     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1905 
1906   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1907   if (isa<Instruction>(BOp))
1908     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1909   return BOp;
1910 }
1911 
1912 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1913                                            Instruction *EntryVal,
1914                                            const InductionDescriptor &ID) {
1915   // We shouldn't have to build scalar steps if we aren't vectorizing.
1916   assert(VF > 1 && "VF should be greater than one");
1917 
1918   // Get the value type and ensure it and the step have the same integer type.
1919   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1920   assert(ScalarIVTy == Step->getType() &&
1921          "Val and Step should have the same type");
1922 
1923   // We build scalar steps for both integer and floating-point induction
1924   // variables. Here, we determine the kind of arithmetic we will perform.
1925   Instruction::BinaryOps AddOp;
1926   Instruction::BinaryOps MulOp;
1927   if (ScalarIVTy->isIntegerTy()) {
1928     AddOp = Instruction::Add;
1929     MulOp = Instruction::Mul;
1930   } else {
1931     AddOp = ID.getInductionOpcode();
1932     MulOp = Instruction::FMul;
1933   }
1934 
1935   // Determine the number of scalars we need to generate for each unroll
1936   // iteration. If EntryVal is uniform, we only need to generate the first
1937   // lane. Otherwise, we generate all VF values.
1938   unsigned Lanes =
1939       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1940                                                                          : VF;
1941   // Compute the scalar steps and save the results in VectorLoopValueMap.
1942   for (unsigned Part = 0; Part < UF; ++Part) {
1943     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1944       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1945       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1946       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1947       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1948       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1949     }
1950   }
1951 }
1952 
1953 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1954   assert(V != Induction && "The new induction variable should not be used.");
1955   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1956   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1957 
1958   // If we have a stride that is replaced by one, do it here. Defer this for
1959   // the VPlan-native path until we start running Legal checks in that path.
1960   if (!EnableVPlanNativePath && Legal->hasStride(V))
1961     V = ConstantInt::get(V->getType(), 1);
1962 
1963   // If we have a vector mapped to this value, return it.
1964   if (VectorLoopValueMap.hasVectorValue(V, Part))
1965     return VectorLoopValueMap.getVectorValue(V, Part);
1966 
1967   // If the value has not been vectorized, check if it has been scalarized
1968   // instead. If it has been scalarized, and we actually need the value in
1969   // vector form, we will construct the vector values on demand.
1970   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1971     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1972 
1973     // If we've scalarized a value, that value should be an instruction.
1974     auto *I = cast<Instruction>(V);
1975 
1976     // If we aren't vectorizing, we can just copy the scalar map values over to
1977     // the vector map.
1978     if (VF == 1) {
1979       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1980       return ScalarValue;
1981     }
1982 
1983     // Get the last scalar instruction we generated for V and Part. If the value
1984     // is known to be uniform after vectorization, this corresponds to lane zero
1985     // of the Part unroll iteration. Otherwise, the last instruction is the one
1986     // we created for the last vector lane of the Part unroll iteration.
1987     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1988     auto *LastInst = cast<Instruction>(
1989         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1990 
1991     // Set the insert point after the last scalarized instruction. This ensures
1992     // the insertelement sequence will directly follow the scalar definitions.
1993     auto OldIP = Builder.saveIP();
1994     auto NewIP = std::next(BasicBlock::iterator(LastInst));
1995     Builder.SetInsertPoint(&*NewIP);
1996 
1997     // However, if we are vectorizing, we need to construct the vector values.
1998     // If the value is known to be uniform after vectorization, we can just
1999     // broadcast the scalar value corresponding to lane zero for each unroll
2000     // iteration. Otherwise, we construct the vector values using insertelement
2001     // instructions. Since the resulting vectors are stored in
2002     // VectorLoopValueMap, we will only generate the insertelements once.
2003     Value *VectorValue = nullptr;
2004     if (Cost->isUniformAfterVectorization(I, VF)) {
2005       VectorValue = getBroadcastInstrs(ScalarValue);
2006       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2007     } else {
2008       // Initialize packing with insertelements to start from undef.
2009       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2010       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2011       for (unsigned Lane = 0; Lane < VF; ++Lane)
2012         packScalarIntoVectorValue(V, {Part, Lane});
2013       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2014     }
2015     Builder.restoreIP(OldIP);
2016     return VectorValue;
2017   }
2018 
2019   // If this scalar is unknown, assume that it is a constant or that it is
2020   // loop invariant. Broadcast V and save the value for future uses.
2021   Value *B = getBroadcastInstrs(V);
2022   VectorLoopValueMap.setVectorValue(V, Part, B);
2023   return B;
2024 }
2025 
2026 Value *
2027 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2028                                             const VPIteration &Instance) {
2029   // If the value is not an instruction contained in the loop, it should
2030   // already be scalar.
2031   if (OrigLoop->isLoopInvariant(V))
2032     return V;
2033 
2034   assert(Instance.Lane > 0
2035              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2036              : true && "Uniform values only have lane zero");
2037 
2038   // If the value from the original loop has not been vectorized, it is
2039   // represented by UF x VF scalar values in the new loop. Return the requested
2040   // scalar value.
2041   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2042     return VectorLoopValueMap.getScalarValue(V, Instance);
2043 
2044   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2045   // for the given unroll part. If this entry is not a vector type (i.e., the
2046   // vectorization factor is one), there is no need to generate an
2047   // extractelement instruction.
2048   auto *U = getOrCreateVectorValue(V, Instance.Part);
2049   if (!U->getType()->isVectorTy()) {
2050     assert(VF == 1 && "Value not scalarized has non-vector type");
2051     return U;
2052   }
2053 
2054   // Otherwise, the value from the original loop has been vectorized and is
2055   // represented by UF vector values. Extract and return the requested scalar
2056   // value from the appropriate vector lane.
2057   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2058 }
2059 
2060 void InnerLoopVectorizer::packScalarIntoVectorValue(
2061     Value *V, const VPIteration &Instance) {
2062   assert(V != Induction && "The new induction variable should not be used.");
2063   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2064   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2065 
2066   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2067   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2068   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2069                                             Builder.getInt32(Instance.Lane));
2070   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2071 }
2072 
2073 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2074   assert(Vec->getType()->isVectorTy() && "Invalid type");
2075   SmallVector<Constant *, 8> ShuffleMask;
2076   for (unsigned i = 0; i < VF; ++i)
2077     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2078 
2079   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2080                                      ConstantVector::get(ShuffleMask),
2081                                      "reverse");
2082 }
2083 
2084 // Return whether we allow using masked interleave-groups (for dealing with
2085 // strided loads/stores that reside in predicated blocks, or for dealing
2086 // with gaps).
2087 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2088   // If an override option has been passed in for interleaved accesses, use it.
2089   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2090     return EnableMaskedInterleavedMemAccesses;
2091 
2092   return TTI.enableMaskedInterleavedAccessVectorization();
2093 }
2094 
2095 // Try to vectorize the interleave group that \p Instr belongs to.
2096 //
2097 // E.g. Translate following interleaved load group (factor = 3):
2098 //   for (i = 0; i < N; i+=3) {
2099 //     R = Pic[i];             // Member of index 0
2100 //     G = Pic[i+1];           // Member of index 1
2101 //     B = Pic[i+2];           // Member of index 2
2102 //     ... // do something to R, G, B
2103 //   }
2104 // To:
2105 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2106 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2107 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2108 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2109 //
2110 // Or translate following interleaved store group (factor = 3):
2111 //   for (i = 0; i < N; i+=3) {
2112 //     ... do something to R, G, B
2113 //     Pic[i]   = R;           // Member of index 0
2114 //     Pic[i+1] = G;           // Member of index 1
2115 //     Pic[i+2] = B;           // Member of index 2
2116 //   }
2117 // To:
2118 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2119 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2120 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2121 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2122 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2123 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2124                                                    VectorParts *BlockInMask) {
2125   const InterleaveGroup<Instruction> *Group =
2126       Cost->getInterleavedAccessGroup(Instr);
2127   assert(Group && "Fail to get an interleaved access group.");
2128 
2129   // Skip if current instruction is not the insert position.
2130   if (Instr != Group->getInsertPos())
2131     return;
2132 
2133   const DataLayout &DL = Instr->getModule()->getDataLayout();
2134   Value *Ptr = getLoadStorePointerOperand(Instr);
2135 
2136   // Prepare for the vector type of the interleaved load/store.
2137   Type *ScalarTy = getMemInstValueType(Instr);
2138   unsigned InterleaveFactor = Group->getFactor();
2139   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2140   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2141 
2142   // Prepare for the new pointers.
2143   setDebugLocFromInst(Builder, Ptr);
2144   SmallVector<Value *, 2> NewPtrs;
2145   unsigned Index = Group->getIndex(Instr);
2146 
2147   VectorParts Mask;
2148   bool IsMaskForCondRequired = BlockInMask;
2149   if (IsMaskForCondRequired) {
2150     Mask = *BlockInMask;
2151     // TODO: extend the masked interleaved-group support to reversed access.
2152     assert(!Group->isReverse() && "Reversed masked interleave-group "
2153                                   "not supported.");
2154   }
2155 
2156   // If the group is reverse, adjust the index to refer to the last vector lane
2157   // instead of the first. We adjust the index from the first vector lane,
2158   // rather than directly getting the pointer for lane VF - 1, because the
2159   // pointer operand of the interleaved access is supposed to be uniform. For
2160   // uniform instructions, we're only required to generate a value for the
2161   // first vector lane in each unroll iteration.
2162   if (Group->isReverse())
2163     Index += (VF - 1) * Group->getFactor();
2164 
2165   bool InBounds = false;
2166   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2167     InBounds = gep->isInBounds();
2168 
2169   for (unsigned Part = 0; Part < UF; Part++) {
2170     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2171 
2172     // Notice current instruction could be any index. Need to adjust the address
2173     // to the member of index 0.
2174     //
2175     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2176     //       b = A[i];       // Member of index 0
2177     // Current pointer is pointed to A[i+1], adjust it to A[i].
2178     //
2179     // E.g.  A[i+1] = a;     // Member of index 1
2180     //       A[i]   = b;     // Member of index 0
2181     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2182     // Current pointer is pointed to A[i+2], adjust it to A[i].
2183     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2184     if (InBounds)
2185       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2186 
2187     // Cast to the vector pointer type.
2188     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2189   }
2190 
2191   setDebugLocFromInst(Builder, Instr);
2192   Value *UndefVec = UndefValue::get(VecTy);
2193 
2194   Value *MaskForGaps = nullptr;
2195   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2196     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2197     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2198   }
2199 
2200   // Vectorize the interleaved load group.
2201   if (isa<LoadInst>(Instr)) {
2202     // For each unroll part, create a wide load for the group.
2203     SmallVector<Value *, 2> NewLoads;
2204     for (unsigned Part = 0; Part < UF; Part++) {
2205       Instruction *NewLoad;
2206       if (IsMaskForCondRequired || MaskForGaps) {
2207         assert(useMaskedInterleavedAccesses(*TTI) &&
2208                "masked interleaved groups are not allowed.");
2209         Value *GroupMask = MaskForGaps;
2210         if (IsMaskForCondRequired) {
2211           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2212           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2213           Value *ShuffledMask = Builder.CreateShuffleVector(
2214               Mask[Part], Undefs, RepMask, "interleaved.mask");
2215           GroupMask = MaskForGaps
2216                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2217                                                 MaskForGaps)
2218                           : ShuffledMask;
2219         }
2220         NewLoad =
2221             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2222                                      GroupMask, UndefVec, "wide.masked.vec");
2223       }
2224       else
2225         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2226                                             Group->getAlignment(), "wide.vec");
2227       Group->addMetadata(NewLoad);
2228       NewLoads.push_back(NewLoad);
2229     }
2230 
2231     // For each member in the group, shuffle out the appropriate data from the
2232     // wide loads.
2233     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2234       Instruction *Member = Group->getMember(I);
2235 
2236       // Skip the gaps in the group.
2237       if (!Member)
2238         continue;
2239 
2240       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2241       for (unsigned Part = 0; Part < UF; Part++) {
2242         Value *StridedVec = Builder.CreateShuffleVector(
2243             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2244 
2245         // If this member has different type, cast the result type.
2246         if (Member->getType() != ScalarTy) {
2247           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2248           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2249         }
2250 
2251         if (Group->isReverse())
2252           StridedVec = reverseVector(StridedVec);
2253 
2254         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2255       }
2256     }
2257     return;
2258   }
2259 
2260   // The sub vector type for current instruction.
2261   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2262 
2263   // Vectorize the interleaved store group.
2264   for (unsigned Part = 0; Part < UF; Part++) {
2265     // Collect the stored vector from each member.
2266     SmallVector<Value *, 4> StoredVecs;
2267     for (unsigned i = 0; i < InterleaveFactor; i++) {
2268       // Interleaved store group doesn't allow a gap, so each index has a member
2269       Instruction *Member = Group->getMember(i);
2270       assert(Member && "Fail to get a member from an interleaved store group");
2271 
2272       Value *StoredVec = getOrCreateVectorValue(
2273           cast<StoreInst>(Member)->getValueOperand(), Part);
2274       if (Group->isReverse())
2275         StoredVec = reverseVector(StoredVec);
2276 
2277       // If this member has different type, cast it to a unified type.
2278 
2279       if (StoredVec->getType() != SubVT)
2280         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2281 
2282       StoredVecs.push_back(StoredVec);
2283     }
2284 
2285     // Concatenate all vectors into a wide vector.
2286     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2287 
2288     // Interleave the elements in the wide vector.
2289     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2290     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2291                                               "interleaved.vec");
2292 
2293     Instruction *NewStoreInstr;
2294     if (IsMaskForCondRequired) {
2295       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2296       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2297       Value *ShuffledMask = Builder.CreateShuffleVector(
2298           Mask[Part], Undefs, RepMask, "interleaved.mask");
2299       NewStoreInstr = Builder.CreateMaskedStore(
2300           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2301     }
2302     else
2303       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2304         Group->getAlignment());
2305 
2306     Group->addMetadata(NewStoreInstr);
2307   }
2308 }
2309 
2310 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2311                                                      VectorParts *BlockInMask) {
2312   // Attempt to issue a wide load.
2313   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2314   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2315 
2316   assert((LI || SI) && "Invalid Load/Store instruction");
2317 
2318   LoopVectorizationCostModel::InstWidening Decision =
2319       Cost->getWideningDecision(Instr, VF);
2320   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2321          "CM decision should be taken at this point");
2322   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2323     return vectorizeInterleaveGroup(Instr);
2324 
2325   Type *ScalarDataTy = getMemInstValueType(Instr);
2326   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2327   Value *Ptr = getLoadStorePointerOperand(Instr);
2328   unsigned Alignment = getLoadStoreAlignment(Instr);
2329   // An alignment of 0 means target abi alignment. We need to use the scalar's
2330   // target abi alignment in such a case.
2331   const DataLayout &DL = Instr->getModule()->getDataLayout();
2332   if (!Alignment)
2333     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2334   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2335 
2336   // Determine if the pointer operand of the access is either consecutive or
2337   // reverse consecutive.
2338   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2339   bool ConsecutiveStride =
2340       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2341   bool CreateGatherScatter =
2342       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2343 
2344   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2345   // gather/scatter. Otherwise Decision should have been to Scalarize.
2346   assert((ConsecutiveStride || CreateGatherScatter) &&
2347          "The instruction should be scalarized");
2348 
2349   // Handle consecutive loads/stores.
2350   if (ConsecutiveStride)
2351     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2352 
2353   VectorParts Mask;
2354   bool isMaskRequired = BlockInMask;
2355   if (isMaskRequired)
2356     Mask = *BlockInMask;
2357 
2358   bool InBounds = false;
2359   if (auto *gep = dyn_cast<GetElementPtrInst>(
2360           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2361     InBounds = gep->isInBounds();
2362 
2363   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2364     // Calculate the pointer for the specific unroll-part.
2365     GetElementPtrInst *PartPtr = nullptr;
2366 
2367     if (Reverse) {
2368       // If the address is consecutive but reversed, then the
2369       // wide store needs to start at the last vector element.
2370       PartPtr = cast<GetElementPtrInst>(
2371           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2372       PartPtr->setIsInBounds(InBounds);
2373       PartPtr = cast<GetElementPtrInst>(
2374           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2375       PartPtr->setIsInBounds(InBounds);
2376       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2377         Mask[Part] = reverseVector(Mask[Part]);
2378     } else {
2379       PartPtr = cast<GetElementPtrInst>(
2380           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2381       PartPtr->setIsInBounds(InBounds);
2382     }
2383 
2384     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2385   };
2386 
2387   // Handle Stores:
2388   if (SI) {
2389     setDebugLocFromInst(Builder, SI);
2390 
2391     for (unsigned Part = 0; Part < UF; ++Part) {
2392       Instruction *NewSI = nullptr;
2393       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2394       if (CreateGatherScatter) {
2395         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2396         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2397         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2398                                             MaskPart);
2399       } else {
2400         if (Reverse) {
2401           // If we store to reverse consecutive memory locations, then we need
2402           // to reverse the order of elements in the stored value.
2403           StoredVal = reverseVector(StoredVal);
2404           // We don't want to update the value in the map as it might be used in
2405           // another expression. So don't call resetVectorValue(StoredVal).
2406         }
2407         auto *VecPtr = CreateVecPtr(Part, Ptr);
2408         if (isMaskRequired)
2409           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2410                                             Mask[Part]);
2411         else
2412           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2413       }
2414       addMetadata(NewSI, SI);
2415     }
2416     return;
2417   }
2418 
2419   // Handle loads.
2420   assert(LI && "Must have a load instruction");
2421   setDebugLocFromInst(Builder, LI);
2422   for (unsigned Part = 0; Part < UF; ++Part) {
2423     Value *NewLI;
2424     if (CreateGatherScatter) {
2425       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2426       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2427       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2428                                          nullptr, "wide.masked.gather");
2429       addMetadata(NewLI, LI);
2430     } else {
2431       auto *VecPtr = CreateVecPtr(Part, Ptr);
2432       if (isMaskRequired)
2433         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2434                                          UndefValue::get(DataTy),
2435                                          "wide.masked.load");
2436       else
2437         NewLI =
2438             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2439 
2440       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2441       addMetadata(NewLI, LI);
2442       if (Reverse)
2443         NewLI = reverseVector(NewLI);
2444     }
2445     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2446   }
2447 }
2448 
2449 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2450                                                const VPIteration &Instance,
2451                                                bool IfPredicateInstr) {
2452   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2453 
2454   setDebugLocFromInst(Builder, Instr);
2455 
2456   // Does this instruction return a value ?
2457   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2458 
2459   Instruction *Cloned = Instr->clone();
2460   if (!IsVoidRetTy)
2461     Cloned->setName(Instr->getName() + ".cloned");
2462 
2463   // Replace the operands of the cloned instructions with their scalar
2464   // equivalents in the new loop.
2465   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2466     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2467     Cloned->setOperand(op, NewOp);
2468   }
2469   addNewMetadata(Cloned, Instr);
2470 
2471   // Place the cloned scalar in the new loop.
2472   Builder.Insert(Cloned);
2473 
2474   // Add the cloned scalar to the scalar map entry.
2475   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2476 
2477   // If we just cloned a new assumption, add it the assumption cache.
2478   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2479     if (II->getIntrinsicID() == Intrinsic::assume)
2480       AC->registerAssumption(II);
2481 
2482   // End if-block.
2483   if (IfPredicateInstr)
2484     PredicatedInstructions.push_back(Cloned);
2485 }
2486 
2487 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2488                                                       Value *End, Value *Step,
2489                                                       Instruction *DL) {
2490   BasicBlock *Header = L->getHeader();
2491   BasicBlock *Latch = L->getLoopLatch();
2492   // As we're just creating this loop, it's possible no latch exists
2493   // yet. If so, use the header as this will be a single block loop.
2494   if (!Latch)
2495     Latch = Header;
2496 
2497   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2498   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2499   setDebugLocFromInst(Builder, OldInst);
2500   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2501 
2502   Builder.SetInsertPoint(Latch->getTerminator());
2503   setDebugLocFromInst(Builder, OldInst);
2504 
2505   // Create i+1 and fill the PHINode.
2506   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2507   Induction->addIncoming(Start, L->getLoopPreheader());
2508   Induction->addIncoming(Next, Latch);
2509   // Create the compare.
2510   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2511   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2512 
2513   // Now we have two terminators. Remove the old one from the block.
2514   Latch->getTerminator()->eraseFromParent();
2515 
2516   return Induction;
2517 }
2518 
2519 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2520   if (TripCount)
2521     return TripCount;
2522 
2523   assert(L && "Create Trip Count for null loop.");
2524   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2525   // Find the loop boundaries.
2526   ScalarEvolution *SE = PSE.getSE();
2527   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2528   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2529          "Invalid loop count");
2530 
2531   Type *IdxTy = Legal->getWidestInductionType();
2532   assert(IdxTy && "No type for induction");
2533 
2534   // The exit count might have the type of i64 while the phi is i32. This can
2535   // happen if we have an induction variable that is sign extended before the
2536   // compare. The only way that we get a backedge taken count is that the
2537   // induction variable was signed and as such will not overflow. In such a case
2538   // truncation is legal.
2539   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2540       IdxTy->getPrimitiveSizeInBits())
2541     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2542   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2543 
2544   // Get the total trip count from the count by adding 1.
2545   const SCEV *ExitCount = SE->getAddExpr(
2546       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2547 
2548   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2549 
2550   // Expand the trip count and place the new instructions in the preheader.
2551   // Notice that the pre-header does not change, only the loop body.
2552   SCEVExpander Exp(*SE, DL, "induction");
2553 
2554   // Count holds the overall loop count (N).
2555   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2556                                 L->getLoopPreheader()->getTerminator());
2557 
2558   if (TripCount->getType()->isPointerTy())
2559     TripCount =
2560         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2561                                     L->getLoopPreheader()->getTerminator());
2562 
2563   return TripCount;
2564 }
2565 
2566 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2567   if (VectorTripCount)
2568     return VectorTripCount;
2569 
2570   Value *TC = getOrCreateTripCount(L);
2571   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2572 
2573   Type *Ty = TC->getType();
2574   Constant *Step = ConstantInt::get(Ty, VF * UF);
2575 
2576   // If the tail is to be folded by masking, round the number of iterations N
2577   // up to a multiple of Step instead of rounding down. This is done by first
2578   // adding Step-1 and then rounding down. Note that it's ok if this addition
2579   // overflows: the vector induction variable will eventually wrap to zero given
2580   // that it starts at zero and its Step is a power of two; the loop will then
2581   // exit, with the last early-exit vector comparison also producing all-true.
2582   if (Cost->foldTailByMasking()) {
2583     assert(isPowerOf2_32(VF * UF) &&
2584            "VF*UF must be a power of 2 when folding tail by masking");
2585     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2586   }
2587 
2588   // Now we need to generate the expression for the part of the loop that the
2589   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2590   // iterations are not required for correctness, or N - Step, otherwise. Step
2591   // is equal to the vectorization factor (number of SIMD elements) times the
2592   // unroll factor (number of SIMD instructions).
2593   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2594 
2595   // If there is a non-reversed interleaved group that may speculatively access
2596   // memory out-of-bounds, we need to ensure that there will be at least one
2597   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2598   // the trip count, we set the remainder to be equal to the step. If the step
2599   // does not evenly divide the trip count, no adjustment is necessary since
2600   // there will already be scalar iterations. Note that the minimum iterations
2601   // check ensures that N >= Step.
2602   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2603     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2604     R = Builder.CreateSelect(IsZero, Step, R);
2605   }
2606 
2607   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2608 
2609   return VectorTripCount;
2610 }
2611 
2612 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2613                                                    const DataLayout &DL) {
2614   // Verify that V is a vector type with same number of elements as DstVTy.
2615   unsigned VF = DstVTy->getNumElements();
2616   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2617   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2618   Type *SrcElemTy = SrcVecTy->getElementType();
2619   Type *DstElemTy = DstVTy->getElementType();
2620   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2621          "Vector elements must have same size");
2622 
2623   // Do a direct cast if element types are castable.
2624   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2625     return Builder.CreateBitOrPointerCast(V, DstVTy);
2626   }
2627   // V cannot be directly casted to desired vector type.
2628   // May happen when V is a floating point vector but DstVTy is a vector of
2629   // pointers or vice-versa. Handle this using a two-step bitcast using an
2630   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2631   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2632          "Only one type should be a pointer type");
2633   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2634          "Only one type should be a floating point type");
2635   Type *IntTy =
2636       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2637   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2638   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2639   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2640 }
2641 
2642 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2643                                                          BasicBlock *Bypass) {
2644   Value *Count = getOrCreateTripCount(L);
2645   BasicBlock *BB = L->getLoopPreheader();
2646   IRBuilder<> Builder(BB->getTerminator());
2647 
2648   // Generate code to check if the loop's trip count is less than VF * UF, or
2649   // equal to it in case a scalar epilogue is required; this implies that the
2650   // vector trip count is zero. This check also covers the case where adding one
2651   // to the backedge-taken count overflowed leading to an incorrect trip count
2652   // of zero. In this case we will also jump to the scalar loop.
2653   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2654                                           : ICmpInst::ICMP_ULT;
2655 
2656   // If tail is to be folded, vector loop takes care of all iterations.
2657   Value *CheckMinIters = Builder.getFalse();
2658   if (!Cost->foldTailByMasking())
2659     CheckMinIters = Builder.CreateICmp(
2660         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2661         "min.iters.check");
2662 
2663   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2664   // Update dominator tree immediately if the generated block is a
2665   // LoopBypassBlock because SCEV expansions to generate loop bypass
2666   // checks may query it before the current function is finished.
2667   DT->addNewBlock(NewBB, BB);
2668   if (L->getParentLoop())
2669     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2670   ReplaceInstWithInst(BB->getTerminator(),
2671                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2672   LoopBypassBlocks.push_back(BB);
2673 }
2674 
2675 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2676   BasicBlock *BB = L->getLoopPreheader();
2677 
2678   // Generate the code to check that the SCEV assumptions that we made.
2679   // We want the new basic block to start at the first instruction in a
2680   // sequence of instructions that form a check.
2681   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2682                    "scev.check");
2683   Value *SCEVCheck =
2684       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2685 
2686   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2687     if (C->isZero())
2688       return;
2689 
2690   assert(!Cost->foldTailByMasking() &&
2691          "Cannot SCEV check stride or overflow when folding tail");
2692   // Create a new block containing the stride check.
2693   BB->setName("vector.scevcheck");
2694   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2695   // Update dominator tree immediately if the generated block is a
2696   // LoopBypassBlock because SCEV expansions to generate loop bypass
2697   // checks may query it before the current function is finished.
2698   DT->addNewBlock(NewBB, BB);
2699   if (L->getParentLoop())
2700     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2701   ReplaceInstWithInst(BB->getTerminator(),
2702                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2703   LoopBypassBlocks.push_back(BB);
2704   AddedSafetyChecks = true;
2705 }
2706 
2707 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2708   // VPlan-native path does not do any analysis for runtime checks currently.
2709   if (EnableVPlanNativePath)
2710     return;
2711 
2712   BasicBlock *BB = L->getLoopPreheader();
2713 
2714   // Generate the code that checks in runtime if arrays overlap. We put the
2715   // checks into a separate block to make the more common case of few elements
2716   // faster.
2717   Instruction *FirstCheckInst;
2718   Instruction *MemRuntimeCheck;
2719   std::tie(FirstCheckInst, MemRuntimeCheck) =
2720       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2721   if (!MemRuntimeCheck)
2722     return;
2723 
2724   assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2725   // Create a new block containing the memory check.
2726   BB->setName("vector.memcheck");
2727   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2728   // Update dominator tree immediately if the generated block is a
2729   // LoopBypassBlock because SCEV expansions to generate loop bypass
2730   // checks may query it before the current function is finished.
2731   DT->addNewBlock(NewBB, BB);
2732   if (L->getParentLoop())
2733     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2734   ReplaceInstWithInst(BB->getTerminator(),
2735                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2736   LoopBypassBlocks.push_back(BB);
2737   AddedSafetyChecks = true;
2738 
2739   // We currently don't use LoopVersioning for the actual loop cloning but we
2740   // still use it to add the noalias metadata.
2741   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2742                                            PSE.getSE());
2743   LVer->prepareNoAliasMetadata();
2744 }
2745 
2746 Value *InnerLoopVectorizer::emitTransformedIndex(
2747     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2748     const InductionDescriptor &ID) const {
2749 
2750   SCEVExpander Exp(*SE, DL, "induction");
2751   auto Step = ID.getStep();
2752   auto StartValue = ID.getStartValue();
2753   assert(Index->getType() == Step->getType() &&
2754          "Index type does not match StepValue type");
2755 
2756   // Note: the IR at this point is broken. We cannot use SE to create any new
2757   // SCEV and then expand it, hoping that SCEV's simplification will give us
2758   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2759   // lead to various SCEV crashes. So all we can do is to use builder and rely
2760   // on InstCombine for future simplifications. Here we handle some trivial
2761   // cases only.
2762   auto CreateAdd = [&B](Value *X, Value *Y) {
2763     assert(X->getType() == Y->getType() && "Types don't match!");
2764     if (auto *CX = dyn_cast<ConstantInt>(X))
2765       if (CX->isZero())
2766         return Y;
2767     if (auto *CY = dyn_cast<ConstantInt>(Y))
2768       if (CY->isZero())
2769         return X;
2770     return B.CreateAdd(X, Y);
2771   };
2772 
2773   auto CreateMul = [&B](Value *X, Value *Y) {
2774     assert(X->getType() == Y->getType() && "Types don't match!");
2775     if (auto *CX = dyn_cast<ConstantInt>(X))
2776       if (CX->isOne())
2777         return Y;
2778     if (auto *CY = dyn_cast<ConstantInt>(Y))
2779       if (CY->isOne())
2780         return X;
2781     return B.CreateMul(X, Y);
2782   };
2783 
2784   switch (ID.getKind()) {
2785   case InductionDescriptor::IK_IntInduction: {
2786     assert(Index->getType() == StartValue->getType() &&
2787            "Index type does not match StartValue type");
2788     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2789       return B.CreateSub(StartValue, Index);
2790     auto *Offset = CreateMul(
2791         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2792     return CreateAdd(StartValue, Offset);
2793   }
2794   case InductionDescriptor::IK_PtrInduction: {
2795     assert(isa<SCEVConstant>(Step) &&
2796            "Expected constant step for pointer induction");
2797     return B.CreateGEP(
2798         StartValue->getType()->getPointerElementType(), StartValue,
2799         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2800                                            &*B.GetInsertPoint())));
2801   }
2802   case InductionDescriptor::IK_FpInduction: {
2803     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2804     auto InductionBinOp = ID.getInductionBinOp();
2805     assert(InductionBinOp &&
2806            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2807             InductionBinOp->getOpcode() == Instruction::FSub) &&
2808            "Original bin op should be defined for FP induction");
2809 
2810     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2811 
2812     // Floating point operations had to be 'fast' to enable the induction.
2813     FastMathFlags Flags;
2814     Flags.setFast();
2815 
2816     Value *MulExp = B.CreateFMul(StepValue, Index);
2817     if (isa<Instruction>(MulExp))
2818       // We have to check, the MulExp may be a constant.
2819       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2820 
2821     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2822                                "induction");
2823     if (isa<Instruction>(BOp))
2824       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2825 
2826     return BOp;
2827   }
2828   case InductionDescriptor::IK_NoInduction:
2829     return nullptr;
2830   }
2831   llvm_unreachable("invalid enum");
2832 }
2833 
2834 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2835   /*
2836    In this function we generate a new loop. The new loop will contain
2837    the vectorized instructions while the old loop will continue to run the
2838    scalar remainder.
2839 
2840        [ ] <-- loop iteration number check.
2841     /   |
2842    /    v
2843   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2844   |  /  |
2845   | /   v
2846   ||   [ ]     <-- vector pre header.
2847   |/    |
2848   |     v
2849   |    [  ] \
2850   |    [  ]_|   <-- vector loop.
2851   |     |
2852   |     v
2853   |   -[ ]   <--- middle-block.
2854   |  /  |
2855   | /   v
2856   -|- >[ ]     <--- new preheader.
2857    |    |
2858    |    v
2859    |   [ ] \
2860    |   [ ]_|   <-- old scalar loop to handle remainder.
2861     \   |
2862      \  v
2863       >[ ]     <-- exit block.
2864    ...
2865    */
2866 
2867   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2868   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2869   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2870   MDNode *OrigLoopID = OrigLoop->getLoopID();
2871   assert(VectorPH && "Invalid loop structure");
2872   assert(ExitBlock && "Must have an exit block");
2873 
2874   // Some loops have a single integer induction variable, while other loops
2875   // don't. One example is c++ iterators that often have multiple pointer
2876   // induction variables. In the code below we also support a case where we
2877   // don't have a single induction variable.
2878   //
2879   // We try to obtain an induction variable from the original loop as hard
2880   // as possible. However if we don't find one that:
2881   //   - is an integer
2882   //   - counts from zero, stepping by one
2883   //   - is the size of the widest induction variable type
2884   // then we create a new one.
2885   OldInduction = Legal->getPrimaryInduction();
2886   Type *IdxTy = Legal->getWidestInductionType();
2887 
2888   // Split the single block loop into the two loop structure described above.
2889   BasicBlock *VecBody =
2890       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2891   BasicBlock *MiddleBlock =
2892       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2893   BasicBlock *ScalarPH =
2894       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2895 
2896   // Create and register the new vector loop.
2897   Loop *Lp = LI->AllocateLoop();
2898   Loop *ParentLoop = OrigLoop->getParentLoop();
2899 
2900   // Insert the new loop into the loop nest and register the new basic blocks
2901   // before calling any utilities such as SCEV that require valid LoopInfo.
2902   if (ParentLoop) {
2903     ParentLoop->addChildLoop(Lp);
2904     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2905     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2906   } else {
2907     LI->addTopLevelLoop(Lp);
2908   }
2909   Lp->addBasicBlockToLoop(VecBody, *LI);
2910 
2911   // Find the loop boundaries.
2912   Value *Count = getOrCreateTripCount(Lp);
2913 
2914   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2915 
2916   // Now, compare the new count to zero. If it is zero skip the vector loop and
2917   // jump to the scalar loop. This check also covers the case where the
2918   // backedge-taken count is uint##_max: adding one to it will overflow leading
2919   // to an incorrect trip count of zero. In this (rare) case we will also jump
2920   // to the scalar loop.
2921   emitMinimumIterationCountCheck(Lp, ScalarPH);
2922 
2923   // Generate the code to check any assumptions that we've made for SCEV
2924   // expressions.
2925   emitSCEVChecks(Lp, ScalarPH);
2926 
2927   // Generate the code that checks in runtime if arrays overlap. We put the
2928   // checks into a separate block to make the more common case of few elements
2929   // faster.
2930   emitMemRuntimeChecks(Lp, ScalarPH);
2931 
2932   // Generate the induction variable.
2933   // The loop step is equal to the vectorization factor (num of SIMD elements)
2934   // times the unroll factor (num of SIMD instructions).
2935   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2936   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2937   Induction =
2938       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2939                               getDebugLocFromInstOrOperands(OldInduction));
2940 
2941   // We are going to resume the execution of the scalar loop.
2942   // Go over all of the induction variables that we found and fix the
2943   // PHIs that are left in the scalar version of the loop.
2944   // The starting values of PHI nodes depend on the counter of the last
2945   // iteration in the vectorized loop.
2946   // If we come from a bypass edge then we need to start from the original
2947   // start value.
2948 
2949   // This variable saves the new starting index for the scalar loop. It is used
2950   // to test if there are any tail iterations left once the vector loop has
2951   // completed.
2952   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2953   for (auto &InductionEntry : *List) {
2954     PHINode *OrigPhi = InductionEntry.first;
2955     InductionDescriptor II = InductionEntry.second;
2956 
2957     // Create phi nodes to merge from the  backedge-taken check block.
2958     PHINode *BCResumeVal = PHINode::Create(
2959         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2960     // Copy original phi DL over to the new one.
2961     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2962     Value *&EndValue = IVEndValues[OrigPhi];
2963     if (OrigPhi == OldInduction) {
2964       // We know what the end value is.
2965       EndValue = CountRoundDown;
2966     } else {
2967       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2968       Type *StepType = II.getStep()->getType();
2969       Instruction::CastOps CastOp =
2970         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2971       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2972       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2973       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2974       EndValue->setName("ind.end");
2975     }
2976 
2977     // The new PHI merges the original incoming value, in case of a bypass,
2978     // or the value at the end of the vectorized loop.
2979     BCResumeVal->addIncoming(EndValue, MiddleBlock);
2980 
2981     // Fix the scalar body counter (PHI node).
2982     // The old induction's phi node in the scalar body needs the truncated
2983     // value.
2984     for (BasicBlock *BB : LoopBypassBlocks)
2985       BCResumeVal->addIncoming(II.getStartValue(), BB);
2986     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2987   }
2988 
2989   // We need the OrigLoop (scalar loop part) latch terminator to help
2990   // produce correct debug info for the middle block BB instructions.
2991   // The legality check stage guarantees that the loop will have a single
2992   // latch.
2993   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
2994          "Scalar loop latch terminator isn't a branch");
2995   BranchInst *ScalarLatchBr =
2996       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2997 
2998   // Add a check in the middle block to see if we have completed
2999   // all of the iterations in the first vector loop.
3000   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3001   // If tail is to be folded, we know we don't need to run the remainder.
3002   Value *CmpN = Builder.getTrue();
3003   if (!Cost->foldTailByMasking()) {
3004     CmpN =
3005         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3006                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3007 
3008     // Here we use the same DebugLoc as the scalar loop latch branch instead
3009     // of the corresponding compare because they may have ended up with
3010     // different line numbers and we want to avoid awkward line stepping while
3011     // debugging. Eg. if the compare has got a line number inside the loop.
3012     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3013   }
3014 
3015   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3016   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3017   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3018 
3019   // Get ready to start creating new instructions into the vectorized body.
3020   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3021 
3022   // Save the state.
3023   LoopVectorPreHeader = Lp->getLoopPreheader();
3024   LoopScalarPreHeader = ScalarPH;
3025   LoopMiddleBlock = MiddleBlock;
3026   LoopExitBlock = ExitBlock;
3027   LoopVectorBody = VecBody;
3028   LoopScalarBody = OldBasicBlock;
3029 
3030   Optional<MDNode *> VectorizedLoopID =
3031       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3032                                       LLVMLoopVectorizeFollowupVectorized});
3033   if (VectorizedLoopID.hasValue()) {
3034     Lp->setLoopID(VectorizedLoopID.getValue());
3035 
3036     // Do not setAlreadyVectorized if loop attributes have been defined
3037     // explicitly.
3038     return LoopVectorPreHeader;
3039   }
3040 
3041   // Keep all loop hints from the original loop on the vector loop (we'll
3042   // replace the vectorizer-specific hints below).
3043   if (MDNode *LID = OrigLoop->getLoopID())
3044     Lp->setLoopID(LID);
3045 
3046   LoopVectorizeHints Hints(Lp, true, *ORE);
3047   Hints.setAlreadyVectorized();
3048 
3049   return LoopVectorPreHeader;
3050 }
3051 
3052 // Fix up external users of the induction variable. At this point, we are
3053 // in LCSSA form, with all external PHIs that use the IV having one input value,
3054 // coming from the remainder loop. We need those PHIs to also have a correct
3055 // value for the IV when arriving directly from the middle block.
3056 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3057                                        const InductionDescriptor &II,
3058                                        Value *CountRoundDown, Value *EndValue,
3059                                        BasicBlock *MiddleBlock) {
3060   // There are two kinds of external IV usages - those that use the value
3061   // computed in the last iteration (the PHI) and those that use the penultimate
3062   // value (the value that feeds into the phi from the loop latch).
3063   // We allow both, but they, obviously, have different values.
3064 
3065   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3066 
3067   DenseMap<Value *, Value *> MissingVals;
3068 
3069   // An external user of the last iteration's value should see the value that
3070   // the remainder loop uses to initialize its own IV.
3071   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3072   for (User *U : PostInc->users()) {
3073     Instruction *UI = cast<Instruction>(U);
3074     if (!OrigLoop->contains(UI)) {
3075       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3076       MissingVals[UI] = EndValue;
3077     }
3078   }
3079 
3080   // An external user of the penultimate value need to see EndValue - Step.
3081   // The simplest way to get this is to recompute it from the constituent SCEVs,
3082   // that is Start + (Step * (CRD - 1)).
3083   for (User *U : OrigPhi->users()) {
3084     auto *UI = cast<Instruction>(U);
3085     if (!OrigLoop->contains(UI)) {
3086       const DataLayout &DL =
3087           OrigLoop->getHeader()->getModule()->getDataLayout();
3088       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3089 
3090       IRBuilder<> B(MiddleBlock->getTerminator());
3091       Value *CountMinusOne = B.CreateSub(
3092           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3093       Value *CMO =
3094           !II.getStep()->getType()->isIntegerTy()
3095               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3096                              II.getStep()->getType())
3097               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3098       CMO->setName("cast.cmo");
3099       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3100       Escape->setName("ind.escape");
3101       MissingVals[UI] = Escape;
3102     }
3103   }
3104 
3105   for (auto &I : MissingVals) {
3106     PHINode *PHI = cast<PHINode>(I.first);
3107     // One corner case we have to handle is two IVs "chasing" each-other,
3108     // that is %IV2 = phi [...], [ %IV1, %latch ]
3109     // In this case, if IV1 has an external use, we need to avoid adding both
3110     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3111     // don't already have an incoming value for the middle block.
3112     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3113       PHI->addIncoming(I.second, MiddleBlock);
3114   }
3115 }
3116 
3117 namespace {
3118 
3119 struct CSEDenseMapInfo {
3120   static bool canHandle(const Instruction *I) {
3121     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3122            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3123   }
3124 
3125   static inline Instruction *getEmptyKey() {
3126     return DenseMapInfo<Instruction *>::getEmptyKey();
3127   }
3128 
3129   static inline Instruction *getTombstoneKey() {
3130     return DenseMapInfo<Instruction *>::getTombstoneKey();
3131   }
3132 
3133   static unsigned getHashValue(const Instruction *I) {
3134     assert(canHandle(I) && "Unknown instruction!");
3135     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3136                                                            I->value_op_end()));
3137   }
3138 
3139   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3140     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3141         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3142       return LHS == RHS;
3143     return LHS->isIdenticalTo(RHS);
3144   }
3145 };
3146 
3147 } // end anonymous namespace
3148 
3149 ///Perform cse of induction variable instructions.
3150 static void cse(BasicBlock *BB) {
3151   // Perform simple cse.
3152   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3153   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3154     Instruction *In = &*I++;
3155 
3156     if (!CSEDenseMapInfo::canHandle(In))
3157       continue;
3158 
3159     // Check if we can replace this instruction with any of the
3160     // visited instructions.
3161     if (Instruction *V = CSEMap.lookup(In)) {
3162       In->replaceAllUsesWith(V);
3163       In->eraseFromParent();
3164       continue;
3165     }
3166 
3167     CSEMap[In] = In;
3168   }
3169 }
3170 
3171 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3172                                                        unsigned VF,
3173                                                        bool &NeedToScalarize) {
3174   Function *F = CI->getCalledFunction();
3175   StringRef FnName = CI->getCalledFunction()->getName();
3176   Type *ScalarRetTy = CI->getType();
3177   SmallVector<Type *, 4> Tys, ScalarTys;
3178   for (auto &ArgOp : CI->arg_operands())
3179     ScalarTys.push_back(ArgOp->getType());
3180 
3181   // Estimate cost of scalarized vector call. The source operands are assumed
3182   // to be vectors, so we need to extract individual elements from there,
3183   // execute VF scalar calls, and then gather the result into the vector return
3184   // value.
3185   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3186   if (VF == 1)
3187     return ScalarCallCost;
3188 
3189   // Compute corresponding vector type for return value and arguments.
3190   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3191   for (Type *ScalarTy : ScalarTys)
3192     Tys.push_back(ToVectorTy(ScalarTy, VF));
3193 
3194   // Compute costs of unpacking argument values for the scalar calls and
3195   // packing the return values to a vector.
3196   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3197 
3198   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3199 
3200   // If we can't emit a vector call for this function, then the currently found
3201   // cost is the cost we need to return.
3202   NeedToScalarize = true;
3203   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3204     return Cost;
3205 
3206   // If the corresponding vector cost is cheaper, return its cost.
3207   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3208   if (VectorCallCost < Cost) {
3209     NeedToScalarize = false;
3210     return VectorCallCost;
3211   }
3212   return Cost;
3213 }
3214 
3215 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3216                                                             unsigned VF) {
3217   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3218   assert(ID && "Expected intrinsic call!");
3219 
3220   FastMathFlags FMF;
3221   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3222     FMF = FPMO->getFastMathFlags();
3223 
3224   SmallVector<Value *, 4> Operands(CI->arg_operands());
3225   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3226 }
3227 
3228 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3229   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3230   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3231   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3232 }
3233 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3234   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3235   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3236   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3237 }
3238 
3239 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3240   // For every instruction `I` in MinBWs, truncate the operands, create a
3241   // truncated version of `I` and reextend its result. InstCombine runs
3242   // later and will remove any ext/trunc pairs.
3243   SmallPtrSet<Value *, 4> Erased;
3244   for (const auto &KV : Cost->getMinimalBitwidths()) {
3245     // If the value wasn't vectorized, we must maintain the original scalar
3246     // type. The absence of the value from VectorLoopValueMap indicates that it
3247     // wasn't vectorized.
3248     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3249       continue;
3250     for (unsigned Part = 0; Part < UF; ++Part) {
3251       Value *I = getOrCreateVectorValue(KV.first, Part);
3252       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3253           !isa<Instruction>(I))
3254         continue;
3255       Type *OriginalTy = I->getType();
3256       Type *ScalarTruncatedTy =
3257           IntegerType::get(OriginalTy->getContext(), KV.second);
3258       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3259                                           OriginalTy->getVectorNumElements());
3260       if (TruncatedTy == OriginalTy)
3261         continue;
3262 
3263       IRBuilder<> B(cast<Instruction>(I));
3264       auto ShrinkOperand = [&](Value *V) -> Value * {
3265         if (auto *ZI = dyn_cast<ZExtInst>(V))
3266           if (ZI->getSrcTy() == TruncatedTy)
3267             return ZI->getOperand(0);
3268         return B.CreateZExtOrTrunc(V, TruncatedTy);
3269       };
3270 
3271       // The actual instruction modification depends on the instruction type,
3272       // unfortunately.
3273       Value *NewI = nullptr;
3274       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3275         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3276                              ShrinkOperand(BO->getOperand(1)));
3277 
3278         // Any wrapping introduced by shrinking this operation shouldn't be
3279         // considered undefined behavior. So, we can't unconditionally copy
3280         // arithmetic wrapping flags to NewI.
3281         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3282       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3283         NewI =
3284             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3285                          ShrinkOperand(CI->getOperand(1)));
3286       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3287         NewI = B.CreateSelect(SI->getCondition(),
3288                               ShrinkOperand(SI->getTrueValue()),
3289                               ShrinkOperand(SI->getFalseValue()));
3290       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3291         switch (CI->getOpcode()) {
3292         default:
3293           llvm_unreachable("Unhandled cast!");
3294         case Instruction::Trunc:
3295           NewI = ShrinkOperand(CI->getOperand(0));
3296           break;
3297         case Instruction::SExt:
3298           NewI = B.CreateSExtOrTrunc(
3299               CI->getOperand(0),
3300               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3301           break;
3302         case Instruction::ZExt:
3303           NewI = B.CreateZExtOrTrunc(
3304               CI->getOperand(0),
3305               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3306           break;
3307         }
3308       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3309         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3310         auto *O0 = B.CreateZExtOrTrunc(
3311             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3312         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3313         auto *O1 = B.CreateZExtOrTrunc(
3314             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3315 
3316         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3317       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3318         // Don't do anything with the operands, just extend the result.
3319         continue;
3320       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3321         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3322         auto *O0 = B.CreateZExtOrTrunc(
3323             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3324         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3325         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3326       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3327         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3328         auto *O0 = B.CreateZExtOrTrunc(
3329             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3330         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3331       } else {
3332         // If we don't know what to do, be conservative and don't do anything.
3333         continue;
3334       }
3335 
3336       // Lastly, extend the result.
3337       NewI->takeName(cast<Instruction>(I));
3338       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3339       I->replaceAllUsesWith(Res);
3340       cast<Instruction>(I)->eraseFromParent();
3341       Erased.insert(I);
3342       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3343     }
3344   }
3345 
3346   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3347   for (const auto &KV : Cost->getMinimalBitwidths()) {
3348     // If the value wasn't vectorized, we must maintain the original scalar
3349     // type. The absence of the value from VectorLoopValueMap indicates that it
3350     // wasn't vectorized.
3351     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3352       continue;
3353     for (unsigned Part = 0; Part < UF; ++Part) {
3354       Value *I = getOrCreateVectorValue(KV.first, Part);
3355       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3356       if (Inst && Inst->use_empty()) {
3357         Value *NewI = Inst->getOperand(0);
3358         Inst->eraseFromParent();
3359         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3360       }
3361     }
3362   }
3363 }
3364 
3365 void InnerLoopVectorizer::fixVectorizedLoop() {
3366   // Insert truncates and extends for any truncated instructions as hints to
3367   // InstCombine.
3368   if (VF > 1)
3369     truncateToMinimalBitwidths();
3370 
3371   // Fix widened non-induction PHIs by setting up the PHI operands.
3372   if (OrigPHIsToFix.size()) {
3373     assert(EnableVPlanNativePath &&
3374            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3375     fixNonInductionPHIs();
3376   }
3377 
3378   // At this point every instruction in the original loop is widened to a
3379   // vector form. Now we need to fix the recurrences in the loop. These PHI
3380   // nodes are currently empty because we did not want to introduce cycles.
3381   // This is the second stage of vectorizing recurrences.
3382   fixCrossIterationPHIs();
3383 
3384   // Update the dominator tree.
3385   //
3386   // FIXME: After creating the structure of the new loop, the dominator tree is
3387   //        no longer up-to-date, and it remains that way until we update it
3388   //        here. An out-of-date dominator tree is problematic for SCEV,
3389   //        because SCEVExpander uses it to guide code generation. The
3390   //        vectorizer use SCEVExpanders in several places. Instead, we should
3391   //        keep the dominator tree up-to-date as we go.
3392   updateAnalysis();
3393 
3394   // Fix-up external users of the induction variables.
3395   for (auto &Entry : *Legal->getInductionVars())
3396     fixupIVUsers(Entry.first, Entry.second,
3397                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3398                  IVEndValues[Entry.first], LoopMiddleBlock);
3399 
3400   fixLCSSAPHIs();
3401   for (Instruction *PI : PredicatedInstructions)
3402     sinkScalarOperands(&*PI);
3403 
3404   // Remove redundant induction instructions.
3405   cse(LoopVectorBody);
3406 }
3407 
3408 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3409   // In order to support recurrences we need to be able to vectorize Phi nodes.
3410   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3411   // stage #2: We now need to fix the recurrences by adding incoming edges to
3412   // the currently empty PHI nodes. At this point every instruction in the
3413   // original loop is widened to a vector form so we can use them to construct
3414   // the incoming edges.
3415   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3416     // Handle first-order recurrences and reductions that need to be fixed.
3417     if (Legal->isFirstOrderRecurrence(&Phi))
3418       fixFirstOrderRecurrence(&Phi);
3419     else if (Legal->isReductionVariable(&Phi))
3420       fixReduction(&Phi);
3421   }
3422 }
3423 
3424 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3425   // This is the second phase of vectorizing first-order recurrences. An
3426   // overview of the transformation is described below. Suppose we have the
3427   // following loop.
3428   //
3429   //   for (int i = 0; i < n; ++i)
3430   //     b[i] = a[i] - a[i - 1];
3431   //
3432   // There is a first-order recurrence on "a". For this loop, the shorthand
3433   // scalar IR looks like:
3434   //
3435   //   scalar.ph:
3436   //     s_init = a[-1]
3437   //     br scalar.body
3438   //
3439   //   scalar.body:
3440   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3441   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3442   //     s2 = a[i]
3443   //     b[i] = s2 - s1
3444   //     br cond, scalar.body, ...
3445   //
3446   // In this example, s1 is a recurrence because it's value depends on the
3447   // previous iteration. In the first phase of vectorization, we created a
3448   // temporary value for s1. We now complete the vectorization and produce the
3449   // shorthand vector IR shown below (for VF = 4, UF = 1).
3450   //
3451   //   vector.ph:
3452   //     v_init = vector(..., ..., ..., a[-1])
3453   //     br vector.body
3454   //
3455   //   vector.body
3456   //     i = phi [0, vector.ph], [i+4, vector.body]
3457   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3458   //     v2 = a[i, i+1, i+2, i+3];
3459   //     v3 = vector(v1(3), v2(0, 1, 2))
3460   //     b[i, i+1, i+2, i+3] = v2 - v3
3461   //     br cond, vector.body, middle.block
3462   //
3463   //   middle.block:
3464   //     x = v2(3)
3465   //     br scalar.ph
3466   //
3467   //   scalar.ph:
3468   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3469   //     br scalar.body
3470   //
3471   // After execution completes the vector loop, we extract the next value of
3472   // the recurrence (x) to use as the initial value in the scalar loop.
3473 
3474   // Get the original loop preheader and single loop latch.
3475   auto *Preheader = OrigLoop->getLoopPreheader();
3476   auto *Latch = OrigLoop->getLoopLatch();
3477 
3478   // Get the initial and previous values of the scalar recurrence.
3479   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3480   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3481 
3482   // Create a vector from the initial value.
3483   auto *VectorInit = ScalarInit;
3484   if (VF > 1) {
3485     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3486     VectorInit = Builder.CreateInsertElement(
3487         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3488         Builder.getInt32(VF - 1), "vector.recur.init");
3489   }
3490 
3491   // We constructed a temporary phi node in the first phase of vectorization.
3492   // This phi node will eventually be deleted.
3493   Builder.SetInsertPoint(
3494       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3495 
3496   // Create a phi node for the new recurrence. The current value will either be
3497   // the initial value inserted into a vector or loop-varying vector value.
3498   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3499   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3500 
3501   // Get the vectorized previous value of the last part UF - 1. It appears last
3502   // among all unrolled iterations, due to the order of their construction.
3503   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3504 
3505   // Set the insertion point after the previous value if it is an instruction.
3506   // Note that the previous value may have been constant-folded so it is not
3507   // guaranteed to be an instruction in the vector loop. Also, if the previous
3508   // value is a phi node, we should insert after all the phi nodes to avoid
3509   // breaking basic block verification.
3510   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3511       isa<PHINode>(PreviousLastPart))
3512     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3513   else
3514     Builder.SetInsertPoint(
3515         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3516 
3517   // We will construct a vector for the recurrence by combining the values for
3518   // the current and previous iterations. This is the required shuffle mask.
3519   SmallVector<Constant *, 8> ShuffleMask(VF);
3520   ShuffleMask[0] = Builder.getInt32(VF - 1);
3521   for (unsigned I = 1; I < VF; ++I)
3522     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3523 
3524   // The vector from which to take the initial value for the current iteration
3525   // (actual or unrolled). Initially, this is the vector phi node.
3526   Value *Incoming = VecPhi;
3527 
3528   // Shuffle the current and previous vector and update the vector parts.
3529   for (unsigned Part = 0; Part < UF; ++Part) {
3530     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3531     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3532     auto *Shuffle =
3533         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3534                                              ConstantVector::get(ShuffleMask))
3535                : Incoming;
3536     PhiPart->replaceAllUsesWith(Shuffle);
3537     cast<Instruction>(PhiPart)->eraseFromParent();
3538     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3539     Incoming = PreviousPart;
3540   }
3541 
3542   // Fix the latch value of the new recurrence in the vector loop.
3543   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3544 
3545   // Extract the last vector element in the middle block. This will be the
3546   // initial value for the recurrence when jumping to the scalar loop.
3547   auto *ExtractForScalar = Incoming;
3548   if (VF > 1) {
3549     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3550     ExtractForScalar = Builder.CreateExtractElement(
3551         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3552   }
3553   // Extract the second last element in the middle block if the
3554   // Phi is used outside the loop. We need to extract the phi itself
3555   // and not the last element (the phi update in the current iteration). This
3556   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3557   // when the scalar loop is not run at all.
3558   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3559   if (VF > 1)
3560     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3561         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3562   // When loop is unrolled without vectorizing, initialize
3563   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3564   // `Incoming`. This is analogous to the vectorized case above: extracting the
3565   // second last element when VF > 1.
3566   else if (UF > 1)
3567     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3568 
3569   // Fix the initial value of the original recurrence in the scalar loop.
3570   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3571   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3572   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3573     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3574     Start->addIncoming(Incoming, BB);
3575   }
3576 
3577   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3578   Phi->setName("scalar.recur");
3579 
3580   // Finally, fix users of the recurrence outside the loop. The users will need
3581   // either the last value of the scalar recurrence or the last value of the
3582   // vector recurrence we extracted in the middle block. Since the loop is in
3583   // LCSSA form, we just need to find all the phi nodes for the original scalar
3584   // recurrence in the exit block, and then add an edge for the middle block.
3585   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3586     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3587       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3588     }
3589   }
3590 }
3591 
3592 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3593   Constant *Zero = Builder.getInt32(0);
3594 
3595   // Get it's reduction variable descriptor.
3596   assert(Legal->isReductionVariable(Phi) &&
3597          "Unable to find the reduction variable");
3598   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3599 
3600   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3601   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3602   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3603   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3604     RdxDesc.getMinMaxRecurrenceKind();
3605   setDebugLocFromInst(Builder, ReductionStartValue);
3606 
3607   // We need to generate a reduction vector from the incoming scalar.
3608   // To do so, we need to generate the 'identity' vector and override
3609   // one of the elements with the incoming scalar reduction. We need
3610   // to do it in the vector-loop preheader.
3611   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3612 
3613   // This is the vector-clone of the value that leaves the loop.
3614   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3615 
3616   // Find the reduction identity variable. Zero for addition, or, xor,
3617   // one for multiplication, -1 for And.
3618   Value *Identity;
3619   Value *VectorStart;
3620   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3621       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3622     // MinMax reduction have the start value as their identify.
3623     if (VF == 1) {
3624       VectorStart = Identity = ReductionStartValue;
3625     } else {
3626       VectorStart = Identity =
3627         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3628     }
3629   } else {
3630     // Handle other reduction kinds:
3631     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3632         RK, VecTy->getScalarType());
3633     if (VF == 1) {
3634       Identity = Iden;
3635       // This vector is the Identity vector where the first element is the
3636       // incoming scalar reduction.
3637       VectorStart = ReductionStartValue;
3638     } else {
3639       Identity = ConstantVector::getSplat(VF, Iden);
3640 
3641       // This vector is the Identity vector where the first element is the
3642       // incoming scalar reduction.
3643       VectorStart =
3644         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3645     }
3646   }
3647 
3648   // Fix the vector-loop phi.
3649 
3650   // Reductions do not have to start at zero. They can start with
3651   // any loop invariant values.
3652   BasicBlock *Latch = OrigLoop->getLoopLatch();
3653   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3654   for (unsigned Part = 0; Part < UF; ++Part) {
3655     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3656     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3657     // Make sure to add the reduction stat value only to the
3658     // first unroll part.
3659     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3660     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3661     cast<PHINode>(VecRdxPhi)
3662       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3663   }
3664 
3665   // Before each round, move the insertion point right between
3666   // the PHIs and the values we are going to write.
3667   // This allows us to write both PHINodes and the extractelement
3668   // instructions.
3669   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3670 
3671   setDebugLocFromInst(Builder, LoopExitInst);
3672 
3673   // If the vector reduction can be performed in a smaller type, we truncate
3674   // then extend the loop exit value to enable InstCombine to evaluate the
3675   // entire expression in the smaller type.
3676   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3677     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3678     Builder.SetInsertPoint(
3679         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3680     VectorParts RdxParts(UF);
3681     for (unsigned Part = 0; Part < UF; ++Part) {
3682       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3683       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3684       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3685                                         : Builder.CreateZExt(Trunc, VecTy);
3686       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3687            UI != RdxParts[Part]->user_end();)
3688         if (*UI != Trunc) {
3689           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3690           RdxParts[Part] = Extnd;
3691         } else {
3692           ++UI;
3693         }
3694     }
3695     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3696     for (unsigned Part = 0; Part < UF; ++Part) {
3697       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3698       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3699     }
3700   }
3701 
3702   // Reduce all of the unrolled parts into a single vector.
3703   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3704   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3705 
3706   // The middle block terminator has already been assigned a DebugLoc here (the
3707   // OrigLoop's single latch terminator). We want the whole middle block to
3708   // appear to execute on this line because: (a) it is all compiler generated,
3709   // (b) these instructions are always executed after evaluating the latch
3710   // conditional branch, and (c) other passes may add new predecessors which
3711   // terminate on this line. This is the easiest way to ensure we don't
3712   // accidentally cause an extra step back into the loop while debugging.
3713   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3714   for (unsigned Part = 1; Part < UF; ++Part) {
3715     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3716     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3717       // Floating point operations had to be 'fast' to enable the reduction.
3718       ReducedPartRdx = addFastMathFlag(
3719           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3720                               ReducedPartRdx, "bin.rdx"),
3721           RdxDesc.getFastMathFlags());
3722     else
3723       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3724                                       RdxPart);
3725   }
3726 
3727   if (VF > 1) {
3728     bool NoNaN = Legal->hasFunNoNaNAttr();
3729     ReducedPartRdx =
3730         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3731     // If the reduction can be performed in a smaller type, we need to extend
3732     // the reduction to the wider type before we branch to the original loop.
3733     if (Phi->getType() != RdxDesc.getRecurrenceType())
3734       ReducedPartRdx =
3735         RdxDesc.isSigned()
3736         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3737         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3738   }
3739 
3740   // Create a phi node that merges control-flow from the backedge-taken check
3741   // block and the middle block.
3742   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3743                                         LoopScalarPreHeader->getTerminator());
3744   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3745     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3746   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3747 
3748   // Now, we need to fix the users of the reduction variable
3749   // inside and outside of the scalar remainder loop.
3750   // We know that the loop is in LCSSA form. We need to update the
3751   // PHI nodes in the exit blocks.
3752   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3753     // All PHINodes need to have a single entry edge, or two if
3754     // we already fixed them.
3755     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3756 
3757     // We found a reduction value exit-PHI. Update it with the
3758     // incoming bypass edge.
3759     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3760       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3761   } // end of the LCSSA phi scan.
3762 
3763     // Fix the scalar loop reduction variable with the incoming reduction sum
3764     // from the vector body and from the backedge value.
3765   int IncomingEdgeBlockIdx =
3766     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3767   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3768   // Pick the other block.
3769   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3770   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3771   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3772 }
3773 
3774 void InnerLoopVectorizer::fixLCSSAPHIs() {
3775   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3776     if (LCSSAPhi.getNumIncomingValues() == 1) {
3777       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3778       // Non-instruction incoming values will have only one value.
3779       unsigned LastLane = 0;
3780       if (isa<Instruction>(IncomingValue))
3781           LastLane = Cost->isUniformAfterVectorization(
3782                          cast<Instruction>(IncomingValue), VF)
3783                          ? 0
3784                          : VF - 1;
3785       // Can be a loop invariant incoming value or the last scalar value to be
3786       // extracted from the vectorized loop.
3787       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3788       Value *lastIncomingValue =
3789           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3790       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3791     }
3792   }
3793 }
3794 
3795 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3796   // The basic block and loop containing the predicated instruction.
3797   auto *PredBB = PredInst->getParent();
3798   auto *VectorLoop = LI->getLoopFor(PredBB);
3799 
3800   // Initialize a worklist with the operands of the predicated instruction.
3801   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3802 
3803   // Holds instructions that we need to analyze again. An instruction may be
3804   // reanalyzed if we don't yet know if we can sink it or not.
3805   SmallVector<Instruction *, 8> InstsToReanalyze;
3806 
3807   // Returns true if a given use occurs in the predicated block. Phi nodes use
3808   // their operands in their corresponding predecessor blocks.
3809   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3810     auto *I = cast<Instruction>(U.getUser());
3811     BasicBlock *BB = I->getParent();
3812     if (auto *Phi = dyn_cast<PHINode>(I))
3813       BB = Phi->getIncomingBlock(
3814           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3815     return BB == PredBB;
3816   };
3817 
3818   // Iteratively sink the scalarized operands of the predicated instruction
3819   // into the block we created for it. When an instruction is sunk, it's
3820   // operands are then added to the worklist. The algorithm ends after one pass
3821   // through the worklist doesn't sink a single instruction.
3822   bool Changed;
3823   do {
3824     // Add the instructions that need to be reanalyzed to the worklist, and
3825     // reset the changed indicator.
3826     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3827     InstsToReanalyze.clear();
3828     Changed = false;
3829 
3830     while (!Worklist.empty()) {
3831       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3832 
3833       // We can't sink an instruction if it is a phi node, is already in the
3834       // predicated block, is not in the loop, or may have side effects.
3835       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3836           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3837         continue;
3838 
3839       // It's legal to sink the instruction if all its uses occur in the
3840       // predicated block. Otherwise, there's nothing to do yet, and we may
3841       // need to reanalyze the instruction.
3842       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3843         InstsToReanalyze.push_back(I);
3844         continue;
3845       }
3846 
3847       // Move the instruction to the beginning of the predicated block, and add
3848       // it's operands to the worklist.
3849       I->moveBefore(&*PredBB->getFirstInsertionPt());
3850       Worklist.insert(I->op_begin(), I->op_end());
3851 
3852       // The sinking may have enabled other instructions to be sunk, so we will
3853       // need to iterate.
3854       Changed = true;
3855     }
3856   } while (Changed);
3857 }
3858 
3859 void InnerLoopVectorizer::fixNonInductionPHIs() {
3860   for (PHINode *OrigPhi : OrigPHIsToFix) {
3861     PHINode *NewPhi =
3862         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3863     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3864 
3865     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3866         predecessors(OrigPhi->getParent()));
3867     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3868         predecessors(NewPhi->getParent()));
3869     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3870            "Scalar and Vector BB should have the same number of predecessors");
3871 
3872     // The insertion point in Builder may be invalidated by the time we get
3873     // here. Force the Builder insertion point to something valid so that we do
3874     // not run into issues during insertion point restore in
3875     // getOrCreateVectorValue calls below.
3876     Builder.SetInsertPoint(NewPhi);
3877 
3878     // The predecessor order is preserved and we can rely on mapping between
3879     // scalar and vector block predecessors.
3880     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3881       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3882 
3883       // When looking up the new scalar/vector values to fix up, use incoming
3884       // values from original phi.
3885       Value *ScIncV =
3886           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3887 
3888       // Scalar incoming value may need a broadcast
3889       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3890       NewPhi->addIncoming(NewIncV, NewPredBB);
3891     }
3892   }
3893 }
3894 
3895 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3896                                               unsigned VF) {
3897   PHINode *P = cast<PHINode>(PN);
3898   if (EnableVPlanNativePath) {
3899     // Currently we enter here in the VPlan-native path for non-induction
3900     // PHIs where all control flow is uniform. We simply widen these PHIs.
3901     // Create a vector phi with no operands - the vector phi operands will be
3902     // set at the end of vector code generation.
3903     Type *VecTy =
3904         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3905     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3906     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3907     OrigPHIsToFix.push_back(P);
3908 
3909     return;
3910   }
3911 
3912   assert(PN->getParent() == OrigLoop->getHeader() &&
3913          "Non-header phis should have been handled elsewhere");
3914 
3915   // In order to support recurrences we need to be able to vectorize Phi nodes.
3916   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3917   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3918   // this value when we vectorize all of the instructions that use the PHI.
3919   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3920     for (unsigned Part = 0; Part < UF; ++Part) {
3921       // This is phase one of vectorizing PHIs.
3922       Type *VecTy =
3923           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3924       Value *EntryPart = PHINode::Create(
3925           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3926       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3927     }
3928     return;
3929   }
3930 
3931   setDebugLocFromInst(Builder, P);
3932 
3933   // This PHINode must be an induction variable.
3934   // Make sure that we know about it.
3935   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3936 
3937   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3938   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3939 
3940   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3941   // which can be found from the original scalar operations.
3942   switch (II.getKind()) {
3943   case InductionDescriptor::IK_NoInduction:
3944     llvm_unreachable("Unknown induction");
3945   case InductionDescriptor::IK_IntInduction:
3946   case InductionDescriptor::IK_FpInduction:
3947     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3948   case InductionDescriptor::IK_PtrInduction: {
3949     // Handle the pointer induction variable case.
3950     assert(P->getType()->isPointerTy() && "Unexpected type.");
3951     // This is the normalized GEP that starts counting at zero.
3952     Value *PtrInd = Induction;
3953     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3954     // Determine the number of scalars we need to generate for each unroll
3955     // iteration. If the instruction is uniform, we only need to generate the
3956     // first lane. Otherwise, we generate all VF values.
3957     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3958     // These are the scalar results. Notice that we don't generate vector GEPs
3959     // because scalar GEPs result in better code.
3960     for (unsigned Part = 0; Part < UF; ++Part) {
3961       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3962         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3963         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3964         Value *SclrGep =
3965             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3966         SclrGep->setName("next.gep");
3967         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3968       }
3969     }
3970     return;
3971   }
3972   }
3973 }
3974 
3975 /// A helper function for checking whether an integer division-related
3976 /// instruction may divide by zero (in which case it must be predicated if
3977 /// executed conditionally in the scalar code).
3978 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3979 /// Non-zero divisors that are non compile-time constants will not be
3980 /// converted into multiplication, so we will still end up scalarizing
3981 /// the division, but can do so w/o predication.
3982 static bool mayDivideByZero(Instruction &I) {
3983   assert((I.getOpcode() == Instruction::UDiv ||
3984           I.getOpcode() == Instruction::SDiv ||
3985           I.getOpcode() == Instruction::URem ||
3986           I.getOpcode() == Instruction::SRem) &&
3987          "Unexpected instruction");
3988   Value *Divisor = I.getOperand(1);
3989   auto *CInt = dyn_cast<ConstantInt>(Divisor);
3990   return !CInt || CInt->isZero();
3991 }
3992 
3993 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3994   switch (I.getOpcode()) {
3995   case Instruction::Br:
3996   case Instruction::PHI:
3997     llvm_unreachable("This instruction is handled by a different recipe.");
3998   case Instruction::GetElementPtr: {
3999     // Construct a vector GEP by widening the operands of the scalar GEP as
4000     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4001     // results in a vector of pointers when at least one operand of the GEP
4002     // is vector-typed. Thus, to keep the representation compact, we only use
4003     // vector-typed operands for loop-varying values.
4004     auto *GEP = cast<GetElementPtrInst>(&I);
4005 
4006     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4007       // If we are vectorizing, but the GEP has only loop-invariant operands,
4008       // the GEP we build (by only using vector-typed operands for
4009       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4010       // produce a vector of pointers, we need to either arbitrarily pick an
4011       // operand to broadcast, or broadcast a clone of the original GEP.
4012       // Here, we broadcast a clone of the original.
4013       //
4014       // TODO: If at some point we decide to scalarize instructions having
4015       //       loop-invariant operands, this special case will no longer be
4016       //       required. We would add the scalarization decision to
4017       //       collectLoopScalars() and teach getVectorValue() to broadcast
4018       //       the lane-zero scalar value.
4019       auto *Clone = Builder.Insert(GEP->clone());
4020       for (unsigned Part = 0; Part < UF; ++Part) {
4021         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4022         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4023         addMetadata(EntryPart, GEP);
4024       }
4025     } else {
4026       // If the GEP has at least one loop-varying operand, we are sure to
4027       // produce a vector of pointers. But if we are only unrolling, we want
4028       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4029       // produce with the code below will be scalar (if VF == 1) or vector
4030       // (otherwise). Note that for the unroll-only case, we still maintain
4031       // values in the vector mapping with initVector, as we do for other
4032       // instructions.
4033       for (unsigned Part = 0; Part < UF; ++Part) {
4034         // The pointer operand of the new GEP. If it's loop-invariant, we
4035         // won't broadcast it.
4036         auto *Ptr =
4037             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4038                 ? GEP->getPointerOperand()
4039                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4040 
4041         // Collect all the indices for the new GEP. If any index is
4042         // loop-invariant, we won't broadcast it.
4043         SmallVector<Value *, 4> Indices;
4044         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4045           if (OrigLoop->isLoopInvariant(U.get()))
4046             Indices.push_back(U.get());
4047           else
4048             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4049         }
4050 
4051         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4052         // but it should be a vector, otherwise.
4053         auto *NewGEP =
4054             GEP->isInBounds()
4055                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4056                                             Indices)
4057                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4058         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4059                "NewGEP is not a pointer vector");
4060         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4061         addMetadata(NewGEP, GEP);
4062       }
4063     }
4064 
4065     break;
4066   }
4067   case Instruction::UDiv:
4068   case Instruction::SDiv:
4069   case Instruction::SRem:
4070   case Instruction::URem:
4071   case Instruction::Add:
4072   case Instruction::FAdd:
4073   case Instruction::Sub:
4074   case Instruction::FSub:
4075   case Instruction::FNeg:
4076   case Instruction::Mul:
4077   case Instruction::FMul:
4078   case Instruction::FDiv:
4079   case Instruction::FRem:
4080   case Instruction::Shl:
4081   case Instruction::LShr:
4082   case Instruction::AShr:
4083   case Instruction::And:
4084   case Instruction::Or:
4085   case Instruction::Xor: {
4086     // Just widen unops and binops.
4087     setDebugLocFromInst(Builder, &I);
4088 
4089     for (unsigned Part = 0; Part < UF; ++Part) {
4090       SmallVector<Value *, 2> Ops;
4091       for (Value *Op : I.operands())
4092         Ops.push_back(getOrCreateVectorValue(Op, Part));
4093 
4094       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4095 
4096       if (auto *VecOp = dyn_cast<Instruction>(V))
4097         VecOp->copyIRFlags(&I);
4098 
4099       // Use this vector value for all users of the original instruction.
4100       VectorLoopValueMap.setVectorValue(&I, Part, V);
4101       addMetadata(V, &I);
4102     }
4103 
4104     break;
4105   }
4106   case Instruction::Select: {
4107     // Widen selects.
4108     // If the selector is loop invariant we can create a select
4109     // instruction with a scalar condition. Otherwise, use vector-select.
4110     auto *SE = PSE.getSE();
4111     bool InvariantCond =
4112         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4113     setDebugLocFromInst(Builder, &I);
4114 
4115     // The condition can be loop invariant  but still defined inside the
4116     // loop. This means that we can't just use the original 'cond' value.
4117     // We have to take the 'vectorized' value and pick the first lane.
4118     // Instcombine will make this a no-op.
4119 
4120     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4121 
4122     for (unsigned Part = 0; Part < UF; ++Part) {
4123       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4124       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4125       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4126       Value *Sel =
4127           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4128       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4129       addMetadata(Sel, &I);
4130     }
4131 
4132     break;
4133   }
4134 
4135   case Instruction::ICmp:
4136   case Instruction::FCmp: {
4137     // Widen compares. Generate vector compares.
4138     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4139     auto *Cmp = dyn_cast<CmpInst>(&I);
4140     setDebugLocFromInst(Builder, Cmp);
4141     for (unsigned Part = 0; Part < UF; ++Part) {
4142       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4143       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4144       Value *C = nullptr;
4145       if (FCmp) {
4146         // Propagate fast math flags.
4147         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4148         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4149         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4150       } else {
4151         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4152       }
4153       VectorLoopValueMap.setVectorValue(&I, Part, C);
4154       addMetadata(C, &I);
4155     }
4156 
4157     break;
4158   }
4159 
4160   case Instruction::ZExt:
4161   case Instruction::SExt:
4162   case Instruction::FPToUI:
4163   case Instruction::FPToSI:
4164   case Instruction::FPExt:
4165   case Instruction::PtrToInt:
4166   case Instruction::IntToPtr:
4167   case Instruction::SIToFP:
4168   case Instruction::UIToFP:
4169   case Instruction::Trunc:
4170   case Instruction::FPTrunc:
4171   case Instruction::BitCast: {
4172     auto *CI = dyn_cast<CastInst>(&I);
4173     setDebugLocFromInst(Builder, CI);
4174 
4175     /// Vectorize casts.
4176     Type *DestTy =
4177         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4178 
4179     for (unsigned Part = 0; Part < UF; ++Part) {
4180       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4181       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4182       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4183       addMetadata(Cast, &I);
4184     }
4185     break;
4186   }
4187 
4188   case Instruction::Call: {
4189     // Ignore dbg intrinsics.
4190     if (isa<DbgInfoIntrinsic>(I))
4191       break;
4192     setDebugLocFromInst(Builder, &I);
4193 
4194     Module *M = I.getParent()->getParent()->getParent();
4195     auto *CI = cast<CallInst>(&I);
4196 
4197     StringRef FnName = CI->getCalledFunction()->getName();
4198     Function *F = CI->getCalledFunction();
4199     Type *RetTy = ToVectorTy(CI->getType(), VF);
4200     SmallVector<Type *, 4> Tys;
4201     for (Value *ArgOperand : CI->arg_operands())
4202       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4203 
4204     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4205 
4206     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4207     // version of the instruction.
4208     // Is it beneficial to perform intrinsic call compared to lib call?
4209     bool NeedToScalarize;
4210     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4211     bool UseVectorIntrinsic =
4212         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4213     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4214            "Instruction should be scalarized elsewhere.");
4215 
4216     for (unsigned Part = 0; Part < UF; ++Part) {
4217       SmallVector<Value *, 4> Args;
4218       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4219         Value *Arg = CI->getArgOperand(i);
4220         // Some intrinsics have a scalar argument - don't replace it with a
4221         // vector.
4222         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4223           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4224         Args.push_back(Arg);
4225       }
4226 
4227       Function *VectorF;
4228       if (UseVectorIntrinsic) {
4229         // Use vector version of the intrinsic.
4230         Type *TysForDecl[] = {CI->getType()};
4231         if (VF > 1)
4232           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4233         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4234       } else {
4235         // Use vector version of the library call.
4236         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4237         assert(!VFnName.empty() && "Vector function name is empty.");
4238         VectorF = M->getFunction(VFnName);
4239         if (!VectorF) {
4240           // Generate a declaration
4241           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4242           VectorF =
4243               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4244           VectorF->copyAttributesFrom(F);
4245         }
4246       }
4247       assert(VectorF && "Can't create vector function.");
4248 
4249       SmallVector<OperandBundleDef, 1> OpBundles;
4250       CI->getOperandBundlesAsDefs(OpBundles);
4251       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4252 
4253       if (isa<FPMathOperator>(V))
4254         V->copyFastMathFlags(CI);
4255 
4256       VectorLoopValueMap.setVectorValue(&I, Part, V);
4257       addMetadata(V, &I);
4258     }
4259 
4260     break;
4261   }
4262 
4263   default:
4264     // This instruction is not vectorized by simple widening.
4265     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4266     llvm_unreachable("Unhandled instruction!");
4267   } // end of switch.
4268 }
4269 
4270 void InnerLoopVectorizer::updateAnalysis() {
4271   // Forget the original basic block.
4272   PSE.getSE()->forgetLoop(OrigLoop);
4273 
4274   // DT is not kept up-to-date for outer loop vectorization
4275   if (EnableVPlanNativePath)
4276     return;
4277 
4278   // Update the dominator tree information.
4279   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4280          "Entry does not dominate exit.");
4281 
4282   DT->addNewBlock(LoopMiddleBlock,
4283                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4284   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4285   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4286   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4287   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4288 }
4289 
4290 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4291   // We should not collect Scalars more than once per VF. Right now, this
4292   // function is called from collectUniformsAndScalars(), which already does
4293   // this check. Collecting Scalars for VF=1 does not make any sense.
4294   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4295          "This function should not be visited twice for the same VF");
4296 
4297   SmallSetVector<Instruction *, 8> Worklist;
4298 
4299   // These sets are used to seed the analysis with pointers used by memory
4300   // accesses that will remain scalar.
4301   SmallSetVector<Instruction *, 8> ScalarPtrs;
4302   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4303 
4304   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4305   // The pointer operands of loads and stores will be scalar as long as the
4306   // memory access is not a gather or scatter operation. The value operand of a
4307   // store will remain scalar if the store is scalarized.
4308   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4309     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4310     assert(WideningDecision != CM_Unknown &&
4311            "Widening decision should be ready at this moment");
4312     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4313       if (Ptr == Store->getValueOperand())
4314         return WideningDecision == CM_Scalarize;
4315     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4316            "Ptr is neither a value or pointer operand");
4317     return WideningDecision != CM_GatherScatter;
4318   };
4319 
4320   // A helper that returns true if the given value is a bitcast or
4321   // getelementptr instruction contained in the loop.
4322   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4323     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4324             isa<GetElementPtrInst>(V)) &&
4325            !TheLoop->isLoopInvariant(V);
4326   };
4327 
4328   // A helper that evaluates a memory access's use of a pointer. If the use
4329   // will be a scalar use, and the pointer is only used by memory accesses, we
4330   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4331   // PossibleNonScalarPtrs.
4332   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4333     // We only care about bitcast and getelementptr instructions contained in
4334     // the loop.
4335     if (!isLoopVaryingBitCastOrGEP(Ptr))
4336       return;
4337 
4338     // If the pointer has already been identified as scalar (e.g., if it was
4339     // also identified as uniform), there's nothing to do.
4340     auto *I = cast<Instruction>(Ptr);
4341     if (Worklist.count(I))
4342       return;
4343 
4344     // If the use of the pointer will be a scalar use, and all users of the
4345     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4346     // place the pointer in PossibleNonScalarPtrs.
4347     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4348           return isa<LoadInst>(U) || isa<StoreInst>(U);
4349         }))
4350       ScalarPtrs.insert(I);
4351     else
4352       PossibleNonScalarPtrs.insert(I);
4353   };
4354 
4355   // We seed the scalars analysis with three classes of instructions: (1)
4356   // instructions marked uniform-after-vectorization, (2) bitcast and
4357   // getelementptr instructions used by memory accesses requiring a scalar use,
4358   // and (3) pointer induction variables and their update instructions (we
4359   // currently only scalarize these).
4360   //
4361   // (1) Add to the worklist all instructions that have been identified as
4362   // uniform-after-vectorization.
4363   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4364 
4365   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4366   // memory accesses requiring a scalar use. The pointer operands of loads and
4367   // stores will be scalar as long as the memory accesses is not a gather or
4368   // scatter operation. The value operand of a store will remain scalar if the
4369   // store is scalarized.
4370   for (auto *BB : TheLoop->blocks())
4371     for (auto &I : *BB) {
4372       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4373         evaluatePtrUse(Load, Load->getPointerOperand());
4374       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4375         evaluatePtrUse(Store, Store->getPointerOperand());
4376         evaluatePtrUse(Store, Store->getValueOperand());
4377       }
4378     }
4379   for (auto *I : ScalarPtrs)
4380     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4381       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4382       Worklist.insert(I);
4383     }
4384 
4385   // (3) Add to the worklist all pointer induction variables and their update
4386   // instructions.
4387   //
4388   // TODO: Once we are able to vectorize pointer induction variables we should
4389   //       no longer insert them into the worklist here.
4390   auto *Latch = TheLoop->getLoopLatch();
4391   for (auto &Induction : *Legal->getInductionVars()) {
4392     auto *Ind = Induction.first;
4393     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4394     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4395       continue;
4396     Worklist.insert(Ind);
4397     Worklist.insert(IndUpdate);
4398     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4399     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4400                       << "\n");
4401   }
4402 
4403   // Insert the forced scalars.
4404   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4405   // induction variable when the PHI user is scalarized.
4406   auto ForcedScalar = ForcedScalars.find(VF);
4407   if (ForcedScalar != ForcedScalars.end())
4408     for (auto *I : ForcedScalar->second)
4409       Worklist.insert(I);
4410 
4411   // Expand the worklist by looking through any bitcasts and getelementptr
4412   // instructions we've already identified as scalar. This is similar to the
4413   // expansion step in collectLoopUniforms(); however, here we're only
4414   // expanding to include additional bitcasts and getelementptr instructions.
4415   unsigned Idx = 0;
4416   while (Idx != Worklist.size()) {
4417     Instruction *Dst = Worklist[Idx++];
4418     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4419       continue;
4420     auto *Src = cast<Instruction>(Dst->getOperand(0));
4421     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4422           auto *J = cast<Instruction>(U);
4423           return !TheLoop->contains(J) || Worklist.count(J) ||
4424                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4425                   isScalarUse(J, Src));
4426         })) {
4427       Worklist.insert(Src);
4428       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4429     }
4430   }
4431 
4432   // An induction variable will remain scalar if all users of the induction
4433   // variable and induction variable update remain scalar.
4434   for (auto &Induction : *Legal->getInductionVars()) {
4435     auto *Ind = Induction.first;
4436     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4437 
4438     // We already considered pointer induction variables, so there's no reason
4439     // to look at their users again.
4440     //
4441     // TODO: Once we are able to vectorize pointer induction variables we
4442     //       should no longer skip over them here.
4443     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4444       continue;
4445 
4446     // Determine if all users of the induction variable are scalar after
4447     // vectorization.
4448     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4449       auto *I = cast<Instruction>(U);
4450       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4451     });
4452     if (!ScalarInd)
4453       continue;
4454 
4455     // Determine if all users of the induction variable update instruction are
4456     // scalar after vectorization.
4457     auto ScalarIndUpdate =
4458         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4459           auto *I = cast<Instruction>(U);
4460           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4461         });
4462     if (!ScalarIndUpdate)
4463       continue;
4464 
4465     // The induction variable and its update instruction will remain scalar.
4466     Worklist.insert(Ind);
4467     Worklist.insert(IndUpdate);
4468     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4469     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4470                       << "\n");
4471   }
4472 
4473   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4474 }
4475 
4476 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4477   if (!blockNeedsPredication(I->getParent()))
4478     return false;
4479   switch(I->getOpcode()) {
4480   default:
4481     break;
4482   case Instruction::Load:
4483   case Instruction::Store: {
4484     if (!Legal->isMaskRequired(I))
4485       return false;
4486     auto *Ptr = getLoadStorePointerOperand(I);
4487     auto *Ty = getMemInstValueType(I);
4488     // We have already decided how to vectorize this instruction, get that
4489     // result.
4490     if (VF > 1) {
4491       InstWidening WideningDecision = getWideningDecision(I, VF);
4492       assert(WideningDecision != CM_Unknown &&
4493              "Widening decision should be ready at this moment");
4494       return WideningDecision == CM_Scalarize;
4495     }
4496     return isa<LoadInst>(I) ?
4497         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4498       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4499   }
4500   case Instruction::UDiv:
4501   case Instruction::SDiv:
4502   case Instruction::SRem:
4503   case Instruction::URem:
4504     return mayDivideByZero(*I);
4505   }
4506   return false;
4507 }
4508 
4509 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4510                                                                unsigned VF) {
4511   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4512   assert(getWideningDecision(I, VF) == CM_Unknown &&
4513          "Decision should not be set yet.");
4514   auto *Group = getInterleavedAccessGroup(I);
4515   assert(Group && "Must have a group.");
4516 
4517   // If the instruction's allocated size doesn't equal it's type size, it
4518   // requires padding and will be scalarized.
4519   auto &DL = I->getModule()->getDataLayout();
4520   auto *ScalarTy = getMemInstValueType(I);
4521   if (hasIrregularType(ScalarTy, DL, VF))
4522     return false;
4523 
4524   // Check if masking is required.
4525   // A Group may need masking for one of two reasons: it resides in a block that
4526   // needs predication, or it was decided to use masking to deal with gaps.
4527   bool PredicatedAccessRequiresMasking =
4528       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4529   bool AccessWithGapsRequiresMasking =
4530       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4531   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4532     return true;
4533 
4534   // If masked interleaving is required, we expect that the user/target had
4535   // enabled it, because otherwise it either wouldn't have been created or
4536   // it should have been invalidated by the CostModel.
4537   assert(useMaskedInterleavedAccesses(TTI) &&
4538          "Masked interleave-groups for predicated accesses are not enabled.");
4539 
4540   auto *Ty = getMemInstValueType(I);
4541   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4542                           : TTI.isLegalMaskedStore(Ty);
4543 }
4544 
4545 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4546                                                                unsigned VF) {
4547   // Get and ensure we have a valid memory instruction.
4548   LoadInst *LI = dyn_cast<LoadInst>(I);
4549   StoreInst *SI = dyn_cast<StoreInst>(I);
4550   assert((LI || SI) && "Invalid memory instruction");
4551 
4552   auto *Ptr = getLoadStorePointerOperand(I);
4553 
4554   // In order to be widened, the pointer should be consecutive, first of all.
4555   if (!Legal->isConsecutivePtr(Ptr))
4556     return false;
4557 
4558   // If the instruction is a store located in a predicated block, it will be
4559   // scalarized.
4560   if (isScalarWithPredication(I))
4561     return false;
4562 
4563   // If the instruction's allocated size doesn't equal it's type size, it
4564   // requires padding and will be scalarized.
4565   auto &DL = I->getModule()->getDataLayout();
4566   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4567   if (hasIrregularType(ScalarTy, DL, VF))
4568     return false;
4569 
4570   return true;
4571 }
4572 
4573 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4574   // We should not collect Uniforms more than once per VF. Right now,
4575   // this function is called from collectUniformsAndScalars(), which
4576   // already does this check. Collecting Uniforms for VF=1 does not make any
4577   // sense.
4578 
4579   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4580          "This function should not be visited twice for the same VF");
4581 
4582   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4583   // not analyze again.  Uniforms.count(VF) will return 1.
4584   Uniforms[VF].clear();
4585 
4586   // We now know that the loop is vectorizable!
4587   // Collect instructions inside the loop that will remain uniform after
4588   // vectorization.
4589 
4590   // Global values, params and instructions outside of current loop are out of
4591   // scope.
4592   auto isOutOfScope = [&](Value *V) -> bool {
4593     Instruction *I = dyn_cast<Instruction>(V);
4594     return (!I || !TheLoop->contains(I));
4595   };
4596 
4597   SetVector<Instruction *> Worklist;
4598   BasicBlock *Latch = TheLoop->getLoopLatch();
4599 
4600   // Start with the conditional branch. If the branch condition is an
4601   // instruction contained in the loop that is only used by the branch, it is
4602   // uniform.
4603   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4604   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4605     Worklist.insert(Cmp);
4606     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4607   }
4608 
4609   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4610   // are pointers that are treated like consecutive pointers during
4611   // vectorization. The pointer operands of interleaved accesses are an
4612   // example.
4613   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4614 
4615   // Holds pointer operands of instructions that are possibly non-uniform.
4616   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4617 
4618   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4619     InstWidening WideningDecision = getWideningDecision(I, VF);
4620     assert(WideningDecision != CM_Unknown &&
4621            "Widening decision should be ready at this moment");
4622 
4623     return (WideningDecision == CM_Widen ||
4624             WideningDecision == CM_Widen_Reverse ||
4625             WideningDecision == CM_Interleave);
4626   };
4627   // Iterate over the instructions in the loop, and collect all
4628   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4629   // that a consecutive-like pointer operand will be scalarized, we collect it
4630   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4631   // getelementptr instruction can be used by both vectorized and scalarized
4632   // memory instructions. For example, if a loop loads and stores from the same
4633   // location, but the store is conditional, the store will be scalarized, and
4634   // the getelementptr won't remain uniform.
4635   for (auto *BB : TheLoop->blocks())
4636     for (auto &I : *BB) {
4637       // If there's no pointer operand, there's nothing to do.
4638       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4639       if (!Ptr)
4640         continue;
4641 
4642       // True if all users of Ptr are memory accesses that have Ptr as their
4643       // pointer operand.
4644       auto UsersAreMemAccesses =
4645           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4646             return getLoadStorePointerOperand(U) == Ptr;
4647           });
4648 
4649       // Ensure the memory instruction will not be scalarized or used by
4650       // gather/scatter, making its pointer operand non-uniform. If the pointer
4651       // operand is used by any instruction other than a memory access, we
4652       // conservatively assume the pointer operand may be non-uniform.
4653       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4654         PossibleNonUniformPtrs.insert(Ptr);
4655 
4656       // If the memory instruction will be vectorized and its pointer operand
4657       // is consecutive-like, or interleaving - the pointer operand should
4658       // remain uniform.
4659       else
4660         ConsecutiveLikePtrs.insert(Ptr);
4661     }
4662 
4663   // Add to the Worklist all consecutive and consecutive-like pointers that
4664   // aren't also identified as possibly non-uniform.
4665   for (auto *V : ConsecutiveLikePtrs)
4666     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4667       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4668       Worklist.insert(V);
4669     }
4670 
4671   // Expand Worklist in topological order: whenever a new instruction
4672   // is added , its users should be already inside Worklist.  It ensures
4673   // a uniform instruction will only be used by uniform instructions.
4674   unsigned idx = 0;
4675   while (idx != Worklist.size()) {
4676     Instruction *I = Worklist[idx++];
4677 
4678     for (auto OV : I->operand_values()) {
4679       // isOutOfScope operands cannot be uniform instructions.
4680       if (isOutOfScope(OV))
4681         continue;
4682       // First order recurrence Phi's should typically be considered
4683       // non-uniform.
4684       auto *OP = dyn_cast<PHINode>(OV);
4685       if (OP && Legal->isFirstOrderRecurrence(OP))
4686         continue;
4687       // If all the users of the operand are uniform, then add the
4688       // operand into the uniform worklist.
4689       auto *OI = cast<Instruction>(OV);
4690       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4691             auto *J = cast<Instruction>(U);
4692             return Worklist.count(J) ||
4693                    (OI == getLoadStorePointerOperand(J) &&
4694                     isUniformDecision(J, VF));
4695           })) {
4696         Worklist.insert(OI);
4697         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4698       }
4699     }
4700   }
4701 
4702   // Returns true if Ptr is the pointer operand of a memory access instruction
4703   // I, and I is known to not require scalarization.
4704   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4705     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4706   };
4707 
4708   // For an instruction to be added into Worklist above, all its users inside
4709   // the loop should also be in Worklist. However, this condition cannot be
4710   // true for phi nodes that form a cyclic dependence. We must process phi
4711   // nodes separately. An induction variable will remain uniform if all users
4712   // of the induction variable and induction variable update remain uniform.
4713   // The code below handles both pointer and non-pointer induction variables.
4714   for (auto &Induction : *Legal->getInductionVars()) {
4715     auto *Ind = Induction.first;
4716     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4717 
4718     // Determine if all users of the induction variable are uniform after
4719     // vectorization.
4720     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4721       auto *I = cast<Instruction>(U);
4722       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4723              isVectorizedMemAccessUse(I, Ind);
4724     });
4725     if (!UniformInd)
4726       continue;
4727 
4728     // Determine if all users of the induction variable update instruction are
4729     // uniform after vectorization.
4730     auto UniformIndUpdate =
4731         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4732           auto *I = cast<Instruction>(U);
4733           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4734                  isVectorizedMemAccessUse(I, IndUpdate);
4735         });
4736     if (!UniformIndUpdate)
4737       continue;
4738 
4739     // The induction variable and its update instruction will remain uniform.
4740     Worklist.insert(Ind);
4741     Worklist.insert(IndUpdate);
4742     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4743     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4744                       << "\n");
4745   }
4746 
4747   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4748 }
4749 
4750 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4751   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4752 
4753   if (Legal->getRuntimePointerChecking()->Need) {
4754     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4755         "runtime pointer checks needed. Enable vectorization of this "
4756         "loop with '#pragma clang loop vectorize(enable)' when "
4757         "compiling with -Os/-Oz",
4758         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4759     return true;
4760   }
4761 
4762   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4763     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4764         "runtime SCEV checks needed. Enable vectorization of this "
4765         "loop with '#pragma clang loop vectorize(enable)' when "
4766         "compiling with -Os/-Oz",
4767         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4768     return true;
4769   }
4770 
4771   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4772   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4773     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4774         "runtime stride == 1 checks needed. Enable vectorization of "
4775         "this loop with '#pragma clang loop vectorize(enable)' when "
4776         "compiling with -Os/-Oz",
4777         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4778     return true;
4779   }
4780 
4781   return false;
4782 }
4783 
4784 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4785   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4786     // TODO: It may by useful to do since it's still likely to be dynamically
4787     // uniform if the target can skip.
4788     reportVectorizationFailure(
4789         "Not inserting runtime ptr check for divergent target",
4790         "runtime pointer checks needed. Not enabled for divergent target",
4791         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4792     return None;
4793   }
4794 
4795   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4796   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4797   if (TC == 1) {
4798     reportVectorizationFailure("Single iteration (non) loop",
4799         "loop trip count is one, irrelevant for vectorization",
4800         "SingleIterationLoop", ORE, TheLoop);
4801     return None;
4802   }
4803 
4804   switch (ScalarEpilogueStatus) {
4805   case CM_ScalarEpilogueAllowed:
4806     return computeFeasibleMaxVF(TC);
4807   case CM_ScalarEpilogueNotNeededPredicatePragma:
4808     LLVM_DEBUG(
4809         dbgs() << "LV: vector predicate hint found.\n"
4810                << "LV: Not allowing scalar epilogue, creating predicated "
4811                << "vector loop.\n");
4812     break;
4813   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4814     // fallthrough as a special case of OptForSize
4815   case CM_ScalarEpilogueNotAllowedOptSize:
4816     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4817       LLVM_DEBUG(
4818           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4819     else
4820       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4821                         << "count.\n");
4822 
4823     // Bail if runtime checks are required, which are not good when optimising
4824     // for size.
4825     if (runtimeChecksRequired())
4826       return None;
4827     break;
4828   }
4829 
4830   // Now try the tail folding
4831 
4832   // Invalidate interleave groups that require an epilogue if we can't mask
4833   // the interleave-group.
4834   if (!useMaskedInterleavedAccesses(TTI))
4835     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4836 
4837   unsigned MaxVF = computeFeasibleMaxVF(TC);
4838   if (TC > 0 && TC % MaxVF == 0) {
4839     // Accept MaxVF if we do not have a tail.
4840     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4841     return MaxVF;
4842   }
4843 
4844   // If we don't know the precise trip count, or if the trip count that we
4845   // found modulo the vectorization factor is not zero, try to fold the tail
4846   // by masking.
4847   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4848   if (Legal->canFoldTailByMasking()) {
4849     FoldTailByMasking = true;
4850     return MaxVF;
4851   }
4852 
4853   if (TC == 0) {
4854     reportVectorizationFailure(
4855         "Unable to calculate the loop count due to complex control flow",
4856         "unable to calculate the loop count due to complex control flow",
4857         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4858     return None;
4859   }
4860 
4861   reportVectorizationFailure(
4862       "Cannot optimize for size and vectorize at the same time.",
4863       "cannot optimize for size and vectorize at the same time. "
4864       "Enable vectorization of this loop with '#pragma clang loop "
4865       "vectorize(enable)' when compiling with -Os/-Oz",
4866       "NoTailLoopWithOptForSize", ORE, TheLoop);
4867   return None;
4868 }
4869 
4870 unsigned
4871 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4872   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4873   unsigned SmallestType, WidestType;
4874   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4875   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4876 
4877   // Get the maximum safe dependence distance in bits computed by LAA.
4878   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4879   // the memory accesses that is most restrictive (involved in the smallest
4880   // dependence distance).
4881   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4882 
4883   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4884 
4885   unsigned MaxVectorSize = WidestRegister / WidestType;
4886 
4887   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4888                     << " / " << WidestType << " bits.\n");
4889   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4890                     << WidestRegister << " bits.\n");
4891 
4892   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4893                                  " into one vector!");
4894   if (MaxVectorSize == 0) {
4895     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4896     MaxVectorSize = 1;
4897     return MaxVectorSize;
4898   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4899              isPowerOf2_32(ConstTripCount)) {
4900     // We need to clamp the VF to be the ConstTripCount. There is no point in
4901     // choosing a higher viable VF as done in the loop below.
4902     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4903                       << ConstTripCount << "\n");
4904     MaxVectorSize = ConstTripCount;
4905     return MaxVectorSize;
4906   }
4907 
4908   unsigned MaxVF = MaxVectorSize;
4909   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4910       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4911     // Collect all viable vectorization factors larger than the default MaxVF
4912     // (i.e. MaxVectorSize).
4913     SmallVector<unsigned, 8> VFs;
4914     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4915     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4916       VFs.push_back(VS);
4917 
4918     // For each VF calculate its register usage.
4919     auto RUs = calculateRegisterUsage(VFs);
4920 
4921     // Select the largest VF which doesn't require more registers than existing
4922     // ones.
4923     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4924     for (int i = RUs.size() - 1; i >= 0; --i) {
4925       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4926         MaxVF = VFs[i];
4927         break;
4928       }
4929     }
4930     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4931       if (MaxVF < MinVF) {
4932         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4933                           << ") with target's minimum: " << MinVF << '\n');
4934         MaxVF = MinVF;
4935       }
4936     }
4937   }
4938   return MaxVF;
4939 }
4940 
4941 VectorizationFactor
4942 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4943   float Cost = expectedCost(1).first;
4944   const float ScalarCost = Cost;
4945   unsigned Width = 1;
4946   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4947 
4948   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4949   if (ForceVectorization && MaxVF > 1) {
4950     // Ignore scalar width, because the user explicitly wants vectorization.
4951     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4952     // evaluation.
4953     Cost = std::numeric_limits<float>::max();
4954   }
4955 
4956   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4957     // Notice that the vector loop needs to be executed less times, so
4958     // we need to divide the cost of the vector loops by the width of
4959     // the vector elements.
4960     VectorizationCostTy C = expectedCost(i);
4961     float VectorCost = C.first / (float)i;
4962     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4963                       << " costs: " << (int)VectorCost << ".\n");
4964     if (!C.second && !ForceVectorization) {
4965       LLVM_DEBUG(
4966           dbgs() << "LV: Not considering vector loop of width " << i
4967                  << " because it will not generate any vector instructions.\n");
4968       continue;
4969     }
4970     if (VectorCost < Cost) {
4971       Cost = VectorCost;
4972       Width = i;
4973     }
4974   }
4975 
4976   if (!EnableCondStoresVectorization && NumPredStores) {
4977     reportVectorizationFailure("There are conditional stores.",
4978         "store that is conditionally executed prevents vectorization",
4979         "ConditionalStore", ORE, TheLoop);
4980     Width = 1;
4981     Cost = ScalarCost;
4982   }
4983 
4984   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4985              << "LV: Vectorization seems to be not beneficial, "
4986              << "but was forced by a user.\n");
4987   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4988   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4989   return Factor;
4990 }
4991 
4992 std::pair<unsigned, unsigned>
4993 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4994   unsigned MinWidth = -1U;
4995   unsigned MaxWidth = 8;
4996   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4997 
4998   // For each block.
4999   for (BasicBlock *BB : TheLoop->blocks()) {
5000     // For each instruction in the loop.
5001     for (Instruction &I : BB->instructionsWithoutDebug()) {
5002       Type *T = I.getType();
5003 
5004       // Skip ignored values.
5005       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5006         continue;
5007 
5008       // Only examine Loads, Stores and PHINodes.
5009       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5010         continue;
5011 
5012       // Examine PHI nodes that are reduction variables. Update the type to
5013       // account for the recurrence type.
5014       if (auto *PN = dyn_cast<PHINode>(&I)) {
5015         if (!Legal->isReductionVariable(PN))
5016           continue;
5017         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5018         T = RdxDesc.getRecurrenceType();
5019       }
5020 
5021       // Examine the stored values.
5022       if (auto *ST = dyn_cast<StoreInst>(&I))
5023         T = ST->getValueOperand()->getType();
5024 
5025       // Ignore loaded pointer types and stored pointer types that are not
5026       // vectorizable.
5027       //
5028       // FIXME: The check here attempts to predict whether a load or store will
5029       //        be vectorized. We only know this for certain after a VF has
5030       //        been selected. Here, we assume that if an access can be
5031       //        vectorized, it will be. We should also look at extending this
5032       //        optimization to non-pointer types.
5033       //
5034       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5035           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5036         continue;
5037 
5038       MinWidth = std::min(MinWidth,
5039                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5040       MaxWidth = std::max(MaxWidth,
5041                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5042     }
5043   }
5044 
5045   return {MinWidth, MaxWidth};
5046 }
5047 
5048 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5049                                                            unsigned LoopCost) {
5050   // -- The interleave heuristics --
5051   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5052   // There are many micro-architectural considerations that we can't predict
5053   // at this level. For example, frontend pressure (on decode or fetch) due to
5054   // code size, or the number and capabilities of the execution ports.
5055   //
5056   // We use the following heuristics to select the interleave count:
5057   // 1. If the code has reductions, then we interleave to break the cross
5058   // iteration dependency.
5059   // 2. If the loop is really small, then we interleave to reduce the loop
5060   // overhead.
5061   // 3. We don't interleave if we think that we will spill registers to memory
5062   // due to the increased register pressure.
5063 
5064   if (!isScalarEpilogueAllowed())
5065     return 1;
5066 
5067   // We used the distance for the interleave count.
5068   if (Legal->getMaxSafeDepDistBytes() != -1U)
5069     return 1;
5070 
5071   // Do not interleave loops with a relatively small trip count.
5072   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5073   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5074     return 1;
5075 
5076   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5077   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5078                     << " registers\n");
5079 
5080   if (VF == 1) {
5081     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5082       TargetNumRegisters = ForceTargetNumScalarRegs;
5083   } else {
5084     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5085       TargetNumRegisters = ForceTargetNumVectorRegs;
5086   }
5087 
5088   RegisterUsage R = calculateRegisterUsage({VF})[0];
5089   // We divide by these constants so assume that we have at least one
5090   // instruction that uses at least one register.
5091   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5092 
5093   // We calculate the interleave count using the following formula.
5094   // Subtract the number of loop invariants from the number of available
5095   // registers. These registers are used by all of the interleaved instances.
5096   // Next, divide the remaining registers by the number of registers that is
5097   // required by the loop, in order to estimate how many parallel instances
5098   // fit without causing spills. All of this is rounded down if necessary to be
5099   // a power of two. We want power of two interleave count to simplify any
5100   // addressing operations or alignment considerations.
5101   // We also want power of two interleave counts to ensure that the induction
5102   // variable of the vector loop wraps to zero, when tail is folded by masking;
5103   // this currently happens when OptForSize, in which case IC is set to 1 above.
5104   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5105                               R.MaxLocalUsers);
5106 
5107   // Don't count the induction variable as interleaved.
5108   if (EnableIndVarRegisterHeur)
5109     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5110                        std::max(1U, (R.MaxLocalUsers - 1)));
5111 
5112   // Clamp the interleave ranges to reasonable counts.
5113   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5114 
5115   // Check if the user has overridden the max.
5116   if (VF == 1) {
5117     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5118       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5119   } else {
5120     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5121       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5122   }
5123 
5124   // If we did not calculate the cost for VF (because the user selected the VF)
5125   // then we calculate the cost of VF here.
5126   if (LoopCost == 0)
5127     LoopCost = expectedCost(VF).first;
5128 
5129   assert(LoopCost && "Non-zero loop cost expected");
5130 
5131   // Clamp the calculated IC to be between the 1 and the max interleave count
5132   // that the target allows.
5133   if (IC > MaxInterleaveCount)
5134     IC = MaxInterleaveCount;
5135   else if (IC < 1)
5136     IC = 1;
5137 
5138   // Interleave if we vectorized this loop and there is a reduction that could
5139   // benefit from interleaving.
5140   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5141     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5142     return IC;
5143   }
5144 
5145   // Note that if we've already vectorized the loop we will have done the
5146   // runtime check and so interleaving won't require further checks.
5147   bool InterleavingRequiresRuntimePointerCheck =
5148       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5149 
5150   // We want to interleave small loops in order to reduce the loop overhead and
5151   // potentially expose ILP opportunities.
5152   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5153   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5154     // We assume that the cost overhead is 1 and we use the cost model
5155     // to estimate the cost of the loop and interleave until the cost of the
5156     // loop overhead is about 5% of the cost of the loop.
5157     unsigned SmallIC =
5158         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5159 
5160     // Interleave until store/load ports (estimated by max interleave count) are
5161     // saturated.
5162     unsigned NumStores = Legal->getNumStores();
5163     unsigned NumLoads = Legal->getNumLoads();
5164     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5165     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5166 
5167     // If we have a scalar reduction (vector reductions are already dealt with
5168     // by this point), we can increase the critical path length if the loop
5169     // we're interleaving is inside another loop. Limit, by default to 2, so the
5170     // critical path only gets increased by one reduction operation.
5171     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5172       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5173       SmallIC = std::min(SmallIC, F);
5174       StoresIC = std::min(StoresIC, F);
5175       LoadsIC = std::min(LoadsIC, F);
5176     }
5177 
5178     if (EnableLoadStoreRuntimeInterleave &&
5179         std::max(StoresIC, LoadsIC) > SmallIC) {
5180       LLVM_DEBUG(
5181           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5182       return std::max(StoresIC, LoadsIC);
5183     }
5184 
5185     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5186     return SmallIC;
5187   }
5188 
5189   // Interleave if this is a large loop (small loops are already dealt with by
5190   // this point) that could benefit from interleaving.
5191   bool HasReductions = !Legal->getReductionVars()->empty();
5192   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5193     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5194     return IC;
5195   }
5196 
5197   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5198   return 1;
5199 }
5200 
5201 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5202 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5203   // This function calculates the register usage by measuring the highest number
5204   // of values that are alive at a single location. Obviously, this is a very
5205   // rough estimation. We scan the loop in a topological order in order and
5206   // assign a number to each instruction. We use RPO to ensure that defs are
5207   // met before their users. We assume that each instruction that has in-loop
5208   // users starts an interval. We record every time that an in-loop value is
5209   // used, so we have a list of the first and last occurrences of each
5210   // instruction. Next, we transpose this data structure into a multi map that
5211   // holds the list of intervals that *end* at a specific location. This multi
5212   // map allows us to perform a linear search. We scan the instructions linearly
5213   // and record each time that a new interval starts, by placing it in a set.
5214   // If we find this value in the multi-map then we remove it from the set.
5215   // The max register usage is the maximum size of the set.
5216   // We also search for instructions that are defined outside the loop, but are
5217   // used inside the loop. We need this number separately from the max-interval
5218   // usage number because when we unroll, loop-invariant values do not take
5219   // more register.
5220   LoopBlocksDFS DFS(TheLoop);
5221   DFS.perform(LI);
5222 
5223   RegisterUsage RU;
5224 
5225   // Each 'key' in the map opens a new interval. The values
5226   // of the map are the index of the 'last seen' usage of the
5227   // instruction that is the key.
5228   using IntervalMap = DenseMap<Instruction *, unsigned>;
5229 
5230   // Maps instruction to its index.
5231   SmallVector<Instruction *, 64> IdxToInstr;
5232   // Marks the end of each interval.
5233   IntervalMap EndPoint;
5234   // Saves the list of instruction indices that are used in the loop.
5235   SmallPtrSet<Instruction *, 8> Ends;
5236   // Saves the list of values that are used in the loop but are
5237   // defined outside the loop, such as arguments and constants.
5238   SmallPtrSet<Value *, 8> LoopInvariants;
5239 
5240   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5241     for (Instruction &I : BB->instructionsWithoutDebug()) {
5242       IdxToInstr.push_back(&I);
5243 
5244       // Save the end location of each USE.
5245       for (Value *U : I.operands()) {
5246         auto *Instr = dyn_cast<Instruction>(U);
5247 
5248         // Ignore non-instruction values such as arguments, constants, etc.
5249         if (!Instr)
5250           continue;
5251 
5252         // If this instruction is outside the loop then record it and continue.
5253         if (!TheLoop->contains(Instr)) {
5254           LoopInvariants.insert(Instr);
5255           continue;
5256         }
5257 
5258         // Overwrite previous end points.
5259         EndPoint[Instr] = IdxToInstr.size();
5260         Ends.insert(Instr);
5261       }
5262     }
5263   }
5264 
5265   // Saves the list of intervals that end with the index in 'key'.
5266   using InstrList = SmallVector<Instruction *, 2>;
5267   DenseMap<unsigned, InstrList> TransposeEnds;
5268 
5269   // Transpose the EndPoints to a list of values that end at each index.
5270   for (auto &Interval : EndPoint)
5271     TransposeEnds[Interval.second].push_back(Interval.first);
5272 
5273   SmallPtrSet<Instruction *, 8> OpenIntervals;
5274 
5275   // Get the size of the widest register.
5276   unsigned MaxSafeDepDist = -1U;
5277   if (Legal->getMaxSafeDepDistBytes() != -1U)
5278     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5279   unsigned WidestRegister =
5280       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5281   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5282 
5283   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5284   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5285 
5286   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5287 
5288   // A lambda that gets the register usage for the given type and VF.
5289   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5290     if (Ty->isTokenTy())
5291       return 0U;
5292     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5293     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5294   };
5295 
5296   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5297     Instruction *I = IdxToInstr[i];
5298 
5299     // Remove all of the instructions that end at this location.
5300     InstrList &List = TransposeEnds[i];
5301     for (Instruction *ToRemove : List)
5302       OpenIntervals.erase(ToRemove);
5303 
5304     // Ignore instructions that are never used within the loop.
5305     if (Ends.find(I) == Ends.end())
5306       continue;
5307 
5308     // Skip ignored values.
5309     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5310       continue;
5311 
5312     // For each VF find the maximum usage of registers.
5313     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5314       if (VFs[j] == 1) {
5315         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5316         continue;
5317       }
5318       collectUniformsAndScalars(VFs[j]);
5319       // Count the number of live intervals.
5320       unsigned RegUsage = 0;
5321       for (auto Inst : OpenIntervals) {
5322         // Skip ignored values for VF > 1.
5323         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5324             isScalarAfterVectorization(Inst, VFs[j]))
5325           continue;
5326         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5327       }
5328       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5329     }
5330 
5331     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5332                       << OpenIntervals.size() << '\n');
5333 
5334     // Add the current instruction to the list of open intervals.
5335     OpenIntervals.insert(I);
5336   }
5337 
5338   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5339     unsigned Invariant = 0;
5340     if (VFs[i] == 1)
5341       Invariant = LoopInvariants.size();
5342     else {
5343       for (auto Inst : LoopInvariants)
5344         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5345     }
5346 
5347     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5348     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5349     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5350                       << '\n');
5351 
5352     RU.LoopInvariantRegs = Invariant;
5353     RU.MaxLocalUsers = MaxUsages[i];
5354     RUs[i] = RU;
5355   }
5356 
5357   return RUs;
5358 }
5359 
5360 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5361   // TODO: Cost model for emulated masked load/store is completely
5362   // broken. This hack guides the cost model to use an artificially
5363   // high enough value to practically disable vectorization with such
5364   // operations, except where previously deployed legality hack allowed
5365   // using very low cost values. This is to avoid regressions coming simply
5366   // from moving "masked load/store" check from legality to cost model.
5367   // Masked Load/Gather emulation was previously never allowed.
5368   // Limited number of Masked Store/Scatter emulation was allowed.
5369   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5370   return isa<LoadInst>(I) ||
5371          (isa<StoreInst>(I) &&
5372           NumPredStores > NumberOfStoresToPredicate);
5373 }
5374 
5375 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5376   // If we aren't vectorizing the loop, or if we've already collected the
5377   // instructions to scalarize, there's nothing to do. Collection may already
5378   // have occurred if we have a user-selected VF and are now computing the
5379   // expected cost for interleaving.
5380   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5381     return;
5382 
5383   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5384   // not profitable to scalarize any instructions, the presence of VF in the
5385   // map will indicate that we've analyzed it already.
5386   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5387 
5388   // Find all the instructions that are scalar with predication in the loop and
5389   // determine if it would be better to not if-convert the blocks they are in.
5390   // If so, we also record the instructions to scalarize.
5391   for (BasicBlock *BB : TheLoop->blocks()) {
5392     if (!blockNeedsPredication(BB))
5393       continue;
5394     for (Instruction &I : *BB)
5395       if (isScalarWithPredication(&I)) {
5396         ScalarCostsTy ScalarCosts;
5397         // Do not apply discount logic if hacked cost is needed
5398         // for emulated masked memrefs.
5399         if (!useEmulatedMaskMemRefHack(&I) &&
5400             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5401           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5402         // Remember that BB will remain after vectorization.
5403         PredicatedBBsAfterVectorization.insert(BB);
5404       }
5405   }
5406 }
5407 
5408 int LoopVectorizationCostModel::computePredInstDiscount(
5409     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5410     unsigned VF) {
5411   assert(!isUniformAfterVectorization(PredInst, VF) &&
5412          "Instruction marked uniform-after-vectorization will be predicated");
5413 
5414   // Initialize the discount to zero, meaning that the scalar version and the
5415   // vector version cost the same.
5416   int Discount = 0;
5417 
5418   // Holds instructions to analyze. The instructions we visit are mapped in
5419   // ScalarCosts. Those instructions are the ones that would be scalarized if
5420   // we find that the scalar version costs less.
5421   SmallVector<Instruction *, 8> Worklist;
5422 
5423   // Returns true if the given instruction can be scalarized.
5424   auto canBeScalarized = [&](Instruction *I) -> bool {
5425     // We only attempt to scalarize instructions forming a single-use chain
5426     // from the original predicated block that would otherwise be vectorized.
5427     // Although not strictly necessary, we give up on instructions we know will
5428     // already be scalar to avoid traversing chains that are unlikely to be
5429     // beneficial.
5430     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5431         isScalarAfterVectorization(I, VF))
5432       return false;
5433 
5434     // If the instruction is scalar with predication, it will be analyzed
5435     // separately. We ignore it within the context of PredInst.
5436     if (isScalarWithPredication(I))
5437       return false;
5438 
5439     // If any of the instruction's operands are uniform after vectorization,
5440     // the instruction cannot be scalarized. This prevents, for example, a
5441     // masked load from being scalarized.
5442     //
5443     // We assume we will only emit a value for lane zero of an instruction
5444     // marked uniform after vectorization, rather than VF identical values.
5445     // Thus, if we scalarize an instruction that uses a uniform, we would
5446     // create uses of values corresponding to the lanes we aren't emitting code
5447     // for. This behavior can be changed by allowing getScalarValue to clone
5448     // the lane zero values for uniforms rather than asserting.
5449     for (Use &U : I->operands())
5450       if (auto *J = dyn_cast<Instruction>(U.get()))
5451         if (isUniformAfterVectorization(J, VF))
5452           return false;
5453 
5454     // Otherwise, we can scalarize the instruction.
5455     return true;
5456   };
5457 
5458   // Compute the expected cost discount from scalarizing the entire expression
5459   // feeding the predicated instruction. We currently only consider expressions
5460   // that are single-use instruction chains.
5461   Worklist.push_back(PredInst);
5462   while (!Worklist.empty()) {
5463     Instruction *I = Worklist.pop_back_val();
5464 
5465     // If we've already analyzed the instruction, there's nothing to do.
5466     if (ScalarCosts.find(I) != ScalarCosts.end())
5467       continue;
5468 
5469     // Compute the cost of the vector instruction. Note that this cost already
5470     // includes the scalarization overhead of the predicated instruction.
5471     unsigned VectorCost = getInstructionCost(I, VF).first;
5472 
5473     // Compute the cost of the scalarized instruction. This cost is the cost of
5474     // the instruction as if it wasn't if-converted and instead remained in the
5475     // predicated block. We will scale this cost by block probability after
5476     // computing the scalarization overhead.
5477     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5478 
5479     // Compute the scalarization overhead of needed insertelement instructions
5480     // and phi nodes.
5481     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5482       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5483                                                  true, false);
5484       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5485     }
5486 
5487     // Compute the scalarization overhead of needed extractelement
5488     // instructions. For each of the instruction's operands, if the operand can
5489     // be scalarized, add it to the worklist; otherwise, account for the
5490     // overhead.
5491     for (Use &U : I->operands())
5492       if (auto *J = dyn_cast<Instruction>(U.get())) {
5493         assert(VectorType::isValidElementType(J->getType()) &&
5494                "Instruction has non-scalar type");
5495         if (canBeScalarized(J))
5496           Worklist.push_back(J);
5497         else if (needsExtract(J, VF))
5498           ScalarCost += TTI.getScalarizationOverhead(
5499                               ToVectorTy(J->getType(),VF), false, true);
5500       }
5501 
5502     // Scale the total scalar cost by block probability.
5503     ScalarCost /= getReciprocalPredBlockProb();
5504 
5505     // Compute the discount. A non-negative discount means the vector version
5506     // of the instruction costs more, and scalarizing would be beneficial.
5507     Discount += VectorCost - ScalarCost;
5508     ScalarCosts[I] = ScalarCost;
5509   }
5510 
5511   return Discount;
5512 }
5513 
5514 LoopVectorizationCostModel::VectorizationCostTy
5515 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5516   VectorizationCostTy Cost;
5517 
5518   // For each block.
5519   for (BasicBlock *BB : TheLoop->blocks()) {
5520     VectorizationCostTy BlockCost;
5521 
5522     // For each instruction in the old loop.
5523     for (Instruction &I : BB->instructionsWithoutDebug()) {
5524       // Skip ignored values.
5525       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5526           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5527         continue;
5528 
5529       VectorizationCostTy C = getInstructionCost(&I, VF);
5530 
5531       // Check if we should override the cost.
5532       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5533         C.first = ForceTargetInstructionCost;
5534 
5535       BlockCost.first += C.first;
5536       BlockCost.second |= C.second;
5537       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5538                         << " for VF " << VF << " For instruction: " << I
5539                         << '\n');
5540     }
5541 
5542     // If we are vectorizing a predicated block, it will have been
5543     // if-converted. This means that the block's instructions (aside from
5544     // stores and instructions that may divide by zero) will now be
5545     // unconditionally executed. For the scalar case, we may not always execute
5546     // the predicated block. Thus, scale the block's cost by the probability of
5547     // executing it.
5548     if (VF == 1 && blockNeedsPredication(BB))
5549       BlockCost.first /= getReciprocalPredBlockProb();
5550 
5551     Cost.first += BlockCost.first;
5552     Cost.second |= BlockCost.second;
5553   }
5554 
5555   return Cost;
5556 }
5557 
5558 /// Gets Address Access SCEV after verifying that the access pattern
5559 /// is loop invariant except the induction variable dependence.
5560 ///
5561 /// This SCEV can be sent to the Target in order to estimate the address
5562 /// calculation cost.
5563 static const SCEV *getAddressAccessSCEV(
5564               Value *Ptr,
5565               LoopVectorizationLegality *Legal,
5566               PredicatedScalarEvolution &PSE,
5567               const Loop *TheLoop) {
5568 
5569   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5570   if (!Gep)
5571     return nullptr;
5572 
5573   // We are looking for a gep with all loop invariant indices except for one
5574   // which should be an induction variable.
5575   auto SE = PSE.getSE();
5576   unsigned NumOperands = Gep->getNumOperands();
5577   for (unsigned i = 1; i < NumOperands; ++i) {
5578     Value *Opd = Gep->getOperand(i);
5579     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5580         !Legal->isInductionVariable(Opd))
5581       return nullptr;
5582   }
5583 
5584   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5585   return PSE.getSCEV(Ptr);
5586 }
5587 
5588 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5589   return Legal->hasStride(I->getOperand(0)) ||
5590          Legal->hasStride(I->getOperand(1));
5591 }
5592 
5593 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5594                                                                  unsigned VF) {
5595   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5596   Type *ValTy = getMemInstValueType(I);
5597   auto SE = PSE.getSE();
5598 
5599   unsigned Alignment = getLoadStoreAlignment(I);
5600   unsigned AS = getLoadStoreAddressSpace(I);
5601   Value *Ptr = getLoadStorePointerOperand(I);
5602   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5603 
5604   // Figure out whether the access is strided and get the stride value
5605   // if it's known in compile time
5606   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5607 
5608   // Get the cost of the scalar memory instruction and address computation.
5609   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5610 
5611   // Don't pass *I here, since it is scalar but will actually be part of a
5612   // vectorized loop where the user of it is a vectorized instruction.
5613   Cost += VF *
5614           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5615                               AS);
5616 
5617   // Get the overhead of the extractelement and insertelement instructions
5618   // we might create due to scalarization.
5619   Cost += getScalarizationOverhead(I, VF);
5620 
5621   // If we have a predicated store, it may not be executed for each vector
5622   // lane. Scale the cost by the probability of executing the predicated
5623   // block.
5624   if (isPredicatedInst(I)) {
5625     Cost /= getReciprocalPredBlockProb();
5626 
5627     if (useEmulatedMaskMemRefHack(I))
5628       // Artificially setting to a high enough value to practically disable
5629       // vectorization with such operations.
5630       Cost = 3000000;
5631   }
5632 
5633   return Cost;
5634 }
5635 
5636 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5637                                                              unsigned VF) {
5638   Type *ValTy = getMemInstValueType(I);
5639   Type *VectorTy = ToVectorTy(ValTy, VF);
5640   unsigned Alignment = getLoadStoreAlignment(I);
5641   Value *Ptr = getLoadStorePointerOperand(I);
5642   unsigned AS = getLoadStoreAddressSpace(I);
5643   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5644 
5645   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5646          "Stride should be 1 or -1 for consecutive memory access");
5647   unsigned Cost = 0;
5648   if (Legal->isMaskRequired(I))
5649     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5650   else
5651     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5652 
5653   bool Reverse = ConsecutiveStride < 0;
5654   if (Reverse)
5655     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5656   return Cost;
5657 }
5658 
5659 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5660                                                          unsigned VF) {
5661   Type *ValTy = getMemInstValueType(I);
5662   Type *VectorTy = ToVectorTy(ValTy, VF);
5663   unsigned Alignment = getLoadStoreAlignment(I);
5664   unsigned AS = getLoadStoreAddressSpace(I);
5665   if (isa<LoadInst>(I)) {
5666     return TTI.getAddressComputationCost(ValTy) +
5667            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5668            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5669   }
5670   StoreInst *SI = cast<StoreInst>(I);
5671 
5672   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5673   return TTI.getAddressComputationCost(ValTy) +
5674          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5675          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5676                                                Instruction::ExtractElement,
5677                                                VectorTy, VF - 1));
5678 }
5679 
5680 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5681                                                           unsigned VF) {
5682   Type *ValTy = getMemInstValueType(I);
5683   Type *VectorTy = ToVectorTy(ValTy, VF);
5684   unsigned Alignment = getLoadStoreAlignment(I);
5685   Value *Ptr = getLoadStorePointerOperand(I);
5686 
5687   return TTI.getAddressComputationCost(VectorTy) +
5688          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5689                                     Legal->isMaskRequired(I), Alignment);
5690 }
5691 
5692 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5693                                                             unsigned VF) {
5694   Type *ValTy = getMemInstValueType(I);
5695   Type *VectorTy = ToVectorTy(ValTy, VF);
5696   unsigned AS = getLoadStoreAddressSpace(I);
5697 
5698   auto Group = getInterleavedAccessGroup(I);
5699   assert(Group && "Fail to get an interleaved access group.");
5700 
5701   unsigned InterleaveFactor = Group->getFactor();
5702   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5703 
5704   // Holds the indices of existing members in an interleaved load group.
5705   // An interleaved store group doesn't need this as it doesn't allow gaps.
5706   SmallVector<unsigned, 4> Indices;
5707   if (isa<LoadInst>(I)) {
5708     for (unsigned i = 0; i < InterleaveFactor; i++)
5709       if (Group->getMember(i))
5710         Indices.push_back(i);
5711   }
5712 
5713   // Calculate the cost of the whole interleaved group.
5714   bool UseMaskForGaps =
5715       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5716   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5717       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5718       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5719 
5720   if (Group->isReverse()) {
5721     // TODO: Add support for reversed masked interleaved access.
5722     assert(!Legal->isMaskRequired(I) &&
5723            "Reverse masked interleaved access not supported.");
5724     Cost += Group->getNumMembers() *
5725             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5726   }
5727   return Cost;
5728 }
5729 
5730 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5731                                                               unsigned VF) {
5732   // Calculate scalar cost only. Vectorization cost should be ready at this
5733   // moment.
5734   if (VF == 1) {
5735     Type *ValTy = getMemInstValueType(I);
5736     unsigned Alignment = getLoadStoreAlignment(I);
5737     unsigned AS = getLoadStoreAddressSpace(I);
5738 
5739     return TTI.getAddressComputationCost(ValTy) +
5740            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5741   }
5742   return getWideningCost(I, VF);
5743 }
5744 
5745 LoopVectorizationCostModel::VectorizationCostTy
5746 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5747   // If we know that this instruction will remain uniform, check the cost of
5748   // the scalar version.
5749   if (isUniformAfterVectorization(I, VF))
5750     VF = 1;
5751 
5752   if (VF > 1 && isProfitableToScalarize(I, VF))
5753     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5754 
5755   // Forced scalars do not have any scalarization overhead.
5756   auto ForcedScalar = ForcedScalars.find(VF);
5757   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5758     auto InstSet = ForcedScalar->second;
5759     if (InstSet.find(I) != InstSet.end())
5760       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5761   }
5762 
5763   Type *VectorTy;
5764   unsigned C = getInstructionCost(I, VF, VectorTy);
5765 
5766   bool TypeNotScalarized =
5767       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5768   return VectorizationCostTy(C, TypeNotScalarized);
5769 }
5770 
5771 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5772                                                               unsigned VF) {
5773 
5774   if (VF == 1)
5775     return 0;
5776 
5777   unsigned Cost = 0;
5778   Type *RetTy = ToVectorTy(I->getType(), VF);
5779   if (!RetTy->isVoidTy() &&
5780       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5781     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5782 
5783   // Some targets keep addresses scalar.
5784   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5785     return Cost;
5786 
5787   // Some targets support efficient element stores.
5788   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5789     return Cost;
5790 
5791   // Collect operands to consider.
5792   CallInst *CI = dyn_cast<CallInst>(I);
5793   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5794 
5795   // Skip operands that do not require extraction/scalarization and do not incur
5796   // any overhead.
5797   return Cost + TTI.getOperandsScalarizationOverhead(
5798                     filterExtractingOperands(Ops, VF), VF);
5799 }
5800 
5801 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5802   if (VF == 1)
5803     return;
5804   NumPredStores = 0;
5805   for (BasicBlock *BB : TheLoop->blocks()) {
5806     // For each instruction in the old loop.
5807     for (Instruction &I : *BB) {
5808       Value *Ptr =  getLoadStorePointerOperand(&I);
5809       if (!Ptr)
5810         continue;
5811 
5812       // TODO: We should generate better code and update the cost model for
5813       // predicated uniform stores. Today they are treated as any other
5814       // predicated store (see added test cases in
5815       // invariant-store-vectorization.ll).
5816       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5817         NumPredStores++;
5818 
5819       if (Legal->isUniform(Ptr) &&
5820           // Conditional loads and stores should be scalarized and predicated.
5821           // isScalarWithPredication cannot be used here since masked
5822           // gather/scatters are not considered scalar with predication.
5823           !Legal->blockNeedsPredication(I.getParent())) {
5824         // TODO: Avoid replicating loads and stores instead of
5825         // relying on instcombine to remove them.
5826         // Load: Scalar load + broadcast
5827         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5828         unsigned Cost = getUniformMemOpCost(&I, VF);
5829         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5830         continue;
5831       }
5832 
5833       // We assume that widening is the best solution when possible.
5834       if (memoryInstructionCanBeWidened(&I, VF)) {
5835         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5836         int ConsecutiveStride =
5837                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5838         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5839                "Expected consecutive stride.");
5840         InstWidening Decision =
5841             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5842         setWideningDecision(&I, VF, Decision, Cost);
5843         continue;
5844       }
5845 
5846       // Choose between Interleaving, Gather/Scatter or Scalarization.
5847       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5848       unsigned NumAccesses = 1;
5849       if (isAccessInterleaved(&I)) {
5850         auto Group = getInterleavedAccessGroup(&I);
5851         assert(Group && "Fail to get an interleaved access group.");
5852 
5853         // Make one decision for the whole group.
5854         if (getWideningDecision(&I, VF) != CM_Unknown)
5855           continue;
5856 
5857         NumAccesses = Group->getNumMembers();
5858         if (interleavedAccessCanBeWidened(&I, VF))
5859           InterleaveCost = getInterleaveGroupCost(&I, VF);
5860       }
5861 
5862       unsigned GatherScatterCost =
5863           isLegalGatherOrScatter(&I)
5864               ? getGatherScatterCost(&I, VF) * NumAccesses
5865               : std::numeric_limits<unsigned>::max();
5866 
5867       unsigned ScalarizationCost =
5868           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5869 
5870       // Choose better solution for the current VF,
5871       // write down this decision and use it during vectorization.
5872       unsigned Cost;
5873       InstWidening Decision;
5874       if (InterleaveCost <= GatherScatterCost &&
5875           InterleaveCost < ScalarizationCost) {
5876         Decision = CM_Interleave;
5877         Cost = InterleaveCost;
5878       } else if (GatherScatterCost < ScalarizationCost) {
5879         Decision = CM_GatherScatter;
5880         Cost = GatherScatterCost;
5881       } else {
5882         Decision = CM_Scalarize;
5883         Cost = ScalarizationCost;
5884       }
5885       // If the instructions belongs to an interleave group, the whole group
5886       // receives the same decision. The whole group receives the cost, but
5887       // the cost will actually be assigned to one instruction.
5888       if (auto Group = getInterleavedAccessGroup(&I))
5889         setWideningDecision(Group, VF, Decision, Cost);
5890       else
5891         setWideningDecision(&I, VF, Decision, Cost);
5892     }
5893   }
5894 
5895   // Make sure that any load of address and any other address computation
5896   // remains scalar unless there is gather/scatter support. This avoids
5897   // inevitable extracts into address registers, and also has the benefit of
5898   // activating LSR more, since that pass can't optimize vectorized
5899   // addresses.
5900   if (TTI.prefersVectorizedAddressing())
5901     return;
5902 
5903   // Start with all scalar pointer uses.
5904   SmallPtrSet<Instruction *, 8> AddrDefs;
5905   for (BasicBlock *BB : TheLoop->blocks())
5906     for (Instruction &I : *BB) {
5907       Instruction *PtrDef =
5908         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5909       if (PtrDef && TheLoop->contains(PtrDef) &&
5910           getWideningDecision(&I, VF) != CM_GatherScatter)
5911         AddrDefs.insert(PtrDef);
5912     }
5913 
5914   // Add all instructions used to generate the addresses.
5915   SmallVector<Instruction *, 4> Worklist;
5916   for (auto *I : AddrDefs)
5917     Worklist.push_back(I);
5918   while (!Worklist.empty()) {
5919     Instruction *I = Worklist.pop_back_val();
5920     for (auto &Op : I->operands())
5921       if (auto *InstOp = dyn_cast<Instruction>(Op))
5922         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5923             AddrDefs.insert(InstOp).second)
5924           Worklist.push_back(InstOp);
5925   }
5926 
5927   for (auto *I : AddrDefs) {
5928     if (isa<LoadInst>(I)) {
5929       // Setting the desired widening decision should ideally be handled in
5930       // by cost functions, but since this involves the task of finding out
5931       // if the loaded register is involved in an address computation, it is
5932       // instead changed here when we know this is the case.
5933       InstWidening Decision = getWideningDecision(I, VF);
5934       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5935         // Scalarize a widened load of address.
5936         setWideningDecision(I, VF, CM_Scalarize,
5937                             (VF * getMemoryInstructionCost(I, 1)));
5938       else if (auto Group = getInterleavedAccessGroup(I)) {
5939         // Scalarize an interleave group of address loads.
5940         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5941           if (Instruction *Member = Group->getMember(I))
5942             setWideningDecision(Member, VF, CM_Scalarize,
5943                                 (VF * getMemoryInstructionCost(Member, 1)));
5944         }
5945       }
5946     } else
5947       // Make sure I gets scalarized and a cost estimate without
5948       // scalarization overhead.
5949       ForcedScalars[VF].insert(I);
5950   }
5951 }
5952 
5953 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5954                                                         unsigned VF,
5955                                                         Type *&VectorTy) {
5956   Type *RetTy = I->getType();
5957   if (canTruncateToMinimalBitwidth(I, VF))
5958     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5959   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5960   auto SE = PSE.getSE();
5961 
5962   // TODO: We need to estimate the cost of intrinsic calls.
5963   switch (I->getOpcode()) {
5964   case Instruction::GetElementPtr:
5965     // We mark this instruction as zero-cost because the cost of GEPs in
5966     // vectorized code depends on whether the corresponding memory instruction
5967     // is scalarized or not. Therefore, we handle GEPs with the memory
5968     // instruction cost.
5969     return 0;
5970   case Instruction::Br: {
5971     // In cases of scalarized and predicated instructions, there will be VF
5972     // predicated blocks in the vectorized loop. Each branch around these
5973     // blocks requires also an extract of its vector compare i1 element.
5974     bool ScalarPredicatedBB = false;
5975     BranchInst *BI = cast<BranchInst>(I);
5976     if (VF > 1 && BI->isConditional() &&
5977         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5978              PredicatedBBsAfterVectorization.end() ||
5979          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5980              PredicatedBBsAfterVectorization.end()))
5981       ScalarPredicatedBB = true;
5982 
5983     if (ScalarPredicatedBB) {
5984       // Return cost for branches around scalarized and predicated blocks.
5985       Type *Vec_i1Ty =
5986           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
5987       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5988               (TTI.getCFInstrCost(Instruction::Br) * VF));
5989     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
5990       // The back-edge branch will remain, as will all scalar branches.
5991       return TTI.getCFInstrCost(Instruction::Br);
5992     else
5993       // This branch will be eliminated by if-conversion.
5994       return 0;
5995     // Note: We currently assume zero cost for an unconditional branch inside
5996     // a predicated block since it will become a fall-through, although we
5997     // may decide in the future to call TTI for all branches.
5998   }
5999   case Instruction::PHI: {
6000     auto *Phi = cast<PHINode>(I);
6001 
6002     // First-order recurrences are replaced by vector shuffles inside the loop.
6003     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6004     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6005       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6006                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6007 
6008     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6009     // converted into select instructions. We require N - 1 selects per phi
6010     // node, where N is the number of incoming values.
6011     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6012       return (Phi->getNumIncomingValues() - 1) *
6013              TTI.getCmpSelInstrCost(
6014                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6015                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6016 
6017     return TTI.getCFInstrCost(Instruction::PHI);
6018   }
6019   case Instruction::UDiv:
6020   case Instruction::SDiv:
6021   case Instruction::URem:
6022   case Instruction::SRem:
6023     // If we have a predicated instruction, it may not be executed for each
6024     // vector lane. Get the scalarization cost and scale this amount by the
6025     // probability of executing the predicated block. If the instruction is not
6026     // predicated, we fall through to the next case.
6027     if (VF > 1 && isScalarWithPredication(I)) {
6028       unsigned Cost = 0;
6029 
6030       // These instructions have a non-void type, so account for the phi nodes
6031       // that we will create. This cost is likely to be zero. The phi node
6032       // cost, if any, should be scaled by the block probability because it
6033       // models a copy at the end of each predicated block.
6034       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6035 
6036       // The cost of the non-predicated instruction.
6037       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6038 
6039       // The cost of insertelement and extractelement instructions needed for
6040       // scalarization.
6041       Cost += getScalarizationOverhead(I, VF);
6042 
6043       // Scale the cost by the probability of executing the predicated blocks.
6044       // This assumes the predicated block for each vector lane is equally
6045       // likely.
6046       return Cost / getReciprocalPredBlockProb();
6047     }
6048     LLVM_FALLTHROUGH;
6049   case Instruction::Add:
6050   case Instruction::FAdd:
6051   case Instruction::Sub:
6052   case Instruction::FSub:
6053   case Instruction::Mul:
6054   case Instruction::FMul:
6055   case Instruction::FDiv:
6056   case Instruction::FRem:
6057   case Instruction::Shl:
6058   case Instruction::LShr:
6059   case Instruction::AShr:
6060   case Instruction::And:
6061   case Instruction::Or:
6062   case Instruction::Xor: {
6063     // Since we will replace the stride by 1 the multiplication should go away.
6064     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6065       return 0;
6066     // Certain instructions can be cheaper to vectorize if they have a constant
6067     // second vector operand. One example of this are shifts on x86.
6068     Value *Op2 = I->getOperand(1);
6069     TargetTransformInfo::OperandValueProperties Op2VP;
6070     TargetTransformInfo::OperandValueKind Op2VK =
6071         TTI.getOperandInfo(Op2, Op2VP);
6072     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6073       Op2VK = TargetTransformInfo::OK_UniformValue;
6074 
6075     SmallVector<const Value *, 4> Operands(I->operand_values());
6076     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6077     return N * TTI.getArithmeticInstrCost(
6078                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6079                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6080   }
6081   case Instruction::FNeg: {
6082     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6083     return N * TTI.getArithmeticInstrCost(
6084                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6085                    TargetTransformInfo::OK_AnyValue,
6086                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6087                    I->getOperand(0));
6088   }
6089   case Instruction::Select: {
6090     SelectInst *SI = cast<SelectInst>(I);
6091     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6092     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6093     Type *CondTy = SI->getCondition()->getType();
6094     if (!ScalarCond)
6095       CondTy = VectorType::get(CondTy, VF);
6096 
6097     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6098   }
6099   case Instruction::ICmp:
6100   case Instruction::FCmp: {
6101     Type *ValTy = I->getOperand(0)->getType();
6102     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6103     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6104       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6105     VectorTy = ToVectorTy(ValTy, VF);
6106     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6107   }
6108   case Instruction::Store:
6109   case Instruction::Load: {
6110     unsigned Width = VF;
6111     if (Width > 1) {
6112       InstWidening Decision = getWideningDecision(I, Width);
6113       assert(Decision != CM_Unknown &&
6114              "CM decision should be taken at this point");
6115       if (Decision == CM_Scalarize)
6116         Width = 1;
6117     }
6118     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6119     return getMemoryInstructionCost(I, VF);
6120   }
6121   case Instruction::ZExt:
6122   case Instruction::SExt:
6123   case Instruction::FPToUI:
6124   case Instruction::FPToSI:
6125   case Instruction::FPExt:
6126   case Instruction::PtrToInt:
6127   case Instruction::IntToPtr:
6128   case Instruction::SIToFP:
6129   case Instruction::UIToFP:
6130   case Instruction::Trunc:
6131   case Instruction::FPTrunc:
6132   case Instruction::BitCast: {
6133     // We optimize the truncation of induction variables having constant
6134     // integer steps. The cost of these truncations is the same as the scalar
6135     // operation.
6136     if (isOptimizableIVTruncate(I, VF)) {
6137       auto *Trunc = cast<TruncInst>(I);
6138       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6139                                   Trunc->getSrcTy(), Trunc);
6140     }
6141 
6142     Type *SrcScalarTy = I->getOperand(0)->getType();
6143     Type *SrcVecTy =
6144         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6145     if (canTruncateToMinimalBitwidth(I, VF)) {
6146       // This cast is going to be shrunk. This may remove the cast or it might
6147       // turn it into slightly different cast. For example, if MinBW == 16,
6148       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6149       //
6150       // Calculate the modified src and dest types.
6151       Type *MinVecTy = VectorTy;
6152       if (I->getOpcode() == Instruction::Trunc) {
6153         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6154         VectorTy =
6155             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6156       } else if (I->getOpcode() == Instruction::ZExt ||
6157                  I->getOpcode() == Instruction::SExt) {
6158         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6159         VectorTy =
6160             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6161       }
6162     }
6163 
6164     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6165     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6166   }
6167   case Instruction::Call: {
6168     bool NeedToScalarize;
6169     CallInst *CI = cast<CallInst>(I);
6170     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6171     if (getVectorIntrinsicIDForCall(CI, TLI))
6172       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6173     return CallCost;
6174   }
6175   default:
6176     // The cost of executing VF copies of the scalar instruction. This opcode
6177     // is unknown. Assume that it is the same as 'mul'.
6178     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6179            getScalarizationOverhead(I, VF);
6180   } // end of switch.
6181 }
6182 
6183 char LoopVectorize::ID = 0;
6184 
6185 static const char lv_name[] = "Loop Vectorization";
6186 
6187 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6188 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6189 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6190 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6191 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6192 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6193 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6194 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6195 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6196 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6197 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6198 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6199 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6200 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6201 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6202 
6203 namespace llvm {
6204 
6205 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6206 
6207 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6208                               bool VectorizeOnlyWhenForced) {
6209   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6210 }
6211 
6212 } // end namespace llvm
6213 
6214 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6215   // Check if the pointer operand of a load or store instruction is
6216   // consecutive.
6217   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6218     return Legal->isConsecutivePtr(Ptr);
6219   return false;
6220 }
6221 
6222 void LoopVectorizationCostModel::collectValuesToIgnore() {
6223   // Ignore ephemeral values.
6224   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6225 
6226   // Ignore type-promoting instructions we identified during reduction
6227   // detection.
6228   for (auto &Reduction : *Legal->getReductionVars()) {
6229     RecurrenceDescriptor &RedDes = Reduction.second;
6230     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6231     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6232   }
6233   // Ignore type-casting instructions we identified during induction
6234   // detection.
6235   for (auto &Induction : *Legal->getInductionVars()) {
6236     InductionDescriptor &IndDes = Induction.second;
6237     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6238     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6239   }
6240 }
6241 
6242 // TODO: we could return a pair of values that specify the max VF and
6243 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6244 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6245 // doesn't have a cost model that can choose which plan to execute if
6246 // more than one is generated.
6247 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6248                                  LoopVectorizationCostModel &CM) {
6249   unsigned WidestType;
6250   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6251   return WidestVectorRegBits / WidestType;
6252 }
6253 
6254 VectorizationFactor
6255 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6256   unsigned VF = UserVF;
6257   // Outer loop handling: They may require CFG and instruction level
6258   // transformations before even evaluating whether vectorization is profitable.
6259   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6260   // the vectorization pipeline.
6261   if (!OrigLoop->empty()) {
6262     // If the user doesn't provide a vectorization factor, determine a
6263     // reasonable one.
6264     if (!UserVF) {
6265       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6266       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6267 
6268       // Make sure we have a VF > 1 for stress testing.
6269       if (VPlanBuildStressTest && VF < 2) {
6270         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6271                           << "overriding computed VF.\n");
6272         VF = 4;
6273       }
6274     }
6275     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6276     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6277     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6278                       << " to build VPlans.\n");
6279     buildVPlans(VF, VF);
6280 
6281     // For VPlan build stress testing, we bail out after VPlan construction.
6282     if (VPlanBuildStressTest)
6283       return VectorizationFactor::Disabled();
6284 
6285     return {VF, 0};
6286   }
6287 
6288   LLVM_DEBUG(
6289       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6290                 "VPlan-native path.\n");
6291   return VectorizationFactor::Disabled();
6292 }
6293 
6294 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6295   assert(OrigLoop->empty() && "Inner loop expected.");
6296   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6297   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6298     return None;
6299 
6300   // Invalidate interleave groups if all blocks of loop will be predicated.
6301   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6302       !useMaskedInterleavedAccesses(*TTI)) {
6303     LLVM_DEBUG(
6304         dbgs()
6305         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6306            "which requires masked-interleaved support.\n");
6307     CM.InterleaveInfo.reset();
6308   }
6309 
6310   if (UserVF) {
6311     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6312     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6313     // Collect the instructions (and their associated costs) that will be more
6314     // profitable to scalarize.
6315     CM.selectUserVectorizationFactor(UserVF);
6316     buildVPlansWithVPRecipes(UserVF, UserVF);
6317     LLVM_DEBUG(printPlans(dbgs()));
6318     return {{UserVF, 0}};
6319   }
6320 
6321   unsigned MaxVF = MaybeMaxVF.getValue();
6322   assert(MaxVF != 0 && "MaxVF is zero.");
6323 
6324   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6325     // Collect Uniform and Scalar instructions after vectorization with VF.
6326     CM.collectUniformsAndScalars(VF);
6327 
6328     // Collect the instructions (and their associated costs) that will be more
6329     // profitable to scalarize.
6330     if (VF > 1)
6331       CM.collectInstsToScalarize(VF);
6332   }
6333 
6334   buildVPlansWithVPRecipes(1, MaxVF);
6335   LLVM_DEBUG(printPlans(dbgs()));
6336   if (MaxVF == 1)
6337     return VectorizationFactor::Disabled();
6338 
6339   // Select the optimal vectorization factor.
6340   return CM.selectVectorizationFactor(MaxVF);
6341 }
6342 
6343 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6344   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6345                     << '\n');
6346   BestVF = VF;
6347   BestUF = UF;
6348 
6349   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6350     return !Plan->hasVF(VF);
6351   });
6352   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6353 }
6354 
6355 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6356                                            DominatorTree *DT) {
6357   // Perform the actual loop transformation.
6358 
6359   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6360   VPCallbackILV CallbackILV(ILV);
6361 
6362   VPTransformState State{BestVF, BestUF,      LI,
6363                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6364                          &ILV,   CallbackILV};
6365   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6366   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6367 
6368   //===------------------------------------------------===//
6369   //
6370   // Notice: any optimization or new instruction that go
6371   // into the code below should also be implemented in
6372   // the cost-model.
6373   //
6374   //===------------------------------------------------===//
6375 
6376   // 2. Copy and widen instructions from the old loop into the new loop.
6377   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6378   VPlans.front()->execute(&State);
6379 
6380   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6381   //    predication, updating analyses.
6382   ILV.fixVectorizedLoop();
6383 }
6384 
6385 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6386     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6387   BasicBlock *Latch = OrigLoop->getLoopLatch();
6388 
6389   // We create new control-flow for the vectorized loop, so the original
6390   // condition will be dead after vectorization if it's only used by the
6391   // branch.
6392   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6393   if (Cmp && Cmp->hasOneUse())
6394     DeadInstructions.insert(Cmp);
6395 
6396   // We create new "steps" for induction variable updates to which the original
6397   // induction variables map. An original update instruction will be dead if
6398   // all its users except the induction variable are dead.
6399   for (auto &Induction : *Legal->getInductionVars()) {
6400     PHINode *Ind = Induction.first;
6401     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6402     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6403           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6404                                  DeadInstructions.end();
6405         }))
6406       DeadInstructions.insert(IndUpdate);
6407 
6408     // We record as "Dead" also the type-casting instructions we had identified
6409     // during induction analysis. We don't need any handling for them in the
6410     // vectorized loop because we have proven that, under a proper runtime
6411     // test guarding the vectorized loop, the value of the phi, and the casted
6412     // value of the phi, are the same. The last instruction in this casting chain
6413     // will get its scalar/vector/widened def from the scalar/vector/widened def
6414     // of the respective phi node. Any other casts in the induction def-use chain
6415     // have no other uses outside the phi update chain, and will be ignored.
6416     InductionDescriptor &IndDes = Induction.second;
6417     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6418     DeadInstructions.insert(Casts.begin(), Casts.end());
6419   }
6420 }
6421 
6422 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6423 
6424 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6425 
6426 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6427                                         Instruction::BinaryOps BinOp) {
6428   // When unrolling and the VF is 1, we only need to add a simple scalar.
6429   Type *Ty = Val->getType();
6430   assert(!Ty->isVectorTy() && "Val must be a scalar");
6431 
6432   if (Ty->isFloatingPointTy()) {
6433     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6434 
6435     // Floating point operations had to be 'fast' to enable the unrolling.
6436     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6437     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6438   }
6439   Constant *C = ConstantInt::get(Ty, StartIdx);
6440   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6441 }
6442 
6443 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6444   SmallVector<Metadata *, 4> MDs;
6445   // Reserve first location for self reference to the LoopID metadata node.
6446   MDs.push_back(nullptr);
6447   bool IsUnrollMetadata = false;
6448   MDNode *LoopID = L->getLoopID();
6449   if (LoopID) {
6450     // First find existing loop unrolling disable metadata.
6451     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6452       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6453       if (MD) {
6454         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6455         IsUnrollMetadata =
6456             S && S->getString().startswith("llvm.loop.unroll.disable");
6457       }
6458       MDs.push_back(LoopID->getOperand(i));
6459     }
6460   }
6461 
6462   if (!IsUnrollMetadata) {
6463     // Add runtime unroll disable metadata.
6464     LLVMContext &Context = L->getHeader()->getContext();
6465     SmallVector<Metadata *, 1> DisableOperands;
6466     DisableOperands.push_back(
6467         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6468     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6469     MDs.push_back(DisableNode);
6470     MDNode *NewLoopID = MDNode::get(Context, MDs);
6471     // Set operand 0 to refer to the loop id itself.
6472     NewLoopID->replaceOperandWith(0, NewLoopID);
6473     L->setLoopID(NewLoopID);
6474   }
6475 }
6476 
6477 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6478     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6479   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6480   bool PredicateAtRangeStart = Predicate(Range.Start);
6481 
6482   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6483     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6484       Range.End = TmpVF;
6485       break;
6486     }
6487 
6488   return PredicateAtRangeStart;
6489 }
6490 
6491 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6492 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6493 /// of VF's starting at a given VF and extending it as much as possible. Each
6494 /// vectorization decision can potentially shorten this sub-range during
6495 /// buildVPlan().
6496 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6497   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6498     VFRange SubRange = {VF, MaxVF + 1};
6499     VPlans.push_back(buildVPlan(SubRange));
6500     VF = SubRange.End;
6501   }
6502 }
6503 
6504 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6505                                          VPlanPtr &Plan) {
6506   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6507 
6508   // Look for cached value.
6509   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6510   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6511   if (ECEntryIt != EdgeMaskCache.end())
6512     return ECEntryIt->second;
6513 
6514   VPValue *SrcMask = createBlockInMask(Src, Plan);
6515 
6516   // The terminator has to be a branch inst!
6517   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6518   assert(BI && "Unexpected terminator found");
6519 
6520   if (!BI->isConditional())
6521     return EdgeMaskCache[Edge] = SrcMask;
6522 
6523   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6524   assert(EdgeMask && "No Edge Mask found for condition");
6525 
6526   if (BI->getSuccessor(0) != Dst)
6527     EdgeMask = Builder.createNot(EdgeMask);
6528 
6529   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6530     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6531 
6532   return EdgeMaskCache[Edge] = EdgeMask;
6533 }
6534 
6535 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6536   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6537 
6538   // Look for cached value.
6539   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6540   if (BCEntryIt != BlockMaskCache.end())
6541     return BCEntryIt->second;
6542 
6543   // All-one mask is modelled as no-mask following the convention for masked
6544   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6545   VPValue *BlockMask = nullptr;
6546 
6547   if (OrigLoop->getHeader() == BB) {
6548     if (!CM.blockNeedsPredication(BB))
6549       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6550 
6551     // Introduce the early-exit compare IV <= BTC to form header block mask.
6552     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6553     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6554     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6555     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6556     return BlockMaskCache[BB] = BlockMask;
6557   }
6558 
6559   // This is the block mask. We OR all incoming edges.
6560   for (auto *Predecessor : predecessors(BB)) {
6561     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6562     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6563       return BlockMaskCache[BB] = EdgeMask;
6564 
6565     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6566       BlockMask = EdgeMask;
6567       continue;
6568     }
6569 
6570     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6571   }
6572 
6573   return BlockMaskCache[BB] = BlockMask;
6574 }
6575 
6576 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6577                                                            VFRange &Range,
6578                                                            VPlanPtr &Plan) {
6579   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6580   if (!IG)
6581     return nullptr;
6582 
6583   // Now check if IG is relevant for VF's in the given range.
6584   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6585     return [=](unsigned VF) -> bool {
6586       return (VF >= 2 && // Query is illegal for VF == 1
6587               CM.getWideningDecision(I, VF) ==
6588                   LoopVectorizationCostModel::CM_Interleave);
6589     };
6590   };
6591   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6592     return nullptr;
6593 
6594   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6595   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6596   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6597   assert(I == IG->getInsertPos() &&
6598          "Generating a recipe for an adjunct member of an interleave group");
6599 
6600   VPValue *Mask = nullptr;
6601   if (Legal->isMaskRequired(I))
6602     Mask = createBlockInMask(I->getParent(), Plan);
6603 
6604   return new VPInterleaveRecipe(IG, Mask);
6605 }
6606 
6607 VPWidenMemoryInstructionRecipe *
6608 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6609                                   VPlanPtr &Plan) {
6610   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6611     return nullptr;
6612 
6613   auto willWiden = [&](unsigned VF) -> bool {
6614     if (VF == 1)
6615       return false;
6616     if (CM.isScalarAfterVectorization(I, VF) ||
6617         CM.isProfitableToScalarize(I, VF))
6618       return false;
6619     LoopVectorizationCostModel::InstWidening Decision =
6620         CM.getWideningDecision(I, VF);
6621     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6622            "CM decision should be taken at this point.");
6623     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6624            "Interleave memory opportunity should be caught earlier.");
6625     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6626   };
6627 
6628   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6629     return nullptr;
6630 
6631   VPValue *Mask = nullptr;
6632   if (Legal->isMaskRequired(I))
6633     Mask = createBlockInMask(I->getParent(), Plan);
6634 
6635   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6636 }
6637 
6638 VPWidenIntOrFpInductionRecipe *
6639 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6640   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6641     // Check if this is an integer or fp induction. If so, build the recipe that
6642     // produces its scalar and vector values.
6643     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6644     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6645         II.getKind() == InductionDescriptor::IK_FpInduction)
6646       return new VPWidenIntOrFpInductionRecipe(Phi);
6647 
6648     return nullptr;
6649   }
6650 
6651   // Optimize the special case where the source is a constant integer
6652   // induction variable. Notice that we can only optimize the 'trunc' case
6653   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6654   // (c) other casts depend on pointer size.
6655 
6656   // Determine whether \p K is a truncation based on an induction variable that
6657   // can be optimized.
6658   auto isOptimizableIVTruncate =
6659       [&](Instruction *K) -> std::function<bool(unsigned)> {
6660     return
6661         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6662   };
6663 
6664   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6665                                isOptimizableIVTruncate(I), Range))
6666     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6667                                              cast<TruncInst>(I));
6668   return nullptr;
6669 }
6670 
6671 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6672   PHINode *Phi = dyn_cast<PHINode>(I);
6673   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6674     return nullptr;
6675 
6676   // We know that all PHIs in non-header blocks are converted into selects, so
6677   // we don't have to worry about the insertion order and we can just use the
6678   // builder. At this point we generate the predication tree. There may be
6679   // duplications since this is a simple recursive scan, but future
6680   // optimizations will clean it up.
6681 
6682   SmallVector<VPValue *, 2> Masks;
6683   unsigned NumIncoming = Phi->getNumIncomingValues();
6684   for (unsigned In = 0; In < NumIncoming; In++) {
6685     VPValue *EdgeMask =
6686       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6687     assert((EdgeMask || NumIncoming == 1) &&
6688            "Multiple predecessors with one having a full mask");
6689     if (EdgeMask)
6690       Masks.push_back(EdgeMask);
6691   }
6692   return new VPBlendRecipe(Phi, Masks);
6693 }
6694 
6695 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6696                                  VFRange &Range) {
6697 
6698   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6699       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6700 
6701   if (IsPredicated)
6702     return false;
6703 
6704   auto IsVectorizableOpcode = [](unsigned Opcode) {
6705     switch (Opcode) {
6706     case Instruction::Add:
6707     case Instruction::And:
6708     case Instruction::AShr:
6709     case Instruction::BitCast:
6710     case Instruction::Br:
6711     case Instruction::Call:
6712     case Instruction::FAdd:
6713     case Instruction::FCmp:
6714     case Instruction::FDiv:
6715     case Instruction::FMul:
6716     case Instruction::FNeg:
6717     case Instruction::FPExt:
6718     case Instruction::FPToSI:
6719     case Instruction::FPToUI:
6720     case Instruction::FPTrunc:
6721     case Instruction::FRem:
6722     case Instruction::FSub:
6723     case Instruction::GetElementPtr:
6724     case Instruction::ICmp:
6725     case Instruction::IntToPtr:
6726     case Instruction::Load:
6727     case Instruction::LShr:
6728     case Instruction::Mul:
6729     case Instruction::Or:
6730     case Instruction::PHI:
6731     case Instruction::PtrToInt:
6732     case Instruction::SDiv:
6733     case Instruction::Select:
6734     case Instruction::SExt:
6735     case Instruction::Shl:
6736     case Instruction::SIToFP:
6737     case Instruction::SRem:
6738     case Instruction::Store:
6739     case Instruction::Sub:
6740     case Instruction::Trunc:
6741     case Instruction::UDiv:
6742     case Instruction::UIToFP:
6743     case Instruction::URem:
6744     case Instruction::Xor:
6745     case Instruction::ZExt:
6746       return true;
6747     }
6748     return false;
6749   };
6750 
6751   if (!IsVectorizableOpcode(I->getOpcode()))
6752     return false;
6753 
6754   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6755     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6756     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6757                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6758       return false;
6759   }
6760 
6761   auto willWiden = [&](unsigned VF) -> bool {
6762     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6763                              CM.isProfitableToScalarize(I, VF)))
6764       return false;
6765     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6766       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6767       // The following case may be scalarized depending on the VF.
6768       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6769       // version of the instruction.
6770       // Is it beneficial to perform intrinsic call compared to lib call?
6771       bool NeedToScalarize;
6772       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6773       bool UseVectorIntrinsic =
6774           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6775       return UseVectorIntrinsic || !NeedToScalarize;
6776     }
6777     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6778       assert(CM.getWideningDecision(I, VF) ==
6779                  LoopVectorizationCostModel::CM_Scalarize &&
6780              "Memory widening decisions should have been taken care by now");
6781       return false;
6782     }
6783     return true;
6784   };
6785 
6786   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6787     return false;
6788 
6789   // Success: widen this instruction. We optimize the common case where
6790   // consecutive instructions can be represented by a single recipe.
6791   if (!VPBB->empty()) {
6792     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6793     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6794       return true;
6795   }
6796 
6797   VPBB->appendRecipe(new VPWidenRecipe(I));
6798   return true;
6799 }
6800 
6801 VPBasicBlock *VPRecipeBuilder::handleReplication(
6802     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6803     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6804     VPlanPtr &Plan) {
6805   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6806       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6807       Range);
6808 
6809   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6810       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6811 
6812   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6813 
6814   // Find if I uses a predicated instruction. If so, it will use its scalar
6815   // value. Avoid hoisting the insert-element which packs the scalar value into
6816   // a vector value, as that happens iff all users use the vector value.
6817   for (auto &Op : I->operands())
6818     if (auto *PredInst = dyn_cast<Instruction>(Op))
6819       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6820         PredInst2Recipe[PredInst]->setAlsoPack(false);
6821 
6822   // Finalize the recipe for Instr, first if it is not predicated.
6823   if (!IsPredicated) {
6824     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6825     VPBB->appendRecipe(Recipe);
6826     return VPBB;
6827   }
6828   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6829   assert(VPBB->getSuccessors().empty() &&
6830          "VPBB has successors when handling predicated replication.");
6831   // Record predicated instructions for above packing optimizations.
6832   PredInst2Recipe[I] = Recipe;
6833   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6834   VPBlockUtils::insertBlockAfter(Region, VPBB);
6835   auto *RegSucc = new VPBasicBlock();
6836   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6837   return RegSucc;
6838 }
6839 
6840 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6841                                                       VPRecipeBase *PredRecipe,
6842                                                       VPlanPtr &Plan) {
6843   // Instructions marked for predication are replicated and placed under an
6844   // if-then construct to prevent side-effects.
6845 
6846   // Generate recipes to compute the block mask for this region.
6847   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6848 
6849   // Build the triangular if-then region.
6850   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6851   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6852   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6853   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6854   auto *PHIRecipe =
6855       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6856   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6857   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6858   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6859 
6860   // Note: first set Entry as region entry and then connect successors starting
6861   // from it in order, to propagate the "parent" of each VPBasicBlock.
6862   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6863   VPBlockUtils::connectBlocks(Pred, Exit);
6864 
6865   return Region;
6866 }
6867 
6868 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6869                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6870   VPRecipeBase *Recipe = nullptr;
6871   // Check if Instr should belong to an interleave memory recipe, or already
6872   // does. In the latter case Instr is irrelevant.
6873   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6874     VPBB->appendRecipe(Recipe);
6875     return true;
6876   }
6877 
6878   // Check if Instr is a memory operation that should be widened.
6879   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6880     VPBB->appendRecipe(Recipe);
6881     return true;
6882   }
6883 
6884   // Check if Instr should form some PHI recipe.
6885   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6886     VPBB->appendRecipe(Recipe);
6887     return true;
6888   }
6889   if ((Recipe = tryToBlend(Instr, Plan))) {
6890     VPBB->appendRecipe(Recipe);
6891     return true;
6892   }
6893   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6894     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6895     return true;
6896   }
6897 
6898   // Check if Instr is to be widened by a general VPWidenRecipe, after
6899   // having first checked for specific widening recipes that deal with
6900   // Interleave Groups, Inductions and Phi nodes.
6901   if (tryToWiden(Instr, VPBB, Range))
6902     return true;
6903 
6904   return false;
6905 }
6906 
6907 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6908                                                         unsigned MaxVF) {
6909   assert(OrigLoop->empty() && "Inner loop expected.");
6910 
6911   // Collect conditions feeding internal conditional branches; they need to be
6912   // represented in VPlan for it to model masking.
6913   SmallPtrSet<Value *, 1> NeedDef;
6914 
6915   auto *Latch = OrigLoop->getLoopLatch();
6916   for (BasicBlock *BB : OrigLoop->blocks()) {
6917     if (BB == Latch)
6918       continue;
6919     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6920     if (Branch && Branch->isConditional())
6921       NeedDef.insert(Branch->getCondition());
6922   }
6923 
6924   // If the tail is to be folded by masking, the primary induction variable
6925   // needs to be represented in VPlan for it to model early-exit masking.
6926   if (CM.foldTailByMasking())
6927     NeedDef.insert(Legal->getPrimaryInduction());
6928 
6929   // Collect instructions from the original loop that will become trivially dead
6930   // in the vectorized loop. We don't need to vectorize these instructions. For
6931   // example, original induction update instructions can become dead because we
6932   // separately emit induction "steps" when generating code for the new loop.
6933   // Similarly, we create a new latch condition when setting up the structure
6934   // of the new loop, so the old one can become dead.
6935   SmallPtrSet<Instruction *, 4> DeadInstructions;
6936   collectTriviallyDeadInstructions(DeadInstructions);
6937 
6938   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6939     VFRange SubRange = {VF, MaxVF + 1};
6940     VPlans.push_back(
6941         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6942     VF = SubRange.End;
6943   }
6944 }
6945 
6946 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6947     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6948     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6949   // Hold a mapping from predicated instructions to their recipes, in order to
6950   // fix their AlsoPack behavior if a user is determined to replicate and use a
6951   // scalar instead of vector value.
6952   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6953 
6954   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6955   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6956 
6957   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6958   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6959   auto Plan = llvm::make_unique<VPlan>(VPBB);
6960 
6961   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
6962   // Represent values that will have defs inside VPlan.
6963   for (Value *V : NeedDef)
6964     Plan->addVPValue(V);
6965 
6966   // Scan the body of the loop in a topological order to visit each basic block
6967   // after having visited its predecessor basic blocks.
6968   LoopBlocksDFS DFS(OrigLoop);
6969   DFS.perform(LI);
6970 
6971   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6972     // Relevant instructions from basic block BB will be grouped into VPRecipe
6973     // ingredients and fill a new VPBasicBlock.
6974     unsigned VPBBsForBB = 0;
6975     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6976     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6977     VPBB = FirstVPBBForBB;
6978     Builder.setInsertPoint(VPBB);
6979 
6980     std::vector<Instruction *> Ingredients;
6981 
6982     // Organize the ingredients to vectorize from current basic block in the
6983     // right order.
6984     for (Instruction &I : BB->instructionsWithoutDebug()) {
6985       Instruction *Instr = &I;
6986 
6987       // First filter out irrelevant instructions, to ensure no recipes are
6988       // built for them.
6989       if (isa<BranchInst>(Instr) ||
6990           DeadInstructions.find(Instr) != DeadInstructions.end())
6991         continue;
6992 
6993       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
6994       // member of the IG, do not construct any Recipe for it.
6995       const InterleaveGroup<Instruction> *IG =
6996           CM.getInterleavedAccessGroup(Instr);
6997       if (IG && Instr != IG->getInsertPos() &&
6998           Range.Start >= 2 && // Query is illegal for VF == 1
6999           CM.getWideningDecision(Instr, Range.Start) ==
7000               LoopVectorizationCostModel::CM_Interleave) {
7001         auto SinkCandidate = SinkAfterInverse.find(Instr);
7002         if (SinkCandidate != SinkAfterInverse.end())
7003           Ingredients.push_back(SinkCandidate->second);
7004         continue;
7005       }
7006 
7007       // Move instructions to handle first-order recurrences, step 1: avoid
7008       // handling this instruction until after we've handled the instruction it
7009       // should follow.
7010       auto SAIt = SinkAfter.find(Instr);
7011       if (SAIt != SinkAfter.end()) {
7012         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7013                           << *SAIt->second
7014                           << " to vectorize a 1st order recurrence.\n");
7015         SinkAfterInverse[SAIt->second] = Instr;
7016         continue;
7017       }
7018 
7019       Ingredients.push_back(Instr);
7020 
7021       // Move instructions to handle first-order recurrences, step 2: push the
7022       // instruction to be sunk at its insertion point.
7023       auto SAInvIt = SinkAfterInverse.find(Instr);
7024       if (SAInvIt != SinkAfterInverse.end())
7025         Ingredients.push_back(SAInvIt->second);
7026     }
7027 
7028     // Introduce each ingredient into VPlan.
7029     for (Instruction *Instr : Ingredients) {
7030       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7031         continue;
7032 
7033       // Otherwise, if all widening options failed, Instruction is to be
7034       // replicated. This may create a successor for VPBB.
7035       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7036           Instr, Range, VPBB, PredInst2Recipe, Plan);
7037       if (NextVPBB != VPBB) {
7038         VPBB = NextVPBB;
7039         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7040                                     : "");
7041       }
7042     }
7043   }
7044 
7045   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7046   // may also be empty, such as the last one VPBB, reflecting original
7047   // basic-blocks with no recipes.
7048   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7049   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7050   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7051   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7052   delete PreEntry;
7053 
7054   std::string PlanName;
7055   raw_string_ostream RSO(PlanName);
7056   unsigned VF = Range.Start;
7057   Plan->addVF(VF);
7058   RSO << "Initial VPlan for VF={" << VF;
7059   for (VF *= 2; VF < Range.End; VF *= 2) {
7060     Plan->addVF(VF);
7061     RSO << "," << VF;
7062   }
7063   RSO << "},UF>=1";
7064   RSO.flush();
7065   Plan->setName(PlanName);
7066 
7067   return Plan;
7068 }
7069 
7070 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7071   // Outer loop handling: They may require CFG and instruction level
7072   // transformations before even evaluating whether vectorization is profitable.
7073   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7074   // the vectorization pipeline.
7075   assert(!OrigLoop->empty());
7076   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7077 
7078   // Create new empty VPlan
7079   auto Plan = llvm::make_unique<VPlan>();
7080 
7081   // Build hierarchical CFG
7082   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7083   HCFGBuilder.buildHierarchicalCFG();
7084 
7085   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7086     Plan->addVF(VF);
7087 
7088   if (EnableVPlanPredication) {
7089     VPlanPredicator VPP(*Plan);
7090     VPP.predicate();
7091 
7092     // Avoid running transformation to recipes until masked code generation in
7093     // VPlan-native path is in place.
7094     return Plan;
7095   }
7096 
7097   SmallPtrSet<Instruction *, 1> DeadInstructions;
7098   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7099       Plan, Legal->getInductionVars(), DeadInstructions);
7100 
7101   return Plan;
7102 }
7103 
7104 Value* LoopVectorizationPlanner::VPCallbackILV::
7105 getOrCreateVectorValues(Value *V, unsigned Part) {
7106       return ILV.getOrCreateVectorValue(V, Part);
7107 }
7108 
7109 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7110   O << " +\n"
7111     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7112   IG->getInsertPos()->printAsOperand(O, false);
7113   if (User) {
7114     O << ", ";
7115     User->getOperand(0)->printAsOperand(O);
7116   }
7117   O << "\\l\"";
7118   for (unsigned i = 0; i < IG->getFactor(); ++i)
7119     if (Instruction *I = IG->getMember(i))
7120       O << " +\n"
7121         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7122 }
7123 
7124 void VPWidenRecipe::execute(VPTransformState &State) {
7125   for (auto &Instr : make_range(Begin, End))
7126     State.ILV->widenInstruction(Instr);
7127 }
7128 
7129 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7130   assert(!State.Instance && "Int or FP induction being replicated.");
7131   State.ILV->widenIntOrFpInduction(IV, Trunc);
7132 }
7133 
7134 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7135   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7136 }
7137 
7138 void VPBlendRecipe::execute(VPTransformState &State) {
7139   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7140   // We know that all PHIs in non-header blocks are converted into
7141   // selects, so we don't have to worry about the insertion order and we
7142   // can just use the builder.
7143   // At this point we generate the predication tree. There may be
7144   // duplications since this is a simple recursive scan, but future
7145   // optimizations will clean it up.
7146 
7147   unsigned NumIncoming = Phi->getNumIncomingValues();
7148 
7149   assert((User || NumIncoming == 1) &&
7150          "Multiple predecessors with predecessors having a full mask");
7151   // Generate a sequence of selects of the form:
7152   // SELECT(Mask3, In3,
7153   //      SELECT(Mask2, In2,
7154   //                   ( ...)))
7155   InnerLoopVectorizer::VectorParts Entry(State.UF);
7156   for (unsigned In = 0; In < NumIncoming; ++In) {
7157     for (unsigned Part = 0; Part < State.UF; ++Part) {
7158       // We might have single edge PHIs (blocks) - use an identity
7159       // 'select' for the first PHI operand.
7160       Value *In0 =
7161           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7162       if (In == 0)
7163         Entry[Part] = In0; // Initialize with the first incoming value.
7164       else {
7165         // Select between the current value and the previous incoming edge
7166         // based on the incoming mask.
7167         Value *Cond = State.get(User->getOperand(In), Part);
7168         Entry[Part] =
7169             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7170       }
7171     }
7172   }
7173   for (unsigned Part = 0; Part < State.UF; ++Part)
7174     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7175 }
7176 
7177 void VPInterleaveRecipe::execute(VPTransformState &State) {
7178   assert(!State.Instance && "Interleave group being replicated.");
7179   if (!User)
7180     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7181 
7182   // Last (and currently only) operand is a mask.
7183   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7184   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7185   for (unsigned Part = 0; Part < State.UF; ++Part)
7186     MaskValues[Part] = State.get(Mask, Part);
7187   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7188 }
7189 
7190 void VPReplicateRecipe::execute(VPTransformState &State) {
7191   if (State.Instance) { // Generate a single instance.
7192     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7193     // Insert scalar instance packing it into a vector.
7194     if (AlsoPack && State.VF > 1) {
7195       // If we're constructing lane 0, initialize to start from undef.
7196       if (State.Instance->Lane == 0) {
7197         Value *Undef =
7198             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7199         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7200       }
7201       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7202     }
7203     return;
7204   }
7205 
7206   // Generate scalar instances for all VF lanes of all UF parts, unless the
7207   // instruction is uniform inwhich case generate only the first lane for each
7208   // of the UF parts.
7209   unsigned EndLane = IsUniform ? 1 : State.VF;
7210   for (unsigned Part = 0; Part < State.UF; ++Part)
7211     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7212       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7213 }
7214 
7215 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7216   assert(State.Instance && "Branch on Mask works only on single instance.");
7217 
7218   unsigned Part = State.Instance->Part;
7219   unsigned Lane = State.Instance->Lane;
7220 
7221   Value *ConditionBit = nullptr;
7222   if (!User) // Block in mask is all-one.
7223     ConditionBit = State.Builder.getTrue();
7224   else {
7225     VPValue *BlockInMask = User->getOperand(0);
7226     ConditionBit = State.get(BlockInMask, Part);
7227     if (ConditionBit->getType()->isVectorTy())
7228       ConditionBit = State.Builder.CreateExtractElement(
7229           ConditionBit, State.Builder.getInt32(Lane));
7230   }
7231 
7232   // Replace the temporary unreachable terminator with a new conditional branch,
7233   // whose two destinations will be set later when they are created.
7234   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7235   assert(isa<UnreachableInst>(CurrentTerminator) &&
7236          "Expected to replace unreachable terminator with conditional branch.");
7237   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7238   CondBr->setSuccessor(0, nullptr);
7239   ReplaceInstWithInst(CurrentTerminator, CondBr);
7240 }
7241 
7242 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7243   assert(State.Instance && "Predicated instruction PHI works per instance.");
7244   Instruction *ScalarPredInst = cast<Instruction>(
7245       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7246   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7247   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7248   assert(PredicatingBB && "Predicated block has no single predecessor.");
7249 
7250   // By current pack/unpack logic we need to generate only a single phi node: if
7251   // a vector value for the predicated instruction exists at this point it means
7252   // the instruction has vector users only, and a phi for the vector value is
7253   // needed. In this case the recipe of the predicated instruction is marked to
7254   // also do that packing, thereby "hoisting" the insert-element sequence.
7255   // Otherwise, a phi node for the scalar value is needed.
7256   unsigned Part = State.Instance->Part;
7257   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7258     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7259     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7260     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7261     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7262     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7263     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7264   } else {
7265     Type *PredInstType = PredInst->getType();
7266     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7267     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7268     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7269     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7270   }
7271 }
7272 
7273 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7274   if (!User)
7275     return State.ILV->vectorizeMemoryInstruction(&Instr);
7276 
7277   // Last (and currently only) operand is a mask.
7278   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7279   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7280   for (unsigned Part = 0; Part < State.UF; ++Part)
7281     MaskValues[Part] = State.get(Mask, Part);
7282   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7283 }
7284 
7285 static ScalarEpilogueLowering
7286 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7287                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7288   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7289   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7290       (F->hasOptSize() ||
7291        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7292     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7293   else if (Hints.getPredicate())
7294     SEL = CM_ScalarEpilogueNotNeededPredicatePragma;
7295 
7296   return SEL;
7297 }
7298 
7299 // Process the loop in the VPlan-native vectorization path. This path builds
7300 // VPlan upfront in the vectorization pipeline, which allows to apply
7301 // VPlan-to-VPlan transformations from the very beginning without modifying the
7302 // input LLVM IR.
7303 static bool processLoopInVPlanNativePath(
7304     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7305     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7306     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7307     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7308     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7309 
7310   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7311   Function *F = L->getHeader()->getParent();
7312   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7313   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7314 
7315   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7316                                 &Hints, IAI);
7317   // Use the planner for outer loop vectorization.
7318   // TODO: CM is not used at this point inside the planner. Turn CM into an
7319   // optional argument if we don't need it in the future.
7320   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7321 
7322   // Get user vectorization factor.
7323   const unsigned UserVF = Hints.getWidth();
7324 
7325   // Plan how to best vectorize, return the best VF and its cost.
7326   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7327 
7328   // If we are stress testing VPlan builds, do not attempt to generate vector
7329   // code. Masked vector code generation support will follow soon.
7330   // Also, do not attempt to vectorize if no vector code will be produced.
7331   if (VPlanBuildStressTest || EnableVPlanPredication ||
7332       VectorizationFactor::Disabled() == VF)
7333     return false;
7334 
7335   LVP.setBestPlan(VF.Width, 1);
7336 
7337   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7338                          &CM);
7339   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7340                     << L->getHeader()->getParent()->getName() << "\"\n");
7341   LVP.executePlan(LB, DT);
7342 
7343   // Mark the loop as already vectorized to avoid vectorizing again.
7344   Hints.setAlreadyVectorized();
7345 
7346   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7347   return true;
7348 }
7349 
7350 bool LoopVectorizePass::processLoop(Loop *L) {
7351   assert((EnableVPlanNativePath || L->empty()) &&
7352          "VPlan-native path is not enabled. Only process inner loops.");
7353 
7354 #ifndef NDEBUG
7355   const std::string DebugLocStr = getDebugLocString(L);
7356 #endif /* NDEBUG */
7357 
7358   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7359                     << L->getHeader()->getParent()->getName() << "\" from "
7360                     << DebugLocStr << "\n");
7361 
7362   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7363 
7364   LLVM_DEBUG(
7365       dbgs() << "LV: Loop hints:"
7366              << " force="
7367              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7368                      ? "disabled"
7369                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7370                             ? "enabled"
7371                             : "?"))
7372              << " width=" << Hints.getWidth()
7373              << " unroll=" << Hints.getInterleave() << "\n");
7374 
7375   // Function containing loop
7376   Function *F = L->getHeader()->getParent();
7377 
7378   // Looking at the diagnostic output is the only way to determine if a loop
7379   // was vectorized (other than looking at the IR or machine code), so it
7380   // is important to generate an optimization remark for each loop. Most of
7381   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7382   // generated as OptimizationRemark and OptimizationRemarkMissed are
7383   // less verbose reporting vectorized loops and unvectorized loops that may
7384   // benefit from vectorization, respectively.
7385 
7386   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7387     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7388     return false;
7389   }
7390 
7391   PredicatedScalarEvolution PSE(*SE, *L);
7392 
7393   // Check if it is legal to vectorize the loop.
7394   LoopVectorizationRequirements Requirements(*ORE);
7395   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7396                                 &Requirements, &Hints, DB, AC);
7397   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7398     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7399     Hints.emitRemarkWithHints();
7400     return false;
7401   }
7402 
7403   // Check the function attributes and profiles to find out if this function
7404   // should be optimized for size.
7405   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7406 
7407   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7408   // here. They may require CFG and instruction level transformations before
7409   // even evaluating whether vectorization is profitable. Since we cannot modify
7410   // the incoming IR, we need to build VPlan upfront in the vectorization
7411   // pipeline.
7412   if (!L->empty())
7413     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7414                                         ORE, BFI, PSI, Hints);
7415 
7416   assert(L->empty() && "Inner loop expected.");
7417   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7418   // count by optimizing for size, to minimize overheads.
7419   // Prefer constant trip counts over profile data, over upper bound estimate.
7420   unsigned ExpectedTC = 0;
7421   bool HasExpectedTC = false;
7422   if (const SCEVConstant *ConstExits =
7423       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7424     const APInt &ExitsCount = ConstExits->getAPInt();
7425     // We are interested in small values for ExpectedTC. Skip over those that
7426     // can't fit an unsigned.
7427     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7428       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7429       HasExpectedTC = true;
7430     }
7431   }
7432   // ExpectedTC may be large because it's bound by a variable. Check
7433   // profiling information to validate we should vectorize.
7434   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7435     auto EstimatedTC = getLoopEstimatedTripCount(L);
7436     if (EstimatedTC) {
7437       ExpectedTC = *EstimatedTC;
7438       HasExpectedTC = true;
7439     }
7440   }
7441   if (!HasExpectedTC) {
7442     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7443     HasExpectedTC = (ExpectedTC > 0);
7444   }
7445 
7446   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7447     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7448                       << "This loop is worth vectorizing only if no scalar "
7449                       << "iteration overheads are incurred.");
7450     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7451       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7452     else {
7453       LLVM_DEBUG(dbgs() << "\n");
7454       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7455     }
7456   }
7457 
7458   // Check the function attributes to see if implicit floats are allowed.
7459   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7460   // an integer loop and the vector instructions selected are purely integer
7461   // vector instructions?
7462   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7463     reportVectorizationFailure(
7464         "Can't vectorize when the NoImplicitFloat attribute is used",
7465         "loop not vectorized due to NoImplicitFloat attribute",
7466         "NoImplicitFloat", ORE, L);
7467     Hints.emitRemarkWithHints();
7468     return false;
7469   }
7470 
7471   // Check if the target supports potentially unsafe FP vectorization.
7472   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7473   // for the target we're vectorizing for, to make sure none of the
7474   // additional fp-math flags can help.
7475   if (Hints.isPotentiallyUnsafe() &&
7476       TTI->isFPVectorizationPotentiallyUnsafe()) {
7477     reportVectorizationFailure(
7478         "Potentially unsafe FP op prevents vectorization",
7479         "loop not vectorized due to unsafe FP support.",
7480         "UnsafeFP", ORE, L);
7481     Hints.emitRemarkWithHints();
7482     return false;
7483   }
7484 
7485   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7486   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7487 
7488   // If an override option has been passed in for interleaved accesses, use it.
7489   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7490     UseInterleaved = EnableInterleavedMemAccesses;
7491 
7492   // Analyze interleaved memory accesses.
7493   if (UseInterleaved) {
7494     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7495   }
7496 
7497   // Use the cost model.
7498   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7499                                 F, &Hints, IAI);
7500   CM.collectValuesToIgnore();
7501 
7502   // Use the planner for vectorization.
7503   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7504 
7505   // Get user vectorization factor.
7506   unsigned UserVF = Hints.getWidth();
7507 
7508   // Plan how to best vectorize, return the best VF and its cost.
7509   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7510 
7511   VectorizationFactor VF = VectorizationFactor::Disabled();
7512   unsigned IC = 1;
7513   unsigned UserIC = Hints.getInterleave();
7514 
7515   if (MaybeVF) {
7516     VF = *MaybeVF;
7517     // Select the interleave count.
7518     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7519   }
7520 
7521   // Identify the diagnostic messages that should be produced.
7522   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7523   bool VectorizeLoop = true, InterleaveLoop = true;
7524   if (Requirements.doesNotMeet(F, L, Hints)) {
7525     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7526                          "requirements.\n");
7527     Hints.emitRemarkWithHints();
7528     return false;
7529   }
7530 
7531   if (VF.Width == 1) {
7532     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7533     VecDiagMsg = std::make_pair(
7534         "VectorizationNotBeneficial",
7535         "the cost-model indicates that vectorization is not beneficial");
7536     VectorizeLoop = false;
7537   }
7538 
7539   if (!MaybeVF && UserIC > 1) {
7540     // Tell the user interleaving was avoided up-front, despite being explicitly
7541     // requested.
7542     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7543                          "interleaving should be avoided up front\n");
7544     IntDiagMsg = std::make_pair(
7545         "InterleavingAvoided",
7546         "Ignoring UserIC, because interleaving was avoided up front");
7547     InterleaveLoop = false;
7548   } else if (IC == 1 && UserIC <= 1) {
7549     // Tell the user interleaving is not beneficial.
7550     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7551     IntDiagMsg = std::make_pair(
7552         "InterleavingNotBeneficial",
7553         "the cost-model indicates that interleaving is not beneficial");
7554     InterleaveLoop = false;
7555     if (UserIC == 1) {
7556       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7557       IntDiagMsg.second +=
7558           " and is explicitly disabled or interleave count is set to 1";
7559     }
7560   } else if (IC > 1 && UserIC == 1) {
7561     // Tell the user interleaving is beneficial, but it explicitly disabled.
7562     LLVM_DEBUG(
7563         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7564     IntDiagMsg = std::make_pair(
7565         "InterleavingBeneficialButDisabled",
7566         "the cost-model indicates that interleaving is beneficial "
7567         "but is explicitly disabled or interleave count is set to 1");
7568     InterleaveLoop = false;
7569   }
7570 
7571   // Override IC if user provided an interleave count.
7572   IC = UserIC > 0 ? UserIC : IC;
7573 
7574   // Emit diagnostic messages, if any.
7575   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7576   if (!VectorizeLoop && !InterleaveLoop) {
7577     // Do not vectorize or interleaving the loop.
7578     ORE->emit([&]() {
7579       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7580                                       L->getStartLoc(), L->getHeader())
7581              << VecDiagMsg.second;
7582     });
7583     ORE->emit([&]() {
7584       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7585                                       L->getStartLoc(), L->getHeader())
7586              << IntDiagMsg.second;
7587     });
7588     return false;
7589   } else if (!VectorizeLoop && InterleaveLoop) {
7590     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7591     ORE->emit([&]() {
7592       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7593                                         L->getStartLoc(), L->getHeader())
7594              << VecDiagMsg.second;
7595     });
7596   } else if (VectorizeLoop && !InterleaveLoop) {
7597     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7598                       << ") in " << DebugLocStr << '\n');
7599     ORE->emit([&]() {
7600       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7601                                         L->getStartLoc(), L->getHeader())
7602              << IntDiagMsg.second;
7603     });
7604   } else if (VectorizeLoop && InterleaveLoop) {
7605     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7606                       << ") in " << DebugLocStr << '\n');
7607     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7608   }
7609 
7610   LVP.setBestPlan(VF.Width, IC);
7611 
7612   using namespace ore;
7613   bool DisableRuntimeUnroll = false;
7614   MDNode *OrigLoopID = L->getLoopID();
7615 
7616   if (!VectorizeLoop) {
7617     assert(IC > 1 && "interleave count should not be 1 or 0");
7618     // If we decided that it is not legal to vectorize the loop, then
7619     // interleave it.
7620     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7621                                &CM);
7622     LVP.executePlan(Unroller, DT);
7623 
7624     ORE->emit([&]() {
7625       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7626                                 L->getHeader())
7627              << "interleaved loop (interleaved count: "
7628              << NV("InterleaveCount", IC) << ")";
7629     });
7630   } else {
7631     // If we decided that it is *legal* to vectorize the loop, then do it.
7632     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7633                            &LVL, &CM);
7634     LVP.executePlan(LB, DT);
7635     ++LoopsVectorized;
7636 
7637     // Add metadata to disable runtime unrolling a scalar loop when there are
7638     // no runtime checks about strides and memory. A scalar loop that is
7639     // rarely used is not worth unrolling.
7640     if (!LB.areSafetyChecksAdded())
7641       DisableRuntimeUnroll = true;
7642 
7643     // Report the vectorization decision.
7644     ORE->emit([&]() {
7645       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7646                                 L->getHeader())
7647              << "vectorized loop (vectorization width: "
7648              << NV("VectorizationFactor", VF.Width)
7649              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7650     });
7651   }
7652 
7653   Optional<MDNode *> RemainderLoopID =
7654       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7655                                       LLVMLoopVectorizeFollowupEpilogue});
7656   if (RemainderLoopID.hasValue()) {
7657     L->setLoopID(RemainderLoopID.getValue());
7658   } else {
7659     if (DisableRuntimeUnroll)
7660       AddRuntimeUnrollDisableMetaData(L);
7661 
7662     // Mark the loop as already vectorized to avoid vectorizing again.
7663     Hints.setAlreadyVectorized();
7664   }
7665 
7666   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7667   return true;
7668 }
7669 
7670 bool LoopVectorizePass::runImpl(
7671     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7672     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7673     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7674     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7675     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7676   SE = &SE_;
7677   LI = &LI_;
7678   TTI = &TTI_;
7679   DT = &DT_;
7680   BFI = &BFI_;
7681   TLI = TLI_;
7682   AA = &AA_;
7683   AC = &AC_;
7684   GetLAA = &GetLAA_;
7685   DB = &DB_;
7686   ORE = &ORE_;
7687   PSI = PSI_;
7688 
7689   // Don't attempt if
7690   // 1. the target claims to have no vector registers, and
7691   // 2. interleaving won't help ILP.
7692   //
7693   // The second condition is necessary because, even if the target has no
7694   // vector registers, loop vectorization may still enable scalar
7695   // interleaving.
7696   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7697     return false;
7698 
7699   bool Changed = false;
7700 
7701   // The vectorizer requires loops to be in simplified form.
7702   // Since simplification may add new inner loops, it has to run before the
7703   // legality and profitability checks. This means running the loop vectorizer
7704   // will simplify all loops, regardless of whether anything end up being
7705   // vectorized.
7706   for (auto &L : *LI)
7707     Changed |=
7708         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7709 
7710   // Build up a worklist of inner-loops to vectorize. This is necessary as
7711   // the act of vectorizing or partially unrolling a loop creates new loops
7712   // and can invalidate iterators across the loops.
7713   SmallVector<Loop *, 8> Worklist;
7714 
7715   for (Loop *L : *LI)
7716     collectSupportedLoops(*L, LI, ORE, Worklist);
7717 
7718   LoopsAnalyzed += Worklist.size();
7719 
7720   // Now walk the identified inner loops.
7721   while (!Worklist.empty()) {
7722     Loop *L = Worklist.pop_back_val();
7723 
7724     // For the inner loops we actually process, form LCSSA to simplify the
7725     // transform.
7726     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7727 
7728     Changed |= processLoop(L);
7729   }
7730 
7731   // Process each loop nest in the function.
7732   return Changed;
7733 }
7734 
7735 PreservedAnalyses LoopVectorizePass::run(Function &F,
7736                                          FunctionAnalysisManager &AM) {
7737     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7738     auto &LI = AM.getResult<LoopAnalysis>(F);
7739     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7740     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7741     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7742     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7743     auto &AA = AM.getResult<AAManager>(F);
7744     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7745     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7746     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7747     MemorySSA *MSSA = EnableMSSALoopDependency
7748                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7749                           : nullptr;
7750 
7751     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7752     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7753         [&](Loop &L) -> const LoopAccessInfo & {
7754       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7755       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7756     };
7757     const ModuleAnalysisManager &MAM =
7758         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7759     ProfileSummaryInfo *PSI =
7760         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7761     bool Changed =
7762         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7763     if (!Changed)
7764       return PreservedAnalyses::all();
7765     PreservedAnalyses PA;
7766 
7767     // We currently do not preserve loopinfo/dominator analyses with outer loop
7768     // vectorization. Until this is addressed, mark these analyses as preserved
7769     // only for non-VPlan-native path.
7770     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7771     if (!EnableVPlanNativePath) {
7772       PA.preserve<LoopAnalysis>();
7773       PA.preserve<DominatorTreeAnalysis>();
7774     }
7775     PA.preserve<BasicAA>();
7776     PA.preserve<GlobalsAA>();
7777     return PA;
7778 }
7779