1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 static cl::opt<bool> MaximizeBandwidth(
181     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
182     cl::desc("Maximize bandwidth when selecting vectorization factor which "
183              "will be determined by the smallest type in loop."));
184 
185 static cl::opt<bool> EnableInterleavedMemAccesses(
186     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
187     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
188 
189 /// An interleave-group may need masking if it resides in a block that needs
190 /// predication, or in order to mask away gaps.
191 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
192     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
193     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
194 
195 /// We don't interleave loops with a known constant trip count below this
196 /// number.
197 static const unsigned TinyTripCountInterleaveThreshold = 128;
198 
199 static cl::opt<unsigned> ForceTargetNumScalarRegs(
200     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
201     cl::desc("A flag that overrides the target's number of scalar registers."));
202 
203 static cl::opt<unsigned> ForceTargetNumVectorRegs(
204     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
205     cl::desc("A flag that overrides the target's number of vector registers."));
206 
207 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
208     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
209     cl::desc("A flag that overrides the target's max interleave factor for "
210              "scalar loops."));
211 
212 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
213     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's max interleave factor for "
215              "vectorized loops."));
216 
217 static cl::opt<unsigned> ForceTargetInstructionCost(
218     "force-target-instruction-cost", cl::init(0), cl::Hidden,
219     cl::desc("A flag that overrides the target's expected cost for "
220              "an instruction to a single constant value. Mostly "
221              "useful for getting consistent testing."));
222 
223 static cl::opt<unsigned> SmallLoopCost(
224     "small-loop-cost", cl::init(20), cl::Hidden,
225     cl::desc(
226         "The cost of a loop that is considered 'small' by the interleaver."));
227 
228 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
229     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
230     cl::desc("Enable the use of the block frequency analysis to access PGO "
231              "heuristics minimizing code growth in cold regions and being more "
232              "aggressive in hot regions."));
233 
234 // Runtime interleave loops for load/store throughput.
235 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
236     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
237     cl::desc(
238         "Enable runtime interleaving until load/store ports are saturated"));
239 
240 /// The number of stores in a loop that are allowed to need predication.
241 static cl::opt<unsigned> NumberOfStoresToPredicate(
242     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
243     cl::desc("Max number of stores to be predicated behind an if."));
244 
245 static cl::opt<bool> EnableIndVarRegisterHeur(
246     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
247     cl::desc("Count the induction variable only once when interleaving"));
248 
249 static cl::opt<bool> EnableCondStoresVectorization(
250     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
251     cl::desc("Enable if predication of stores during vectorization."));
252 
253 static cl::opt<unsigned> MaxNestedScalarReductionIC(
254     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
255     cl::desc("The maximum interleave count to use when interleaving a scalar "
256              "reduction in a nested loop."));
257 
258 cl::opt<bool> EnableVPlanNativePath(
259     "enable-vplan-native-path", cl::init(false), cl::Hidden,
260     cl::desc("Enable VPlan-native vectorization path with "
261              "support for outer loop vectorization."));
262 
263 // FIXME: Remove this switch once we have divergence analysis. Currently we
264 // assume divergent non-backedge branches when this switch is true.
265 cl::opt<bool> EnableVPlanPredication(
266     "enable-vplan-predication", cl::init(false), cl::Hidden,
267     cl::desc("Enable VPlan-native vectorization path predicator with "
268              "support for outer loop vectorization."));
269 
270 // This flag enables the stress testing of the VPlan H-CFG construction in the
271 // VPlan-native vectorization path. It must be used in conjuction with
272 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
273 // verification of the H-CFGs built.
274 static cl::opt<bool> VPlanBuildStressTest(
275     "vplan-build-stress-test", cl::init(false), cl::Hidden,
276     cl::desc(
277         "Build VPlan for every supported loop nest in the function and bail "
278         "out right after the build (stress test the VPlan H-CFG construction "
279         "in the VPlan-native vectorization path)."));
280 
281 cl::opt<bool> llvm::EnableLoopInterleaving(
282     "interleave-loops", cl::init(true), cl::Hidden,
283     cl::desc("Enable loop interleaving in Loop vectorization passes"));
284 cl::opt<bool> llvm::EnableLoopVectorization(
285     "vectorize-loops", cl::init(true), cl::Hidden,
286     cl::desc("Run the Loop vectorization passes"));
287 
288 /// A helper function for converting Scalar types to vector types.
289 /// If the incoming type is void, we return void. If the VF is 1, we return
290 /// the scalar type.
291 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
292   if (Scalar->isVoidTy() || VF == 1)
293     return Scalar;
294   return VectorType::get(Scalar, VF);
295 }
296 
297 /// A helper function that returns the type of loaded or stored value.
298 static Type *getMemInstValueType(Value *I) {
299   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
300          "Expected Load or Store instruction");
301   if (auto *LI = dyn_cast<LoadInst>(I))
302     return LI->getType();
303   return cast<StoreInst>(I)->getValueOperand()->getType();
304 }
305 
306 /// A helper function that returns true if the given type is irregular. The
307 /// type is irregular if its allocated size doesn't equal the store size of an
308 /// element of the corresponding vector type at the given vectorization factor.
309 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
310   // Determine if an array of VF elements of type Ty is "bitcast compatible"
311   // with a <VF x Ty> vector.
312   if (VF > 1) {
313     auto *VectorTy = VectorType::get(Ty, VF);
314     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
315   }
316 
317   // If the vectorization factor is one, we just check if an array of type Ty
318   // requires padding between elements.
319   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
320 }
321 
322 /// A helper function that returns the reciprocal of the block probability of
323 /// predicated blocks. If we return X, we are assuming the predicated block
324 /// will execute once for every X iterations of the loop header.
325 ///
326 /// TODO: We should use actual block probability here, if available. Currently,
327 ///       we always assume predicated blocks have a 50% chance of executing.
328 static unsigned getReciprocalPredBlockProb() { return 2; }
329 
330 /// A helper function that adds a 'fast' flag to floating-point operations.
331 static Value *addFastMathFlag(Value *V) {
332   if (isa<FPMathOperator>(V))
333     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
334   return V;
335 }
336 
337 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
338   if (isa<FPMathOperator>(V))
339     cast<Instruction>(V)->setFastMathFlags(FMF);
340   return V;
341 }
342 
343 /// A helper function that returns an integer or floating-point constant with
344 /// value C.
345 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
346   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
347                            : ConstantFP::get(Ty, C);
348 }
349 
350 namespace llvm {
351 
352 /// InnerLoopVectorizer vectorizes loops which contain only one basic
353 /// block to a specified vectorization factor (VF).
354 /// This class performs the widening of scalars into vectors, or multiple
355 /// scalars. This class also implements the following features:
356 /// * It inserts an epilogue loop for handling loops that don't have iteration
357 ///   counts that are known to be a multiple of the vectorization factor.
358 /// * It handles the code generation for reduction variables.
359 /// * Scalarization (implementation using scalars) of un-vectorizable
360 ///   instructions.
361 /// InnerLoopVectorizer does not perform any vectorization-legality
362 /// checks, and relies on the caller to check for the different legality
363 /// aspects. The InnerLoopVectorizer relies on the
364 /// LoopVectorizationLegality class to provide information about the induction
365 /// and reduction variables that were found to a given vectorization factor.
366 class InnerLoopVectorizer {
367 public:
368   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
369                       LoopInfo *LI, DominatorTree *DT,
370                       const TargetLibraryInfo *TLI,
371                       const TargetTransformInfo *TTI, AssumptionCache *AC,
372                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
373                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
374                       LoopVectorizationCostModel *CM)
375       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
376         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
377         Builder(PSE.getSE()->getContext()),
378         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
379   virtual ~InnerLoopVectorizer() = default;
380 
381   /// Create a new empty loop. Unlink the old loop and connect the new one.
382   /// Return the pre-header block of the new loop.
383   BasicBlock *createVectorizedLoopSkeleton();
384 
385   /// Widen a single instruction within the innermost loop.
386   void widenInstruction(Instruction &I);
387 
388   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
389   void fixVectorizedLoop();
390 
391   // Return true if any runtime check is added.
392   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
393 
394   /// A type for vectorized values in the new loop. Each value from the
395   /// original loop, when vectorized, is represented by UF vector values in the
396   /// new unrolled loop, where UF is the unroll factor.
397   using VectorParts = SmallVector<Value *, 2>;
398 
399   /// Vectorize a single PHINode in a block. This method handles the induction
400   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
401   /// arbitrary length vectors.
402   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
403 
404   /// A helper function to scalarize a single Instruction in the innermost loop.
405   /// Generates a sequence of scalar instances for each lane between \p MinLane
406   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
407   /// inclusive..
408   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
409                             bool IfPredicateInstr);
410 
411   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
412   /// is provided, the integer induction variable will first be truncated to
413   /// the corresponding type.
414   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
415 
416   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
417   /// vector or scalar value on-demand if one is not yet available. When
418   /// vectorizing a loop, we visit the definition of an instruction before its
419   /// uses. When visiting the definition, we either vectorize or scalarize the
420   /// instruction, creating an entry for it in the corresponding map. (In some
421   /// cases, such as induction variables, we will create both vector and scalar
422   /// entries.) Then, as we encounter uses of the definition, we derive values
423   /// for each scalar or vector use unless such a value is already available.
424   /// For example, if we scalarize a definition and one of its uses is vector,
425   /// we build the required vector on-demand with an insertelement sequence
426   /// when visiting the use. Otherwise, if the use is scalar, we can use the
427   /// existing scalar definition.
428   ///
429   /// Return a value in the new loop corresponding to \p V from the original
430   /// loop at unroll index \p Part. If the value has already been vectorized,
431   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
432   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
433   /// a new vector value on-demand by inserting the scalar values into a vector
434   /// with an insertelement sequence. If the value has been neither vectorized
435   /// nor scalarized, it must be loop invariant, so we simply broadcast the
436   /// value into a vector.
437   Value *getOrCreateVectorValue(Value *V, unsigned Part);
438 
439   /// Return a value in the new loop corresponding to \p V from the original
440   /// loop at unroll and vector indices \p Instance. If the value has been
441   /// vectorized but not scalarized, the necessary extractelement instruction
442   /// will be generated.
443   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
444 
445   /// Construct the vector value of a scalarized value \p V one lane at a time.
446   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
447 
448   /// Try to vectorize the interleaved access group that \p Instr belongs to,
449   /// optionally masking the vector operations if \p BlockInMask is non-null.
450   void vectorizeInterleaveGroup(Instruction *Instr,
451                                 VectorParts *BlockInMask = nullptr);
452 
453   /// Vectorize Load and Store instructions, optionally masking the vector
454   /// operations if \p BlockInMask is non-null.
455   void vectorizeMemoryInstruction(Instruction *Instr,
456                                   VectorParts *BlockInMask = nullptr);
457 
458   /// Set the debug location in the builder using the debug location in
459   /// the instruction.
460   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
461 
462   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
463   void fixNonInductionPHIs(void);
464 
465 protected:
466   friend class LoopVectorizationPlanner;
467 
468   /// A small list of PHINodes.
469   using PhiVector = SmallVector<PHINode *, 4>;
470 
471   /// A type for scalarized values in the new loop. Each value from the
472   /// original loop, when scalarized, is represented by UF x VF scalar values
473   /// in the new unrolled loop, where UF is the unroll factor and VF is the
474   /// vectorization factor.
475   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
476 
477   /// Set up the values of the IVs correctly when exiting the vector loop.
478   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
479                     Value *CountRoundDown, Value *EndValue,
480                     BasicBlock *MiddleBlock);
481 
482   /// Create a new induction variable inside L.
483   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
484                                    Value *Step, Instruction *DL);
485 
486   /// Handle all cross-iteration phis in the header.
487   void fixCrossIterationPHIs();
488 
489   /// Fix a first-order recurrence. This is the second phase of vectorizing
490   /// this phi node.
491   void fixFirstOrderRecurrence(PHINode *Phi);
492 
493   /// Fix a reduction cross-iteration phi. This is the second phase of
494   /// vectorizing this phi node.
495   void fixReduction(PHINode *Phi);
496 
497   /// The Loop exit block may have single value PHI nodes with some
498   /// incoming value. While vectorizing we only handled real values
499   /// that were defined inside the loop and we should have one value for
500   /// each predecessor of its parent basic block. See PR14725.
501   void fixLCSSAPHIs();
502 
503   /// Iteratively sink the scalarized operands of a predicated instruction into
504   /// the block that was created for it.
505   void sinkScalarOperands(Instruction *PredInst);
506 
507   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
508   /// represented as.
509   void truncateToMinimalBitwidths();
510 
511   /// Insert the new loop to the loop hierarchy and pass manager
512   /// and update the analysis passes.
513   void updateAnalysis();
514 
515   /// Create a broadcast instruction. This method generates a broadcast
516   /// instruction (shuffle) for loop invariant values and for the induction
517   /// value. If this is the induction variable then we extend it to N, N+1, ...
518   /// this is needed because each iteration in the loop corresponds to a SIMD
519   /// element.
520   virtual Value *getBroadcastInstrs(Value *V);
521 
522   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
523   /// to each vector element of Val. The sequence starts at StartIndex.
524   /// \p Opcode is relevant for FP induction variable.
525   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
526                                Instruction::BinaryOps Opcode =
527                                Instruction::BinaryOpsEnd);
528 
529   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
530   /// variable on which to base the steps, \p Step is the size of the step, and
531   /// \p EntryVal is the value from the original loop that maps to the steps.
532   /// Note that \p EntryVal doesn't have to be an induction variable - it
533   /// can also be a truncate instruction.
534   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
535                         const InductionDescriptor &ID);
536 
537   /// Create a vector induction phi node based on an existing scalar one. \p
538   /// EntryVal is the value from the original loop that maps to the vector phi
539   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
540   /// truncate instruction, instead of widening the original IV, we widen a
541   /// version of the IV truncated to \p EntryVal's type.
542   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
543                                        Value *Step, Instruction *EntryVal);
544 
545   /// Returns true if an instruction \p I should be scalarized instead of
546   /// vectorized for the chosen vectorization factor.
547   bool shouldScalarizeInstruction(Instruction *I) const;
548 
549   /// Returns true if we should generate a scalar version of \p IV.
550   bool needsScalarInduction(Instruction *IV) const;
551 
552   /// If there is a cast involved in the induction variable \p ID, which should
553   /// be ignored in the vectorized loop body, this function records the
554   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
555   /// cast. We had already proved that the casted Phi is equal to the uncasted
556   /// Phi in the vectorized loop (under a runtime guard), and therefore
557   /// there is no need to vectorize the cast - the same value can be used in the
558   /// vector loop for both the Phi and the cast.
559   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
560   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
561   ///
562   /// \p EntryVal is the value from the original loop that maps to the vector
563   /// phi node and is used to distinguish what is the IV currently being
564   /// processed - original one (if \p EntryVal is a phi corresponding to the
565   /// original IV) or the "newly-created" one based on the proof mentioned above
566   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
567   /// latter case \p EntryVal is a TruncInst and we must not record anything for
568   /// that IV, but it's error-prone to expect callers of this routine to care
569   /// about that, hence this explicit parameter.
570   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
571                                              const Instruction *EntryVal,
572                                              Value *VectorLoopValue,
573                                              unsigned Part,
574                                              unsigned Lane = UINT_MAX);
575 
576   /// Generate a shuffle sequence that will reverse the vector Vec.
577   virtual Value *reverseVector(Value *Vec);
578 
579   /// Returns (and creates if needed) the original loop trip count.
580   Value *getOrCreateTripCount(Loop *NewLoop);
581 
582   /// Returns (and creates if needed) the trip count of the widened loop.
583   Value *getOrCreateVectorTripCount(Loop *NewLoop);
584 
585   /// Returns a bitcasted value to the requested vector type.
586   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
587   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
588                                 const DataLayout &DL);
589 
590   /// Emit a bypass check to see if the vector trip count is zero, including if
591   /// it overflows.
592   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
593 
594   /// Emit a bypass check to see if all of the SCEV assumptions we've
595   /// had to make are correct.
596   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
597 
598   /// Emit bypass checks to check any memory assumptions we may have made.
599   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
600 
601   /// Compute the transformed value of Index at offset StartValue using step
602   /// StepValue.
603   /// For integer induction, returns StartValue + Index * StepValue.
604   /// For pointer induction, returns StartValue[Index * StepValue].
605   /// FIXME: The newly created binary instructions should contain nsw/nuw
606   /// flags, which can be found from the original scalar operations.
607   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
608                               const DataLayout &DL,
609                               const InductionDescriptor &ID) const;
610 
611   /// Add additional metadata to \p To that was not present on \p Orig.
612   ///
613   /// Currently this is used to add the noalias annotations based on the
614   /// inserted memchecks.  Use this for instructions that are *cloned* into the
615   /// vector loop.
616   void addNewMetadata(Instruction *To, const Instruction *Orig);
617 
618   /// Add metadata from one instruction to another.
619   ///
620   /// This includes both the original MDs from \p From and additional ones (\see
621   /// addNewMetadata).  Use this for *newly created* instructions in the vector
622   /// loop.
623   void addMetadata(Instruction *To, Instruction *From);
624 
625   /// Similar to the previous function but it adds the metadata to a
626   /// vector of instructions.
627   void addMetadata(ArrayRef<Value *> To, Instruction *From);
628 
629   /// The original loop.
630   Loop *OrigLoop;
631 
632   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
633   /// dynamic knowledge to simplify SCEV expressions and converts them to a
634   /// more usable form.
635   PredicatedScalarEvolution &PSE;
636 
637   /// Loop Info.
638   LoopInfo *LI;
639 
640   /// Dominator Tree.
641   DominatorTree *DT;
642 
643   /// Alias Analysis.
644   AliasAnalysis *AA;
645 
646   /// Target Library Info.
647   const TargetLibraryInfo *TLI;
648 
649   /// Target Transform Info.
650   const TargetTransformInfo *TTI;
651 
652   /// Assumption Cache.
653   AssumptionCache *AC;
654 
655   /// Interface to emit optimization remarks.
656   OptimizationRemarkEmitter *ORE;
657 
658   /// LoopVersioning.  It's only set up (non-null) if memchecks were
659   /// used.
660   ///
661   /// This is currently only used to add no-alias metadata based on the
662   /// memchecks.  The actually versioning is performed manually.
663   std::unique_ptr<LoopVersioning> LVer;
664 
665   /// The vectorization SIMD factor to use. Each vector will have this many
666   /// vector elements.
667   unsigned VF;
668 
669   /// The vectorization unroll factor to use. Each scalar is vectorized to this
670   /// many different vector instructions.
671   unsigned UF;
672 
673   /// The builder that we use
674   IRBuilder<> Builder;
675 
676   // --- Vectorization state ---
677 
678   /// The vector-loop preheader.
679   BasicBlock *LoopVectorPreHeader;
680 
681   /// The scalar-loop preheader.
682   BasicBlock *LoopScalarPreHeader;
683 
684   /// Middle Block between the vector and the scalar.
685   BasicBlock *LoopMiddleBlock;
686 
687   /// The ExitBlock of the scalar loop.
688   BasicBlock *LoopExitBlock;
689 
690   /// The vector loop body.
691   BasicBlock *LoopVectorBody;
692 
693   /// The scalar loop body.
694   BasicBlock *LoopScalarBody;
695 
696   /// A list of all bypass blocks. The first block is the entry of the loop.
697   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
698 
699   /// The new Induction variable which was added to the new block.
700   PHINode *Induction = nullptr;
701 
702   /// The induction variable of the old basic block.
703   PHINode *OldInduction = nullptr;
704 
705   /// Maps values from the original loop to their corresponding values in the
706   /// vectorized loop. A key value can map to either vector values, scalar
707   /// values or both kinds of values, depending on whether the key was
708   /// vectorized and scalarized.
709   VectorizerValueMap VectorLoopValueMap;
710 
711   /// Store instructions that were predicated.
712   SmallVector<Instruction *, 4> PredicatedInstructions;
713 
714   /// Trip count of the original loop.
715   Value *TripCount = nullptr;
716 
717   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
718   Value *VectorTripCount = nullptr;
719 
720   /// The legality analysis.
721   LoopVectorizationLegality *Legal;
722 
723   /// The profitablity analysis.
724   LoopVectorizationCostModel *Cost;
725 
726   // Record whether runtime checks are added.
727   bool AddedSafetyChecks = false;
728 
729   // Holds the end values for each induction variable. We save the end values
730   // so we can later fix-up the external users of the induction variables.
731   DenseMap<PHINode *, Value *> IVEndValues;
732 
733   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
734   // fixed up at the end of vector code generation.
735   SmallVector<PHINode *, 8> OrigPHIsToFix;
736 };
737 
738 class InnerLoopUnroller : public InnerLoopVectorizer {
739 public:
740   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
741                     LoopInfo *LI, DominatorTree *DT,
742                     const TargetLibraryInfo *TLI,
743                     const TargetTransformInfo *TTI, AssumptionCache *AC,
744                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
745                     LoopVectorizationLegality *LVL,
746                     LoopVectorizationCostModel *CM)
747       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
748                             UnrollFactor, LVL, CM) {}
749 
750 private:
751   Value *getBroadcastInstrs(Value *V) override;
752   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
753                        Instruction::BinaryOps Opcode =
754                        Instruction::BinaryOpsEnd) override;
755   Value *reverseVector(Value *Vec) override;
756 };
757 
758 } // end namespace llvm
759 
760 /// Look for a meaningful debug location on the instruction or it's
761 /// operands.
762 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
763   if (!I)
764     return I;
765 
766   DebugLoc Empty;
767   if (I->getDebugLoc() != Empty)
768     return I;
769 
770   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
771     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
772       if (OpInst->getDebugLoc() != Empty)
773         return OpInst;
774   }
775 
776   return I;
777 }
778 
779 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
780   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
781     const DILocation *DIL = Inst->getDebugLoc();
782     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
783         !isa<DbgInfoIntrinsic>(Inst)) {
784       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
785       if (NewDIL)
786         B.SetCurrentDebugLocation(NewDIL.getValue());
787       else
788         LLVM_DEBUG(dbgs()
789                    << "Failed to create new discriminator: "
790                    << DIL->getFilename() << " Line: " << DIL->getLine());
791     }
792     else
793       B.SetCurrentDebugLocation(DIL);
794   } else
795     B.SetCurrentDebugLocation(DebugLoc());
796 }
797 
798 /// Write a record \p DebugMsg about vectorization failure to the debug
799 /// output stream. If \p I is passed, it is an instruction that prevents
800 /// vectorization.
801 #ifndef NDEBUG
802 static void debugVectorizationFailure(const StringRef DebugMsg,
803     Instruction *I) {
804   dbgs() << "LV: Not vectorizing: " << DebugMsg;
805   if (I != nullptr)
806     dbgs() << " " << *I;
807   else
808     dbgs() << '.';
809   dbgs() << '\n';
810 }
811 #endif
812 
813 /// Create an analysis remark that explains why vectorization failed
814 ///
815 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
816 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
817 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
818 /// the location of the remark.  \return the remark object that can be
819 /// streamed to.
820 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
821     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
822   Value *CodeRegion = TheLoop->getHeader();
823   DebugLoc DL = TheLoop->getStartLoc();
824 
825   if (I) {
826     CodeRegion = I->getParent();
827     // If there is no debug location attached to the instruction, revert back to
828     // using the loop's.
829     if (I->getDebugLoc())
830       DL = I->getDebugLoc();
831   }
832 
833   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
834   R << "loop not vectorized: ";
835   return R;
836 }
837 
838 namespace llvm {
839 
840 void reportVectorizationFailure(const StringRef DebugMsg,
841     const StringRef OREMsg, const StringRef ORETag,
842     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
843   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
844   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
845   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
846                 ORETag, TheLoop, I) << OREMsg);
847 }
848 
849 } // end namespace llvm
850 
851 #ifndef NDEBUG
852 /// \return string containing a file name and a line # for the given loop.
853 static std::string getDebugLocString(const Loop *L) {
854   std::string Result;
855   if (L) {
856     raw_string_ostream OS(Result);
857     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
858       LoopDbgLoc.print(OS);
859     else
860       // Just print the module name.
861       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
862     OS.flush();
863   }
864   return Result;
865 }
866 #endif
867 
868 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
869                                          const Instruction *Orig) {
870   // If the loop was versioned with memchecks, add the corresponding no-alias
871   // metadata.
872   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
873     LVer->annotateInstWithNoAlias(To, Orig);
874 }
875 
876 void InnerLoopVectorizer::addMetadata(Instruction *To,
877                                       Instruction *From) {
878   propagateMetadata(To, From);
879   addNewMetadata(To, From);
880 }
881 
882 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
883                                       Instruction *From) {
884   for (Value *V : To) {
885     if (Instruction *I = dyn_cast<Instruction>(V))
886       addMetadata(I, From);
887   }
888 }
889 
890 namespace llvm {
891 
892 // Loop vectorization cost-model hints how the scalar epilogue loop should be
893 // lowered.
894 enum ScalarEpilogueLowering {
895 
896   // The default: allowing scalar epilogues.
897   CM_ScalarEpilogueAllowed,
898 
899   // Vectorization with OptForSize: don't allow epilogues.
900   CM_ScalarEpilogueNotAllowedOptSize,
901 
902   // A special case of vectorisation with OptForSize: loops with a very small
903   // trip count are considered for vectorization under OptForSize, thereby
904   // making sure the cost of their loop body is dominant, free of runtime
905   // guards and scalar iteration overheads.
906   CM_ScalarEpilogueNotAllowedLowTripLoop,
907 
908   // Loop hint predicate indicating an epilogue is undesired.
909   CM_ScalarEpilogueNotNeededPredicatePragma
910 };
911 
912 /// LoopVectorizationCostModel - estimates the expected speedups due to
913 /// vectorization.
914 /// In many cases vectorization is not profitable. This can happen because of
915 /// a number of reasons. In this class we mainly attempt to predict the
916 /// expected speedup/slowdowns due to the supported instruction set. We use the
917 /// TargetTransformInfo to query the different backends for the cost of
918 /// different operations.
919 class LoopVectorizationCostModel {
920 public:
921   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
922                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
923                              LoopVectorizationLegality *Legal,
924                              const TargetTransformInfo &TTI,
925                              const TargetLibraryInfo *TLI, DemandedBits *DB,
926                              AssumptionCache *AC,
927                              OptimizationRemarkEmitter *ORE, const Function *F,
928                              const LoopVectorizeHints *Hints,
929                              InterleavedAccessInfo &IAI)
930       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
931         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
932         Hints(Hints), InterleaveInfo(IAI) {}
933 
934   /// \return An upper bound for the vectorization factor, or None if
935   /// vectorization and interleaving should be avoided up front.
936   Optional<unsigned> computeMaxVF();
937 
938   /// \return True if runtime checks are required for vectorization, and false
939   /// otherwise.
940   bool runtimeChecksRequired();
941 
942   /// \return The most profitable vectorization factor and the cost of that VF.
943   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
944   /// then this vectorization factor will be selected if vectorization is
945   /// possible.
946   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
947 
948   /// Setup cost-based decisions for user vectorization factor.
949   void selectUserVectorizationFactor(unsigned UserVF) {
950     collectUniformsAndScalars(UserVF);
951     collectInstsToScalarize(UserVF);
952   }
953 
954   /// \return The size (in bits) of the smallest and widest types in the code
955   /// that needs to be vectorized. We ignore values that remain scalar such as
956   /// 64 bit loop indices.
957   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
958 
959   /// \return The desired interleave count.
960   /// If interleave count has been specified by metadata it will be returned.
961   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
962   /// are the selected vectorization factor and the cost of the selected VF.
963   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
964 
965   /// Memory access instruction may be vectorized in more than one way.
966   /// Form of instruction after vectorization depends on cost.
967   /// This function takes cost-based decisions for Load/Store instructions
968   /// and collects them in a map. This decisions map is used for building
969   /// the lists of loop-uniform and loop-scalar instructions.
970   /// The calculated cost is saved with widening decision in order to
971   /// avoid redundant calculations.
972   void setCostBasedWideningDecision(unsigned VF);
973 
974   /// A struct that represents some properties of the register usage
975   /// of a loop.
976   struct RegisterUsage {
977     /// Holds the number of loop invariant values that are used in the loop.
978     unsigned LoopInvariantRegs;
979 
980     /// Holds the maximum number of concurrent live intervals in the loop.
981     unsigned MaxLocalUsers;
982   };
983 
984   /// \return Returns information about the register usages of the loop for the
985   /// given vectorization factors.
986   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
987 
988   /// Collect values we want to ignore in the cost model.
989   void collectValuesToIgnore();
990 
991   /// \returns The smallest bitwidth each instruction can be represented with.
992   /// The vector equivalents of these instructions should be truncated to this
993   /// type.
994   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
995     return MinBWs;
996   }
997 
998   /// \returns True if it is more profitable to scalarize instruction \p I for
999   /// vectorization factor \p VF.
1000   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1001     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1002 
1003     // Cost model is not run in the VPlan-native path - return conservative
1004     // result until this changes.
1005     if (EnableVPlanNativePath)
1006       return false;
1007 
1008     auto Scalars = InstsToScalarize.find(VF);
1009     assert(Scalars != InstsToScalarize.end() &&
1010            "VF not yet analyzed for scalarization profitability");
1011     return Scalars->second.find(I) != Scalars->second.end();
1012   }
1013 
1014   /// Returns true if \p I is known to be uniform after vectorization.
1015   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1016     if (VF == 1)
1017       return true;
1018 
1019     // Cost model is not run in the VPlan-native path - return conservative
1020     // result until this changes.
1021     if (EnableVPlanNativePath)
1022       return false;
1023 
1024     auto UniformsPerVF = Uniforms.find(VF);
1025     assert(UniformsPerVF != Uniforms.end() &&
1026            "VF not yet analyzed for uniformity");
1027     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1028   }
1029 
1030   /// Returns true if \p I is known to be scalar after vectorization.
1031   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1032     if (VF == 1)
1033       return true;
1034 
1035     // Cost model is not run in the VPlan-native path - return conservative
1036     // result until this changes.
1037     if (EnableVPlanNativePath)
1038       return false;
1039 
1040     auto ScalarsPerVF = Scalars.find(VF);
1041     assert(ScalarsPerVF != Scalars.end() &&
1042            "Scalar values are not calculated for VF");
1043     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1044   }
1045 
1046   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1047   /// for vectorization factor \p VF.
1048   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1049     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1050            !isProfitableToScalarize(I, VF) &&
1051            !isScalarAfterVectorization(I, VF);
1052   }
1053 
1054   /// Decision that was taken during cost calculation for memory instruction.
1055   enum InstWidening {
1056     CM_Unknown,
1057     CM_Widen,         // For consecutive accesses with stride +1.
1058     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1059     CM_Interleave,
1060     CM_GatherScatter,
1061     CM_Scalarize
1062   };
1063 
1064   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1065   /// instruction \p I and vector width \p VF.
1066   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1067                            unsigned Cost) {
1068     assert(VF >= 2 && "Expected VF >=2");
1069     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1070   }
1071 
1072   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073   /// interleaving group \p Grp and vector width \p VF.
1074   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1075                            InstWidening W, unsigned Cost) {
1076     assert(VF >= 2 && "Expected VF >=2");
1077     /// Broadcast this decicion to all instructions inside the group.
1078     /// But the cost will be assigned to one instruction only.
1079     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1080       if (auto *I = Grp->getMember(i)) {
1081         if (Grp->getInsertPos() == I)
1082           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1083         else
1084           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1085       }
1086     }
1087   }
1088 
1089   /// Return the cost model decision for the given instruction \p I and vector
1090   /// width \p VF. Return CM_Unknown if this instruction did not pass
1091   /// through the cost modeling.
1092   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1093     assert(VF >= 2 && "Expected VF >=2");
1094 
1095     // Cost model is not run in the VPlan-native path - return conservative
1096     // result until this changes.
1097     if (EnableVPlanNativePath)
1098       return CM_GatherScatter;
1099 
1100     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1101     auto Itr = WideningDecisions.find(InstOnVF);
1102     if (Itr == WideningDecisions.end())
1103       return CM_Unknown;
1104     return Itr->second.first;
1105   }
1106 
1107   /// Return the vectorization cost for the given instruction \p I and vector
1108   /// width \p VF.
1109   unsigned getWideningCost(Instruction *I, unsigned VF) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1112     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1113            "The cost is not calculated");
1114     return WideningDecisions[InstOnVF].second;
1115   }
1116 
1117   /// Return True if instruction \p I is an optimizable truncate whose operand
1118   /// is an induction variable. Such a truncate will be removed by adding a new
1119   /// induction variable with the destination type.
1120   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1121     // If the instruction is not a truncate, return false.
1122     auto *Trunc = dyn_cast<TruncInst>(I);
1123     if (!Trunc)
1124       return false;
1125 
1126     // Get the source and destination types of the truncate.
1127     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1128     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1129 
1130     // If the truncate is free for the given types, return false. Replacing a
1131     // free truncate with an induction variable would add an induction variable
1132     // update instruction to each iteration of the loop. We exclude from this
1133     // check the primary induction variable since it will need an update
1134     // instruction regardless.
1135     Value *Op = Trunc->getOperand(0);
1136     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1137       return false;
1138 
1139     // If the truncated value is not an induction variable, return false.
1140     return Legal->isInductionPhi(Op);
1141   }
1142 
1143   /// Collects the instructions to scalarize for each predicated instruction in
1144   /// the loop.
1145   void collectInstsToScalarize(unsigned VF);
1146 
1147   /// Collect Uniform and Scalar values for the given \p VF.
1148   /// The sets depend on CM decision for Load/Store instructions
1149   /// that may be vectorized as interleave, gather-scatter or scalarized.
1150   void collectUniformsAndScalars(unsigned VF) {
1151     // Do the analysis once.
1152     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1153       return;
1154     setCostBasedWideningDecision(VF);
1155     collectLoopUniforms(VF);
1156     collectLoopScalars(VF);
1157   }
1158 
1159   /// Returns true if the target machine supports masked store operation
1160   /// for the given \p DataType and kind of access to \p Ptr.
1161   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1162     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1163   }
1164 
1165   /// Returns true if the target machine supports masked load operation
1166   /// for the given \p DataType and kind of access to \p Ptr.
1167   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1168     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1169   }
1170 
1171   /// Returns true if the target machine supports masked scatter operation
1172   /// for the given \p DataType.
1173   bool isLegalMaskedScatter(Type *DataType) {
1174     return TTI.isLegalMaskedScatter(DataType);
1175   }
1176 
1177   /// Returns true if the target machine supports masked gather operation
1178   /// for the given \p DataType.
1179   bool isLegalMaskedGather(Type *DataType) {
1180     return TTI.isLegalMaskedGather(DataType);
1181   }
1182 
1183   /// Returns true if the target machine can represent \p V as a masked gather
1184   /// or scatter operation.
1185   bool isLegalGatherOrScatter(Value *V) {
1186     bool LI = isa<LoadInst>(V);
1187     bool SI = isa<StoreInst>(V);
1188     if (!LI && !SI)
1189       return false;
1190     auto *Ty = getMemInstValueType(V);
1191     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1192   }
1193 
1194   /// Returns true if \p I is an instruction that will be scalarized with
1195   /// predication. Such instructions include conditional stores and
1196   /// instructions that may divide by zero.
1197   /// If a non-zero VF has been calculated, we check if I will be scalarized
1198   /// predication for that VF.
1199   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1200 
1201   // Returns true if \p I is an instruction that will be predicated either
1202   // through scalar predication or masked load/store or masked gather/scatter.
1203   // Superset of instructions that return true for isScalarWithPredication.
1204   bool isPredicatedInst(Instruction *I) {
1205     if (!blockNeedsPredication(I->getParent()))
1206       return false;
1207     // Loads and stores that need some form of masked operation are predicated
1208     // instructions.
1209     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1210       return Legal->isMaskRequired(I);
1211     return isScalarWithPredication(I);
1212   }
1213 
1214   /// Returns true if \p I is a memory instruction with consecutive memory
1215   /// access that can be widened.
1216   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1217 
1218   /// Returns true if \p I is a memory instruction in an interleaved-group
1219   /// of memory accesses that can be vectorized with wide vector loads/stores
1220   /// and shuffles.
1221   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1222 
1223   /// Check if \p Instr belongs to any interleaved access group.
1224   bool isAccessInterleaved(Instruction *Instr) {
1225     return InterleaveInfo.isInterleaved(Instr);
1226   }
1227 
1228   /// Get the interleaved access group that \p Instr belongs to.
1229   const InterleaveGroup<Instruction> *
1230   getInterleavedAccessGroup(Instruction *Instr) {
1231     return InterleaveInfo.getInterleaveGroup(Instr);
1232   }
1233 
1234   /// Returns true if an interleaved group requires a scalar iteration
1235   /// to handle accesses with gaps, and there is nothing preventing us from
1236   /// creating a scalar epilogue.
1237   bool requiresScalarEpilogue() const {
1238     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1239   }
1240 
1241   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1242   /// loop hint annotation.
1243   bool isScalarEpilogueAllowed() const {
1244     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1245   }
1246 
1247   /// Returns true if all loop blocks should be masked to fold tail loop.
1248   bool foldTailByMasking() const { return FoldTailByMasking; }
1249 
1250   bool blockNeedsPredication(BasicBlock *BB) {
1251     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1252   }
1253 
1254   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1255   /// with factor VF.  Return the cost of the instruction, including
1256   /// scalarization overhead if it's needed.
1257   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1258 
1259   /// Estimate cost of a call instruction CI if it were vectorized with factor
1260   /// VF. Return the cost of the instruction, including scalarization overhead
1261   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1262   /// scalarized -
1263   /// i.e. either vector version isn't available, or is too expensive.
1264   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1265 
1266 private:
1267   unsigned NumPredStores = 0;
1268 
1269   /// \return An upper bound for the vectorization factor, larger than zero.
1270   /// One is returned if vectorization should best be avoided due to cost.
1271   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1272 
1273   /// The vectorization cost is a combination of the cost itself and a boolean
1274   /// indicating whether any of the contributing operations will actually
1275   /// operate on
1276   /// vector values after type legalization in the backend. If this latter value
1277   /// is
1278   /// false, then all operations will be scalarized (i.e. no vectorization has
1279   /// actually taken place).
1280   using VectorizationCostTy = std::pair<unsigned, bool>;
1281 
1282   /// Returns the expected execution cost. The unit of the cost does
1283   /// not matter because we use the 'cost' units to compare different
1284   /// vector widths. The cost that is returned is *not* normalized by
1285   /// the factor width.
1286   VectorizationCostTy expectedCost(unsigned VF);
1287 
1288   /// Returns the execution time cost of an instruction for a given vector
1289   /// width. Vector width of one means scalar.
1290   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1291 
1292   /// The cost-computation logic from getInstructionCost which provides
1293   /// the vector type as an output parameter.
1294   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1295 
1296   /// Calculate vectorization cost of memory instruction \p I.
1297   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1298 
1299   /// The cost computation for scalarized memory instruction.
1300   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1301 
1302   /// The cost computation for interleaving group of memory instructions.
1303   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1304 
1305   /// The cost computation for Gather/Scatter instruction.
1306   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1307 
1308   /// The cost computation for widening instruction \p I with consecutive
1309   /// memory access.
1310   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1311 
1312   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1313   /// Load: scalar load + broadcast.
1314   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1315   /// element)
1316   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1317 
1318   /// Estimate the overhead of scalarizing an instruction. This is a
1319   /// convenience wrapper for the type-based getScalarizationOverhead API.
1320   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1321 
1322   /// Returns whether the instruction is a load or store and will be a emitted
1323   /// as a vector operation.
1324   bool isConsecutiveLoadOrStore(Instruction *I);
1325 
1326   /// Returns true if an artificially high cost for emulated masked memrefs
1327   /// should be used.
1328   bool useEmulatedMaskMemRefHack(Instruction *I);
1329 
1330   /// Map of scalar integer values to the smallest bitwidth they can be legally
1331   /// represented as. The vector equivalents of these values should be truncated
1332   /// to this type.
1333   MapVector<Instruction *, uint64_t> MinBWs;
1334 
1335   /// A type representing the costs for instructions if they were to be
1336   /// scalarized rather than vectorized. The entries are Instruction-Cost
1337   /// pairs.
1338   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1339 
1340   /// A set containing all BasicBlocks that are known to present after
1341   /// vectorization as a predicated block.
1342   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1343 
1344   /// Records whether it is allowed to have the original scalar loop execute at
1345   /// least once. This may be needed as a fallback loop in case runtime
1346   /// aliasing/dependence checks fail, or to handle the tail/remainder
1347   /// iterations when the trip count is unknown or doesn't divide by the VF,
1348   /// or as a peel-loop to handle gaps in interleave-groups.
1349   /// Under optsize and when the trip count is very small we don't allow any
1350   /// iterations to execute in the scalar loop.
1351   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1352 
1353   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1354   bool FoldTailByMasking = false;
1355 
1356   /// A map holding scalar costs for different vectorization factors. The
1357   /// presence of a cost for an instruction in the mapping indicates that the
1358   /// instruction will be scalarized when vectorizing with the associated
1359   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1360   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1361 
1362   /// Holds the instructions known to be uniform after vectorization.
1363   /// The data is collected per VF.
1364   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1365 
1366   /// Holds the instructions known to be scalar after vectorization.
1367   /// The data is collected per VF.
1368   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1369 
1370   /// Holds the instructions (address computations) that are forced to be
1371   /// scalarized.
1372   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1373 
1374   /// Returns the expected difference in cost from scalarizing the expression
1375   /// feeding a predicated instruction \p PredInst. The instructions to
1376   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1377   /// non-negative return value implies the expression will be scalarized.
1378   /// Currently, only single-use chains are considered for scalarization.
1379   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1380                               unsigned VF);
1381 
1382   /// Collect the instructions that are uniform after vectorization. An
1383   /// instruction is uniform if we represent it with a single scalar value in
1384   /// the vectorized loop corresponding to each vector iteration. Examples of
1385   /// uniform instructions include pointer operands of consecutive or
1386   /// interleaved memory accesses. Note that although uniformity implies an
1387   /// instruction will be scalar, the reverse is not true. In general, a
1388   /// scalarized instruction will be represented by VF scalar values in the
1389   /// vectorized loop, each corresponding to an iteration of the original
1390   /// scalar loop.
1391   void collectLoopUniforms(unsigned VF);
1392 
1393   /// Collect the instructions that are scalar after vectorization. An
1394   /// instruction is scalar if it is known to be uniform or will be scalarized
1395   /// during vectorization. Non-uniform scalarized instructions will be
1396   /// represented by VF values in the vectorized loop, each corresponding to an
1397   /// iteration of the original scalar loop.
1398   void collectLoopScalars(unsigned VF);
1399 
1400   /// Keeps cost model vectorization decision and cost for instructions.
1401   /// Right now it is used for memory instructions only.
1402   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1403                                 std::pair<InstWidening, unsigned>>;
1404 
1405   DecisionList WideningDecisions;
1406 
1407   /// Returns true if \p V is expected to be vectorized and it needs to be
1408   /// extracted.
1409   bool needsExtract(Value *V, unsigned VF) const {
1410     Instruction *I = dyn_cast<Instruction>(V);
1411     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1412       return false;
1413 
1414     // Assume we can vectorize V (and hence we need extraction) if the
1415     // scalars are not computed yet. This can happen, because it is called
1416     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1417     // the scalars are collected. That should be a safe assumption in most
1418     // cases, because we check if the operands have vectorizable types
1419     // beforehand in LoopVectorizationLegality.
1420     return Scalars.find(VF) == Scalars.end() ||
1421            !isScalarAfterVectorization(I, VF);
1422   };
1423 
1424   /// Returns a range containing only operands needing to be extracted.
1425   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1426                                                    unsigned VF) {
1427     return SmallVector<Value *, 4>(make_filter_range(
1428         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1429   }
1430 
1431 public:
1432   /// The loop that we evaluate.
1433   Loop *TheLoop;
1434 
1435   /// Predicated scalar evolution analysis.
1436   PredicatedScalarEvolution &PSE;
1437 
1438   /// Loop Info analysis.
1439   LoopInfo *LI;
1440 
1441   /// Vectorization legality.
1442   LoopVectorizationLegality *Legal;
1443 
1444   /// Vector target information.
1445   const TargetTransformInfo &TTI;
1446 
1447   /// Target Library Info.
1448   const TargetLibraryInfo *TLI;
1449 
1450   /// Demanded bits analysis.
1451   DemandedBits *DB;
1452 
1453   /// Assumption cache.
1454   AssumptionCache *AC;
1455 
1456   /// Interface to emit optimization remarks.
1457   OptimizationRemarkEmitter *ORE;
1458 
1459   const Function *TheFunction;
1460 
1461   /// Loop Vectorize Hint.
1462   const LoopVectorizeHints *Hints;
1463 
1464   /// The interleave access information contains groups of interleaved accesses
1465   /// with the same stride and close to each other.
1466   InterleavedAccessInfo &InterleaveInfo;
1467 
1468   /// Values to ignore in the cost model.
1469   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1470 
1471   /// Values to ignore in the cost model when VF > 1.
1472   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1473 };
1474 
1475 } // end namespace llvm
1476 
1477 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1478 // vectorization. The loop needs to be annotated with #pragma omp simd
1479 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1480 // vector length information is not provided, vectorization is not considered
1481 // explicit. Interleave hints are not allowed either. These limitations will be
1482 // relaxed in the future.
1483 // Please, note that we are currently forced to abuse the pragma 'clang
1484 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1485 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1486 // provides *explicit vectorization hints* (LV can bypass legal checks and
1487 // assume that vectorization is legal). However, both hints are implemented
1488 // using the same metadata (llvm.loop.vectorize, processed by
1489 // LoopVectorizeHints). This will be fixed in the future when the native IR
1490 // representation for pragma 'omp simd' is introduced.
1491 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1492                                    OptimizationRemarkEmitter *ORE) {
1493   assert(!OuterLp->empty() && "This is not an outer loop");
1494   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1495 
1496   // Only outer loops with an explicit vectorization hint are supported.
1497   // Unannotated outer loops are ignored.
1498   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1499     return false;
1500 
1501   Function *Fn = OuterLp->getHeader()->getParent();
1502   if (!Hints.allowVectorization(Fn, OuterLp,
1503                                 true /*VectorizeOnlyWhenForced*/)) {
1504     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1505     return false;
1506   }
1507 
1508   if (Hints.getInterleave() > 1) {
1509     // TODO: Interleave support is future work.
1510     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1511                          "outer loops.\n");
1512     Hints.emitRemarkWithHints();
1513     return false;
1514   }
1515 
1516   return true;
1517 }
1518 
1519 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1520                                   OptimizationRemarkEmitter *ORE,
1521                                   SmallVectorImpl<Loop *> &V) {
1522   // Collect inner loops and outer loops without irreducible control flow. For
1523   // now, only collect outer loops that have explicit vectorization hints. If we
1524   // are stress testing the VPlan H-CFG construction, we collect the outermost
1525   // loop of every loop nest.
1526   if (L.empty() || VPlanBuildStressTest ||
1527       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1528     LoopBlocksRPO RPOT(&L);
1529     RPOT.perform(LI);
1530     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1531       V.push_back(&L);
1532       // TODO: Collect inner loops inside marked outer loops in case
1533       // vectorization fails for the outer loop. Do not invoke
1534       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1535       // already known to be reducible. We can use an inherited attribute for
1536       // that.
1537       return;
1538     }
1539   }
1540   for (Loop *InnerL : L)
1541     collectSupportedLoops(*InnerL, LI, ORE, V);
1542 }
1543 
1544 namespace {
1545 
1546 /// The LoopVectorize Pass.
1547 struct LoopVectorize : public FunctionPass {
1548   /// Pass identification, replacement for typeid
1549   static char ID;
1550 
1551   LoopVectorizePass Impl;
1552 
1553   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1554                          bool VectorizeOnlyWhenForced = false)
1555       : FunctionPass(ID) {
1556     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1557     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1558     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1559   }
1560 
1561   bool runOnFunction(Function &F) override {
1562     if (skipFunction(F))
1563       return false;
1564 
1565     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1566     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1567     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1568     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1569     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1570     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1571     auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
1572     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1573     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1574     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1575     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1576     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1577     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1578 
1579     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1580         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1581 
1582     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1583                         GetLAA, *ORE, PSI);
1584   }
1585 
1586   void getAnalysisUsage(AnalysisUsage &AU) const override {
1587     AU.addRequired<AssumptionCacheTracker>();
1588     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1589     AU.addRequired<DominatorTreeWrapperPass>();
1590     AU.addRequired<LoopInfoWrapperPass>();
1591     AU.addRequired<ScalarEvolutionWrapperPass>();
1592     AU.addRequired<TargetTransformInfoWrapperPass>();
1593     AU.addRequired<AAResultsWrapperPass>();
1594     AU.addRequired<LoopAccessLegacyAnalysis>();
1595     AU.addRequired<DemandedBitsWrapperPass>();
1596     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1597 
1598     // We currently do not preserve loopinfo/dominator analyses with outer loop
1599     // vectorization. Until this is addressed, mark these analyses as preserved
1600     // only for non-VPlan-native path.
1601     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1602     if (!EnableVPlanNativePath) {
1603       AU.addPreserved<LoopInfoWrapperPass>();
1604       AU.addPreserved<DominatorTreeWrapperPass>();
1605     }
1606 
1607     AU.addPreserved<BasicAAWrapperPass>();
1608     AU.addPreserved<GlobalsAAWrapperPass>();
1609     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1610   }
1611 };
1612 
1613 } // end anonymous namespace
1614 
1615 //===----------------------------------------------------------------------===//
1616 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1617 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1618 //===----------------------------------------------------------------------===//
1619 
1620 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1621   // We need to place the broadcast of invariant variables outside the loop,
1622   // but only if it's proven safe to do so. Else, broadcast will be inside
1623   // vector loop body.
1624   Instruction *Instr = dyn_cast<Instruction>(V);
1625   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1626                      (!Instr ||
1627                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1628   // Place the code for broadcasting invariant variables in the new preheader.
1629   IRBuilder<>::InsertPointGuard Guard(Builder);
1630   if (SafeToHoist)
1631     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1632 
1633   // Broadcast the scalar into all locations in the vector.
1634   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1635 
1636   return Shuf;
1637 }
1638 
1639 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1640     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1641   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1642          "Expected either an induction phi-node or a truncate of it!");
1643   Value *Start = II.getStartValue();
1644 
1645   // Construct the initial value of the vector IV in the vector loop preheader
1646   auto CurrIP = Builder.saveIP();
1647   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1648   if (isa<TruncInst>(EntryVal)) {
1649     assert(Start->getType()->isIntegerTy() &&
1650            "Truncation requires an integer type");
1651     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1652     Step = Builder.CreateTrunc(Step, TruncType);
1653     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1654   }
1655   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1656   Value *SteppedStart =
1657       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1658 
1659   // We create vector phi nodes for both integer and floating-point induction
1660   // variables. Here, we determine the kind of arithmetic we will perform.
1661   Instruction::BinaryOps AddOp;
1662   Instruction::BinaryOps MulOp;
1663   if (Step->getType()->isIntegerTy()) {
1664     AddOp = Instruction::Add;
1665     MulOp = Instruction::Mul;
1666   } else {
1667     AddOp = II.getInductionOpcode();
1668     MulOp = Instruction::FMul;
1669   }
1670 
1671   // Multiply the vectorization factor by the step using integer or
1672   // floating-point arithmetic as appropriate.
1673   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1674   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1675 
1676   // Create a vector splat to use in the induction update.
1677   //
1678   // FIXME: If the step is non-constant, we create the vector splat with
1679   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1680   //        handle a constant vector splat.
1681   Value *SplatVF = isa<Constant>(Mul)
1682                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1683                        : Builder.CreateVectorSplat(VF, Mul);
1684   Builder.restoreIP(CurrIP);
1685 
1686   // We may need to add the step a number of times, depending on the unroll
1687   // factor. The last of those goes into the PHI.
1688   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1689                                     &*LoopVectorBody->getFirstInsertionPt());
1690   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1691   Instruction *LastInduction = VecInd;
1692   for (unsigned Part = 0; Part < UF; ++Part) {
1693     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1694 
1695     if (isa<TruncInst>(EntryVal))
1696       addMetadata(LastInduction, EntryVal);
1697     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1698 
1699     LastInduction = cast<Instruction>(addFastMathFlag(
1700         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1701     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1702   }
1703 
1704   // Move the last step to the end of the latch block. This ensures consistent
1705   // placement of all induction updates.
1706   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1707   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1708   auto *ICmp = cast<Instruction>(Br->getCondition());
1709   LastInduction->moveBefore(ICmp);
1710   LastInduction->setName("vec.ind.next");
1711 
1712   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1713   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1714 }
1715 
1716 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1717   return Cost->isScalarAfterVectorization(I, VF) ||
1718          Cost->isProfitableToScalarize(I, VF);
1719 }
1720 
1721 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1722   if (shouldScalarizeInstruction(IV))
1723     return true;
1724   auto isScalarInst = [&](User *U) -> bool {
1725     auto *I = cast<Instruction>(U);
1726     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1727   };
1728   return llvm::any_of(IV->users(), isScalarInst);
1729 }
1730 
1731 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1732     const InductionDescriptor &ID, const Instruction *EntryVal,
1733     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1734   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1735          "Expected either an induction phi-node or a truncate of it!");
1736 
1737   // This induction variable is not the phi from the original loop but the
1738   // newly-created IV based on the proof that casted Phi is equal to the
1739   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1740   // re-uses the same InductionDescriptor that original IV uses but we don't
1741   // have to do any recording in this case - that is done when original IV is
1742   // processed.
1743   if (isa<TruncInst>(EntryVal))
1744     return;
1745 
1746   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1747   if (Casts.empty())
1748     return;
1749   // Only the first Cast instruction in the Casts vector is of interest.
1750   // The rest of the Casts (if exist) have no uses outside the
1751   // induction update chain itself.
1752   Instruction *CastInst = *Casts.begin();
1753   if (Lane < UINT_MAX)
1754     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1755   else
1756     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1757 }
1758 
1759 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1760   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1761          "Primary induction variable must have an integer type");
1762 
1763   auto II = Legal->getInductionVars()->find(IV);
1764   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1765 
1766   auto ID = II->second;
1767   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1768 
1769   // The scalar value to broadcast. This will be derived from the canonical
1770   // induction variable.
1771   Value *ScalarIV = nullptr;
1772 
1773   // The value from the original loop to which we are mapping the new induction
1774   // variable.
1775   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1776 
1777   // True if we have vectorized the induction variable.
1778   auto VectorizedIV = false;
1779 
1780   // Determine if we want a scalar version of the induction variable. This is
1781   // true if the induction variable itself is not widened, or if it has at
1782   // least one user in the loop that is not widened.
1783   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1784 
1785   // Generate code for the induction step. Note that induction steps are
1786   // required to be loop-invariant
1787   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1788          "Induction step should be loop invariant");
1789   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1790   Value *Step = nullptr;
1791   if (PSE.getSE()->isSCEVable(IV->getType())) {
1792     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1793     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1794                              LoopVectorPreHeader->getTerminator());
1795   } else {
1796     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1797   }
1798 
1799   // Try to create a new independent vector induction variable. If we can't
1800   // create the phi node, we will splat the scalar induction variable in each
1801   // loop iteration.
1802   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1803     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1804     VectorizedIV = true;
1805   }
1806 
1807   // If we haven't yet vectorized the induction variable, or if we will create
1808   // a scalar one, we need to define the scalar induction variable and step
1809   // values. If we were given a truncation type, truncate the canonical
1810   // induction variable and step. Otherwise, derive these values from the
1811   // induction descriptor.
1812   if (!VectorizedIV || NeedsScalarIV) {
1813     ScalarIV = Induction;
1814     if (IV != OldInduction) {
1815       ScalarIV = IV->getType()->isIntegerTy()
1816                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1817                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1818                                           IV->getType());
1819       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1820       ScalarIV->setName("offset.idx");
1821     }
1822     if (Trunc) {
1823       auto *TruncType = cast<IntegerType>(Trunc->getType());
1824       assert(Step->getType()->isIntegerTy() &&
1825              "Truncation requires an integer step");
1826       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1827       Step = Builder.CreateTrunc(Step, TruncType);
1828     }
1829   }
1830 
1831   // If we haven't yet vectorized the induction variable, splat the scalar
1832   // induction variable, and build the necessary step vectors.
1833   // TODO: Don't do it unless the vectorized IV is really required.
1834   if (!VectorizedIV) {
1835     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1836     for (unsigned Part = 0; Part < UF; ++Part) {
1837       Value *EntryPart =
1838           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1839       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1840       if (Trunc)
1841         addMetadata(EntryPart, Trunc);
1842       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1843     }
1844   }
1845 
1846   // If an induction variable is only used for counting loop iterations or
1847   // calculating addresses, it doesn't need to be widened. Create scalar steps
1848   // that can be used by instructions we will later scalarize. Note that the
1849   // addition of the scalar steps will not increase the number of instructions
1850   // in the loop in the common case prior to InstCombine. We will be trading
1851   // one vector extract for each scalar step.
1852   if (NeedsScalarIV)
1853     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1854 }
1855 
1856 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1857                                           Instruction::BinaryOps BinOp) {
1858   // Create and check the types.
1859   assert(Val->getType()->isVectorTy() && "Must be a vector");
1860   int VLen = Val->getType()->getVectorNumElements();
1861 
1862   Type *STy = Val->getType()->getScalarType();
1863   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1864          "Induction Step must be an integer or FP");
1865   assert(Step->getType() == STy && "Step has wrong type");
1866 
1867   SmallVector<Constant *, 8> Indices;
1868 
1869   if (STy->isIntegerTy()) {
1870     // Create a vector of consecutive numbers from zero to VF.
1871     for (int i = 0; i < VLen; ++i)
1872       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1873 
1874     // Add the consecutive indices to the vector value.
1875     Constant *Cv = ConstantVector::get(Indices);
1876     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1877     Step = Builder.CreateVectorSplat(VLen, Step);
1878     assert(Step->getType() == Val->getType() && "Invalid step vec");
1879     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1880     // which can be found from the original scalar operations.
1881     Step = Builder.CreateMul(Cv, Step);
1882     return Builder.CreateAdd(Val, Step, "induction");
1883   }
1884 
1885   // Floating point induction.
1886   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1887          "Binary Opcode should be specified for FP induction");
1888   // Create a vector of consecutive numbers from zero to VF.
1889   for (int i = 0; i < VLen; ++i)
1890     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1891 
1892   // Add the consecutive indices to the vector value.
1893   Constant *Cv = ConstantVector::get(Indices);
1894 
1895   Step = Builder.CreateVectorSplat(VLen, Step);
1896 
1897   // Floating point operations had to be 'fast' to enable the induction.
1898   FastMathFlags Flags;
1899   Flags.setFast();
1900 
1901   Value *MulOp = Builder.CreateFMul(Cv, Step);
1902   if (isa<Instruction>(MulOp))
1903     // Have to check, MulOp may be a constant
1904     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1905 
1906   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1907   if (isa<Instruction>(BOp))
1908     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1909   return BOp;
1910 }
1911 
1912 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1913                                            Instruction *EntryVal,
1914                                            const InductionDescriptor &ID) {
1915   // We shouldn't have to build scalar steps if we aren't vectorizing.
1916   assert(VF > 1 && "VF should be greater than one");
1917 
1918   // Get the value type and ensure it and the step have the same integer type.
1919   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1920   assert(ScalarIVTy == Step->getType() &&
1921          "Val and Step should have the same type");
1922 
1923   // We build scalar steps for both integer and floating-point induction
1924   // variables. Here, we determine the kind of arithmetic we will perform.
1925   Instruction::BinaryOps AddOp;
1926   Instruction::BinaryOps MulOp;
1927   if (ScalarIVTy->isIntegerTy()) {
1928     AddOp = Instruction::Add;
1929     MulOp = Instruction::Mul;
1930   } else {
1931     AddOp = ID.getInductionOpcode();
1932     MulOp = Instruction::FMul;
1933   }
1934 
1935   // Determine the number of scalars we need to generate for each unroll
1936   // iteration. If EntryVal is uniform, we only need to generate the first
1937   // lane. Otherwise, we generate all VF values.
1938   unsigned Lanes =
1939       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1940                                                                          : VF;
1941   // Compute the scalar steps and save the results in VectorLoopValueMap.
1942   for (unsigned Part = 0; Part < UF; ++Part) {
1943     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1944       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1945       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1946       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1947       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1948       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1949     }
1950   }
1951 }
1952 
1953 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1954   assert(V != Induction && "The new induction variable should not be used.");
1955   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1956   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1957 
1958   // If we have a stride that is replaced by one, do it here. Defer this for
1959   // the VPlan-native path until we start running Legal checks in that path.
1960   if (!EnableVPlanNativePath && Legal->hasStride(V))
1961     V = ConstantInt::get(V->getType(), 1);
1962 
1963   // If we have a vector mapped to this value, return it.
1964   if (VectorLoopValueMap.hasVectorValue(V, Part))
1965     return VectorLoopValueMap.getVectorValue(V, Part);
1966 
1967   // If the value has not been vectorized, check if it has been scalarized
1968   // instead. If it has been scalarized, and we actually need the value in
1969   // vector form, we will construct the vector values on demand.
1970   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1971     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1972 
1973     // If we've scalarized a value, that value should be an instruction.
1974     auto *I = cast<Instruction>(V);
1975 
1976     // If we aren't vectorizing, we can just copy the scalar map values over to
1977     // the vector map.
1978     if (VF == 1) {
1979       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1980       return ScalarValue;
1981     }
1982 
1983     // Get the last scalar instruction we generated for V and Part. If the value
1984     // is known to be uniform after vectorization, this corresponds to lane zero
1985     // of the Part unroll iteration. Otherwise, the last instruction is the one
1986     // we created for the last vector lane of the Part unroll iteration.
1987     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1988     auto *LastInst = cast<Instruction>(
1989         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1990 
1991     // Set the insert point after the last scalarized instruction. This ensures
1992     // the insertelement sequence will directly follow the scalar definitions.
1993     auto OldIP = Builder.saveIP();
1994     auto NewIP = std::next(BasicBlock::iterator(LastInst));
1995     Builder.SetInsertPoint(&*NewIP);
1996 
1997     // However, if we are vectorizing, we need to construct the vector values.
1998     // If the value is known to be uniform after vectorization, we can just
1999     // broadcast the scalar value corresponding to lane zero for each unroll
2000     // iteration. Otherwise, we construct the vector values using insertelement
2001     // instructions. Since the resulting vectors are stored in
2002     // VectorLoopValueMap, we will only generate the insertelements once.
2003     Value *VectorValue = nullptr;
2004     if (Cost->isUniformAfterVectorization(I, VF)) {
2005       VectorValue = getBroadcastInstrs(ScalarValue);
2006       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2007     } else {
2008       // Initialize packing with insertelements to start from undef.
2009       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2010       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2011       for (unsigned Lane = 0; Lane < VF; ++Lane)
2012         packScalarIntoVectorValue(V, {Part, Lane});
2013       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2014     }
2015     Builder.restoreIP(OldIP);
2016     return VectorValue;
2017   }
2018 
2019   // If this scalar is unknown, assume that it is a constant or that it is
2020   // loop invariant. Broadcast V and save the value for future uses.
2021   Value *B = getBroadcastInstrs(V);
2022   VectorLoopValueMap.setVectorValue(V, Part, B);
2023   return B;
2024 }
2025 
2026 Value *
2027 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2028                                             const VPIteration &Instance) {
2029   // If the value is not an instruction contained in the loop, it should
2030   // already be scalar.
2031   if (OrigLoop->isLoopInvariant(V))
2032     return V;
2033 
2034   assert(Instance.Lane > 0
2035              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2036              : true && "Uniform values only have lane zero");
2037 
2038   // If the value from the original loop has not been vectorized, it is
2039   // represented by UF x VF scalar values in the new loop. Return the requested
2040   // scalar value.
2041   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2042     return VectorLoopValueMap.getScalarValue(V, Instance);
2043 
2044   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2045   // for the given unroll part. If this entry is not a vector type (i.e., the
2046   // vectorization factor is one), there is no need to generate an
2047   // extractelement instruction.
2048   auto *U = getOrCreateVectorValue(V, Instance.Part);
2049   if (!U->getType()->isVectorTy()) {
2050     assert(VF == 1 && "Value not scalarized has non-vector type");
2051     return U;
2052   }
2053 
2054   // Otherwise, the value from the original loop has been vectorized and is
2055   // represented by UF vector values. Extract and return the requested scalar
2056   // value from the appropriate vector lane.
2057   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2058 }
2059 
2060 void InnerLoopVectorizer::packScalarIntoVectorValue(
2061     Value *V, const VPIteration &Instance) {
2062   assert(V != Induction && "The new induction variable should not be used.");
2063   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2064   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2065 
2066   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2067   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2068   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2069                                             Builder.getInt32(Instance.Lane));
2070   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2071 }
2072 
2073 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2074   assert(Vec->getType()->isVectorTy() && "Invalid type");
2075   SmallVector<Constant *, 8> ShuffleMask;
2076   for (unsigned i = 0; i < VF; ++i)
2077     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2078 
2079   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2080                                      ConstantVector::get(ShuffleMask),
2081                                      "reverse");
2082 }
2083 
2084 // Return whether we allow using masked interleave-groups (for dealing with
2085 // strided loads/stores that reside in predicated blocks, or for dealing
2086 // with gaps).
2087 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2088   // If an override option has been passed in for interleaved accesses, use it.
2089   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2090     return EnableMaskedInterleavedMemAccesses;
2091 
2092   return TTI.enableMaskedInterleavedAccessVectorization();
2093 }
2094 
2095 // Try to vectorize the interleave group that \p Instr belongs to.
2096 //
2097 // E.g. Translate following interleaved load group (factor = 3):
2098 //   for (i = 0; i < N; i+=3) {
2099 //     R = Pic[i];             // Member of index 0
2100 //     G = Pic[i+1];           // Member of index 1
2101 //     B = Pic[i+2];           // Member of index 2
2102 //     ... // do something to R, G, B
2103 //   }
2104 // To:
2105 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2106 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2107 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2108 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2109 //
2110 // Or translate following interleaved store group (factor = 3):
2111 //   for (i = 0; i < N; i+=3) {
2112 //     ... do something to R, G, B
2113 //     Pic[i]   = R;           // Member of index 0
2114 //     Pic[i+1] = G;           // Member of index 1
2115 //     Pic[i+2] = B;           // Member of index 2
2116 //   }
2117 // To:
2118 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2119 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2120 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2121 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2122 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2123 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2124                                                    VectorParts *BlockInMask) {
2125   const InterleaveGroup<Instruction> *Group =
2126       Cost->getInterleavedAccessGroup(Instr);
2127   assert(Group && "Fail to get an interleaved access group.");
2128 
2129   // Skip if current instruction is not the insert position.
2130   if (Instr != Group->getInsertPos())
2131     return;
2132 
2133   const DataLayout &DL = Instr->getModule()->getDataLayout();
2134   Value *Ptr = getLoadStorePointerOperand(Instr);
2135 
2136   // Prepare for the vector type of the interleaved load/store.
2137   Type *ScalarTy = getMemInstValueType(Instr);
2138   unsigned InterleaveFactor = Group->getFactor();
2139   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2140   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2141 
2142   // Prepare for the new pointers.
2143   setDebugLocFromInst(Builder, Ptr);
2144   SmallVector<Value *, 2> NewPtrs;
2145   unsigned Index = Group->getIndex(Instr);
2146 
2147   VectorParts Mask;
2148   bool IsMaskForCondRequired = BlockInMask;
2149   if (IsMaskForCondRequired) {
2150     Mask = *BlockInMask;
2151     // TODO: extend the masked interleaved-group support to reversed access.
2152     assert(!Group->isReverse() && "Reversed masked interleave-group "
2153                                   "not supported.");
2154   }
2155 
2156   // If the group is reverse, adjust the index to refer to the last vector lane
2157   // instead of the first. We adjust the index from the first vector lane,
2158   // rather than directly getting the pointer for lane VF - 1, because the
2159   // pointer operand of the interleaved access is supposed to be uniform. For
2160   // uniform instructions, we're only required to generate a value for the
2161   // first vector lane in each unroll iteration.
2162   if (Group->isReverse())
2163     Index += (VF - 1) * Group->getFactor();
2164 
2165   bool InBounds = false;
2166   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2167     InBounds = gep->isInBounds();
2168 
2169   for (unsigned Part = 0; Part < UF; Part++) {
2170     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2171 
2172     // Notice current instruction could be any index. Need to adjust the address
2173     // to the member of index 0.
2174     //
2175     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2176     //       b = A[i];       // Member of index 0
2177     // Current pointer is pointed to A[i+1], adjust it to A[i].
2178     //
2179     // E.g.  A[i+1] = a;     // Member of index 1
2180     //       A[i]   = b;     // Member of index 0
2181     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2182     // Current pointer is pointed to A[i+2], adjust it to A[i].
2183     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2184     if (InBounds)
2185       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2186 
2187     // Cast to the vector pointer type.
2188     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2189   }
2190 
2191   setDebugLocFromInst(Builder, Instr);
2192   Value *UndefVec = UndefValue::get(VecTy);
2193 
2194   Value *MaskForGaps = nullptr;
2195   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2196     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2197     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2198   }
2199 
2200   // Vectorize the interleaved load group.
2201   if (isa<LoadInst>(Instr)) {
2202     // For each unroll part, create a wide load for the group.
2203     SmallVector<Value *, 2> NewLoads;
2204     for (unsigned Part = 0; Part < UF; Part++) {
2205       Instruction *NewLoad;
2206       if (IsMaskForCondRequired || MaskForGaps) {
2207         assert(useMaskedInterleavedAccesses(*TTI) &&
2208                "masked interleaved groups are not allowed.");
2209         Value *GroupMask = MaskForGaps;
2210         if (IsMaskForCondRequired) {
2211           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2212           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2213           Value *ShuffledMask = Builder.CreateShuffleVector(
2214               Mask[Part], Undefs, RepMask, "interleaved.mask");
2215           GroupMask = MaskForGaps
2216                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2217                                                 MaskForGaps)
2218                           : ShuffledMask;
2219         }
2220         NewLoad =
2221             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2222                                      GroupMask, UndefVec, "wide.masked.vec");
2223       }
2224       else
2225         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2226                                             Group->getAlignment(), "wide.vec");
2227       Group->addMetadata(NewLoad);
2228       NewLoads.push_back(NewLoad);
2229     }
2230 
2231     // For each member in the group, shuffle out the appropriate data from the
2232     // wide loads.
2233     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2234       Instruction *Member = Group->getMember(I);
2235 
2236       // Skip the gaps in the group.
2237       if (!Member)
2238         continue;
2239 
2240       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2241       for (unsigned Part = 0; Part < UF; Part++) {
2242         Value *StridedVec = Builder.CreateShuffleVector(
2243             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2244 
2245         // If this member has different type, cast the result type.
2246         if (Member->getType() != ScalarTy) {
2247           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2248           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2249         }
2250 
2251         if (Group->isReverse())
2252           StridedVec = reverseVector(StridedVec);
2253 
2254         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2255       }
2256     }
2257     return;
2258   }
2259 
2260   // The sub vector type for current instruction.
2261   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2262 
2263   // Vectorize the interleaved store group.
2264   for (unsigned Part = 0; Part < UF; Part++) {
2265     // Collect the stored vector from each member.
2266     SmallVector<Value *, 4> StoredVecs;
2267     for (unsigned i = 0; i < InterleaveFactor; i++) {
2268       // Interleaved store group doesn't allow a gap, so each index has a member
2269       Instruction *Member = Group->getMember(i);
2270       assert(Member && "Fail to get a member from an interleaved store group");
2271 
2272       Value *StoredVec = getOrCreateVectorValue(
2273           cast<StoreInst>(Member)->getValueOperand(), Part);
2274       if (Group->isReverse())
2275         StoredVec = reverseVector(StoredVec);
2276 
2277       // If this member has different type, cast it to a unified type.
2278 
2279       if (StoredVec->getType() != SubVT)
2280         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2281 
2282       StoredVecs.push_back(StoredVec);
2283     }
2284 
2285     // Concatenate all vectors into a wide vector.
2286     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2287 
2288     // Interleave the elements in the wide vector.
2289     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2290     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2291                                               "interleaved.vec");
2292 
2293     Instruction *NewStoreInstr;
2294     if (IsMaskForCondRequired) {
2295       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2296       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2297       Value *ShuffledMask = Builder.CreateShuffleVector(
2298           Mask[Part], Undefs, RepMask, "interleaved.mask");
2299       NewStoreInstr = Builder.CreateMaskedStore(
2300           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2301     }
2302     else
2303       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2304         Group->getAlignment());
2305 
2306     Group->addMetadata(NewStoreInstr);
2307   }
2308 }
2309 
2310 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2311                                                      VectorParts *BlockInMask) {
2312   // Attempt to issue a wide load.
2313   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2314   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2315 
2316   assert((LI || SI) && "Invalid Load/Store instruction");
2317 
2318   LoopVectorizationCostModel::InstWidening Decision =
2319       Cost->getWideningDecision(Instr, VF);
2320   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2321          "CM decision should be taken at this point");
2322   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2323     return vectorizeInterleaveGroup(Instr);
2324 
2325   Type *ScalarDataTy = getMemInstValueType(Instr);
2326   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2327   Value *Ptr = getLoadStorePointerOperand(Instr);
2328   unsigned Alignment = getLoadStoreAlignment(Instr);
2329   // An alignment of 0 means target abi alignment. We need to use the scalar's
2330   // target abi alignment in such a case.
2331   const DataLayout &DL = Instr->getModule()->getDataLayout();
2332   if (!Alignment)
2333     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2334   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2335 
2336   // Determine if the pointer operand of the access is either consecutive or
2337   // reverse consecutive.
2338   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2339   bool ConsecutiveStride =
2340       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2341   bool CreateGatherScatter =
2342       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2343 
2344   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2345   // gather/scatter. Otherwise Decision should have been to Scalarize.
2346   assert((ConsecutiveStride || CreateGatherScatter) &&
2347          "The instruction should be scalarized");
2348 
2349   // Handle consecutive loads/stores.
2350   if (ConsecutiveStride)
2351     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2352 
2353   VectorParts Mask;
2354   bool isMaskRequired = BlockInMask;
2355   if (isMaskRequired)
2356     Mask = *BlockInMask;
2357 
2358   bool InBounds = false;
2359   if (auto *gep = dyn_cast<GetElementPtrInst>(
2360           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2361     InBounds = gep->isInBounds();
2362 
2363   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2364     // Calculate the pointer for the specific unroll-part.
2365     GetElementPtrInst *PartPtr = nullptr;
2366 
2367     if (Reverse) {
2368       // If the address is consecutive but reversed, then the
2369       // wide store needs to start at the last vector element.
2370       PartPtr = cast<GetElementPtrInst>(
2371           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2372       PartPtr->setIsInBounds(InBounds);
2373       PartPtr = cast<GetElementPtrInst>(
2374           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2375       PartPtr->setIsInBounds(InBounds);
2376       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2377         Mask[Part] = reverseVector(Mask[Part]);
2378     } else {
2379       PartPtr = cast<GetElementPtrInst>(
2380           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2381       PartPtr->setIsInBounds(InBounds);
2382     }
2383 
2384     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2385   };
2386 
2387   // Handle Stores:
2388   if (SI) {
2389     setDebugLocFromInst(Builder, SI);
2390 
2391     for (unsigned Part = 0; Part < UF; ++Part) {
2392       Instruction *NewSI = nullptr;
2393       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2394       if (CreateGatherScatter) {
2395         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2396         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2397         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2398                                             MaskPart);
2399       } else {
2400         if (Reverse) {
2401           // If we store to reverse consecutive memory locations, then we need
2402           // to reverse the order of elements in the stored value.
2403           StoredVal = reverseVector(StoredVal);
2404           // We don't want to update the value in the map as it might be used in
2405           // another expression. So don't call resetVectorValue(StoredVal).
2406         }
2407         auto *VecPtr = CreateVecPtr(Part, Ptr);
2408         if (isMaskRequired)
2409           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2410                                             Mask[Part]);
2411         else
2412           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2413       }
2414       addMetadata(NewSI, SI);
2415     }
2416     return;
2417   }
2418 
2419   // Handle loads.
2420   assert(LI && "Must have a load instruction");
2421   setDebugLocFromInst(Builder, LI);
2422   for (unsigned Part = 0; Part < UF; ++Part) {
2423     Value *NewLI;
2424     if (CreateGatherScatter) {
2425       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2426       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2427       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2428                                          nullptr, "wide.masked.gather");
2429       addMetadata(NewLI, LI);
2430     } else {
2431       auto *VecPtr = CreateVecPtr(Part, Ptr);
2432       if (isMaskRequired)
2433         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2434                                          UndefValue::get(DataTy),
2435                                          "wide.masked.load");
2436       else
2437         NewLI =
2438             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2439 
2440       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2441       addMetadata(NewLI, LI);
2442       if (Reverse)
2443         NewLI = reverseVector(NewLI);
2444     }
2445     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2446   }
2447 }
2448 
2449 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2450                                                const VPIteration &Instance,
2451                                                bool IfPredicateInstr) {
2452   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2453 
2454   setDebugLocFromInst(Builder, Instr);
2455 
2456   // Does this instruction return a value ?
2457   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2458 
2459   Instruction *Cloned = Instr->clone();
2460   if (!IsVoidRetTy)
2461     Cloned->setName(Instr->getName() + ".cloned");
2462 
2463   // Replace the operands of the cloned instructions with their scalar
2464   // equivalents in the new loop.
2465   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2466     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2467     Cloned->setOperand(op, NewOp);
2468   }
2469   addNewMetadata(Cloned, Instr);
2470 
2471   // Place the cloned scalar in the new loop.
2472   Builder.Insert(Cloned);
2473 
2474   // Add the cloned scalar to the scalar map entry.
2475   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2476 
2477   // If we just cloned a new assumption, add it the assumption cache.
2478   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2479     if (II->getIntrinsicID() == Intrinsic::assume)
2480       AC->registerAssumption(II);
2481 
2482   // End if-block.
2483   if (IfPredicateInstr)
2484     PredicatedInstructions.push_back(Cloned);
2485 }
2486 
2487 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2488                                                       Value *End, Value *Step,
2489                                                       Instruction *DL) {
2490   BasicBlock *Header = L->getHeader();
2491   BasicBlock *Latch = L->getLoopLatch();
2492   // As we're just creating this loop, it's possible no latch exists
2493   // yet. If so, use the header as this will be a single block loop.
2494   if (!Latch)
2495     Latch = Header;
2496 
2497   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2498   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2499   setDebugLocFromInst(Builder, OldInst);
2500   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2501 
2502   Builder.SetInsertPoint(Latch->getTerminator());
2503   setDebugLocFromInst(Builder, OldInst);
2504 
2505   // Create i+1 and fill the PHINode.
2506   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2507   Induction->addIncoming(Start, L->getLoopPreheader());
2508   Induction->addIncoming(Next, Latch);
2509   // Create the compare.
2510   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2511   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2512 
2513   // Now we have two terminators. Remove the old one from the block.
2514   Latch->getTerminator()->eraseFromParent();
2515 
2516   return Induction;
2517 }
2518 
2519 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2520   if (TripCount)
2521     return TripCount;
2522 
2523   assert(L && "Create Trip Count for null loop.");
2524   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2525   // Find the loop boundaries.
2526   ScalarEvolution *SE = PSE.getSE();
2527   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2528   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2529          "Invalid loop count");
2530 
2531   Type *IdxTy = Legal->getWidestInductionType();
2532   assert(IdxTy && "No type for induction");
2533 
2534   // The exit count might have the type of i64 while the phi is i32. This can
2535   // happen if we have an induction variable that is sign extended before the
2536   // compare. The only way that we get a backedge taken count is that the
2537   // induction variable was signed and as such will not overflow. In such a case
2538   // truncation is legal.
2539   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2540       IdxTy->getPrimitiveSizeInBits())
2541     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2542   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2543 
2544   // Get the total trip count from the count by adding 1.
2545   const SCEV *ExitCount = SE->getAddExpr(
2546       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2547 
2548   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2549 
2550   // Expand the trip count and place the new instructions in the preheader.
2551   // Notice that the pre-header does not change, only the loop body.
2552   SCEVExpander Exp(*SE, DL, "induction");
2553 
2554   // Count holds the overall loop count (N).
2555   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2556                                 L->getLoopPreheader()->getTerminator());
2557 
2558   if (TripCount->getType()->isPointerTy())
2559     TripCount =
2560         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2561                                     L->getLoopPreheader()->getTerminator());
2562 
2563   return TripCount;
2564 }
2565 
2566 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2567   if (VectorTripCount)
2568     return VectorTripCount;
2569 
2570   Value *TC = getOrCreateTripCount(L);
2571   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2572 
2573   Type *Ty = TC->getType();
2574   Constant *Step = ConstantInt::get(Ty, VF * UF);
2575 
2576   // If the tail is to be folded by masking, round the number of iterations N
2577   // up to a multiple of Step instead of rounding down. This is done by first
2578   // adding Step-1 and then rounding down. Note that it's ok if this addition
2579   // overflows: the vector induction variable will eventually wrap to zero given
2580   // that it starts at zero and its Step is a power of two; the loop will then
2581   // exit, with the last early-exit vector comparison also producing all-true.
2582   if (Cost->foldTailByMasking()) {
2583     assert(isPowerOf2_32(VF * UF) &&
2584            "VF*UF must be a power of 2 when folding tail by masking");
2585     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2586   }
2587 
2588   // Now we need to generate the expression for the part of the loop that the
2589   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2590   // iterations are not required for correctness, or N - Step, otherwise. Step
2591   // is equal to the vectorization factor (number of SIMD elements) times the
2592   // unroll factor (number of SIMD instructions).
2593   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2594 
2595   // If there is a non-reversed interleaved group that may speculatively access
2596   // memory out-of-bounds, we need to ensure that there will be at least one
2597   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2598   // the trip count, we set the remainder to be equal to the step. If the step
2599   // does not evenly divide the trip count, no adjustment is necessary since
2600   // there will already be scalar iterations. Note that the minimum iterations
2601   // check ensures that N >= Step.
2602   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2603     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2604     R = Builder.CreateSelect(IsZero, Step, R);
2605   }
2606 
2607   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2608 
2609   return VectorTripCount;
2610 }
2611 
2612 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2613                                                    const DataLayout &DL) {
2614   // Verify that V is a vector type with same number of elements as DstVTy.
2615   unsigned VF = DstVTy->getNumElements();
2616   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2617   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2618   Type *SrcElemTy = SrcVecTy->getElementType();
2619   Type *DstElemTy = DstVTy->getElementType();
2620   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2621          "Vector elements must have same size");
2622 
2623   // Do a direct cast if element types are castable.
2624   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2625     return Builder.CreateBitOrPointerCast(V, DstVTy);
2626   }
2627   // V cannot be directly casted to desired vector type.
2628   // May happen when V is a floating point vector but DstVTy is a vector of
2629   // pointers or vice-versa. Handle this using a two-step bitcast using an
2630   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2631   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2632          "Only one type should be a pointer type");
2633   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2634          "Only one type should be a floating point type");
2635   Type *IntTy =
2636       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2637   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2638   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2639   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2640 }
2641 
2642 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2643                                                          BasicBlock *Bypass) {
2644   Value *Count = getOrCreateTripCount(L);
2645   BasicBlock *BB = L->getLoopPreheader();
2646   IRBuilder<> Builder(BB->getTerminator());
2647 
2648   // Generate code to check if the loop's trip count is less than VF * UF, or
2649   // equal to it in case a scalar epilogue is required; this implies that the
2650   // vector trip count is zero. This check also covers the case where adding one
2651   // to the backedge-taken count overflowed leading to an incorrect trip count
2652   // of zero. In this case we will also jump to the scalar loop.
2653   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2654                                           : ICmpInst::ICMP_ULT;
2655 
2656   // If tail is to be folded, vector loop takes care of all iterations.
2657   Value *CheckMinIters = Builder.getFalse();
2658   if (!Cost->foldTailByMasking())
2659     CheckMinIters = Builder.CreateICmp(
2660         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2661         "min.iters.check");
2662 
2663   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2664   // Update dominator tree immediately if the generated block is a
2665   // LoopBypassBlock because SCEV expansions to generate loop bypass
2666   // checks may query it before the current function is finished.
2667   DT->addNewBlock(NewBB, BB);
2668   if (L->getParentLoop())
2669     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2670   ReplaceInstWithInst(BB->getTerminator(),
2671                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2672   LoopBypassBlocks.push_back(BB);
2673 }
2674 
2675 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2676   BasicBlock *BB = L->getLoopPreheader();
2677 
2678   // Generate the code to check that the SCEV assumptions that we made.
2679   // We want the new basic block to start at the first instruction in a
2680   // sequence of instructions that form a check.
2681   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2682                    "scev.check");
2683   Value *SCEVCheck =
2684       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2685 
2686   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2687     if (C->isZero())
2688       return;
2689 
2690   assert(!Cost->foldTailByMasking() &&
2691          "Cannot SCEV check stride or overflow when folding tail");
2692   // Create a new block containing the stride check.
2693   BB->setName("vector.scevcheck");
2694   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2695   // Update dominator tree immediately if the generated block is a
2696   // LoopBypassBlock because SCEV expansions to generate loop bypass
2697   // checks may query it before the current function is finished.
2698   DT->addNewBlock(NewBB, BB);
2699   if (L->getParentLoop())
2700     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2701   ReplaceInstWithInst(BB->getTerminator(),
2702                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2703   LoopBypassBlocks.push_back(BB);
2704   AddedSafetyChecks = true;
2705 }
2706 
2707 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2708   // VPlan-native path does not do any analysis for runtime checks currently.
2709   if (EnableVPlanNativePath)
2710     return;
2711 
2712   BasicBlock *BB = L->getLoopPreheader();
2713 
2714   // Generate the code that checks in runtime if arrays overlap. We put the
2715   // checks into a separate block to make the more common case of few elements
2716   // faster.
2717   Instruction *FirstCheckInst;
2718   Instruction *MemRuntimeCheck;
2719   std::tie(FirstCheckInst, MemRuntimeCheck) =
2720       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2721   if (!MemRuntimeCheck)
2722     return;
2723 
2724   assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
2725   // Create a new block containing the memory check.
2726   BB->setName("vector.memcheck");
2727   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2728   // Update dominator tree immediately if the generated block is a
2729   // LoopBypassBlock because SCEV expansions to generate loop bypass
2730   // checks may query it before the current function is finished.
2731   DT->addNewBlock(NewBB, BB);
2732   if (L->getParentLoop())
2733     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2734   ReplaceInstWithInst(BB->getTerminator(),
2735                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2736   LoopBypassBlocks.push_back(BB);
2737   AddedSafetyChecks = true;
2738 
2739   // We currently don't use LoopVersioning for the actual loop cloning but we
2740   // still use it to add the noalias metadata.
2741   LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2742                                            PSE.getSE());
2743   LVer->prepareNoAliasMetadata();
2744 }
2745 
2746 Value *InnerLoopVectorizer::emitTransformedIndex(
2747     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2748     const InductionDescriptor &ID) const {
2749 
2750   SCEVExpander Exp(*SE, DL, "induction");
2751   auto Step = ID.getStep();
2752   auto StartValue = ID.getStartValue();
2753   assert(Index->getType() == Step->getType() &&
2754          "Index type does not match StepValue type");
2755 
2756   // Note: the IR at this point is broken. We cannot use SE to create any new
2757   // SCEV and then expand it, hoping that SCEV's simplification will give us
2758   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2759   // lead to various SCEV crashes. So all we can do is to use builder and rely
2760   // on InstCombine for future simplifications. Here we handle some trivial
2761   // cases only.
2762   auto CreateAdd = [&B](Value *X, Value *Y) {
2763     assert(X->getType() == Y->getType() && "Types don't match!");
2764     if (auto *CX = dyn_cast<ConstantInt>(X))
2765       if (CX->isZero())
2766         return Y;
2767     if (auto *CY = dyn_cast<ConstantInt>(Y))
2768       if (CY->isZero())
2769         return X;
2770     return B.CreateAdd(X, Y);
2771   };
2772 
2773   auto CreateMul = [&B](Value *X, Value *Y) {
2774     assert(X->getType() == Y->getType() && "Types don't match!");
2775     if (auto *CX = dyn_cast<ConstantInt>(X))
2776       if (CX->isOne())
2777         return Y;
2778     if (auto *CY = dyn_cast<ConstantInt>(Y))
2779       if (CY->isOne())
2780         return X;
2781     return B.CreateMul(X, Y);
2782   };
2783 
2784   switch (ID.getKind()) {
2785   case InductionDescriptor::IK_IntInduction: {
2786     assert(Index->getType() == StartValue->getType() &&
2787            "Index type does not match StartValue type");
2788     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2789       return B.CreateSub(StartValue, Index);
2790     auto *Offset = CreateMul(
2791         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2792     return CreateAdd(StartValue, Offset);
2793   }
2794   case InductionDescriptor::IK_PtrInduction: {
2795     assert(isa<SCEVConstant>(Step) &&
2796            "Expected constant step for pointer induction");
2797     return B.CreateGEP(
2798         StartValue->getType()->getPointerElementType(), StartValue,
2799         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2800                                            &*B.GetInsertPoint())));
2801   }
2802   case InductionDescriptor::IK_FpInduction: {
2803     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2804     auto InductionBinOp = ID.getInductionBinOp();
2805     assert(InductionBinOp &&
2806            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2807             InductionBinOp->getOpcode() == Instruction::FSub) &&
2808            "Original bin op should be defined for FP induction");
2809 
2810     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2811 
2812     // Floating point operations had to be 'fast' to enable the induction.
2813     FastMathFlags Flags;
2814     Flags.setFast();
2815 
2816     Value *MulExp = B.CreateFMul(StepValue, Index);
2817     if (isa<Instruction>(MulExp))
2818       // We have to check, the MulExp may be a constant.
2819       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2820 
2821     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2822                                "induction");
2823     if (isa<Instruction>(BOp))
2824       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2825 
2826     return BOp;
2827   }
2828   case InductionDescriptor::IK_NoInduction:
2829     return nullptr;
2830   }
2831   llvm_unreachable("invalid enum");
2832 }
2833 
2834 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2835   /*
2836    In this function we generate a new loop. The new loop will contain
2837    the vectorized instructions while the old loop will continue to run the
2838    scalar remainder.
2839 
2840        [ ] <-- loop iteration number check.
2841     /   |
2842    /    v
2843   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2844   |  /  |
2845   | /   v
2846   ||   [ ]     <-- vector pre header.
2847   |/    |
2848   |     v
2849   |    [  ] \
2850   |    [  ]_|   <-- vector loop.
2851   |     |
2852   |     v
2853   |   -[ ]   <--- middle-block.
2854   |  /  |
2855   | /   v
2856   -|- >[ ]     <--- new preheader.
2857    |    |
2858    |    v
2859    |   [ ] \
2860    |   [ ]_|   <-- old scalar loop to handle remainder.
2861     \   |
2862      \  v
2863       >[ ]     <-- exit block.
2864    ...
2865    */
2866 
2867   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2868   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2869   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2870   MDNode *OrigLoopID = OrigLoop->getLoopID();
2871   assert(VectorPH && "Invalid loop structure");
2872   assert(ExitBlock && "Must have an exit block");
2873 
2874   // Some loops have a single integer induction variable, while other loops
2875   // don't. One example is c++ iterators that often have multiple pointer
2876   // induction variables. In the code below we also support a case where we
2877   // don't have a single induction variable.
2878   //
2879   // We try to obtain an induction variable from the original loop as hard
2880   // as possible. However if we don't find one that:
2881   //   - is an integer
2882   //   - counts from zero, stepping by one
2883   //   - is the size of the widest induction variable type
2884   // then we create a new one.
2885   OldInduction = Legal->getPrimaryInduction();
2886   Type *IdxTy = Legal->getWidestInductionType();
2887 
2888   // Split the single block loop into the two loop structure described above.
2889   BasicBlock *VecBody =
2890       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2891   BasicBlock *MiddleBlock =
2892       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2893   BasicBlock *ScalarPH =
2894       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2895 
2896   // Create and register the new vector loop.
2897   Loop *Lp = LI->AllocateLoop();
2898   Loop *ParentLoop = OrigLoop->getParentLoop();
2899 
2900   // Insert the new loop into the loop nest and register the new basic blocks
2901   // before calling any utilities such as SCEV that require valid LoopInfo.
2902   if (ParentLoop) {
2903     ParentLoop->addChildLoop(Lp);
2904     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2905     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2906   } else {
2907     LI->addTopLevelLoop(Lp);
2908   }
2909   Lp->addBasicBlockToLoop(VecBody, *LI);
2910 
2911   // Find the loop boundaries.
2912   Value *Count = getOrCreateTripCount(Lp);
2913 
2914   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2915 
2916   // Now, compare the new count to zero. If it is zero skip the vector loop and
2917   // jump to the scalar loop. This check also covers the case where the
2918   // backedge-taken count is uint##_max: adding one to it will overflow leading
2919   // to an incorrect trip count of zero. In this (rare) case we will also jump
2920   // to the scalar loop.
2921   emitMinimumIterationCountCheck(Lp, ScalarPH);
2922 
2923   // Generate the code to check any assumptions that we've made for SCEV
2924   // expressions.
2925   emitSCEVChecks(Lp, ScalarPH);
2926 
2927   // Generate the code that checks in runtime if arrays overlap. We put the
2928   // checks into a separate block to make the more common case of few elements
2929   // faster.
2930   emitMemRuntimeChecks(Lp, ScalarPH);
2931 
2932   // Generate the induction variable.
2933   // The loop step is equal to the vectorization factor (num of SIMD elements)
2934   // times the unroll factor (num of SIMD instructions).
2935   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2936   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2937   Induction =
2938       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2939                               getDebugLocFromInstOrOperands(OldInduction));
2940 
2941   // We are going to resume the execution of the scalar loop.
2942   // Go over all of the induction variables that we found and fix the
2943   // PHIs that are left in the scalar version of the loop.
2944   // The starting values of PHI nodes depend on the counter of the last
2945   // iteration in the vectorized loop.
2946   // If we come from a bypass edge then we need to start from the original
2947   // start value.
2948 
2949   // This variable saves the new starting index for the scalar loop. It is used
2950   // to test if there are any tail iterations left once the vector loop has
2951   // completed.
2952   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2953   for (auto &InductionEntry : *List) {
2954     PHINode *OrigPhi = InductionEntry.first;
2955     InductionDescriptor II = InductionEntry.second;
2956 
2957     // Create phi nodes to merge from the  backedge-taken check block.
2958     PHINode *BCResumeVal = PHINode::Create(
2959         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2960     // Copy original phi DL over to the new one.
2961     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2962     Value *&EndValue = IVEndValues[OrigPhi];
2963     if (OrigPhi == OldInduction) {
2964       // We know what the end value is.
2965       EndValue = CountRoundDown;
2966     } else {
2967       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2968       Type *StepType = II.getStep()->getType();
2969       Instruction::CastOps CastOp =
2970         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2971       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2972       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2973       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2974       EndValue->setName("ind.end");
2975     }
2976 
2977     // The new PHI merges the original incoming value, in case of a bypass,
2978     // or the value at the end of the vectorized loop.
2979     BCResumeVal->addIncoming(EndValue, MiddleBlock);
2980 
2981     // Fix the scalar body counter (PHI node).
2982     // The old induction's phi node in the scalar body needs the truncated
2983     // value.
2984     for (BasicBlock *BB : LoopBypassBlocks)
2985       BCResumeVal->addIncoming(II.getStartValue(), BB);
2986     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
2987   }
2988 
2989   // We need the OrigLoop (scalar loop part) latch terminator to help
2990   // produce correct debug info for the middle block BB instructions.
2991   // The legality check stage guarantees that the loop will have a single
2992   // latch.
2993   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
2994          "Scalar loop latch terminator isn't a branch");
2995   BranchInst *ScalarLatchBr =
2996       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
2997 
2998   // Add a check in the middle block to see if we have completed
2999   // all of the iterations in the first vector loop.
3000   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3001   // If tail is to be folded, we know we don't need to run the remainder.
3002   Value *CmpN = Builder.getTrue();
3003   if (!Cost->foldTailByMasking()) {
3004     CmpN =
3005         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3006                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3007 
3008     // Here we use the same DebugLoc as the scalar loop latch branch instead
3009     // of the corresponding compare because they may have ended up with
3010     // different line numbers and we want to avoid awkward line stepping while
3011     // debugging. Eg. if the compare has got a line number inside the loop.
3012     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3013   }
3014 
3015   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3016   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3017   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3018 
3019   // Get ready to start creating new instructions into the vectorized body.
3020   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3021 
3022   // Save the state.
3023   LoopVectorPreHeader = Lp->getLoopPreheader();
3024   LoopScalarPreHeader = ScalarPH;
3025   LoopMiddleBlock = MiddleBlock;
3026   LoopExitBlock = ExitBlock;
3027   LoopVectorBody = VecBody;
3028   LoopScalarBody = OldBasicBlock;
3029 
3030   Optional<MDNode *> VectorizedLoopID =
3031       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3032                                       LLVMLoopVectorizeFollowupVectorized});
3033   if (VectorizedLoopID.hasValue()) {
3034     Lp->setLoopID(VectorizedLoopID.getValue());
3035 
3036     // Do not setAlreadyVectorized if loop attributes have been defined
3037     // explicitly.
3038     return LoopVectorPreHeader;
3039   }
3040 
3041   // Keep all loop hints from the original loop on the vector loop (we'll
3042   // replace the vectorizer-specific hints below).
3043   if (MDNode *LID = OrigLoop->getLoopID())
3044     Lp->setLoopID(LID);
3045 
3046   LoopVectorizeHints Hints(Lp, true, *ORE);
3047   Hints.setAlreadyVectorized();
3048 
3049   return LoopVectorPreHeader;
3050 }
3051 
3052 // Fix up external users of the induction variable. At this point, we are
3053 // in LCSSA form, with all external PHIs that use the IV having one input value,
3054 // coming from the remainder loop. We need those PHIs to also have a correct
3055 // value for the IV when arriving directly from the middle block.
3056 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3057                                        const InductionDescriptor &II,
3058                                        Value *CountRoundDown, Value *EndValue,
3059                                        BasicBlock *MiddleBlock) {
3060   // There are two kinds of external IV usages - those that use the value
3061   // computed in the last iteration (the PHI) and those that use the penultimate
3062   // value (the value that feeds into the phi from the loop latch).
3063   // We allow both, but they, obviously, have different values.
3064 
3065   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3066 
3067   DenseMap<Value *, Value *> MissingVals;
3068 
3069   // An external user of the last iteration's value should see the value that
3070   // the remainder loop uses to initialize its own IV.
3071   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3072   for (User *U : PostInc->users()) {
3073     Instruction *UI = cast<Instruction>(U);
3074     if (!OrigLoop->contains(UI)) {
3075       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3076       MissingVals[UI] = EndValue;
3077     }
3078   }
3079 
3080   // An external user of the penultimate value need to see EndValue - Step.
3081   // The simplest way to get this is to recompute it from the constituent SCEVs,
3082   // that is Start + (Step * (CRD - 1)).
3083   for (User *U : OrigPhi->users()) {
3084     auto *UI = cast<Instruction>(U);
3085     if (!OrigLoop->contains(UI)) {
3086       const DataLayout &DL =
3087           OrigLoop->getHeader()->getModule()->getDataLayout();
3088       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3089 
3090       IRBuilder<> B(MiddleBlock->getTerminator());
3091       Value *CountMinusOne = B.CreateSub(
3092           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3093       Value *CMO =
3094           !II.getStep()->getType()->isIntegerTy()
3095               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3096                              II.getStep()->getType())
3097               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3098       CMO->setName("cast.cmo");
3099       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3100       Escape->setName("ind.escape");
3101       MissingVals[UI] = Escape;
3102     }
3103   }
3104 
3105   for (auto &I : MissingVals) {
3106     PHINode *PHI = cast<PHINode>(I.first);
3107     // One corner case we have to handle is two IVs "chasing" each-other,
3108     // that is %IV2 = phi [...], [ %IV1, %latch ]
3109     // In this case, if IV1 has an external use, we need to avoid adding both
3110     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3111     // don't already have an incoming value for the middle block.
3112     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3113       PHI->addIncoming(I.second, MiddleBlock);
3114   }
3115 }
3116 
3117 namespace {
3118 
3119 struct CSEDenseMapInfo {
3120   static bool canHandle(const Instruction *I) {
3121     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3122            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3123   }
3124 
3125   static inline Instruction *getEmptyKey() {
3126     return DenseMapInfo<Instruction *>::getEmptyKey();
3127   }
3128 
3129   static inline Instruction *getTombstoneKey() {
3130     return DenseMapInfo<Instruction *>::getTombstoneKey();
3131   }
3132 
3133   static unsigned getHashValue(const Instruction *I) {
3134     assert(canHandle(I) && "Unknown instruction!");
3135     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3136                                                            I->value_op_end()));
3137   }
3138 
3139   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3140     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3141         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3142       return LHS == RHS;
3143     return LHS->isIdenticalTo(RHS);
3144   }
3145 };
3146 
3147 } // end anonymous namespace
3148 
3149 ///Perform cse of induction variable instructions.
3150 static void cse(BasicBlock *BB) {
3151   // Perform simple cse.
3152   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3153   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3154     Instruction *In = &*I++;
3155 
3156     if (!CSEDenseMapInfo::canHandle(In))
3157       continue;
3158 
3159     // Check if we can replace this instruction with any of the
3160     // visited instructions.
3161     if (Instruction *V = CSEMap.lookup(In)) {
3162       In->replaceAllUsesWith(V);
3163       In->eraseFromParent();
3164       continue;
3165     }
3166 
3167     CSEMap[In] = In;
3168   }
3169 }
3170 
3171 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3172                                                        unsigned VF,
3173                                                        bool &NeedToScalarize) {
3174   Function *F = CI->getCalledFunction();
3175   StringRef FnName = CI->getCalledFunction()->getName();
3176   Type *ScalarRetTy = CI->getType();
3177   SmallVector<Type *, 4> Tys, ScalarTys;
3178   for (auto &ArgOp : CI->arg_operands())
3179     ScalarTys.push_back(ArgOp->getType());
3180 
3181   // Estimate cost of scalarized vector call. The source operands are assumed
3182   // to be vectors, so we need to extract individual elements from there,
3183   // execute VF scalar calls, and then gather the result into the vector return
3184   // value.
3185   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3186   if (VF == 1)
3187     return ScalarCallCost;
3188 
3189   // Compute corresponding vector type for return value and arguments.
3190   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3191   for (Type *ScalarTy : ScalarTys)
3192     Tys.push_back(ToVectorTy(ScalarTy, VF));
3193 
3194   // Compute costs of unpacking argument values for the scalar calls and
3195   // packing the return values to a vector.
3196   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3197 
3198   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3199 
3200   // If we can't emit a vector call for this function, then the currently found
3201   // cost is the cost we need to return.
3202   NeedToScalarize = true;
3203   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3204     return Cost;
3205 
3206   // If the corresponding vector cost is cheaper, return its cost.
3207   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3208   if (VectorCallCost < Cost) {
3209     NeedToScalarize = false;
3210     return VectorCallCost;
3211   }
3212   return Cost;
3213 }
3214 
3215 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3216                                                             unsigned VF) {
3217   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3218   assert(ID && "Expected intrinsic call!");
3219 
3220   FastMathFlags FMF;
3221   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3222     FMF = FPMO->getFastMathFlags();
3223 
3224   SmallVector<Value *, 4> Operands(CI->arg_operands());
3225   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3226 }
3227 
3228 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3229   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3230   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3231   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3232 }
3233 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3234   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3235   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3236   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3237 }
3238 
3239 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3240   // For every instruction `I` in MinBWs, truncate the operands, create a
3241   // truncated version of `I` and reextend its result. InstCombine runs
3242   // later and will remove any ext/trunc pairs.
3243   SmallPtrSet<Value *, 4> Erased;
3244   for (const auto &KV : Cost->getMinimalBitwidths()) {
3245     // If the value wasn't vectorized, we must maintain the original scalar
3246     // type. The absence of the value from VectorLoopValueMap indicates that it
3247     // wasn't vectorized.
3248     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3249       continue;
3250     for (unsigned Part = 0; Part < UF; ++Part) {
3251       Value *I = getOrCreateVectorValue(KV.first, Part);
3252       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3253           !isa<Instruction>(I))
3254         continue;
3255       Type *OriginalTy = I->getType();
3256       Type *ScalarTruncatedTy =
3257           IntegerType::get(OriginalTy->getContext(), KV.second);
3258       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3259                                           OriginalTy->getVectorNumElements());
3260       if (TruncatedTy == OriginalTy)
3261         continue;
3262 
3263       IRBuilder<> B(cast<Instruction>(I));
3264       auto ShrinkOperand = [&](Value *V) -> Value * {
3265         if (auto *ZI = dyn_cast<ZExtInst>(V))
3266           if (ZI->getSrcTy() == TruncatedTy)
3267             return ZI->getOperand(0);
3268         return B.CreateZExtOrTrunc(V, TruncatedTy);
3269       };
3270 
3271       // The actual instruction modification depends on the instruction type,
3272       // unfortunately.
3273       Value *NewI = nullptr;
3274       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3275         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3276                              ShrinkOperand(BO->getOperand(1)));
3277 
3278         // Any wrapping introduced by shrinking this operation shouldn't be
3279         // considered undefined behavior. So, we can't unconditionally copy
3280         // arithmetic wrapping flags to NewI.
3281         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3282       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3283         NewI =
3284             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3285                          ShrinkOperand(CI->getOperand(1)));
3286       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3287         NewI = B.CreateSelect(SI->getCondition(),
3288                               ShrinkOperand(SI->getTrueValue()),
3289                               ShrinkOperand(SI->getFalseValue()));
3290       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3291         switch (CI->getOpcode()) {
3292         default:
3293           llvm_unreachable("Unhandled cast!");
3294         case Instruction::Trunc:
3295           NewI = ShrinkOperand(CI->getOperand(0));
3296           break;
3297         case Instruction::SExt:
3298           NewI = B.CreateSExtOrTrunc(
3299               CI->getOperand(0),
3300               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3301           break;
3302         case Instruction::ZExt:
3303           NewI = B.CreateZExtOrTrunc(
3304               CI->getOperand(0),
3305               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3306           break;
3307         }
3308       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3309         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3310         auto *O0 = B.CreateZExtOrTrunc(
3311             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3312         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3313         auto *O1 = B.CreateZExtOrTrunc(
3314             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3315 
3316         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3317       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3318         // Don't do anything with the operands, just extend the result.
3319         continue;
3320       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3321         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3322         auto *O0 = B.CreateZExtOrTrunc(
3323             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3324         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3325         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3326       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3327         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3328         auto *O0 = B.CreateZExtOrTrunc(
3329             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3330         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3331       } else {
3332         // If we don't know what to do, be conservative and don't do anything.
3333         continue;
3334       }
3335 
3336       // Lastly, extend the result.
3337       NewI->takeName(cast<Instruction>(I));
3338       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3339       I->replaceAllUsesWith(Res);
3340       cast<Instruction>(I)->eraseFromParent();
3341       Erased.insert(I);
3342       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3343     }
3344   }
3345 
3346   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3347   for (const auto &KV : Cost->getMinimalBitwidths()) {
3348     // If the value wasn't vectorized, we must maintain the original scalar
3349     // type. The absence of the value from VectorLoopValueMap indicates that it
3350     // wasn't vectorized.
3351     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3352       continue;
3353     for (unsigned Part = 0; Part < UF; ++Part) {
3354       Value *I = getOrCreateVectorValue(KV.first, Part);
3355       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3356       if (Inst && Inst->use_empty()) {
3357         Value *NewI = Inst->getOperand(0);
3358         Inst->eraseFromParent();
3359         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3360       }
3361     }
3362   }
3363 }
3364 
3365 void InnerLoopVectorizer::fixVectorizedLoop() {
3366   // Insert truncates and extends for any truncated instructions as hints to
3367   // InstCombine.
3368   if (VF > 1)
3369     truncateToMinimalBitwidths();
3370 
3371   // Fix widened non-induction PHIs by setting up the PHI operands.
3372   if (OrigPHIsToFix.size()) {
3373     assert(EnableVPlanNativePath &&
3374            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3375     fixNonInductionPHIs();
3376   }
3377 
3378   // At this point every instruction in the original loop is widened to a
3379   // vector form. Now we need to fix the recurrences in the loop. These PHI
3380   // nodes are currently empty because we did not want to introduce cycles.
3381   // This is the second stage of vectorizing recurrences.
3382   fixCrossIterationPHIs();
3383 
3384   // Update the dominator tree.
3385   //
3386   // FIXME: After creating the structure of the new loop, the dominator tree is
3387   //        no longer up-to-date, and it remains that way until we update it
3388   //        here. An out-of-date dominator tree is problematic for SCEV,
3389   //        because SCEVExpander uses it to guide code generation. The
3390   //        vectorizer use SCEVExpanders in several places. Instead, we should
3391   //        keep the dominator tree up-to-date as we go.
3392   updateAnalysis();
3393 
3394   // Fix-up external users of the induction variables.
3395   for (auto &Entry : *Legal->getInductionVars())
3396     fixupIVUsers(Entry.first, Entry.second,
3397                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3398                  IVEndValues[Entry.first], LoopMiddleBlock);
3399 
3400   fixLCSSAPHIs();
3401   for (Instruction *PI : PredicatedInstructions)
3402     sinkScalarOperands(&*PI);
3403 
3404   // Remove redundant induction instructions.
3405   cse(LoopVectorBody);
3406 }
3407 
3408 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3409   // In order to support recurrences we need to be able to vectorize Phi nodes.
3410   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3411   // stage #2: We now need to fix the recurrences by adding incoming edges to
3412   // the currently empty PHI nodes. At this point every instruction in the
3413   // original loop is widened to a vector form so we can use them to construct
3414   // the incoming edges.
3415   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3416     // Handle first-order recurrences and reductions that need to be fixed.
3417     if (Legal->isFirstOrderRecurrence(&Phi))
3418       fixFirstOrderRecurrence(&Phi);
3419     else if (Legal->isReductionVariable(&Phi))
3420       fixReduction(&Phi);
3421   }
3422 }
3423 
3424 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3425   // This is the second phase of vectorizing first-order recurrences. An
3426   // overview of the transformation is described below. Suppose we have the
3427   // following loop.
3428   //
3429   //   for (int i = 0; i < n; ++i)
3430   //     b[i] = a[i] - a[i - 1];
3431   //
3432   // There is a first-order recurrence on "a". For this loop, the shorthand
3433   // scalar IR looks like:
3434   //
3435   //   scalar.ph:
3436   //     s_init = a[-1]
3437   //     br scalar.body
3438   //
3439   //   scalar.body:
3440   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3441   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3442   //     s2 = a[i]
3443   //     b[i] = s2 - s1
3444   //     br cond, scalar.body, ...
3445   //
3446   // In this example, s1 is a recurrence because it's value depends on the
3447   // previous iteration. In the first phase of vectorization, we created a
3448   // temporary value for s1. We now complete the vectorization and produce the
3449   // shorthand vector IR shown below (for VF = 4, UF = 1).
3450   //
3451   //   vector.ph:
3452   //     v_init = vector(..., ..., ..., a[-1])
3453   //     br vector.body
3454   //
3455   //   vector.body
3456   //     i = phi [0, vector.ph], [i+4, vector.body]
3457   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3458   //     v2 = a[i, i+1, i+2, i+3];
3459   //     v3 = vector(v1(3), v2(0, 1, 2))
3460   //     b[i, i+1, i+2, i+3] = v2 - v3
3461   //     br cond, vector.body, middle.block
3462   //
3463   //   middle.block:
3464   //     x = v2(3)
3465   //     br scalar.ph
3466   //
3467   //   scalar.ph:
3468   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3469   //     br scalar.body
3470   //
3471   // After execution completes the vector loop, we extract the next value of
3472   // the recurrence (x) to use as the initial value in the scalar loop.
3473 
3474   // Get the original loop preheader and single loop latch.
3475   auto *Preheader = OrigLoop->getLoopPreheader();
3476   auto *Latch = OrigLoop->getLoopLatch();
3477 
3478   // Get the initial and previous values of the scalar recurrence.
3479   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3480   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3481 
3482   // Create a vector from the initial value.
3483   auto *VectorInit = ScalarInit;
3484   if (VF > 1) {
3485     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3486     VectorInit = Builder.CreateInsertElement(
3487         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3488         Builder.getInt32(VF - 1), "vector.recur.init");
3489   }
3490 
3491   // We constructed a temporary phi node in the first phase of vectorization.
3492   // This phi node will eventually be deleted.
3493   Builder.SetInsertPoint(
3494       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3495 
3496   // Create a phi node for the new recurrence. The current value will either be
3497   // the initial value inserted into a vector or loop-varying vector value.
3498   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3499   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3500 
3501   // Get the vectorized previous value of the last part UF - 1. It appears last
3502   // among all unrolled iterations, due to the order of their construction.
3503   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3504 
3505   // Set the insertion point after the previous value if it is an instruction.
3506   // Note that the previous value may have been constant-folded so it is not
3507   // guaranteed to be an instruction in the vector loop. Also, if the previous
3508   // value is a phi node, we should insert after all the phi nodes to avoid
3509   // breaking basic block verification.
3510   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3511       isa<PHINode>(PreviousLastPart))
3512     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3513   else
3514     Builder.SetInsertPoint(
3515         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3516 
3517   // We will construct a vector for the recurrence by combining the values for
3518   // the current and previous iterations. This is the required shuffle mask.
3519   SmallVector<Constant *, 8> ShuffleMask(VF);
3520   ShuffleMask[0] = Builder.getInt32(VF - 1);
3521   for (unsigned I = 1; I < VF; ++I)
3522     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3523 
3524   // The vector from which to take the initial value for the current iteration
3525   // (actual or unrolled). Initially, this is the vector phi node.
3526   Value *Incoming = VecPhi;
3527 
3528   // Shuffle the current and previous vector and update the vector parts.
3529   for (unsigned Part = 0; Part < UF; ++Part) {
3530     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3531     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3532     auto *Shuffle =
3533         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3534                                              ConstantVector::get(ShuffleMask))
3535                : Incoming;
3536     PhiPart->replaceAllUsesWith(Shuffle);
3537     cast<Instruction>(PhiPart)->eraseFromParent();
3538     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3539     Incoming = PreviousPart;
3540   }
3541 
3542   // Fix the latch value of the new recurrence in the vector loop.
3543   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3544 
3545   // Extract the last vector element in the middle block. This will be the
3546   // initial value for the recurrence when jumping to the scalar loop.
3547   auto *ExtractForScalar = Incoming;
3548   if (VF > 1) {
3549     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3550     ExtractForScalar = Builder.CreateExtractElement(
3551         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3552   }
3553   // Extract the second last element in the middle block if the
3554   // Phi is used outside the loop. We need to extract the phi itself
3555   // and not the last element (the phi update in the current iteration). This
3556   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3557   // when the scalar loop is not run at all.
3558   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3559   if (VF > 1)
3560     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3561         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3562   // When loop is unrolled without vectorizing, initialize
3563   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3564   // `Incoming`. This is analogous to the vectorized case above: extracting the
3565   // second last element when VF > 1.
3566   else if (UF > 1)
3567     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3568 
3569   // Fix the initial value of the original recurrence in the scalar loop.
3570   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3571   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3572   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3573     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3574     Start->addIncoming(Incoming, BB);
3575   }
3576 
3577   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3578   Phi->setName("scalar.recur");
3579 
3580   // Finally, fix users of the recurrence outside the loop. The users will need
3581   // either the last value of the scalar recurrence or the last value of the
3582   // vector recurrence we extracted in the middle block. Since the loop is in
3583   // LCSSA form, we just need to find all the phi nodes for the original scalar
3584   // recurrence in the exit block, and then add an edge for the middle block.
3585   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3586     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3587       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3588     }
3589   }
3590 }
3591 
3592 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3593   Constant *Zero = Builder.getInt32(0);
3594 
3595   // Get it's reduction variable descriptor.
3596   assert(Legal->isReductionVariable(Phi) &&
3597          "Unable to find the reduction variable");
3598   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3599 
3600   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3601   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3602   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3603   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3604     RdxDesc.getMinMaxRecurrenceKind();
3605   setDebugLocFromInst(Builder, ReductionStartValue);
3606 
3607   // We need to generate a reduction vector from the incoming scalar.
3608   // To do so, we need to generate the 'identity' vector and override
3609   // one of the elements with the incoming scalar reduction. We need
3610   // to do it in the vector-loop preheader.
3611   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3612 
3613   // This is the vector-clone of the value that leaves the loop.
3614   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3615 
3616   // Find the reduction identity variable. Zero for addition, or, xor,
3617   // one for multiplication, -1 for And.
3618   Value *Identity;
3619   Value *VectorStart;
3620   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3621       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3622     // MinMax reduction have the start value as their identify.
3623     if (VF == 1) {
3624       VectorStart = Identity = ReductionStartValue;
3625     } else {
3626       VectorStart = Identity =
3627         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3628     }
3629   } else {
3630     // Handle other reduction kinds:
3631     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3632         RK, VecTy->getScalarType());
3633     if (VF == 1) {
3634       Identity = Iden;
3635       // This vector is the Identity vector where the first element is the
3636       // incoming scalar reduction.
3637       VectorStart = ReductionStartValue;
3638     } else {
3639       Identity = ConstantVector::getSplat(VF, Iden);
3640 
3641       // This vector is the Identity vector where the first element is the
3642       // incoming scalar reduction.
3643       VectorStart =
3644         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3645     }
3646   }
3647 
3648   // Fix the vector-loop phi.
3649 
3650   // Reductions do not have to start at zero. They can start with
3651   // any loop invariant values.
3652   BasicBlock *Latch = OrigLoop->getLoopLatch();
3653   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3654   for (unsigned Part = 0; Part < UF; ++Part) {
3655     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3656     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3657     // Make sure to add the reduction stat value only to the
3658     // first unroll part.
3659     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3660     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3661     cast<PHINode>(VecRdxPhi)
3662       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3663   }
3664 
3665   // Before each round, move the insertion point right between
3666   // the PHIs and the values we are going to write.
3667   // This allows us to write both PHINodes and the extractelement
3668   // instructions.
3669   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3670 
3671   setDebugLocFromInst(Builder, LoopExitInst);
3672 
3673   // If the vector reduction can be performed in a smaller type, we truncate
3674   // then extend the loop exit value to enable InstCombine to evaluate the
3675   // entire expression in the smaller type.
3676   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3677     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3678     Builder.SetInsertPoint(
3679         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3680     VectorParts RdxParts(UF);
3681     for (unsigned Part = 0; Part < UF; ++Part) {
3682       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3683       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3684       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3685                                         : Builder.CreateZExt(Trunc, VecTy);
3686       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3687            UI != RdxParts[Part]->user_end();)
3688         if (*UI != Trunc) {
3689           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3690           RdxParts[Part] = Extnd;
3691         } else {
3692           ++UI;
3693         }
3694     }
3695     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3696     for (unsigned Part = 0; Part < UF; ++Part) {
3697       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3698       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3699     }
3700   }
3701 
3702   // Reduce all of the unrolled parts into a single vector.
3703   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3704   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3705 
3706   // The middle block terminator has already been assigned a DebugLoc here (the
3707   // OrigLoop's single latch terminator). We want the whole middle block to
3708   // appear to execute on this line because: (a) it is all compiler generated,
3709   // (b) these instructions are always executed after evaluating the latch
3710   // conditional branch, and (c) other passes may add new predecessors which
3711   // terminate on this line. This is the easiest way to ensure we don't
3712   // accidentally cause an extra step back into the loop while debugging.
3713   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3714   for (unsigned Part = 1; Part < UF; ++Part) {
3715     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3716     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3717       // Floating point operations had to be 'fast' to enable the reduction.
3718       ReducedPartRdx = addFastMathFlag(
3719           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3720                               ReducedPartRdx, "bin.rdx"),
3721           RdxDesc.getFastMathFlags());
3722     else
3723       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3724                                       RdxPart);
3725   }
3726 
3727   if (VF > 1) {
3728     bool NoNaN = Legal->hasFunNoNaNAttr();
3729     ReducedPartRdx =
3730         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3731     // If the reduction can be performed in a smaller type, we need to extend
3732     // the reduction to the wider type before we branch to the original loop.
3733     if (Phi->getType() != RdxDesc.getRecurrenceType())
3734       ReducedPartRdx =
3735         RdxDesc.isSigned()
3736         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3737         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3738   }
3739 
3740   // Create a phi node that merges control-flow from the backedge-taken check
3741   // block and the middle block.
3742   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3743                                         LoopScalarPreHeader->getTerminator());
3744   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3745     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3746   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3747 
3748   // Now, we need to fix the users of the reduction variable
3749   // inside and outside of the scalar remainder loop.
3750   // We know that the loop is in LCSSA form. We need to update the
3751   // PHI nodes in the exit blocks.
3752   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3753     // All PHINodes need to have a single entry edge, or two if
3754     // we already fixed them.
3755     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3756 
3757     // We found a reduction value exit-PHI. Update it with the
3758     // incoming bypass edge.
3759     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3760       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3761   } // end of the LCSSA phi scan.
3762 
3763     // Fix the scalar loop reduction variable with the incoming reduction sum
3764     // from the vector body and from the backedge value.
3765   int IncomingEdgeBlockIdx =
3766     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3767   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3768   // Pick the other block.
3769   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3770   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3771   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3772 }
3773 
3774 void InnerLoopVectorizer::fixLCSSAPHIs() {
3775   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3776     if (LCSSAPhi.getNumIncomingValues() == 1) {
3777       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3778       // Non-instruction incoming values will have only one value.
3779       unsigned LastLane = 0;
3780       if (isa<Instruction>(IncomingValue))
3781           LastLane = Cost->isUniformAfterVectorization(
3782                          cast<Instruction>(IncomingValue), VF)
3783                          ? 0
3784                          : VF - 1;
3785       // Can be a loop invariant incoming value or the last scalar value to be
3786       // extracted from the vectorized loop.
3787       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3788       Value *lastIncomingValue =
3789           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3790       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3791     }
3792   }
3793 }
3794 
3795 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3796   // The basic block and loop containing the predicated instruction.
3797   auto *PredBB = PredInst->getParent();
3798   auto *VectorLoop = LI->getLoopFor(PredBB);
3799 
3800   // Initialize a worklist with the operands of the predicated instruction.
3801   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3802 
3803   // Holds instructions that we need to analyze again. An instruction may be
3804   // reanalyzed if we don't yet know if we can sink it or not.
3805   SmallVector<Instruction *, 8> InstsToReanalyze;
3806 
3807   // Returns true if a given use occurs in the predicated block. Phi nodes use
3808   // their operands in their corresponding predecessor blocks.
3809   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3810     auto *I = cast<Instruction>(U.getUser());
3811     BasicBlock *BB = I->getParent();
3812     if (auto *Phi = dyn_cast<PHINode>(I))
3813       BB = Phi->getIncomingBlock(
3814           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3815     return BB == PredBB;
3816   };
3817 
3818   // Iteratively sink the scalarized operands of the predicated instruction
3819   // into the block we created for it. When an instruction is sunk, it's
3820   // operands are then added to the worklist. The algorithm ends after one pass
3821   // through the worklist doesn't sink a single instruction.
3822   bool Changed;
3823   do {
3824     // Add the instructions that need to be reanalyzed to the worklist, and
3825     // reset the changed indicator.
3826     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3827     InstsToReanalyze.clear();
3828     Changed = false;
3829 
3830     while (!Worklist.empty()) {
3831       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3832 
3833       // We can't sink an instruction if it is a phi node, is already in the
3834       // predicated block, is not in the loop, or may have side effects.
3835       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3836           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3837         continue;
3838 
3839       // It's legal to sink the instruction if all its uses occur in the
3840       // predicated block. Otherwise, there's nothing to do yet, and we may
3841       // need to reanalyze the instruction.
3842       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3843         InstsToReanalyze.push_back(I);
3844         continue;
3845       }
3846 
3847       // Move the instruction to the beginning of the predicated block, and add
3848       // it's operands to the worklist.
3849       I->moveBefore(&*PredBB->getFirstInsertionPt());
3850       Worklist.insert(I->op_begin(), I->op_end());
3851 
3852       // The sinking may have enabled other instructions to be sunk, so we will
3853       // need to iterate.
3854       Changed = true;
3855     }
3856   } while (Changed);
3857 }
3858 
3859 void InnerLoopVectorizer::fixNonInductionPHIs() {
3860   for (PHINode *OrigPhi : OrigPHIsToFix) {
3861     PHINode *NewPhi =
3862         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3863     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3864 
3865     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3866         predecessors(OrigPhi->getParent()));
3867     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3868         predecessors(NewPhi->getParent()));
3869     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3870            "Scalar and Vector BB should have the same number of predecessors");
3871 
3872     // The insertion point in Builder may be invalidated by the time we get
3873     // here. Force the Builder insertion point to something valid so that we do
3874     // not run into issues during insertion point restore in
3875     // getOrCreateVectorValue calls below.
3876     Builder.SetInsertPoint(NewPhi);
3877 
3878     // The predecessor order is preserved and we can rely on mapping between
3879     // scalar and vector block predecessors.
3880     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3881       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3882 
3883       // When looking up the new scalar/vector values to fix up, use incoming
3884       // values from original phi.
3885       Value *ScIncV =
3886           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3887 
3888       // Scalar incoming value may need a broadcast
3889       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3890       NewPhi->addIncoming(NewIncV, NewPredBB);
3891     }
3892   }
3893 }
3894 
3895 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3896                                               unsigned VF) {
3897   PHINode *P = cast<PHINode>(PN);
3898   if (EnableVPlanNativePath) {
3899     // Currently we enter here in the VPlan-native path for non-induction
3900     // PHIs where all control flow is uniform. We simply widen these PHIs.
3901     // Create a vector phi with no operands - the vector phi operands will be
3902     // set at the end of vector code generation.
3903     Type *VecTy =
3904         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3905     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3906     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3907     OrigPHIsToFix.push_back(P);
3908 
3909     return;
3910   }
3911 
3912   assert(PN->getParent() == OrigLoop->getHeader() &&
3913          "Non-header phis should have been handled elsewhere");
3914 
3915   // In order to support recurrences we need to be able to vectorize Phi nodes.
3916   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3917   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3918   // this value when we vectorize all of the instructions that use the PHI.
3919   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3920     for (unsigned Part = 0; Part < UF; ++Part) {
3921       // This is phase one of vectorizing PHIs.
3922       Type *VecTy =
3923           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3924       Value *EntryPart = PHINode::Create(
3925           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3926       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3927     }
3928     return;
3929   }
3930 
3931   setDebugLocFromInst(Builder, P);
3932 
3933   // This PHINode must be an induction variable.
3934   // Make sure that we know about it.
3935   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3936 
3937   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3938   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3939 
3940   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3941   // which can be found from the original scalar operations.
3942   switch (II.getKind()) {
3943   case InductionDescriptor::IK_NoInduction:
3944     llvm_unreachable("Unknown induction");
3945   case InductionDescriptor::IK_IntInduction:
3946   case InductionDescriptor::IK_FpInduction:
3947     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3948   case InductionDescriptor::IK_PtrInduction: {
3949     // Handle the pointer induction variable case.
3950     assert(P->getType()->isPointerTy() && "Unexpected type.");
3951     // This is the normalized GEP that starts counting at zero.
3952     Value *PtrInd = Induction;
3953     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3954     // Determine the number of scalars we need to generate for each unroll
3955     // iteration. If the instruction is uniform, we only need to generate the
3956     // first lane. Otherwise, we generate all VF values.
3957     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
3958     // These are the scalar results. Notice that we don't generate vector GEPs
3959     // because scalar GEPs result in better code.
3960     for (unsigned Part = 0; Part < UF; ++Part) {
3961       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
3962         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
3963         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
3964         Value *SclrGep =
3965             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
3966         SclrGep->setName("next.gep");
3967         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
3968       }
3969     }
3970     return;
3971   }
3972   }
3973 }
3974 
3975 /// A helper function for checking whether an integer division-related
3976 /// instruction may divide by zero (in which case it must be predicated if
3977 /// executed conditionally in the scalar code).
3978 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
3979 /// Non-zero divisors that are non compile-time constants will not be
3980 /// converted into multiplication, so we will still end up scalarizing
3981 /// the division, but can do so w/o predication.
3982 static bool mayDivideByZero(Instruction &I) {
3983   assert((I.getOpcode() == Instruction::UDiv ||
3984           I.getOpcode() == Instruction::SDiv ||
3985           I.getOpcode() == Instruction::URem ||
3986           I.getOpcode() == Instruction::SRem) &&
3987          "Unexpected instruction");
3988   Value *Divisor = I.getOperand(1);
3989   auto *CInt = dyn_cast<ConstantInt>(Divisor);
3990   return !CInt || CInt->isZero();
3991 }
3992 
3993 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
3994   switch (I.getOpcode()) {
3995   case Instruction::Br:
3996   case Instruction::PHI:
3997     llvm_unreachable("This instruction is handled by a different recipe.");
3998   case Instruction::GetElementPtr: {
3999     // Construct a vector GEP by widening the operands of the scalar GEP as
4000     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4001     // results in a vector of pointers when at least one operand of the GEP
4002     // is vector-typed. Thus, to keep the representation compact, we only use
4003     // vector-typed operands for loop-varying values.
4004     auto *GEP = cast<GetElementPtrInst>(&I);
4005 
4006     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4007       // If we are vectorizing, but the GEP has only loop-invariant operands,
4008       // the GEP we build (by only using vector-typed operands for
4009       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4010       // produce a vector of pointers, we need to either arbitrarily pick an
4011       // operand to broadcast, or broadcast a clone of the original GEP.
4012       // Here, we broadcast a clone of the original.
4013       //
4014       // TODO: If at some point we decide to scalarize instructions having
4015       //       loop-invariant operands, this special case will no longer be
4016       //       required. We would add the scalarization decision to
4017       //       collectLoopScalars() and teach getVectorValue() to broadcast
4018       //       the lane-zero scalar value.
4019       auto *Clone = Builder.Insert(GEP->clone());
4020       for (unsigned Part = 0; Part < UF; ++Part) {
4021         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4022         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4023         addMetadata(EntryPart, GEP);
4024       }
4025     } else {
4026       // If the GEP has at least one loop-varying operand, we are sure to
4027       // produce a vector of pointers. But if we are only unrolling, we want
4028       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4029       // produce with the code below will be scalar (if VF == 1) or vector
4030       // (otherwise). Note that for the unroll-only case, we still maintain
4031       // values in the vector mapping with initVector, as we do for other
4032       // instructions.
4033       for (unsigned Part = 0; Part < UF; ++Part) {
4034         // The pointer operand of the new GEP. If it's loop-invariant, we
4035         // won't broadcast it.
4036         auto *Ptr =
4037             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4038                 ? GEP->getPointerOperand()
4039                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4040 
4041         // Collect all the indices for the new GEP. If any index is
4042         // loop-invariant, we won't broadcast it.
4043         SmallVector<Value *, 4> Indices;
4044         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4045           if (OrigLoop->isLoopInvariant(U.get()))
4046             Indices.push_back(U.get());
4047           else
4048             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4049         }
4050 
4051         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4052         // but it should be a vector, otherwise.
4053         auto *NewGEP =
4054             GEP->isInBounds()
4055                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4056                                             Indices)
4057                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4058         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4059                "NewGEP is not a pointer vector");
4060         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4061         addMetadata(NewGEP, GEP);
4062       }
4063     }
4064 
4065     break;
4066   }
4067   case Instruction::UDiv:
4068   case Instruction::SDiv:
4069   case Instruction::SRem:
4070   case Instruction::URem:
4071   case Instruction::Add:
4072   case Instruction::FAdd:
4073   case Instruction::Sub:
4074   case Instruction::FSub:
4075   case Instruction::FNeg:
4076   case Instruction::Mul:
4077   case Instruction::FMul:
4078   case Instruction::FDiv:
4079   case Instruction::FRem:
4080   case Instruction::Shl:
4081   case Instruction::LShr:
4082   case Instruction::AShr:
4083   case Instruction::And:
4084   case Instruction::Or:
4085   case Instruction::Xor: {
4086     // Just widen unops and binops.
4087     setDebugLocFromInst(Builder, &I);
4088 
4089     for (unsigned Part = 0; Part < UF; ++Part) {
4090       SmallVector<Value *, 2> Ops;
4091       for (Value *Op : I.operands())
4092         Ops.push_back(getOrCreateVectorValue(Op, Part));
4093 
4094       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4095 
4096       if (auto *VecOp = dyn_cast<Instruction>(V))
4097         VecOp->copyIRFlags(&I);
4098 
4099       // Use this vector value for all users of the original instruction.
4100       VectorLoopValueMap.setVectorValue(&I, Part, V);
4101       addMetadata(V, &I);
4102     }
4103 
4104     break;
4105   }
4106   case Instruction::Select: {
4107     // Widen selects.
4108     // If the selector is loop invariant we can create a select
4109     // instruction with a scalar condition. Otherwise, use vector-select.
4110     auto *SE = PSE.getSE();
4111     bool InvariantCond =
4112         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4113     setDebugLocFromInst(Builder, &I);
4114 
4115     // The condition can be loop invariant  but still defined inside the
4116     // loop. This means that we can't just use the original 'cond' value.
4117     // We have to take the 'vectorized' value and pick the first lane.
4118     // Instcombine will make this a no-op.
4119 
4120     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4121 
4122     for (unsigned Part = 0; Part < UF; ++Part) {
4123       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4124       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4125       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4126       Value *Sel =
4127           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4128       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4129       addMetadata(Sel, &I);
4130     }
4131 
4132     break;
4133   }
4134 
4135   case Instruction::ICmp:
4136   case Instruction::FCmp: {
4137     // Widen compares. Generate vector compares.
4138     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4139     auto *Cmp = dyn_cast<CmpInst>(&I);
4140     setDebugLocFromInst(Builder, Cmp);
4141     for (unsigned Part = 0; Part < UF; ++Part) {
4142       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4143       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4144       Value *C = nullptr;
4145       if (FCmp) {
4146         // Propagate fast math flags.
4147         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4148         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4149         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4150       } else {
4151         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4152       }
4153       VectorLoopValueMap.setVectorValue(&I, Part, C);
4154       addMetadata(C, &I);
4155     }
4156 
4157     break;
4158   }
4159 
4160   case Instruction::ZExt:
4161   case Instruction::SExt:
4162   case Instruction::FPToUI:
4163   case Instruction::FPToSI:
4164   case Instruction::FPExt:
4165   case Instruction::PtrToInt:
4166   case Instruction::IntToPtr:
4167   case Instruction::SIToFP:
4168   case Instruction::UIToFP:
4169   case Instruction::Trunc:
4170   case Instruction::FPTrunc:
4171   case Instruction::BitCast: {
4172     auto *CI = dyn_cast<CastInst>(&I);
4173     setDebugLocFromInst(Builder, CI);
4174 
4175     /// Vectorize casts.
4176     Type *DestTy =
4177         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4178 
4179     for (unsigned Part = 0; Part < UF; ++Part) {
4180       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4181       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4182       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4183       addMetadata(Cast, &I);
4184     }
4185     break;
4186   }
4187 
4188   case Instruction::Call: {
4189     // Ignore dbg intrinsics.
4190     if (isa<DbgInfoIntrinsic>(I))
4191       break;
4192     setDebugLocFromInst(Builder, &I);
4193 
4194     Module *M = I.getParent()->getParent()->getParent();
4195     auto *CI = cast<CallInst>(&I);
4196 
4197     StringRef FnName = CI->getCalledFunction()->getName();
4198     Function *F = CI->getCalledFunction();
4199     Type *RetTy = ToVectorTy(CI->getType(), VF);
4200     SmallVector<Type *, 4> Tys;
4201     for (Value *ArgOperand : CI->arg_operands())
4202       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4203 
4204     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4205 
4206     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4207     // version of the instruction.
4208     // Is it beneficial to perform intrinsic call compared to lib call?
4209     bool NeedToScalarize;
4210     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4211     bool UseVectorIntrinsic =
4212         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4213     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4214            "Instruction should be scalarized elsewhere.");
4215 
4216     for (unsigned Part = 0; Part < UF; ++Part) {
4217       SmallVector<Value *, 4> Args;
4218       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4219         Value *Arg = CI->getArgOperand(i);
4220         // Some intrinsics have a scalar argument - don't replace it with a
4221         // vector.
4222         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4223           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4224         Args.push_back(Arg);
4225       }
4226 
4227       Function *VectorF;
4228       if (UseVectorIntrinsic) {
4229         // Use vector version of the intrinsic.
4230         Type *TysForDecl[] = {CI->getType()};
4231         if (VF > 1)
4232           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4233         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4234       } else {
4235         // Use vector version of the library call.
4236         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4237         assert(!VFnName.empty() && "Vector function name is empty.");
4238         VectorF = M->getFunction(VFnName);
4239         if (!VectorF) {
4240           // Generate a declaration
4241           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4242           VectorF =
4243               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4244           VectorF->copyAttributesFrom(F);
4245         }
4246       }
4247       assert(VectorF && "Can't create vector function.");
4248 
4249       SmallVector<OperandBundleDef, 1> OpBundles;
4250       CI->getOperandBundlesAsDefs(OpBundles);
4251       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4252 
4253       if (isa<FPMathOperator>(V))
4254         V->copyFastMathFlags(CI);
4255 
4256       VectorLoopValueMap.setVectorValue(&I, Part, V);
4257       addMetadata(V, &I);
4258     }
4259 
4260     break;
4261   }
4262 
4263   default:
4264     // This instruction is not vectorized by simple widening.
4265     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4266     llvm_unreachable("Unhandled instruction!");
4267   } // end of switch.
4268 }
4269 
4270 void InnerLoopVectorizer::updateAnalysis() {
4271   // Forget the original basic block.
4272   PSE.getSE()->forgetLoop(OrigLoop);
4273 
4274   // DT is not kept up-to-date for outer loop vectorization
4275   if (EnableVPlanNativePath)
4276     return;
4277 
4278   // Update the dominator tree information.
4279   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4280          "Entry does not dominate exit.");
4281 
4282   DT->addNewBlock(LoopMiddleBlock,
4283                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4284   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4285   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4286   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4287   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4288 }
4289 
4290 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4291   // We should not collect Scalars more than once per VF. Right now, this
4292   // function is called from collectUniformsAndScalars(), which already does
4293   // this check. Collecting Scalars for VF=1 does not make any sense.
4294   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4295          "This function should not be visited twice for the same VF");
4296 
4297   SmallSetVector<Instruction *, 8> Worklist;
4298 
4299   // These sets are used to seed the analysis with pointers used by memory
4300   // accesses that will remain scalar.
4301   SmallSetVector<Instruction *, 8> ScalarPtrs;
4302   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4303 
4304   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4305   // The pointer operands of loads and stores will be scalar as long as the
4306   // memory access is not a gather or scatter operation. The value operand of a
4307   // store will remain scalar if the store is scalarized.
4308   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4309     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4310     assert(WideningDecision != CM_Unknown &&
4311            "Widening decision should be ready at this moment");
4312     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4313       if (Ptr == Store->getValueOperand())
4314         return WideningDecision == CM_Scalarize;
4315     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4316            "Ptr is neither a value or pointer operand");
4317     return WideningDecision != CM_GatherScatter;
4318   };
4319 
4320   // A helper that returns true if the given value is a bitcast or
4321   // getelementptr instruction contained in the loop.
4322   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4323     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4324             isa<GetElementPtrInst>(V)) &&
4325            !TheLoop->isLoopInvariant(V);
4326   };
4327 
4328   // A helper that evaluates a memory access's use of a pointer. If the use
4329   // will be a scalar use, and the pointer is only used by memory accesses, we
4330   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4331   // PossibleNonScalarPtrs.
4332   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4333     // We only care about bitcast and getelementptr instructions contained in
4334     // the loop.
4335     if (!isLoopVaryingBitCastOrGEP(Ptr))
4336       return;
4337 
4338     // If the pointer has already been identified as scalar (e.g., if it was
4339     // also identified as uniform), there's nothing to do.
4340     auto *I = cast<Instruction>(Ptr);
4341     if (Worklist.count(I))
4342       return;
4343 
4344     // If the use of the pointer will be a scalar use, and all users of the
4345     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4346     // place the pointer in PossibleNonScalarPtrs.
4347     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4348           return isa<LoadInst>(U) || isa<StoreInst>(U);
4349         }))
4350       ScalarPtrs.insert(I);
4351     else
4352       PossibleNonScalarPtrs.insert(I);
4353   };
4354 
4355   // We seed the scalars analysis with three classes of instructions: (1)
4356   // instructions marked uniform-after-vectorization, (2) bitcast and
4357   // getelementptr instructions used by memory accesses requiring a scalar use,
4358   // and (3) pointer induction variables and their update instructions (we
4359   // currently only scalarize these).
4360   //
4361   // (1) Add to the worklist all instructions that have been identified as
4362   // uniform-after-vectorization.
4363   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4364 
4365   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4366   // memory accesses requiring a scalar use. The pointer operands of loads and
4367   // stores will be scalar as long as the memory accesses is not a gather or
4368   // scatter operation. The value operand of a store will remain scalar if the
4369   // store is scalarized.
4370   for (auto *BB : TheLoop->blocks())
4371     for (auto &I : *BB) {
4372       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4373         evaluatePtrUse(Load, Load->getPointerOperand());
4374       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4375         evaluatePtrUse(Store, Store->getPointerOperand());
4376         evaluatePtrUse(Store, Store->getValueOperand());
4377       }
4378     }
4379   for (auto *I : ScalarPtrs)
4380     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4381       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4382       Worklist.insert(I);
4383     }
4384 
4385   // (3) Add to the worklist all pointer induction variables and their update
4386   // instructions.
4387   //
4388   // TODO: Once we are able to vectorize pointer induction variables we should
4389   //       no longer insert them into the worklist here.
4390   auto *Latch = TheLoop->getLoopLatch();
4391   for (auto &Induction : *Legal->getInductionVars()) {
4392     auto *Ind = Induction.first;
4393     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4394     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4395       continue;
4396     Worklist.insert(Ind);
4397     Worklist.insert(IndUpdate);
4398     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4399     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4400                       << "\n");
4401   }
4402 
4403   // Insert the forced scalars.
4404   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4405   // induction variable when the PHI user is scalarized.
4406   auto ForcedScalar = ForcedScalars.find(VF);
4407   if (ForcedScalar != ForcedScalars.end())
4408     for (auto *I : ForcedScalar->second)
4409       Worklist.insert(I);
4410 
4411   // Expand the worklist by looking through any bitcasts and getelementptr
4412   // instructions we've already identified as scalar. This is similar to the
4413   // expansion step in collectLoopUniforms(); however, here we're only
4414   // expanding to include additional bitcasts and getelementptr instructions.
4415   unsigned Idx = 0;
4416   while (Idx != Worklist.size()) {
4417     Instruction *Dst = Worklist[Idx++];
4418     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4419       continue;
4420     auto *Src = cast<Instruction>(Dst->getOperand(0));
4421     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4422           auto *J = cast<Instruction>(U);
4423           return !TheLoop->contains(J) || Worklist.count(J) ||
4424                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4425                   isScalarUse(J, Src));
4426         })) {
4427       Worklist.insert(Src);
4428       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4429     }
4430   }
4431 
4432   // An induction variable will remain scalar if all users of the induction
4433   // variable and induction variable update remain scalar.
4434   for (auto &Induction : *Legal->getInductionVars()) {
4435     auto *Ind = Induction.first;
4436     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4437 
4438     // We already considered pointer induction variables, so there's no reason
4439     // to look at their users again.
4440     //
4441     // TODO: Once we are able to vectorize pointer induction variables we
4442     //       should no longer skip over them here.
4443     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4444       continue;
4445 
4446     // Determine if all users of the induction variable are scalar after
4447     // vectorization.
4448     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4449       auto *I = cast<Instruction>(U);
4450       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4451     });
4452     if (!ScalarInd)
4453       continue;
4454 
4455     // Determine if all users of the induction variable update instruction are
4456     // scalar after vectorization.
4457     auto ScalarIndUpdate =
4458         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4459           auto *I = cast<Instruction>(U);
4460           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4461         });
4462     if (!ScalarIndUpdate)
4463       continue;
4464 
4465     // The induction variable and its update instruction will remain scalar.
4466     Worklist.insert(Ind);
4467     Worklist.insert(IndUpdate);
4468     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4469     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4470                       << "\n");
4471   }
4472 
4473   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4474 }
4475 
4476 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4477   if (!blockNeedsPredication(I->getParent()))
4478     return false;
4479   switch(I->getOpcode()) {
4480   default:
4481     break;
4482   case Instruction::Load:
4483   case Instruction::Store: {
4484     if (!Legal->isMaskRequired(I))
4485       return false;
4486     auto *Ptr = getLoadStorePointerOperand(I);
4487     auto *Ty = getMemInstValueType(I);
4488     // We have already decided how to vectorize this instruction, get that
4489     // result.
4490     if (VF > 1) {
4491       InstWidening WideningDecision = getWideningDecision(I, VF);
4492       assert(WideningDecision != CM_Unknown &&
4493              "Widening decision should be ready at this moment");
4494       return WideningDecision == CM_Scalarize;
4495     }
4496     return isa<LoadInst>(I) ?
4497         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4498       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4499   }
4500   case Instruction::UDiv:
4501   case Instruction::SDiv:
4502   case Instruction::SRem:
4503   case Instruction::URem:
4504     return mayDivideByZero(*I);
4505   }
4506   return false;
4507 }
4508 
4509 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4510                                                                unsigned VF) {
4511   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4512   assert(getWideningDecision(I, VF) == CM_Unknown &&
4513          "Decision should not be set yet.");
4514   auto *Group = getInterleavedAccessGroup(I);
4515   assert(Group && "Must have a group.");
4516 
4517   // If the instruction's allocated size doesn't equal it's type size, it
4518   // requires padding and will be scalarized.
4519   auto &DL = I->getModule()->getDataLayout();
4520   auto *ScalarTy = getMemInstValueType(I);
4521   if (hasIrregularType(ScalarTy, DL, VF))
4522     return false;
4523 
4524   // Check if masking is required.
4525   // A Group may need masking for one of two reasons: it resides in a block that
4526   // needs predication, or it was decided to use masking to deal with gaps.
4527   bool PredicatedAccessRequiresMasking =
4528       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4529   bool AccessWithGapsRequiresMasking =
4530       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4531   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4532     return true;
4533 
4534   // If masked interleaving is required, we expect that the user/target had
4535   // enabled it, because otherwise it either wouldn't have been created or
4536   // it should have been invalidated by the CostModel.
4537   assert(useMaskedInterleavedAccesses(TTI) &&
4538          "Masked interleave-groups for predicated accesses are not enabled.");
4539 
4540   auto *Ty = getMemInstValueType(I);
4541   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4542                           : TTI.isLegalMaskedStore(Ty);
4543 }
4544 
4545 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4546                                                                unsigned VF) {
4547   // Get and ensure we have a valid memory instruction.
4548   LoadInst *LI = dyn_cast<LoadInst>(I);
4549   StoreInst *SI = dyn_cast<StoreInst>(I);
4550   assert((LI || SI) && "Invalid memory instruction");
4551 
4552   auto *Ptr = getLoadStorePointerOperand(I);
4553 
4554   // In order to be widened, the pointer should be consecutive, first of all.
4555   if (!Legal->isConsecutivePtr(Ptr))
4556     return false;
4557 
4558   // If the instruction is a store located in a predicated block, it will be
4559   // scalarized.
4560   if (isScalarWithPredication(I))
4561     return false;
4562 
4563   // If the instruction's allocated size doesn't equal it's type size, it
4564   // requires padding and will be scalarized.
4565   auto &DL = I->getModule()->getDataLayout();
4566   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4567   if (hasIrregularType(ScalarTy, DL, VF))
4568     return false;
4569 
4570   return true;
4571 }
4572 
4573 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4574   // We should not collect Uniforms more than once per VF. Right now,
4575   // this function is called from collectUniformsAndScalars(), which
4576   // already does this check. Collecting Uniforms for VF=1 does not make any
4577   // sense.
4578 
4579   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4580          "This function should not be visited twice for the same VF");
4581 
4582   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4583   // not analyze again.  Uniforms.count(VF) will return 1.
4584   Uniforms[VF].clear();
4585 
4586   // We now know that the loop is vectorizable!
4587   // Collect instructions inside the loop that will remain uniform after
4588   // vectorization.
4589 
4590   // Global values, params and instructions outside of current loop are out of
4591   // scope.
4592   auto isOutOfScope = [&](Value *V) -> bool {
4593     Instruction *I = dyn_cast<Instruction>(V);
4594     return (!I || !TheLoop->contains(I));
4595   };
4596 
4597   SetVector<Instruction *> Worklist;
4598   BasicBlock *Latch = TheLoop->getLoopLatch();
4599 
4600   // Start with the conditional branch. If the branch condition is an
4601   // instruction contained in the loop that is only used by the branch, it is
4602   // uniform.
4603   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4604   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4605     Worklist.insert(Cmp);
4606     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4607   }
4608 
4609   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4610   // are pointers that are treated like consecutive pointers during
4611   // vectorization. The pointer operands of interleaved accesses are an
4612   // example.
4613   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4614 
4615   // Holds pointer operands of instructions that are possibly non-uniform.
4616   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4617 
4618   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4619     InstWidening WideningDecision = getWideningDecision(I, VF);
4620     assert(WideningDecision != CM_Unknown &&
4621            "Widening decision should be ready at this moment");
4622 
4623     return (WideningDecision == CM_Widen ||
4624             WideningDecision == CM_Widen_Reverse ||
4625             WideningDecision == CM_Interleave);
4626   };
4627   // Iterate over the instructions in the loop, and collect all
4628   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4629   // that a consecutive-like pointer operand will be scalarized, we collect it
4630   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4631   // getelementptr instruction can be used by both vectorized and scalarized
4632   // memory instructions. For example, if a loop loads and stores from the same
4633   // location, but the store is conditional, the store will be scalarized, and
4634   // the getelementptr won't remain uniform.
4635   for (auto *BB : TheLoop->blocks())
4636     for (auto &I : *BB) {
4637       // If there's no pointer operand, there's nothing to do.
4638       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4639       if (!Ptr)
4640         continue;
4641 
4642       // True if all users of Ptr are memory accesses that have Ptr as their
4643       // pointer operand.
4644       auto UsersAreMemAccesses =
4645           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4646             return getLoadStorePointerOperand(U) == Ptr;
4647           });
4648 
4649       // Ensure the memory instruction will not be scalarized or used by
4650       // gather/scatter, making its pointer operand non-uniform. If the pointer
4651       // operand is used by any instruction other than a memory access, we
4652       // conservatively assume the pointer operand may be non-uniform.
4653       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4654         PossibleNonUniformPtrs.insert(Ptr);
4655 
4656       // If the memory instruction will be vectorized and its pointer operand
4657       // is consecutive-like, or interleaving - the pointer operand should
4658       // remain uniform.
4659       else
4660         ConsecutiveLikePtrs.insert(Ptr);
4661     }
4662 
4663   // Add to the Worklist all consecutive and consecutive-like pointers that
4664   // aren't also identified as possibly non-uniform.
4665   for (auto *V : ConsecutiveLikePtrs)
4666     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4667       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4668       Worklist.insert(V);
4669     }
4670 
4671   // Expand Worklist in topological order: whenever a new instruction
4672   // is added , its users should be already inside Worklist.  It ensures
4673   // a uniform instruction will only be used by uniform instructions.
4674   unsigned idx = 0;
4675   while (idx != Worklist.size()) {
4676     Instruction *I = Worklist[idx++];
4677 
4678     for (auto OV : I->operand_values()) {
4679       // isOutOfScope operands cannot be uniform instructions.
4680       if (isOutOfScope(OV))
4681         continue;
4682       // First order recurrence Phi's should typically be considered
4683       // non-uniform.
4684       auto *OP = dyn_cast<PHINode>(OV);
4685       if (OP && Legal->isFirstOrderRecurrence(OP))
4686         continue;
4687       // If all the users of the operand are uniform, then add the
4688       // operand into the uniform worklist.
4689       auto *OI = cast<Instruction>(OV);
4690       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4691             auto *J = cast<Instruction>(U);
4692             return Worklist.count(J) ||
4693                    (OI == getLoadStorePointerOperand(J) &&
4694                     isUniformDecision(J, VF));
4695           })) {
4696         Worklist.insert(OI);
4697         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4698       }
4699     }
4700   }
4701 
4702   // Returns true if Ptr is the pointer operand of a memory access instruction
4703   // I, and I is known to not require scalarization.
4704   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4705     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4706   };
4707 
4708   // For an instruction to be added into Worklist above, all its users inside
4709   // the loop should also be in Worklist. However, this condition cannot be
4710   // true for phi nodes that form a cyclic dependence. We must process phi
4711   // nodes separately. An induction variable will remain uniform if all users
4712   // of the induction variable and induction variable update remain uniform.
4713   // The code below handles both pointer and non-pointer induction variables.
4714   for (auto &Induction : *Legal->getInductionVars()) {
4715     auto *Ind = Induction.first;
4716     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4717 
4718     // Determine if all users of the induction variable are uniform after
4719     // vectorization.
4720     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4721       auto *I = cast<Instruction>(U);
4722       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4723              isVectorizedMemAccessUse(I, Ind);
4724     });
4725     if (!UniformInd)
4726       continue;
4727 
4728     // Determine if all users of the induction variable update instruction are
4729     // uniform after vectorization.
4730     auto UniformIndUpdate =
4731         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4732           auto *I = cast<Instruction>(U);
4733           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4734                  isVectorizedMemAccessUse(I, IndUpdate);
4735         });
4736     if (!UniformIndUpdate)
4737       continue;
4738 
4739     // The induction variable and its update instruction will remain uniform.
4740     Worklist.insert(Ind);
4741     Worklist.insert(IndUpdate);
4742     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4743     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4744                       << "\n");
4745   }
4746 
4747   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4748 }
4749 
4750 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4751   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4752 
4753   if (Legal->getRuntimePointerChecking()->Need) {
4754     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4755         "runtime pointer checks needed. Enable vectorization of this "
4756         "loop with '#pragma clang loop vectorize(enable)' when "
4757         "compiling with -Os/-Oz",
4758         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4759     return true;
4760   }
4761 
4762   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4763     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4764         "runtime SCEV checks needed. Enable vectorization of this "
4765         "loop with '#pragma clang loop vectorize(enable)' when "
4766         "compiling with -Os/-Oz",
4767         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4768     return true;
4769   }
4770 
4771   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4772   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4773     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4774         "runtime stride == 1 checks needed. Enable vectorization of "
4775         "this loop with '#pragma clang loop vectorize(enable)' when "
4776         "compiling with -Os/-Oz",
4777         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4778     return true;
4779   }
4780 
4781   return false;
4782 }
4783 
4784 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4785   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4786     // TODO: It may by useful to do since it's still likely to be dynamically
4787     // uniform if the target can skip.
4788     reportVectorizationFailure(
4789         "Not inserting runtime ptr check for divergent target",
4790         "runtime pointer checks needed. Not enabled for divergent target",
4791         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4792     return None;
4793   }
4794 
4795   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4796   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4797   if (TC == 1) {
4798     reportVectorizationFailure("Single iteration (non) loop",
4799         "loop trip count is one, irrelevant for vectorization",
4800         "SingleIterationLoop", ORE, TheLoop);
4801     return None;
4802   }
4803 
4804   switch (ScalarEpilogueStatus) {
4805   case CM_ScalarEpilogueAllowed:
4806     return computeFeasibleMaxVF(TC);
4807   case CM_ScalarEpilogueNotNeededPredicatePragma:
4808     LLVM_DEBUG(
4809         dbgs() << "LV: vector predicate hint found.\n"
4810                << "LV: Not allowing scalar epilogue, creating predicated "
4811                << "vector loop.\n");
4812     break;
4813   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4814     // fallthrough as a special case of OptForSize
4815   case CM_ScalarEpilogueNotAllowedOptSize:
4816     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4817       LLVM_DEBUG(
4818           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4819     else
4820       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4821                         << "count.\n");
4822 
4823     // Bail if runtime checks are required, which are not good when optimising
4824     // for size.
4825     if (runtimeChecksRequired())
4826       return None;
4827     break;
4828   }
4829 
4830   // Now try the tail folding
4831 
4832   // Invalidate interleave groups that require an epilogue if we can't mask
4833   // the interleave-group.
4834   if (!useMaskedInterleavedAccesses(TTI))
4835     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4836 
4837   unsigned MaxVF = computeFeasibleMaxVF(TC);
4838   if (TC > 0 && TC % MaxVF == 0) {
4839     // Accept MaxVF if we do not have a tail.
4840     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4841     return MaxVF;
4842   }
4843 
4844   // If we don't know the precise trip count, or if the trip count that we
4845   // found modulo the vectorization factor is not zero, try to fold the tail
4846   // by masking.
4847   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4848   if (Legal->canFoldTailByMasking()) {
4849     FoldTailByMasking = true;
4850     return MaxVF;
4851   }
4852 
4853   if (TC == 0) {
4854     reportVectorizationFailure(
4855         "Unable to calculate the loop count due to complex control flow",
4856         "unable to calculate the loop count due to complex control flow",
4857         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4858     return None;
4859   }
4860 
4861   reportVectorizationFailure(
4862       "Cannot optimize for size and vectorize at the same time.",
4863       "cannot optimize for size and vectorize at the same time. "
4864       "Enable vectorization of this loop with '#pragma clang loop "
4865       "vectorize(enable)' when compiling with -Os/-Oz",
4866       "NoTailLoopWithOptForSize", ORE, TheLoop);
4867   return None;
4868 }
4869 
4870 unsigned
4871 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4872   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4873   unsigned SmallestType, WidestType;
4874   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4875   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4876 
4877   // Get the maximum safe dependence distance in bits computed by LAA.
4878   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4879   // the memory accesses that is most restrictive (involved in the smallest
4880   // dependence distance).
4881   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4882 
4883   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4884 
4885   unsigned MaxVectorSize = WidestRegister / WidestType;
4886 
4887   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4888                     << " / " << WidestType << " bits.\n");
4889   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4890                     << WidestRegister << " bits.\n");
4891 
4892   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4893                                  " into one vector!");
4894   if (MaxVectorSize == 0) {
4895     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4896     MaxVectorSize = 1;
4897     return MaxVectorSize;
4898   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4899              isPowerOf2_32(ConstTripCount)) {
4900     // We need to clamp the VF to be the ConstTripCount. There is no point in
4901     // choosing a higher viable VF as done in the loop below.
4902     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4903                       << ConstTripCount << "\n");
4904     MaxVectorSize = ConstTripCount;
4905     return MaxVectorSize;
4906   }
4907 
4908   unsigned MaxVF = MaxVectorSize;
4909   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4910       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4911     // Collect all viable vectorization factors larger than the default MaxVF
4912     // (i.e. MaxVectorSize).
4913     SmallVector<unsigned, 8> VFs;
4914     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4915     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4916       VFs.push_back(VS);
4917 
4918     // For each VF calculate its register usage.
4919     auto RUs = calculateRegisterUsage(VFs);
4920 
4921     // Select the largest VF which doesn't require more registers than existing
4922     // ones.
4923     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4924     for (int i = RUs.size() - 1; i >= 0; --i) {
4925       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4926         MaxVF = VFs[i];
4927         break;
4928       }
4929     }
4930     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4931       if (MaxVF < MinVF) {
4932         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4933                           << ") with target's minimum: " << MinVF << '\n');
4934         MaxVF = MinVF;
4935       }
4936     }
4937   }
4938   return MaxVF;
4939 }
4940 
4941 VectorizationFactor
4942 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4943   float Cost = expectedCost(1).first;
4944   const float ScalarCost = Cost;
4945   unsigned Width = 1;
4946   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4947 
4948   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4949   if (ForceVectorization && MaxVF > 1) {
4950     // Ignore scalar width, because the user explicitly wants vectorization.
4951     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4952     // evaluation.
4953     Cost = std::numeric_limits<float>::max();
4954   }
4955 
4956   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4957     // Notice that the vector loop needs to be executed less times, so
4958     // we need to divide the cost of the vector loops by the width of
4959     // the vector elements.
4960     VectorizationCostTy C = expectedCost(i);
4961     float VectorCost = C.first / (float)i;
4962     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
4963                       << " costs: " << (int)VectorCost << ".\n");
4964     if (!C.second && !ForceVectorization) {
4965       LLVM_DEBUG(
4966           dbgs() << "LV: Not considering vector loop of width " << i
4967                  << " because it will not generate any vector instructions.\n");
4968       continue;
4969     }
4970     if (VectorCost < Cost) {
4971       Cost = VectorCost;
4972       Width = i;
4973     }
4974   }
4975 
4976   if (!EnableCondStoresVectorization && NumPredStores) {
4977     reportVectorizationFailure("There are conditional stores.",
4978         "store that is conditionally executed prevents vectorization",
4979         "ConditionalStore", ORE, TheLoop);
4980     Width = 1;
4981     Cost = ScalarCost;
4982   }
4983 
4984   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
4985              << "LV: Vectorization seems to be not beneficial, "
4986              << "but was forced by a user.\n");
4987   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
4988   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
4989   return Factor;
4990 }
4991 
4992 std::pair<unsigned, unsigned>
4993 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4994   unsigned MinWidth = -1U;
4995   unsigned MaxWidth = 8;
4996   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
4997 
4998   // For each block.
4999   for (BasicBlock *BB : TheLoop->blocks()) {
5000     // For each instruction in the loop.
5001     for (Instruction &I : BB->instructionsWithoutDebug()) {
5002       Type *T = I.getType();
5003 
5004       // Skip ignored values.
5005       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5006         continue;
5007 
5008       // Only examine Loads, Stores and PHINodes.
5009       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5010         continue;
5011 
5012       // Examine PHI nodes that are reduction variables. Update the type to
5013       // account for the recurrence type.
5014       if (auto *PN = dyn_cast<PHINode>(&I)) {
5015         if (!Legal->isReductionVariable(PN))
5016           continue;
5017         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5018         T = RdxDesc.getRecurrenceType();
5019       }
5020 
5021       // Examine the stored values.
5022       if (auto *ST = dyn_cast<StoreInst>(&I))
5023         T = ST->getValueOperand()->getType();
5024 
5025       // Ignore loaded pointer types and stored pointer types that are not
5026       // vectorizable.
5027       //
5028       // FIXME: The check here attempts to predict whether a load or store will
5029       //        be vectorized. We only know this for certain after a VF has
5030       //        been selected. Here, we assume that if an access can be
5031       //        vectorized, it will be. We should also look at extending this
5032       //        optimization to non-pointer types.
5033       //
5034       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5035           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5036         continue;
5037 
5038       MinWidth = std::min(MinWidth,
5039                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5040       MaxWidth = std::max(MaxWidth,
5041                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5042     }
5043   }
5044 
5045   return {MinWidth, MaxWidth};
5046 }
5047 
5048 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5049                                                            unsigned LoopCost) {
5050   // -- The interleave heuristics --
5051   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5052   // There are many micro-architectural considerations that we can't predict
5053   // at this level. For example, frontend pressure (on decode or fetch) due to
5054   // code size, or the number and capabilities of the execution ports.
5055   //
5056   // We use the following heuristics to select the interleave count:
5057   // 1. If the code has reductions, then we interleave to break the cross
5058   // iteration dependency.
5059   // 2. If the loop is really small, then we interleave to reduce the loop
5060   // overhead.
5061   // 3. We don't interleave if we think that we will spill registers to memory
5062   // due to the increased register pressure.
5063 
5064   if (!isScalarEpilogueAllowed())
5065     return 1;
5066 
5067   // We used the distance for the interleave count.
5068   if (Legal->getMaxSafeDepDistBytes() != -1U)
5069     return 1;
5070 
5071   // Do not interleave loops with a relatively small trip count.
5072   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5073   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5074     return 1;
5075 
5076   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5077   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5078                     << " registers\n");
5079 
5080   if (VF == 1) {
5081     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5082       TargetNumRegisters = ForceTargetNumScalarRegs;
5083   } else {
5084     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5085       TargetNumRegisters = ForceTargetNumVectorRegs;
5086   }
5087 
5088   RegisterUsage R = calculateRegisterUsage({VF})[0];
5089   // We divide by these constants so assume that we have at least one
5090   // instruction that uses at least one register.
5091   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5092 
5093   // We calculate the interleave count using the following formula.
5094   // Subtract the number of loop invariants from the number of available
5095   // registers. These registers are used by all of the interleaved instances.
5096   // Next, divide the remaining registers by the number of registers that is
5097   // required by the loop, in order to estimate how many parallel instances
5098   // fit without causing spills. All of this is rounded down if necessary to be
5099   // a power of two. We want power of two interleave count to simplify any
5100   // addressing operations or alignment considerations.
5101   // We also want power of two interleave counts to ensure that the induction
5102   // variable of the vector loop wraps to zero, when tail is folded by masking;
5103   // this currently happens when OptForSize, in which case IC is set to 1 above.
5104   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5105                               R.MaxLocalUsers);
5106 
5107   // Don't count the induction variable as interleaved.
5108   if (EnableIndVarRegisterHeur)
5109     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5110                        std::max(1U, (R.MaxLocalUsers - 1)));
5111 
5112   // Clamp the interleave ranges to reasonable counts.
5113   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5114 
5115   // Check if the user has overridden the max.
5116   if (VF == 1) {
5117     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5118       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5119   } else {
5120     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5121       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5122   }
5123 
5124   // If the trip count is constant, limit the interleave count to be less than
5125   // the trip count divided by VF.
5126   if (TC > 0) {
5127     assert(TC >= VF && "VF exceeds trip count?");
5128     if ((TC / VF) < MaxInterleaveCount)
5129       MaxInterleaveCount = (TC / VF);
5130   }
5131 
5132   // If we did not calculate the cost for VF (because the user selected the VF)
5133   // then we calculate the cost of VF here.
5134   if (LoopCost == 0)
5135     LoopCost = expectedCost(VF).first;
5136 
5137   assert(LoopCost && "Non-zero loop cost expected");
5138 
5139   // Clamp the calculated IC to be between the 1 and the max interleave count
5140   // that the target and trip count allows.
5141   if (IC > MaxInterleaveCount)
5142     IC = MaxInterleaveCount;
5143   else if (IC < 1)
5144     IC = 1;
5145 
5146   // Interleave if we vectorized this loop and there is a reduction that could
5147   // benefit from interleaving.
5148   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5149     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5150     return IC;
5151   }
5152 
5153   // Note that if we've already vectorized the loop we will have done the
5154   // runtime check and so interleaving won't require further checks.
5155   bool InterleavingRequiresRuntimePointerCheck =
5156       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5157 
5158   // We want to interleave small loops in order to reduce the loop overhead and
5159   // potentially expose ILP opportunities.
5160   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5161   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5162     // We assume that the cost overhead is 1 and we use the cost model
5163     // to estimate the cost of the loop and interleave until the cost of the
5164     // loop overhead is about 5% of the cost of the loop.
5165     unsigned SmallIC =
5166         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5167 
5168     // Interleave until store/load ports (estimated by max interleave count) are
5169     // saturated.
5170     unsigned NumStores = Legal->getNumStores();
5171     unsigned NumLoads = Legal->getNumLoads();
5172     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5173     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5174 
5175     // If we have a scalar reduction (vector reductions are already dealt with
5176     // by this point), we can increase the critical path length if the loop
5177     // we're interleaving is inside another loop. Limit, by default to 2, so the
5178     // critical path only gets increased by one reduction operation.
5179     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5180       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5181       SmallIC = std::min(SmallIC, F);
5182       StoresIC = std::min(StoresIC, F);
5183       LoadsIC = std::min(LoadsIC, F);
5184     }
5185 
5186     if (EnableLoadStoreRuntimeInterleave &&
5187         std::max(StoresIC, LoadsIC) > SmallIC) {
5188       LLVM_DEBUG(
5189           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5190       return std::max(StoresIC, LoadsIC);
5191     }
5192 
5193     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5194     return SmallIC;
5195   }
5196 
5197   // Interleave if this is a large loop (small loops are already dealt with by
5198   // this point) that could benefit from interleaving.
5199   bool HasReductions = !Legal->getReductionVars()->empty();
5200   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5201     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5202     return IC;
5203   }
5204 
5205   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5206   return 1;
5207 }
5208 
5209 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5210 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5211   // This function calculates the register usage by measuring the highest number
5212   // of values that are alive at a single location. Obviously, this is a very
5213   // rough estimation. We scan the loop in a topological order in order and
5214   // assign a number to each instruction. We use RPO to ensure that defs are
5215   // met before their users. We assume that each instruction that has in-loop
5216   // users starts an interval. We record every time that an in-loop value is
5217   // used, so we have a list of the first and last occurrences of each
5218   // instruction. Next, we transpose this data structure into a multi map that
5219   // holds the list of intervals that *end* at a specific location. This multi
5220   // map allows us to perform a linear search. We scan the instructions linearly
5221   // and record each time that a new interval starts, by placing it in a set.
5222   // If we find this value in the multi-map then we remove it from the set.
5223   // The max register usage is the maximum size of the set.
5224   // We also search for instructions that are defined outside the loop, but are
5225   // used inside the loop. We need this number separately from the max-interval
5226   // usage number because when we unroll, loop-invariant values do not take
5227   // more register.
5228   LoopBlocksDFS DFS(TheLoop);
5229   DFS.perform(LI);
5230 
5231   RegisterUsage RU;
5232 
5233   // Each 'key' in the map opens a new interval. The values
5234   // of the map are the index of the 'last seen' usage of the
5235   // instruction that is the key.
5236   using IntervalMap = DenseMap<Instruction *, unsigned>;
5237 
5238   // Maps instruction to its index.
5239   SmallVector<Instruction *, 64> IdxToInstr;
5240   // Marks the end of each interval.
5241   IntervalMap EndPoint;
5242   // Saves the list of instruction indices that are used in the loop.
5243   SmallPtrSet<Instruction *, 8> Ends;
5244   // Saves the list of values that are used in the loop but are
5245   // defined outside the loop, such as arguments and constants.
5246   SmallPtrSet<Value *, 8> LoopInvariants;
5247 
5248   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5249     for (Instruction &I : BB->instructionsWithoutDebug()) {
5250       IdxToInstr.push_back(&I);
5251 
5252       // Save the end location of each USE.
5253       for (Value *U : I.operands()) {
5254         auto *Instr = dyn_cast<Instruction>(U);
5255 
5256         // Ignore non-instruction values such as arguments, constants, etc.
5257         if (!Instr)
5258           continue;
5259 
5260         // If this instruction is outside the loop then record it and continue.
5261         if (!TheLoop->contains(Instr)) {
5262           LoopInvariants.insert(Instr);
5263           continue;
5264         }
5265 
5266         // Overwrite previous end points.
5267         EndPoint[Instr] = IdxToInstr.size();
5268         Ends.insert(Instr);
5269       }
5270     }
5271   }
5272 
5273   // Saves the list of intervals that end with the index in 'key'.
5274   using InstrList = SmallVector<Instruction *, 2>;
5275   DenseMap<unsigned, InstrList> TransposeEnds;
5276 
5277   // Transpose the EndPoints to a list of values that end at each index.
5278   for (auto &Interval : EndPoint)
5279     TransposeEnds[Interval.second].push_back(Interval.first);
5280 
5281   SmallPtrSet<Instruction *, 8> OpenIntervals;
5282 
5283   // Get the size of the widest register.
5284   unsigned MaxSafeDepDist = -1U;
5285   if (Legal->getMaxSafeDepDistBytes() != -1U)
5286     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5287   unsigned WidestRegister =
5288       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5289   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5290 
5291   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5292   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5293 
5294   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5295 
5296   // A lambda that gets the register usage for the given type and VF.
5297   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5298     if (Ty->isTokenTy())
5299       return 0U;
5300     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5301     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5302   };
5303 
5304   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5305     Instruction *I = IdxToInstr[i];
5306 
5307     // Remove all of the instructions that end at this location.
5308     InstrList &List = TransposeEnds[i];
5309     for (Instruction *ToRemove : List)
5310       OpenIntervals.erase(ToRemove);
5311 
5312     // Ignore instructions that are never used within the loop.
5313     if (Ends.find(I) == Ends.end())
5314       continue;
5315 
5316     // Skip ignored values.
5317     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5318       continue;
5319 
5320     // For each VF find the maximum usage of registers.
5321     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5322       if (VFs[j] == 1) {
5323         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5324         continue;
5325       }
5326       collectUniformsAndScalars(VFs[j]);
5327       // Count the number of live intervals.
5328       unsigned RegUsage = 0;
5329       for (auto Inst : OpenIntervals) {
5330         // Skip ignored values for VF > 1.
5331         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5332             isScalarAfterVectorization(Inst, VFs[j]))
5333           continue;
5334         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5335       }
5336       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5337     }
5338 
5339     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5340                       << OpenIntervals.size() << '\n');
5341 
5342     // Add the current instruction to the list of open intervals.
5343     OpenIntervals.insert(I);
5344   }
5345 
5346   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5347     unsigned Invariant = 0;
5348     if (VFs[i] == 1)
5349       Invariant = LoopInvariants.size();
5350     else {
5351       for (auto Inst : LoopInvariants)
5352         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5353     }
5354 
5355     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5356     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5357     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5358                       << '\n');
5359 
5360     RU.LoopInvariantRegs = Invariant;
5361     RU.MaxLocalUsers = MaxUsages[i];
5362     RUs[i] = RU;
5363   }
5364 
5365   return RUs;
5366 }
5367 
5368 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5369   // TODO: Cost model for emulated masked load/store is completely
5370   // broken. This hack guides the cost model to use an artificially
5371   // high enough value to practically disable vectorization with such
5372   // operations, except where previously deployed legality hack allowed
5373   // using very low cost values. This is to avoid regressions coming simply
5374   // from moving "masked load/store" check from legality to cost model.
5375   // Masked Load/Gather emulation was previously never allowed.
5376   // Limited number of Masked Store/Scatter emulation was allowed.
5377   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5378   return isa<LoadInst>(I) ||
5379          (isa<StoreInst>(I) &&
5380           NumPredStores > NumberOfStoresToPredicate);
5381 }
5382 
5383 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5384   // If we aren't vectorizing the loop, or if we've already collected the
5385   // instructions to scalarize, there's nothing to do. Collection may already
5386   // have occurred if we have a user-selected VF and are now computing the
5387   // expected cost for interleaving.
5388   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5389     return;
5390 
5391   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5392   // not profitable to scalarize any instructions, the presence of VF in the
5393   // map will indicate that we've analyzed it already.
5394   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5395 
5396   // Find all the instructions that are scalar with predication in the loop and
5397   // determine if it would be better to not if-convert the blocks they are in.
5398   // If so, we also record the instructions to scalarize.
5399   for (BasicBlock *BB : TheLoop->blocks()) {
5400     if (!blockNeedsPredication(BB))
5401       continue;
5402     for (Instruction &I : *BB)
5403       if (isScalarWithPredication(&I)) {
5404         ScalarCostsTy ScalarCosts;
5405         // Do not apply discount logic if hacked cost is needed
5406         // for emulated masked memrefs.
5407         if (!useEmulatedMaskMemRefHack(&I) &&
5408             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5409           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5410         // Remember that BB will remain after vectorization.
5411         PredicatedBBsAfterVectorization.insert(BB);
5412       }
5413   }
5414 }
5415 
5416 int LoopVectorizationCostModel::computePredInstDiscount(
5417     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5418     unsigned VF) {
5419   assert(!isUniformAfterVectorization(PredInst, VF) &&
5420          "Instruction marked uniform-after-vectorization will be predicated");
5421 
5422   // Initialize the discount to zero, meaning that the scalar version and the
5423   // vector version cost the same.
5424   int Discount = 0;
5425 
5426   // Holds instructions to analyze. The instructions we visit are mapped in
5427   // ScalarCosts. Those instructions are the ones that would be scalarized if
5428   // we find that the scalar version costs less.
5429   SmallVector<Instruction *, 8> Worklist;
5430 
5431   // Returns true if the given instruction can be scalarized.
5432   auto canBeScalarized = [&](Instruction *I) -> bool {
5433     // We only attempt to scalarize instructions forming a single-use chain
5434     // from the original predicated block that would otherwise be vectorized.
5435     // Although not strictly necessary, we give up on instructions we know will
5436     // already be scalar to avoid traversing chains that are unlikely to be
5437     // beneficial.
5438     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5439         isScalarAfterVectorization(I, VF))
5440       return false;
5441 
5442     // If the instruction is scalar with predication, it will be analyzed
5443     // separately. We ignore it within the context of PredInst.
5444     if (isScalarWithPredication(I))
5445       return false;
5446 
5447     // If any of the instruction's operands are uniform after vectorization,
5448     // the instruction cannot be scalarized. This prevents, for example, a
5449     // masked load from being scalarized.
5450     //
5451     // We assume we will only emit a value for lane zero of an instruction
5452     // marked uniform after vectorization, rather than VF identical values.
5453     // Thus, if we scalarize an instruction that uses a uniform, we would
5454     // create uses of values corresponding to the lanes we aren't emitting code
5455     // for. This behavior can be changed by allowing getScalarValue to clone
5456     // the lane zero values for uniforms rather than asserting.
5457     for (Use &U : I->operands())
5458       if (auto *J = dyn_cast<Instruction>(U.get()))
5459         if (isUniformAfterVectorization(J, VF))
5460           return false;
5461 
5462     // Otherwise, we can scalarize the instruction.
5463     return true;
5464   };
5465 
5466   // Compute the expected cost discount from scalarizing the entire expression
5467   // feeding the predicated instruction. We currently only consider expressions
5468   // that are single-use instruction chains.
5469   Worklist.push_back(PredInst);
5470   while (!Worklist.empty()) {
5471     Instruction *I = Worklist.pop_back_val();
5472 
5473     // If we've already analyzed the instruction, there's nothing to do.
5474     if (ScalarCosts.find(I) != ScalarCosts.end())
5475       continue;
5476 
5477     // Compute the cost of the vector instruction. Note that this cost already
5478     // includes the scalarization overhead of the predicated instruction.
5479     unsigned VectorCost = getInstructionCost(I, VF).first;
5480 
5481     // Compute the cost of the scalarized instruction. This cost is the cost of
5482     // the instruction as if it wasn't if-converted and instead remained in the
5483     // predicated block. We will scale this cost by block probability after
5484     // computing the scalarization overhead.
5485     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5486 
5487     // Compute the scalarization overhead of needed insertelement instructions
5488     // and phi nodes.
5489     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5490       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5491                                                  true, false);
5492       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5493     }
5494 
5495     // Compute the scalarization overhead of needed extractelement
5496     // instructions. For each of the instruction's operands, if the operand can
5497     // be scalarized, add it to the worklist; otherwise, account for the
5498     // overhead.
5499     for (Use &U : I->operands())
5500       if (auto *J = dyn_cast<Instruction>(U.get())) {
5501         assert(VectorType::isValidElementType(J->getType()) &&
5502                "Instruction has non-scalar type");
5503         if (canBeScalarized(J))
5504           Worklist.push_back(J);
5505         else if (needsExtract(J, VF))
5506           ScalarCost += TTI.getScalarizationOverhead(
5507                               ToVectorTy(J->getType(),VF), false, true);
5508       }
5509 
5510     // Scale the total scalar cost by block probability.
5511     ScalarCost /= getReciprocalPredBlockProb();
5512 
5513     // Compute the discount. A non-negative discount means the vector version
5514     // of the instruction costs more, and scalarizing would be beneficial.
5515     Discount += VectorCost - ScalarCost;
5516     ScalarCosts[I] = ScalarCost;
5517   }
5518 
5519   return Discount;
5520 }
5521 
5522 LoopVectorizationCostModel::VectorizationCostTy
5523 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5524   VectorizationCostTy Cost;
5525 
5526   // For each block.
5527   for (BasicBlock *BB : TheLoop->blocks()) {
5528     VectorizationCostTy BlockCost;
5529 
5530     // For each instruction in the old loop.
5531     for (Instruction &I : BB->instructionsWithoutDebug()) {
5532       // Skip ignored values.
5533       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5534           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5535         continue;
5536 
5537       VectorizationCostTy C = getInstructionCost(&I, VF);
5538 
5539       // Check if we should override the cost.
5540       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5541         C.first = ForceTargetInstructionCost;
5542 
5543       BlockCost.first += C.first;
5544       BlockCost.second |= C.second;
5545       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5546                         << " for VF " << VF << " For instruction: " << I
5547                         << '\n');
5548     }
5549 
5550     // If we are vectorizing a predicated block, it will have been
5551     // if-converted. This means that the block's instructions (aside from
5552     // stores and instructions that may divide by zero) will now be
5553     // unconditionally executed. For the scalar case, we may not always execute
5554     // the predicated block. Thus, scale the block's cost by the probability of
5555     // executing it.
5556     if (VF == 1 && blockNeedsPredication(BB))
5557       BlockCost.first /= getReciprocalPredBlockProb();
5558 
5559     Cost.first += BlockCost.first;
5560     Cost.second |= BlockCost.second;
5561   }
5562 
5563   return Cost;
5564 }
5565 
5566 /// Gets Address Access SCEV after verifying that the access pattern
5567 /// is loop invariant except the induction variable dependence.
5568 ///
5569 /// This SCEV can be sent to the Target in order to estimate the address
5570 /// calculation cost.
5571 static const SCEV *getAddressAccessSCEV(
5572               Value *Ptr,
5573               LoopVectorizationLegality *Legal,
5574               PredicatedScalarEvolution &PSE,
5575               const Loop *TheLoop) {
5576 
5577   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5578   if (!Gep)
5579     return nullptr;
5580 
5581   // We are looking for a gep with all loop invariant indices except for one
5582   // which should be an induction variable.
5583   auto SE = PSE.getSE();
5584   unsigned NumOperands = Gep->getNumOperands();
5585   for (unsigned i = 1; i < NumOperands; ++i) {
5586     Value *Opd = Gep->getOperand(i);
5587     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5588         !Legal->isInductionVariable(Opd))
5589       return nullptr;
5590   }
5591 
5592   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5593   return PSE.getSCEV(Ptr);
5594 }
5595 
5596 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5597   return Legal->hasStride(I->getOperand(0)) ||
5598          Legal->hasStride(I->getOperand(1));
5599 }
5600 
5601 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5602                                                                  unsigned VF) {
5603   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5604   Type *ValTy = getMemInstValueType(I);
5605   auto SE = PSE.getSE();
5606 
5607   unsigned Alignment = getLoadStoreAlignment(I);
5608   unsigned AS = getLoadStoreAddressSpace(I);
5609   Value *Ptr = getLoadStorePointerOperand(I);
5610   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5611 
5612   // Figure out whether the access is strided and get the stride value
5613   // if it's known in compile time
5614   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5615 
5616   // Get the cost of the scalar memory instruction and address computation.
5617   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5618 
5619   // Don't pass *I here, since it is scalar but will actually be part of a
5620   // vectorized loop where the user of it is a vectorized instruction.
5621   Cost += VF *
5622           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5623                               AS);
5624 
5625   // Get the overhead of the extractelement and insertelement instructions
5626   // we might create due to scalarization.
5627   Cost += getScalarizationOverhead(I, VF);
5628 
5629   // If we have a predicated store, it may not be executed for each vector
5630   // lane. Scale the cost by the probability of executing the predicated
5631   // block.
5632   if (isPredicatedInst(I)) {
5633     Cost /= getReciprocalPredBlockProb();
5634 
5635     if (useEmulatedMaskMemRefHack(I))
5636       // Artificially setting to a high enough value to practically disable
5637       // vectorization with such operations.
5638       Cost = 3000000;
5639   }
5640 
5641   return Cost;
5642 }
5643 
5644 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5645                                                              unsigned VF) {
5646   Type *ValTy = getMemInstValueType(I);
5647   Type *VectorTy = ToVectorTy(ValTy, VF);
5648   unsigned Alignment = getLoadStoreAlignment(I);
5649   Value *Ptr = getLoadStorePointerOperand(I);
5650   unsigned AS = getLoadStoreAddressSpace(I);
5651   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5652 
5653   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5654          "Stride should be 1 or -1 for consecutive memory access");
5655   unsigned Cost = 0;
5656   if (Legal->isMaskRequired(I))
5657     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5658   else
5659     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5660 
5661   bool Reverse = ConsecutiveStride < 0;
5662   if (Reverse)
5663     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5664   return Cost;
5665 }
5666 
5667 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5668                                                          unsigned VF) {
5669   Type *ValTy = getMemInstValueType(I);
5670   Type *VectorTy = ToVectorTy(ValTy, VF);
5671   unsigned Alignment = getLoadStoreAlignment(I);
5672   unsigned AS = getLoadStoreAddressSpace(I);
5673   if (isa<LoadInst>(I)) {
5674     return TTI.getAddressComputationCost(ValTy) +
5675            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5676            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5677   }
5678   StoreInst *SI = cast<StoreInst>(I);
5679 
5680   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5681   return TTI.getAddressComputationCost(ValTy) +
5682          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5683          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5684                                                Instruction::ExtractElement,
5685                                                VectorTy, VF - 1));
5686 }
5687 
5688 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5689                                                           unsigned VF) {
5690   Type *ValTy = getMemInstValueType(I);
5691   Type *VectorTy = ToVectorTy(ValTy, VF);
5692   unsigned Alignment = getLoadStoreAlignment(I);
5693   Value *Ptr = getLoadStorePointerOperand(I);
5694 
5695   return TTI.getAddressComputationCost(VectorTy) +
5696          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5697                                     Legal->isMaskRequired(I), Alignment);
5698 }
5699 
5700 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5701                                                             unsigned VF) {
5702   Type *ValTy = getMemInstValueType(I);
5703   Type *VectorTy = ToVectorTy(ValTy, VF);
5704   unsigned AS = getLoadStoreAddressSpace(I);
5705 
5706   auto Group = getInterleavedAccessGroup(I);
5707   assert(Group && "Fail to get an interleaved access group.");
5708 
5709   unsigned InterleaveFactor = Group->getFactor();
5710   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5711 
5712   // Holds the indices of existing members in an interleaved load group.
5713   // An interleaved store group doesn't need this as it doesn't allow gaps.
5714   SmallVector<unsigned, 4> Indices;
5715   if (isa<LoadInst>(I)) {
5716     for (unsigned i = 0; i < InterleaveFactor; i++)
5717       if (Group->getMember(i))
5718         Indices.push_back(i);
5719   }
5720 
5721   // Calculate the cost of the whole interleaved group.
5722   bool UseMaskForGaps =
5723       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5724   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5725       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5726       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5727 
5728   if (Group->isReverse()) {
5729     // TODO: Add support for reversed masked interleaved access.
5730     assert(!Legal->isMaskRequired(I) &&
5731            "Reverse masked interleaved access not supported.");
5732     Cost += Group->getNumMembers() *
5733             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5734   }
5735   return Cost;
5736 }
5737 
5738 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5739                                                               unsigned VF) {
5740   // Calculate scalar cost only. Vectorization cost should be ready at this
5741   // moment.
5742   if (VF == 1) {
5743     Type *ValTy = getMemInstValueType(I);
5744     unsigned Alignment = getLoadStoreAlignment(I);
5745     unsigned AS = getLoadStoreAddressSpace(I);
5746 
5747     return TTI.getAddressComputationCost(ValTy) +
5748            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5749   }
5750   return getWideningCost(I, VF);
5751 }
5752 
5753 LoopVectorizationCostModel::VectorizationCostTy
5754 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5755   // If we know that this instruction will remain uniform, check the cost of
5756   // the scalar version.
5757   if (isUniformAfterVectorization(I, VF))
5758     VF = 1;
5759 
5760   if (VF > 1 && isProfitableToScalarize(I, VF))
5761     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5762 
5763   // Forced scalars do not have any scalarization overhead.
5764   auto ForcedScalar = ForcedScalars.find(VF);
5765   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5766     auto InstSet = ForcedScalar->second;
5767     if (InstSet.find(I) != InstSet.end())
5768       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5769   }
5770 
5771   Type *VectorTy;
5772   unsigned C = getInstructionCost(I, VF, VectorTy);
5773 
5774   bool TypeNotScalarized =
5775       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5776   return VectorizationCostTy(C, TypeNotScalarized);
5777 }
5778 
5779 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5780                                                               unsigned VF) {
5781 
5782   if (VF == 1)
5783     return 0;
5784 
5785   unsigned Cost = 0;
5786   Type *RetTy = ToVectorTy(I->getType(), VF);
5787   if (!RetTy->isVoidTy() &&
5788       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5789     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5790 
5791   // Some targets keep addresses scalar.
5792   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5793     return Cost;
5794 
5795   // Some targets support efficient element stores.
5796   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5797     return Cost;
5798 
5799   // Collect operands to consider.
5800   CallInst *CI = dyn_cast<CallInst>(I);
5801   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5802 
5803   // Skip operands that do not require extraction/scalarization and do not incur
5804   // any overhead.
5805   return Cost + TTI.getOperandsScalarizationOverhead(
5806                     filterExtractingOperands(Ops, VF), VF);
5807 }
5808 
5809 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5810   if (VF == 1)
5811     return;
5812   NumPredStores = 0;
5813   for (BasicBlock *BB : TheLoop->blocks()) {
5814     // For each instruction in the old loop.
5815     for (Instruction &I : *BB) {
5816       Value *Ptr =  getLoadStorePointerOperand(&I);
5817       if (!Ptr)
5818         continue;
5819 
5820       // TODO: We should generate better code and update the cost model for
5821       // predicated uniform stores. Today they are treated as any other
5822       // predicated store (see added test cases in
5823       // invariant-store-vectorization.ll).
5824       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5825         NumPredStores++;
5826 
5827       if (Legal->isUniform(Ptr) &&
5828           // Conditional loads and stores should be scalarized and predicated.
5829           // isScalarWithPredication cannot be used here since masked
5830           // gather/scatters are not considered scalar with predication.
5831           !Legal->blockNeedsPredication(I.getParent())) {
5832         // TODO: Avoid replicating loads and stores instead of
5833         // relying on instcombine to remove them.
5834         // Load: Scalar load + broadcast
5835         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5836         unsigned Cost = getUniformMemOpCost(&I, VF);
5837         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5838         continue;
5839       }
5840 
5841       // We assume that widening is the best solution when possible.
5842       if (memoryInstructionCanBeWidened(&I, VF)) {
5843         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5844         int ConsecutiveStride =
5845                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5846         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5847                "Expected consecutive stride.");
5848         InstWidening Decision =
5849             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5850         setWideningDecision(&I, VF, Decision, Cost);
5851         continue;
5852       }
5853 
5854       // Choose between Interleaving, Gather/Scatter or Scalarization.
5855       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5856       unsigned NumAccesses = 1;
5857       if (isAccessInterleaved(&I)) {
5858         auto Group = getInterleavedAccessGroup(&I);
5859         assert(Group && "Fail to get an interleaved access group.");
5860 
5861         // Make one decision for the whole group.
5862         if (getWideningDecision(&I, VF) != CM_Unknown)
5863           continue;
5864 
5865         NumAccesses = Group->getNumMembers();
5866         if (interleavedAccessCanBeWidened(&I, VF))
5867           InterleaveCost = getInterleaveGroupCost(&I, VF);
5868       }
5869 
5870       unsigned GatherScatterCost =
5871           isLegalGatherOrScatter(&I)
5872               ? getGatherScatterCost(&I, VF) * NumAccesses
5873               : std::numeric_limits<unsigned>::max();
5874 
5875       unsigned ScalarizationCost =
5876           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5877 
5878       // Choose better solution for the current VF,
5879       // write down this decision and use it during vectorization.
5880       unsigned Cost;
5881       InstWidening Decision;
5882       if (InterleaveCost <= GatherScatterCost &&
5883           InterleaveCost < ScalarizationCost) {
5884         Decision = CM_Interleave;
5885         Cost = InterleaveCost;
5886       } else if (GatherScatterCost < ScalarizationCost) {
5887         Decision = CM_GatherScatter;
5888         Cost = GatherScatterCost;
5889       } else {
5890         Decision = CM_Scalarize;
5891         Cost = ScalarizationCost;
5892       }
5893       // If the instructions belongs to an interleave group, the whole group
5894       // receives the same decision. The whole group receives the cost, but
5895       // the cost will actually be assigned to one instruction.
5896       if (auto Group = getInterleavedAccessGroup(&I))
5897         setWideningDecision(Group, VF, Decision, Cost);
5898       else
5899         setWideningDecision(&I, VF, Decision, Cost);
5900     }
5901   }
5902 
5903   // Make sure that any load of address and any other address computation
5904   // remains scalar unless there is gather/scatter support. This avoids
5905   // inevitable extracts into address registers, and also has the benefit of
5906   // activating LSR more, since that pass can't optimize vectorized
5907   // addresses.
5908   if (TTI.prefersVectorizedAddressing())
5909     return;
5910 
5911   // Start with all scalar pointer uses.
5912   SmallPtrSet<Instruction *, 8> AddrDefs;
5913   for (BasicBlock *BB : TheLoop->blocks())
5914     for (Instruction &I : *BB) {
5915       Instruction *PtrDef =
5916         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5917       if (PtrDef && TheLoop->contains(PtrDef) &&
5918           getWideningDecision(&I, VF) != CM_GatherScatter)
5919         AddrDefs.insert(PtrDef);
5920     }
5921 
5922   // Add all instructions used to generate the addresses.
5923   SmallVector<Instruction *, 4> Worklist;
5924   for (auto *I : AddrDefs)
5925     Worklist.push_back(I);
5926   while (!Worklist.empty()) {
5927     Instruction *I = Worklist.pop_back_val();
5928     for (auto &Op : I->operands())
5929       if (auto *InstOp = dyn_cast<Instruction>(Op))
5930         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5931             AddrDefs.insert(InstOp).second)
5932           Worklist.push_back(InstOp);
5933   }
5934 
5935   for (auto *I : AddrDefs) {
5936     if (isa<LoadInst>(I)) {
5937       // Setting the desired widening decision should ideally be handled in
5938       // by cost functions, but since this involves the task of finding out
5939       // if the loaded register is involved in an address computation, it is
5940       // instead changed here when we know this is the case.
5941       InstWidening Decision = getWideningDecision(I, VF);
5942       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5943         // Scalarize a widened load of address.
5944         setWideningDecision(I, VF, CM_Scalarize,
5945                             (VF * getMemoryInstructionCost(I, 1)));
5946       else if (auto Group = getInterleavedAccessGroup(I)) {
5947         // Scalarize an interleave group of address loads.
5948         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5949           if (Instruction *Member = Group->getMember(I))
5950             setWideningDecision(Member, VF, CM_Scalarize,
5951                                 (VF * getMemoryInstructionCost(Member, 1)));
5952         }
5953       }
5954     } else
5955       // Make sure I gets scalarized and a cost estimate without
5956       // scalarization overhead.
5957       ForcedScalars[VF].insert(I);
5958   }
5959 }
5960 
5961 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5962                                                         unsigned VF,
5963                                                         Type *&VectorTy) {
5964   Type *RetTy = I->getType();
5965   if (canTruncateToMinimalBitwidth(I, VF))
5966     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
5967   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
5968   auto SE = PSE.getSE();
5969 
5970   // TODO: We need to estimate the cost of intrinsic calls.
5971   switch (I->getOpcode()) {
5972   case Instruction::GetElementPtr:
5973     // We mark this instruction as zero-cost because the cost of GEPs in
5974     // vectorized code depends on whether the corresponding memory instruction
5975     // is scalarized or not. Therefore, we handle GEPs with the memory
5976     // instruction cost.
5977     return 0;
5978   case Instruction::Br: {
5979     // In cases of scalarized and predicated instructions, there will be VF
5980     // predicated blocks in the vectorized loop. Each branch around these
5981     // blocks requires also an extract of its vector compare i1 element.
5982     bool ScalarPredicatedBB = false;
5983     BranchInst *BI = cast<BranchInst>(I);
5984     if (VF > 1 && BI->isConditional() &&
5985         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
5986              PredicatedBBsAfterVectorization.end() ||
5987          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
5988              PredicatedBBsAfterVectorization.end()))
5989       ScalarPredicatedBB = true;
5990 
5991     if (ScalarPredicatedBB) {
5992       // Return cost for branches around scalarized and predicated blocks.
5993       Type *Vec_i1Ty =
5994           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
5995       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
5996               (TTI.getCFInstrCost(Instruction::Br) * VF));
5997     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
5998       // The back-edge branch will remain, as will all scalar branches.
5999       return TTI.getCFInstrCost(Instruction::Br);
6000     else
6001       // This branch will be eliminated by if-conversion.
6002       return 0;
6003     // Note: We currently assume zero cost for an unconditional branch inside
6004     // a predicated block since it will become a fall-through, although we
6005     // may decide in the future to call TTI for all branches.
6006   }
6007   case Instruction::PHI: {
6008     auto *Phi = cast<PHINode>(I);
6009 
6010     // First-order recurrences are replaced by vector shuffles inside the loop.
6011     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6012     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6013       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6014                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6015 
6016     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6017     // converted into select instructions. We require N - 1 selects per phi
6018     // node, where N is the number of incoming values.
6019     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6020       return (Phi->getNumIncomingValues() - 1) *
6021              TTI.getCmpSelInstrCost(
6022                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6023                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6024 
6025     return TTI.getCFInstrCost(Instruction::PHI);
6026   }
6027   case Instruction::UDiv:
6028   case Instruction::SDiv:
6029   case Instruction::URem:
6030   case Instruction::SRem:
6031     // If we have a predicated instruction, it may not be executed for each
6032     // vector lane. Get the scalarization cost and scale this amount by the
6033     // probability of executing the predicated block. If the instruction is not
6034     // predicated, we fall through to the next case.
6035     if (VF > 1 && isScalarWithPredication(I)) {
6036       unsigned Cost = 0;
6037 
6038       // These instructions have a non-void type, so account for the phi nodes
6039       // that we will create. This cost is likely to be zero. The phi node
6040       // cost, if any, should be scaled by the block probability because it
6041       // models a copy at the end of each predicated block.
6042       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6043 
6044       // The cost of the non-predicated instruction.
6045       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6046 
6047       // The cost of insertelement and extractelement instructions needed for
6048       // scalarization.
6049       Cost += getScalarizationOverhead(I, VF);
6050 
6051       // Scale the cost by the probability of executing the predicated blocks.
6052       // This assumes the predicated block for each vector lane is equally
6053       // likely.
6054       return Cost / getReciprocalPredBlockProb();
6055     }
6056     LLVM_FALLTHROUGH;
6057   case Instruction::Add:
6058   case Instruction::FAdd:
6059   case Instruction::Sub:
6060   case Instruction::FSub:
6061   case Instruction::Mul:
6062   case Instruction::FMul:
6063   case Instruction::FDiv:
6064   case Instruction::FRem:
6065   case Instruction::Shl:
6066   case Instruction::LShr:
6067   case Instruction::AShr:
6068   case Instruction::And:
6069   case Instruction::Or:
6070   case Instruction::Xor: {
6071     // Since we will replace the stride by 1 the multiplication should go away.
6072     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6073       return 0;
6074     // Certain instructions can be cheaper to vectorize if they have a constant
6075     // second vector operand. One example of this are shifts on x86.
6076     Value *Op2 = I->getOperand(1);
6077     TargetTransformInfo::OperandValueProperties Op2VP;
6078     TargetTransformInfo::OperandValueKind Op2VK =
6079         TTI.getOperandInfo(Op2, Op2VP);
6080     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6081       Op2VK = TargetTransformInfo::OK_UniformValue;
6082 
6083     SmallVector<const Value *, 4> Operands(I->operand_values());
6084     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6085     return N * TTI.getArithmeticInstrCost(
6086                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6087                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6088   }
6089   case Instruction::FNeg: {
6090     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6091     return N * TTI.getArithmeticInstrCost(
6092                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6093                    TargetTransformInfo::OK_AnyValue,
6094                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6095                    I->getOperand(0));
6096   }
6097   case Instruction::Select: {
6098     SelectInst *SI = cast<SelectInst>(I);
6099     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6100     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6101     Type *CondTy = SI->getCondition()->getType();
6102     if (!ScalarCond)
6103       CondTy = VectorType::get(CondTy, VF);
6104 
6105     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6106   }
6107   case Instruction::ICmp:
6108   case Instruction::FCmp: {
6109     Type *ValTy = I->getOperand(0)->getType();
6110     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6111     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6112       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6113     VectorTy = ToVectorTy(ValTy, VF);
6114     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6115   }
6116   case Instruction::Store:
6117   case Instruction::Load: {
6118     unsigned Width = VF;
6119     if (Width > 1) {
6120       InstWidening Decision = getWideningDecision(I, Width);
6121       assert(Decision != CM_Unknown &&
6122              "CM decision should be taken at this point");
6123       if (Decision == CM_Scalarize)
6124         Width = 1;
6125     }
6126     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6127     return getMemoryInstructionCost(I, VF);
6128   }
6129   case Instruction::ZExt:
6130   case Instruction::SExt:
6131   case Instruction::FPToUI:
6132   case Instruction::FPToSI:
6133   case Instruction::FPExt:
6134   case Instruction::PtrToInt:
6135   case Instruction::IntToPtr:
6136   case Instruction::SIToFP:
6137   case Instruction::UIToFP:
6138   case Instruction::Trunc:
6139   case Instruction::FPTrunc:
6140   case Instruction::BitCast: {
6141     // We optimize the truncation of induction variables having constant
6142     // integer steps. The cost of these truncations is the same as the scalar
6143     // operation.
6144     if (isOptimizableIVTruncate(I, VF)) {
6145       auto *Trunc = cast<TruncInst>(I);
6146       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6147                                   Trunc->getSrcTy(), Trunc);
6148     }
6149 
6150     Type *SrcScalarTy = I->getOperand(0)->getType();
6151     Type *SrcVecTy =
6152         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6153     if (canTruncateToMinimalBitwidth(I, VF)) {
6154       // This cast is going to be shrunk. This may remove the cast or it might
6155       // turn it into slightly different cast. For example, if MinBW == 16,
6156       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6157       //
6158       // Calculate the modified src and dest types.
6159       Type *MinVecTy = VectorTy;
6160       if (I->getOpcode() == Instruction::Trunc) {
6161         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6162         VectorTy =
6163             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6164       } else if (I->getOpcode() == Instruction::ZExt ||
6165                  I->getOpcode() == Instruction::SExt) {
6166         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6167         VectorTy =
6168             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6169       }
6170     }
6171 
6172     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6173     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6174   }
6175   case Instruction::Call: {
6176     bool NeedToScalarize;
6177     CallInst *CI = cast<CallInst>(I);
6178     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6179     if (getVectorIntrinsicIDForCall(CI, TLI))
6180       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6181     return CallCost;
6182   }
6183   default:
6184     // The cost of executing VF copies of the scalar instruction. This opcode
6185     // is unknown. Assume that it is the same as 'mul'.
6186     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6187            getScalarizationOverhead(I, VF);
6188   } // end of switch.
6189 }
6190 
6191 char LoopVectorize::ID = 0;
6192 
6193 static const char lv_name[] = "Loop Vectorization";
6194 
6195 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6196 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6197 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6198 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6199 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6200 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6201 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6202 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6203 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6204 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6205 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6206 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6207 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6208 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6209 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6210 
6211 namespace llvm {
6212 
6213 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6214 
6215 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6216                               bool VectorizeOnlyWhenForced) {
6217   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6218 }
6219 
6220 } // end namespace llvm
6221 
6222 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6223   // Check if the pointer operand of a load or store instruction is
6224   // consecutive.
6225   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6226     return Legal->isConsecutivePtr(Ptr);
6227   return false;
6228 }
6229 
6230 void LoopVectorizationCostModel::collectValuesToIgnore() {
6231   // Ignore ephemeral values.
6232   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6233 
6234   // Ignore type-promoting instructions we identified during reduction
6235   // detection.
6236   for (auto &Reduction : *Legal->getReductionVars()) {
6237     RecurrenceDescriptor &RedDes = Reduction.second;
6238     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6239     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6240   }
6241   // Ignore type-casting instructions we identified during induction
6242   // detection.
6243   for (auto &Induction : *Legal->getInductionVars()) {
6244     InductionDescriptor &IndDes = Induction.second;
6245     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6246     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6247   }
6248 }
6249 
6250 // TODO: we could return a pair of values that specify the max VF and
6251 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6252 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6253 // doesn't have a cost model that can choose which plan to execute if
6254 // more than one is generated.
6255 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6256                                  LoopVectorizationCostModel &CM) {
6257   unsigned WidestType;
6258   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6259   return WidestVectorRegBits / WidestType;
6260 }
6261 
6262 VectorizationFactor
6263 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6264   unsigned VF = UserVF;
6265   // Outer loop handling: They may require CFG and instruction level
6266   // transformations before even evaluating whether vectorization is profitable.
6267   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6268   // the vectorization pipeline.
6269   if (!OrigLoop->empty()) {
6270     // If the user doesn't provide a vectorization factor, determine a
6271     // reasonable one.
6272     if (!UserVF) {
6273       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6274       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6275 
6276       // Make sure we have a VF > 1 for stress testing.
6277       if (VPlanBuildStressTest && VF < 2) {
6278         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6279                           << "overriding computed VF.\n");
6280         VF = 4;
6281       }
6282     }
6283     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6284     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6285     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6286                       << " to build VPlans.\n");
6287     buildVPlans(VF, VF);
6288 
6289     // For VPlan build stress testing, we bail out after VPlan construction.
6290     if (VPlanBuildStressTest)
6291       return VectorizationFactor::Disabled();
6292 
6293     return {VF, 0};
6294   }
6295 
6296   LLVM_DEBUG(
6297       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6298                 "VPlan-native path.\n");
6299   return VectorizationFactor::Disabled();
6300 }
6301 
6302 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6303   assert(OrigLoop->empty() && "Inner loop expected.");
6304   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6305   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6306     return None;
6307 
6308   // Invalidate interleave groups if all blocks of loop will be predicated.
6309   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6310       !useMaskedInterleavedAccesses(*TTI)) {
6311     LLVM_DEBUG(
6312         dbgs()
6313         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6314            "which requires masked-interleaved support.\n");
6315     CM.InterleaveInfo.reset();
6316   }
6317 
6318   if (UserVF) {
6319     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6320     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6321     // Collect the instructions (and their associated costs) that will be more
6322     // profitable to scalarize.
6323     CM.selectUserVectorizationFactor(UserVF);
6324     buildVPlansWithVPRecipes(UserVF, UserVF);
6325     LLVM_DEBUG(printPlans(dbgs()));
6326     return {{UserVF, 0}};
6327   }
6328 
6329   unsigned MaxVF = MaybeMaxVF.getValue();
6330   assert(MaxVF != 0 && "MaxVF is zero.");
6331 
6332   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6333     // Collect Uniform and Scalar instructions after vectorization with VF.
6334     CM.collectUniformsAndScalars(VF);
6335 
6336     // Collect the instructions (and their associated costs) that will be more
6337     // profitable to scalarize.
6338     if (VF > 1)
6339       CM.collectInstsToScalarize(VF);
6340   }
6341 
6342   buildVPlansWithVPRecipes(1, MaxVF);
6343   LLVM_DEBUG(printPlans(dbgs()));
6344   if (MaxVF == 1)
6345     return VectorizationFactor::Disabled();
6346 
6347   // Select the optimal vectorization factor.
6348   return CM.selectVectorizationFactor(MaxVF);
6349 }
6350 
6351 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6352   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6353                     << '\n');
6354   BestVF = VF;
6355   BestUF = UF;
6356 
6357   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6358     return !Plan->hasVF(VF);
6359   });
6360   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6361 }
6362 
6363 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6364                                            DominatorTree *DT) {
6365   // Perform the actual loop transformation.
6366 
6367   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6368   VPCallbackILV CallbackILV(ILV);
6369 
6370   VPTransformState State{BestVF, BestUF,      LI,
6371                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6372                          &ILV,   CallbackILV};
6373   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6374   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6375 
6376   //===------------------------------------------------===//
6377   //
6378   // Notice: any optimization or new instruction that go
6379   // into the code below should also be implemented in
6380   // the cost-model.
6381   //
6382   //===------------------------------------------------===//
6383 
6384   // 2. Copy and widen instructions from the old loop into the new loop.
6385   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6386   VPlans.front()->execute(&State);
6387 
6388   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6389   //    predication, updating analyses.
6390   ILV.fixVectorizedLoop();
6391 }
6392 
6393 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6394     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6395   BasicBlock *Latch = OrigLoop->getLoopLatch();
6396 
6397   // We create new control-flow for the vectorized loop, so the original
6398   // condition will be dead after vectorization if it's only used by the
6399   // branch.
6400   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6401   if (Cmp && Cmp->hasOneUse())
6402     DeadInstructions.insert(Cmp);
6403 
6404   // We create new "steps" for induction variable updates to which the original
6405   // induction variables map. An original update instruction will be dead if
6406   // all its users except the induction variable are dead.
6407   for (auto &Induction : *Legal->getInductionVars()) {
6408     PHINode *Ind = Induction.first;
6409     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6410     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6411           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6412                                  DeadInstructions.end();
6413         }))
6414       DeadInstructions.insert(IndUpdate);
6415 
6416     // We record as "Dead" also the type-casting instructions we had identified
6417     // during induction analysis. We don't need any handling for them in the
6418     // vectorized loop because we have proven that, under a proper runtime
6419     // test guarding the vectorized loop, the value of the phi, and the casted
6420     // value of the phi, are the same. The last instruction in this casting chain
6421     // will get its scalar/vector/widened def from the scalar/vector/widened def
6422     // of the respective phi node. Any other casts in the induction def-use chain
6423     // have no other uses outside the phi update chain, and will be ignored.
6424     InductionDescriptor &IndDes = Induction.second;
6425     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6426     DeadInstructions.insert(Casts.begin(), Casts.end());
6427   }
6428 }
6429 
6430 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6431 
6432 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6433 
6434 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6435                                         Instruction::BinaryOps BinOp) {
6436   // When unrolling and the VF is 1, we only need to add a simple scalar.
6437   Type *Ty = Val->getType();
6438   assert(!Ty->isVectorTy() && "Val must be a scalar");
6439 
6440   if (Ty->isFloatingPointTy()) {
6441     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6442 
6443     // Floating point operations had to be 'fast' to enable the unrolling.
6444     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6445     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6446   }
6447   Constant *C = ConstantInt::get(Ty, StartIdx);
6448   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6449 }
6450 
6451 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6452   SmallVector<Metadata *, 4> MDs;
6453   // Reserve first location for self reference to the LoopID metadata node.
6454   MDs.push_back(nullptr);
6455   bool IsUnrollMetadata = false;
6456   MDNode *LoopID = L->getLoopID();
6457   if (LoopID) {
6458     // First find existing loop unrolling disable metadata.
6459     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6460       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6461       if (MD) {
6462         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6463         IsUnrollMetadata =
6464             S && S->getString().startswith("llvm.loop.unroll.disable");
6465       }
6466       MDs.push_back(LoopID->getOperand(i));
6467     }
6468   }
6469 
6470   if (!IsUnrollMetadata) {
6471     // Add runtime unroll disable metadata.
6472     LLVMContext &Context = L->getHeader()->getContext();
6473     SmallVector<Metadata *, 1> DisableOperands;
6474     DisableOperands.push_back(
6475         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6476     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6477     MDs.push_back(DisableNode);
6478     MDNode *NewLoopID = MDNode::get(Context, MDs);
6479     // Set operand 0 to refer to the loop id itself.
6480     NewLoopID->replaceOperandWith(0, NewLoopID);
6481     L->setLoopID(NewLoopID);
6482   }
6483 }
6484 
6485 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6486     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6487   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6488   bool PredicateAtRangeStart = Predicate(Range.Start);
6489 
6490   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6491     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6492       Range.End = TmpVF;
6493       break;
6494     }
6495 
6496   return PredicateAtRangeStart;
6497 }
6498 
6499 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6500 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6501 /// of VF's starting at a given VF and extending it as much as possible. Each
6502 /// vectorization decision can potentially shorten this sub-range during
6503 /// buildVPlan().
6504 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6505   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6506     VFRange SubRange = {VF, MaxVF + 1};
6507     VPlans.push_back(buildVPlan(SubRange));
6508     VF = SubRange.End;
6509   }
6510 }
6511 
6512 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6513                                          VPlanPtr &Plan) {
6514   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6515 
6516   // Look for cached value.
6517   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6518   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6519   if (ECEntryIt != EdgeMaskCache.end())
6520     return ECEntryIt->second;
6521 
6522   VPValue *SrcMask = createBlockInMask(Src, Plan);
6523 
6524   // The terminator has to be a branch inst!
6525   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6526   assert(BI && "Unexpected terminator found");
6527 
6528   if (!BI->isConditional())
6529     return EdgeMaskCache[Edge] = SrcMask;
6530 
6531   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6532   assert(EdgeMask && "No Edge Mask found for condition");
6533 
6534   if (BI->getSuccessor(0) != Dst)
6535     EdgeMask = Builder.createNot(EdgeMask);
6536 
6537   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6538     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6539 
6540   return EdgeMaskCache[Edge] = EdgeMask;
6541 }
6542 
6543 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6544   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6545 
6546   // Look for cached value.
6547   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6548   if (BCEntryIt != BlockMaskCache.end())
6549     return BCEntryIt->second;
6550 
6551   // All-one mask is modelled as no-mask following the convention for masked
6552   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6553   VPValue *BlockMask = nullptr;
6554 
6555   if (OrigLoop->getHeader() == BB) {
6556     if (!CM.blockNeedsPredication(BB))
6557       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6558 
6559     // Introduce the early-exit compare IV <= BTC to form header block mask.
6560     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6561     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6562     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6563     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6564     return BlockMaskCache[BB] = BlockMask;
6565   }
6566 
6567   // This is the block mask. We OR all incoming edges.
6568   for (auto *Predecessor : predecessors(BB)) {
6569     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6570     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6571       return BlockMaskCache[BB] = EdgeMask;
6572 
6573     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6574       BlockMask = EdgeMask;
6575       continue;
6576     }
6577 
6578     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6579   }
6580 
6581   return BlockMaskCache[BB] = BlockMask;
6582 }
6583 
6584 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6585                                                            VFRange &Range,
6586                                                            VPlanPtr &Plan) {
6587   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6588   if (!IG)
6589     return nullptr;
6590 
6591   // Now check if IG is relevant for VF's in the given range.
6592   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6593     return [=](unsigned VF) -> bool {
6594       return (VF >= 2 && // Query is illegal for VF == 1
6595               CM.getWideningDecision(I, VF) ==
6596                   LoopVectorizationCostModel::CM_Interleave);
6597     };
6598   };
6599   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6600     return nullptr;
6601 
6602   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6603   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6604   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6605   assert(I == IG->getInsertPos() &&
6606          "Generating a recipe for an adjunct member of an interleave group");
6607 
6608   VPValue *Mask = nullptr;
6609   if (Legal->isMaskRequired(I))
6610     Mask = createBlockInMask(I->getParent(), Plan);
6611 
6612   return new VPInterleaveRecipe(IG, Mask);
6613 }
6614 
6615 VPWidenMemoryInstructionRecipe *
6616 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6617                                   VPlanPtr &Plan) {
6618   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6619     return nullptr;
6620 
6621   auto willWiden = [&](unsigned VF) -> bool {
6622     if (VF == 1)
6623       return false;
6624     if (CM.isScalarAfterVectorization(I, VF) ||
6625         CM.isProfitableToScalarize(I, VF))
6626       return false;
6627     LoopVectorizationCostModel::InstWidening Decision =
6628         CM.getWideningDecision(I, VF);
6629     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6630            "CM decision should be taken at this point.");
6631     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6632            "Interleave memory opportunity should be caught earlier.");
6633     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6634   };
6635 
6636   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6637     return nullptr;
6638 
6639   VPValue *Mask = nullptr;
6640   if (Legal->isMaskRequired(I))
6641     Mask = createBlockInMask(I->getParent(), Plan);
6642 
6643   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6644 }
6645 
6646 VPWidenIntOrFpInductionRecipe *
6647 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6648   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6649     // Check if this is an integer or fp induction. If so, build the recipe that
6650     // produces its scalar and vector values.
6651     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6652     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6653         II.getKind() == InductionDescriptor::IK_FpInduction)
6654       return new VPWidenIntOrFpInductionRecipe(Phi);
6655 
6656     return nullptr;
6657   }
6658 
6659   // Optimize the special case where the source is a constant integer
6660   // induction variable. Notice that we can only optimize the 'trunc' case
6661   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6662   // (c) other casts depend on pointer size.
6663 
6664   // Determine whether \p K is a truncation based on an induction variable that
6665   // can be optimized.
6666   auto isOptimizableIVTruncate =
6667       [&](Instruction *K) -> std::function<bool(unsigned)> {
6668     return
6669         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6670   };
6671 
6672   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6673                                isOptimizableIVTruncate(I), Range))
6674     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6675                                              cast<TruncInst>(I));
6676   return nullptr;
6677 }
6678 
6679 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6680   PHINode *Phi = dyn_cast<PHINode>(I);
6681   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6682     return nullptr;
6683 
6684   // We know that all PHIs in non-header blocks are converted into selects, so
6685   // we don't have to worry about the insertion order and we can just use the
6686   // builder. At this point we generate the predication tree. There may be
6687   // duplications since this is a simple recursive scan, but future
6688   // optimizations will clean it up.
6689 
6690   SmallVector<VPValue *, 2> Masks;
6691   unsigned NumIncoming = Phi->getNumIncomingValues();
6692   for (unsigned In = 0; In < NumIncoming; In++) {
6693     VPValue *EdgeMask =
6694       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6695     assert((EdgeMask || NumIncoming == 1) &&
6696            "Multiple predecessors with one having a full mask");
6697     if (EdgeMask)
6698       Masks.push_back(EdgeMask);
6699   }
6700   return new VPBlendRecipe(Phi, Masks);
6701 }
6702 
6703 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6704                                  VFRange &Range) {
6705 
6706   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6707       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6708 
6709   if (IsPredicated)
6710     return false;
6711 
6712   auto IsVectorizableOpcode = [](unsigned Opcode) {
6713     switch (Opcode) {
6714     case Instruction::Add:
6715     case Instruction::And:
6716     case Instruction::AShr:
6717     case Instruction::BitCast:
6718     case Instruction::Br:
6719     case Instruction::Call:
6720     case Instruction::FAdd:
6721     case Instruction::FCmp:
6722     case Instruction::FDiv:
6723     case Instruction::FMul:
6724     case Instruction::FNeg:
6725     case Instruction::FPExt:
6726     case Instruction::FPToSI:
6727     case Instruction::FPToUI:
6728     case Instruction::FPTrunc:
6729     case Instruction::FRem:
6730     case Instruction::FSub:
6731     case Instruction::GetElementPtr:
6732     case Instruction::ICmp:
6733     case Instruction::IntToPtr:
6734     case Instruction::Load:
6735     case Instruction::LShr:
6736     case Instruction::Mul:
6737     case Instruction::Or:
6738     case Instruction::PHI:
6739     case Instruction::PtrToInt:
6740     case Instruction::SDiv:
6741     case Instruction::Select:
6742     case Instruction::SExt:
6743     case Instruction::Shl:
6744     case Instruction::SIToFP:
6745     case Instruction::SRem:
6746     case Instruction::Store:
6747     case Instruction::Sub:
6748     case Instruction::Trunc:
6749     case Instruction::UDiv:
6750     case Instruction::UIToFP:
6751     case Instruction::URem:
6752     case Instruction::Xor:
6753     case Instruction::ZExt:
6754       return true;
6755     }
6756     return false;
6757   };
6758 
6759   if (!IsVectorizableOpcode(I->getOpcode()))
6760     return false;
6761 
6762   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6763     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6764     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6765                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6766       return false;
6767   }
6768 
6769   auto willWiden = [&](unsigned VF) -> bool {
6770     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6771                              CM.isProfitableToScalarize(I, VF)))
6772       return false;
6773     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6774       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6775       // The following case may be scalarized depending on the VF.
6776       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6777       // version of the instruction.
6778       // Is it beneficial to perform intrinsic call compared to lib call?
6779       bool NeedToScalarize;
6780       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6781       bool UseVectorIntrinsic =
6782           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6783       return UseVectorIntrinsic || !NeedToScalarize;
6784     }
6785     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6786       assert(CM.getWideningDecision(I, VF) ==
6787                  LoopVectorizationCostModel::CM_Scalarize &&
6788              "Memory widening decisions should have been taken care by now");
6789       return false;
6790     }
6791     return true;
6792   };
6793 
6794   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6795     return false;
6796 
6797   // Success: widen this instruction. We optimize the common case where
6798   // consecutive instructions can be represented by a single recipe.
6799   if (!VPBB->empty()) {
6800     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6801     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6802       return true;
6803   }
6804 
6805   VPBB->appendRecipe(new VPWidenRecipe(I));
6806   return true;
6807 }
6808 
6809 VPBasicBlock *VPRecipeBuilder::handleReplication(
6810     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6811     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6812     VPlanPtr &Plan) {
6813   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6814       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6815       Range);
6816 
6817   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6818       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6819 
6820   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6821 
6822   // Find if I uses a predicated instruction. If so, it will use its scalar
6823   // value. Avoid hoisting the insert-element which packs the scalar value into
6824   // a vector value, as that happens iff all users use the vector value.
6825   for (auto &Op : I->operands())
6826     if (auto *PredInst = dyn_cast<Instruction>(Op))
6827       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6828         PredInst2Recipe[PredInst]->setAlsoPack(false);
6829 
6830   // Finalize the recipe for Instr, first if it is not predicated.
6831   if (!IsPredicated) {
6832     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6833     VPBB->appendRecipe(Recipe);
6834     return VPBB;
6835   }
6836   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6837   assert(VPBB->getSuccessors().empty() &&
6838          "VPBB has successors when handling predicated replication.");
6839   // Record predicated instructions for above packing optimizations.
6840   PredInst2Recipe[I] = Recipe;
6841   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6842   VPBlockUtils::insertBlockAfter(Region, VPBB);
6843   auto *RegSucc = new VPBasicBlock();
6844   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6845   return RegSucc;
6846 }
6847 
6848 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6849                                                       VPRecipeBase *PredRecipe,
6850                                                       VPlanPtr &Plan) {
6851   // Instructions marked for predication are replicated and placed under an
6852   // if-then construct to prevent side-effects.
6853 
6854   // Generate recipes to compute the block mask for this region.
6855   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6856 
6857   // Build the triangular if-then region.
6858   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6859   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6860   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6861   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6862   auto *PHIRecipe =
6863       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6864   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6865   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6866   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6867 
6868   // Note: first set Entry as region entry and then connect successors starting
6869   // from it in order, to propagate the "parent" of each VPBasicBlock.
6870   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6871   VPBlockUtils::connectBlocks(Pred, Exit);
6872 
6873   return Region;
6874 }
6875 
6876 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6877                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6878   VPRecipeBase *Recipe = nullptr;
6879   // Check if Instr should belong to an interleave memory recipe, or already
6880   // does. In the latter case Instr is irrelevant.
6881   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6882     VPBB->appendRecipe(Recipe);
6883     return true;
6884   }
6885 
6886   // Check if Instr is a memory operation that should be widened.
6887   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6888     VPBB->appendRecipe(Recipe);
6889     return true;
6890   }
6891 
6892   // Check if Instr should form some PHI recipe.
6893   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6894     VPBB->appendRecipe(Recipe);
6895     return true;
6896   }
6897   if ((Recipe = tryToBlend(Instr, Plan))) {
6898     VPBB->appendRecipe(Recipe);
6899     return true;
6900   }
6901   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6902     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6903     return true;
6904   }
6905 
6906   // Check if Instr is to be widened by a general VPWidenRecipe, after
6907   // having first checked for specific widening recipes that deal with
6908   // Interleave Groups, Inductions and Phi nodes.
6909   if (tryToWiden(Instr, VPBB, Range))
6910     return true;
6911 
6912   return false;
6913 }
6914 
6915 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6916                                                         unsigned MaxVF) {
6917   assert(OrigLoop->empty() && "Inner loop expected.");
6918 
6919   // Collect conditions feeding internal conditional branches; they need to be
6920   // represented in VPlan for it to model masking.
6921   SmallPtrSet<Value *, 1> NeedDef;
6922 
6923   auto *Latch = OrigLoop->getLoopLatch();
6924   for (BasicBlock *BB : OrigLoop->blocks()) {
6925     if (BB == Latch)
6926       continue;
6927     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6928     if (Branch && Branch->isConditional())
6929       NeedDef.insert(Branch->getCondition());
6930   }
6931 
6932   // If the tail is to be folded by masking, the primary induction variable
6933   // needs to be represented in VPlan for it to model early-exit masking.
6934   if (CM.foldTailByMasking())
6935     NeedDef.insert(Legal->getPrimaryInduction());
6936 
6937   // Collect instructions from the original loop that will become trivially dead
6938   // in the vectorized loop. We don't need to vectorize these instructions. For
6939   // example, original induction update instructions can become dead because we
6940   // separately emit induction "steps" when generating code for the new loop.
6941   // Similarly, we create a new latch condition when setting up the structure
6942   // of the new loop, so the old one can become dead.
6943   SmallPtrSet<Instruction *, 4> DeadInstructions;
6944   collectTriviallyDeadInstructions(DeadInstructions);
6945 
6946   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6947     VFRange SubRange = {VF, MaxVF + 1};
6948     VPlans.push_back(
6949         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6950     VF = SubRange.End;
6951   }
6952 }
6953 
6954 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
6955     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
6956     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6957   // Hold a mapping from predicated instructions to their recipes, in order to
6958   // fix their AlsoPack behavior if a user is determined to replicate and use a
6959   // scalar instead of vector value.
6960   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
6961 
6962   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
6963   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
6964 
6965   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
6966   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
6967   auto Plan = llvm::make_unique<VPlan>(VPBB);
6968 
6969   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
6970   // Represent values that will have defs inside VPlan.
6971   for (Value *V : NeedDef)
6972     Plan->addVPValue(V);
6973 
6974   // Scan the body of the loop in a topological order to visit each basic block
6975   // after having visited its predecessor basic blocks.
6976   LoopBlocksDFS DFS(OrigLoop);
6977   DFS.perform(LI);
6978 
6979   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6980     // Relevant instructions from basic block BB will be grouped into VPRecipe
6981     // ingredients and fill a new VPBasicBlock.
6982     unsigned VPBBsForBB = 0;
6983     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
6984     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
6985     VPBB = FirstVPBBForBB;
6986     Builder.setInsertPoint(VPBB);
6987 
6988     std::vector<Instruction *> Ingredients;
6989 
6990     // Organize the ingredients to vectorize from current basic block in the
6991     // right order.
6992     for (Instruction &I : BB->instructionsWithoutDebug()) {
6993       Instruction *Instr = &I;
6994 
6995       // First filter out irrelevant instructions, to ensure no recipes are
6996       // built for them.
6997       if (isa<BranchInst>(Instr) ||
6998           DeadInstructions.find(Instr) != DeadInstructions.end())
6999         continue;
7000 
7001       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7002       // member of the IG, do not construct any Recipe for it.
7003       const InterleaveGroup<Instruction> *IG =
7004           CM.getInterleavedAccessGroup(Instr);
7005       if (IG && Instr != IG->getInsertPos() &&
7006           Range.Start >= 2 && // Query is illegal for VF == 1
7007           CM.getWideningDecision(Instr, Range.Start) ==
7008               LoopVectorizationCostModel::CM_Interleave) {
7009         auto SinkCandidate = SinkAfterInverse.find(Instr);
7010         if (SinkCandidate != SinkAfterInverse.end())
7011           Ingredients.push_back(SinkCandidate->second);
7012         continue;
7013       }
7014 
7015       // Move instructions to handle first-order recurrences, step 1: avoid
7016       // handling this instruction until after we've handled the instruction it
7017       // should follow.
7018       auto SAIt = SinkAfter.find(Instr);
7019       if (SAIt != SinkAfter.end()) {
7020         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7021                           << *SAIt->second
7022                           << " to vectorize a 1st order recurrence.\n");
7023         SinkAfterInverse[SAIt->second] = Instr;
7024         continue;
7025       }
7026 
7027       Ingredients.push_back(Instr);
7028 
7029       // Move instructions to handle first-order recurrences, step 2: push the
7030       // instruction to be sunk at its insertion point.
7031       auto SAInvIt = SinkAfterInverse.find(Instr);
7032       if (SAInvIt != SinkAfterInverse.end())
7033         Ingredients.push_back(SAInvIt->second);
7034     }
7035 
7036     // Introduce each ingredient into VPlan.
7037     for (Instruction *Instr : Ingredients) {
7038       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7039         continue;
7040 
7041       // Otherwise, if all widening options failed, Instruction is to be
7042       // replicated. This may create a successor for VPBB.
7043       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7044           Instr, Range, VPBB, PredInst2Recipe, Plan);
7045       if (NextVPBB != VPBB) {
7046         VPBB = NextVPBB;
7047         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7048                                     : "");
7049       }
7050     }
7051   }
7052 
7053   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7054   // may also be empty, such as the last one VPBB, reflecting original
7055   // basic-blocks with no recipes.
7056   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7057   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7058   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7059   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7060   delete PreEntry;
7061 
7062   std::string PlanName;
7063   raw_string_ostream RSO(PlanName);
7064   unsigned VF = Range.Start;
7065   Plan->addVF(VF);
7066   RSO << "Initial VPlan for VF={" << VF;
7067   for (VF *= 2; VF < Range.End; VF *= 2) {
7068     Plan->addVF(VF);
7069     RSO << "," << VF;
7070   }
7071   RSO << "},UF>=1";
7072   RSO.flush();
7073   Plan->setName(PlanName);
7074 
7075   return Plan;
7076 }
7077 
7078 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7079   // Outer loop handling: They may require CFG and instruction level
7080   // transformations before even evaluating whether vectorization is profitable.
7081   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7082   // the vectorization pipeline.
7083   assert(!OrigLoop->empty());
7084   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7085 
7086   // Create new empty VPlan
7087   auto Plan = llvm::make_unique<VPlan>();
7088 
7089   // Build hierarchical CFG
7090   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7091   HCFGBuilder.buildHierarchicalCFG();
7092 
7093   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7094     Plan->addVF(VF);
7095 
7096   if (EnableVPlanPredication) {
7097     VPlanPredicator VPP(*Plan);
7098     VPP.predicate();
7099 
7100     // Avoid running transformation to recipes until masked code generation in
7101     // VPlan-native path is in place.
7102     return Plan;
7103   }
7104 
7105   SmallPtrSet<Instruction *, 1> DeadInstructions;
7106   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7107       Plan, Legal->getInductionVars(), DeadInstructions);
7108 
7109   return Plan;
7110 }
7111 
7112 Value* LoopVectorizationPlanner::VPCallbackILV::
7113 getOrCreateVectorValues(Value *V, unsigned Part) {
7114       return ILV.getOrCreateVectorValue(V, Part);
7115 }
7116 
7117 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7118   O << " +\n"
7119     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7120   IG->getInsertPos()->printAsOperand(O, false);
7121   if (User) {
7122     O << ", ";
7123     User->getOperand(0)->printAsOperand(O);
7124   }
7125   O << "\\l\"";
7126   for (unsigned i = 0; i < IG->getFactor(); ++i)
7127     if (Instruction *I = IG->getMember(i))
7128       O << " +\n"
7129         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7130 }
7131 
7132 void VPWidenRecipe::execute(VPTransformState &State) {
7133   for (auto &Instr : make_range(Begin, End))
7134     State.ILV->widenInstruction(Instr);
7135 }
7136 
7137 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7138   assert(!State.Instance && "Int or FP induction being replicated.");
7139   State.ILV->widenIntOrFpInduction(IV, Trunc);
7140 }
7141 
7142 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7143   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7144 }
7145 
7146 void VPBlendRecipe::execute(VPTransformState &State) {
7147   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7148   // We know that all PHIs in non-header blocks are converted into
7149   // selects, so we don't have to worry about the insertion order and we
7150   // can just use the builder.
7151   // At this point we generate the predication tree. There may be
7152   // duplications since this is a simple recursive scan, but future
7153   // optimizations will clean it up.
7154 
7155   unsigned NumIncoming = Phi->getNumIncomingValues();
7156 
7157   assert((User || NumIncoming == 1) &&
7158          "Multiple predecessors with predecessors having a full mask");
7159   // Generate a sequence of selects of the form:
7160   // SELECT(Mask3, In3,
7161   //      SELECT(Mask2, In2,
7162   //                   ( ...)))
7163   InnerLoopVectorizer::VectorParts Entry(State.UF);
7164   for (unsigned In = 0; In < NumIncoming; ++In) {
7165     for (unsigned Part = 0; Part < State.UF; ++Part) {
7166       // We might have single edge PHIs (blocks) - use an identity
7167       // 'select' for the first PHI operand.
7168       Value *In0 =
7169           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7170       if (In == 0)
7171         Entry[Part] = In0; // Initialize with the first incoming value.
7172       else {
7173         // Select between the current value and the previous incoming edge
7174         // based on the incoming mask.
7175         Value *Cond = State.get(User->getOperand(In), Part);
7176         Entry[Part] =
7177             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7178       }
7179     }
7180   }
7181   for (unsigned Part = 0; Part < State.UF; ++Part)
7182     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7183 }
7184 
7185 void VPInterleaveRecipe::execute(VPTransformState &State) {
7186   assert(!State.Instance && "Interleave group being replicated.");
7187   if (!User)
7188     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7189 
7190   // Last (and currently only) operand is a mask.
7191   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7192   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7193   for (unsigned Part = 0; Part < State.UF; ++Part)
7194     MaskValues[Part] = State.get(Mask, Part);
7195   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7196 }
7197 
7198 void VPReplicateRecipe::execute(VPTransformState &State) {
7199   if (State.Instance) { // Generate a single instance.
7200     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7201     // Insert scalar instance packing it into a vector.
7202     if (AlsoPack && State.VF > 1) {
7203       // If we're constructing lane 0, initialize to start from undef.
7204       if (State.Instance->Lane == 0) {
7205         Value *Undef =
7206             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7207         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7208       }
7209       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7210     }
7211     return;
7212   }
7213 
7214   // Generate scalar instances for all VF lanes of all UF parts, unless the
7215   // instruction is uniform inwhich case generate only the first lane for each
7216   // of the UF parts.
7217   unsigned EndLane = IsUniform ? 1 : State.VF;
7218   for (unsigned Part = 0; Part < State.UF; ++Part)
7219     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7220       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7221 }
7222 
7223 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7224   assert(State.Instance && "Branch on Mask works only on single instance.");
7225 
7226   unsigned Part = State.Instance->Part;
7227   unsigned Lane = State.Instance->Lane;
7228 
7229   Value *ConditionBit = nullptr;
7230   if (!User) // Block in mask is all-one.
7231     ConditionBit = State.Builder.getTrue();
7232   else {
7233     VPValue *BlockInMask = User->getOperand(0);
7234     ConditionBit = State.get(BlockInMask, Part);
7235     if (ConditionBit->getType()->isVectorTy())
7236       ConditionBit = State.Builder.CreateExtractElement(
7237           ConditionBit, State.Builder.getInt32(Lane));
7238   }
7239 
7240   // Replace the temporary unreachable terminator with a new conditional branch,
7241   // whose two destinations will be set later when they are created.
7242   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7243   assert(isa<UnreachableInst>(CurrentTerminator) &&
7244          "Expected to replace unreachable terminator with conditional branch.");
7245   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7246   CondBr->setSuccessor(0, nullptr);
7247   ReplaceInstWithInst(CurrentTerminator, CondBr);
7248 }
7249 
7250 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7251   assert(State.Instance && "Predicated instruction PHI works per instance.");
7252   Instruction *ScalarPredInst = cast<Instruction>(
7253       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7254   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7255   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7256   assert(PredicatingBB && "Predicated block has no single predecessor.");
7257 
7258   // By current pack/unpack logic we need to generate only a single phi node: if
7259   // a vector value for the predicated instruction exists at this point it means
7260   // the instruction has vector users only, and a phi for the vector value is
7261   // needed. In this case the recipe of the predicated instruction is marked to
7262   // also do that packing, thereby "hoisting" the insert-element sequence.
7263   // Otherwise, a phi node for the scalar value is needed.
7264   unsigned Part = State.Instance->Part;
7265   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7266     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7267     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7268     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7269     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7270     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7271     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7272   } else {
7273     Type *PredInstType = PredInst->getType();
7274     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7275     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7276     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7277     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7278   }
7279 }
7280 
7281 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7282   if (!User)
7283     return State.ILV->vectorizeMemoryInstruction(&Instr);
7284 
7285   // Last (and currently only) operand is a mask.
7286   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7287   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7288   for (unsigned Part = 0; Part < State.UF; ++Part)
7289     MaskValues[Part] = State.get(Mask, Part);
7290   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7291 }
7292 
7293 static ScalarEpilogueLowering
7294 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7295                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7296   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7297   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7298       (F->hasOptSize() ||
7299        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7300     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7301   else if (Hints.getPredicate())
7302     SEL = CM_ScalarEpilogueNotNeededPredicatePragma;
7303 
7304   return SEL;
7305 }
7306 
7307 // Process the loop in the VPlan-native vectorization path. This path builds
7308 // VPlan upfront in the vectorization pipeline, which allows to apply
7309 // VPlan-to-VPlan transformations from the very beginning without modifying the
7310 // input LLVM IR.
7311 static bool processLoopInVPlanNativePath(
7312     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7313     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7314     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7315     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7316     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7317 
7318   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7319   Function *F = L->getHeader()->getParent();
7320   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7321   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7322 
7323   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7324                                 &Hints, IAI);
7325   // Use the planner for outer loop vectorization.
7326   // TODO: CM is not used at this point inside the planner. Turn CM into an
7327   // optional argument if we don't need it in the future.
7328   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7329 
7330   // Get user vectorization factor.
7331   const unsigned UserVF = Hints.getWidth();
7332 
7333   // Plan how to best vectorize, return the best VF and its cost.
7334   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7335 
7336   // If we are stress testing VPlan builds, do not attempt to generate vector
7337   // code. Masked vector code generation support will follow soon.
7338   // Also, do not attempt to vectorize if no vector code will be produced.
7339   if (VPlanBuildStressTest || EnableVPlanPredication ||
7340       VectorizationFactor::Disabled() == VF)
7341     return false;
7342 
7343   LVP.setBestPlan(VF.Width, 1);
7344 
7345   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7346                          &CM);
7347   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7348                     << L->getHeader()->getParent()->getName() << "\"\n");
7349   LVP.executePlan(LB, DT);
7350 
7351   // Mark the loop as already vectorized to avoid vectorizing again.
7352   Hints.setAlreadyVectorized();
7353 
7354   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7355   return true;
7356 }
7357 
7358 bool LoopVectorizePass::processLoop(Loop *L) {
7359   assert((EnableVPlanNativePath || L->empty()) &&
7360          "VPlan-native path is not enabled. Only process inner loops.");
7361 
7362 #ifndef NDEBUG
7363   const std::string DebugLocStr = getDebugLocString(L);
7364 #endif /* NDEBUG */
7365 
7366   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7367                     << L->getHeader()->getParent()->getName() << "\" from "
7368                     << DebugLocStr << "\n");
7369 
7370   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7371 
7372   LLVM_DEBUG(
7373       dbgs() << "LV: Loop hints:"
7374              << " force="
7375              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7376                      ? "disabled"
7377                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7378                             ? "enabled"
7379                             : "?"))
7380              << " width=" << Hints.getWidth()
7381              << " unroll=" << Hints.getInterleave() << "\n");
7382 
7383   // Function containing loop
7384   Function *F = L->getHeader()->getParent();
7385 
7386   // Looking at the diagnostic output is the only way to determine if a loop
7387   // was vectorized (other than looking at the IR or machine code), so it
7388   // is important to generate an optimization remark for each loop. Most of
7389   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7390   // generated as OptimizationRemark and OptimizationRemarkMissed are
7391   // less verbose reporting vectorized loops and unvectorized loops that may
7392   // benefit from vectorization, respectively.
7393 
7394   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7395     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7396     return false;
7397   }
7398 
7399   PredicatedScalarEvolution PSE(*SE, *L);
7400 
7401   // Check if it is legal to vectorize the loop.
7402   LoopVectorizationRequirements Requirements(*ORE);
7403   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7404                                 &Requirements, &Hints, DB, AC);
7405   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7406     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7407     Hints.emitRemarkWithHints();
7408     return false;
7409   }
7410 
7411   // Check the function attributes and profiles to find out if this function
7412   // should be optimized for size.
7413   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7414 
7415   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7416   // here. They may require CFG and instruction level transformations before
7417   // even evaluating whether vectorization is profitable. Since we cannot modify
7418   // the incoming IR, we need to build VPlan upfront in the vectorization
7419   // pipeline.
7420   if (!L->empty())
7421     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7422                                         ORE, BFI, PSI, Hints);
7423 
7424   assert(L->empty() && "Inner loop expected.");
7425   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7426   // count by optimizing for size, to minimize overheads.
7427   // Prefer constant trip counts over profile data, over upper bound estimate.
7428   unsigned ExpectedTC = 0;
7429   bool HasExpectedTC = false;
7430   if (const SCEVConstant *ConstExits =
7431       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7432     const APInt &ExitsCount = ConstExits->getAPInt();
7433     // We are interested in small values for ExpectedTC. Skip over those that
7434     // can't fit an unsigned.
7435     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7436       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7437       HasExpectedTC = true;
7438     }
7439   }
7440   // ExpectedTC may be large because it's bound by a variable. Check
7441   // profiling information to validate we should vectorize.
7442   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7443     auto EstimatedTC = getLoopEstimatedTripCount(L);
7444     if (EstimatedTC) {
7445       ExpectedTC = *EstimatedTC;
7446       HasExpectedTC = true;
7447     }
7448   }
7449   if (!HasExpectedTC) {
7450     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7451     HasExpectedTC = (ExpectedTC > 0);
7452   }
7453 
7454   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7455     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7456                       << "This loop is worth vectorizing only if no scalar "
7457                       << "iteration overheads are incurred.");
7458     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7459       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7460     else {
7461       LLVM_DEBUG(dbgs() << "\n");
7462       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7463     }
7464   }
7465 
7466   // Check the function attributes to see if implicit floats are allowed.
7467   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7468   // an integer loop and the vector instructions selected are purely integer
7469   // vector instructions?
7470   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7471     reportVectorizationFailure(
7472         "Can't vectorize when the NoImplicitFloat attribute is used",
7473         "loop not vectorized due to NoImplicitFloat attribute",
7474         "NoImplicitFloat", ORE, L);
7475     Hints.emitRemarkWithHints();
7476     return false;
7477   }
7478 
7479   // Check if the target supports potentially unsafe FP vectorization.
7480   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7481   // for the target we're vectorizing for, to make sure none of the
7482   // additional fp-math flags can help.
7483   if (Hints.isPotentiallyUnsafe() &&
7484       TTI->isFPVectorizationPotentiallyUnsafe()) {
7485     reportVectorizationFailure(
7486         "Potentially unsafe FP op prevents vectorization",
7487         "loop not vectorized due to unsafe FP support.",
7488         "UnsafeFP", ORE, L);
7489     Hints.emitRemarkWithHints();
7490     return false;
7491   }
7492 
7493   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7494   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7495 
7496   // If an override option has been passed in for interleaved accesses, use it.
7497   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7498     UseInterleaved = EnableInterleavedMemAccesses;
7499 
7500   // Analyze interleaved memory accesses.
7501   if (UseInterleaved) {
7502     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7503   }
7504 
7505   // Use the cost model.
7506   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7507                                 F, &Hints, IAI);
7508   CM.collectValuesToIgnore();
7509 
7510   // Use the planner for vectorization.
7511   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7512 
7513   // Get user vectorization factor.
7514   unsigned UserVF = Hints.getWidth();
7515 
7516   // Plan how to best vectorize, return the best VF and its cost.
7517   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7518 
7519   VectorizationFactor VF = VectorizationFactor::Disabled();
7520   unsigned IC = 1;
7521   unsigned UserIC = Hints.getInterleave();
7522 
7523   if (MaybeVF) {
7524     VF = *MaybeVF;
7525     // Select the interleave count.
7526     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7527   }
7528 
7529   // Identify the diagnostic messages that should be produced.
7530   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7531   bool VectorizeLoop = true, InterleaveLoop = true;
7532   if (Requirements.doesNotMeet(F, L, Hints)) {
7533     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7534                          "requirements.\n");
7535     Hints.emitRemarkWithHints();
7536     return false;
7537   }
7538 
7539   if (VF.Width == 1) {
7540     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7541     VecDiagMsg = std::make_pair(
7542         "VectorizationNotBeneficial",
7543         "the cost-model indicates that vectorization is not beneficial");
7544     VectorizeLoop = false;
7545   }
7546 
7547   if (!MaybeVF && UserIC > 1) {
7548     // Tell the user interleaving was avoided up-front, despite being explicitly
7549     // requested.
7550     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7551                          "interleaving should be avoided up front\n");
7552     IntDiagMsg = std::make_pair(
7553         "InterleavingAvoided",
7554         "Ignoring UserIC, because interleaving was avoided up front");
7555     InterleaveLoop = false;
7556   } else if (IC == 1 && UserIC <= 1) {
7557     // Tell the user interleaving is not beneficial.
7558     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7559     IntDiagMsg = std::make_pair(
7560         "InterleavingNotBeneficial",
7561         "the cost-model indicates that interleaving is not beneficial");
7562     InterleaveLoop = false;
7563     if (UserIC == 1) {
7564       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7565       IntDiagMsg.second +=
7566           " and is explicitly disabled or interleave count is set to 1";
7567     }
7568   } else if (IC > 1 && UserIC == 1) {
7569     // Tell the user interleaving is beneficial, but it explicitly disabled.
7570     LLVM_DEBUG(
7571         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7572     IntDiagMsg = std::make_pair(
7573         "InterleavingBeneficialButDisabled",
7574         "the cost-model indicates that interleaving is beneficial "
7575         "but is explicitly disabled or interleave count is set to 1");
7576     InterleaveLoop = false;
7577   }
7578 
7579   // Override IC if user provided an interleave count.
7580   IC = UserIC > 0 ? UserIC : IC;
7581 
7582   // Emit diagnostic messages, if any.
7583   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7584   if (!VectorizeLoop && !InterleaveLoop) {
7585     // Do not vectorize or interleaving the loop.
7586     ORE->emit([&]() {
7587       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7588                                       L->getStartLoc(), L->getHeader())
7589              << VecDiagMsg.second;
7590     });
7591     ORE->emit([&]() {
7592       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7593                                       L->getStartLoc(), L->getHeader())
7594              << IntDiagMsg.second;
7595     });
7596     return false;
7597   } else if (!VectorizeLoop && InterleaveLoop) {
7598     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7599     ORE->emit([&]() {
7600       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7601                                         L->getStartLoc(), L->getHeader())
7602              << VecDiagMsg.second;
7603     });
7604   } else if (VectorizeLoop && !InterleaveLoop) {
7605     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7606                       << ") in " << DebugLocStr << '\n');
7607     ORE->emit([&]() {
7608       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7609                                         L->getStartLoc(), L->getHeader())
7610              << IntDiagMsg.second;
7611     });
7612   } else if (VectorizeLoop && InterleaveLoop) {
7613     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7614                       << ") in " << DebugLocStr << '\n');
7615     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7616   }
7617 
7618   LVP.setBestPlan(VF.Width, IC);
7619 
7620   using namespace ore;
7621   bool DisableRuntimeUnroll = false;
7622   MDNode *OrigLoopID = L->getLoopID();
7623 
7624   if (!VectorizeLoop) {
7625     assert(IC > 1 && "interleave count should not be 1 or 0");
7626     // If we decided that it is not legal to vectorize the loop, then
7627     // interleave it.
7628     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7629                                &CM);
7630     LVP.executePlan(Unroller, DT);
7631 
7632     ORE->emit([&]() {
7633       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7634                                 L->getHeader())
7635              << "interleaved loop (interleaved count: "
7636              << NV("InterleaveCount", IC) << ")";
7637     });
7638   } else {
7639     // If we decided that it is *legal* to vectorize the loop, then do it.
7640     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7641                            &LVL, &CM);
7642     LVP.executePlan(LB, DT);
7643     ++LoopsVectorized;
7644 
7645     // Add metadata to disable runtime unrolling a scalar loop when there are
7646     // no runtime checks about strides and memory. A scalar loop that is
7647     // rarely used is not worth unrolling.
7648     if (!LB.areSafetyChecksAdded())
7649       DisableRuntimeUnroll = true;
7650 
7651     // Report the vectorization decision.
7652     ORE->emit([&]() {
7653       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7654                                 L->getHeader())
7655              << "vectorized loop (vectorization width: "
7656              << NV("VectorizationFactor", VF.Width)
7657              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7658     });
7659   }
7660 
7661   Optional<MDNode *> RemainderLoopID =
7662       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7663                                       LLVMLoopVectorizeFollowupEpilogue});
7664   if (RemainderLoopID.hasValue()) {
7665     L->setLoopID(RemainderLoopID.getValue());
7666   } else {
7667     if (DisableRuntimeUnroll)
7668       AddRuntimeUnrollDisableMetaData(L);
7669 
7670     // Mark the loop as already vectorized to avoid vectorizing again.
7671     Hints.setAlreadyVectorized();
7672   }
7673 
7674   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7675   return true;
7676 }
7677 
7678 bool LoopVectorizePass::runImpl(
7679     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7680     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7681     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7682     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7683     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7684   SE = &SE_;
7685   LI = &LI_;
7686   TTI = &TTI_;
7687   DT = &DT_;
7688   BFI = &BFI_;
7689   TLI = TLI_;
7690   AA = &AA_;
7691   AC = &AC_;
7692   GetLAA = &GetLAA_;
7693   DB = &DB_;
7694   ORE = &ORE_;
7695   PSI = PSI_;
7696 
7697   // Don't attempt if
7698   // 1. the target claims to have no vector registers, and
7699   // 2. interleaving won't help ILP.
7700   //
7701   // The second condition is necessary because, even if the target has no
7702   // vector registers, loop vectorization may still enable scalar
7703   // interleaving.
7704   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7705     return false;
7706 
7707   bool Changed = false;
7708 
7709   // The vectorizer requires loops to be in simplified form.
7710   // Since simplification may add new inner loops, it has to run before the
7711   // legality and profitability checks. This means running the loop vectorizer
7712   // will simplify all loops, regardless of whether anything end up being
7713   // vectorized.
7714   for (auto &L : *LI)
7715     Changed |=
7716         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7717 
7718   // Build up a worklist of inner-loops to vectorize. This is necessary as
7719   // the act of vectorizing or partially unrolling a loop creates new loops
7720   // and can invalidate iterators across the loops.
7721   SmallVector<Loop *, 8> Worklist;
7722 
7723   for (Loop *L : *LI)
7724     collectSupportedLoops(*L, LI, ORE, Worklist);
7725 
7726   LoopsAnalyzed += Worklist.size();
7727 
7728   // Now walk the identified inner loops.
7729   while (!Worklist.empty()) {
7730     Loop *L = Worklist.pop_back_val();
7731 
7732     // For the inner loops we actually process, form LCSSA to simplify the
7733     // transform.
7734     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7735 
7736     Changed |= processLoop(L);
7737   }
7738 
7739   // Process each loop nest in the function.
7740   return Changed;
7741 }
7742 
7743 PreservedAnalyses LoopVectorizePass::run(Function &F,
7744                                          FunctionAnalysisManager &AM) {
7745     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7746     auto &LI = AM.getResult<LoopAnalysis>(F);
7747     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7748     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7749     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7750     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7751     auto &AA = AM.getResult<AAManager>(F);
7752     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7753     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7754     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7755     MemorySSA *MSSA = EnableMSSALoopDependency
7756                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7757                           : nullptr;
7758 
7759     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7760     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7761         [&](Loop &L) -> const LoopAccessInfo & {
7762       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7763       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7764     };
7765     const ModuleAnalysisManager &MAM =
7766         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7767     ProfileSummaryInfo *PSI =
7768         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7769     bool Changed =
7770         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7771     if (!Changed)
7772       return PreservedAnalyses::all();
7773     PreservedAnalyses PA;
7774 
7775     // We currently do not preserve loopinfo/dominator analyses with outer loop
7776     // vectorization. Until this is addressed, mark these analyses as preserved
7777     // only for non-VPlan-native path.
7778     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7779     if (!EnableVPlanNativePath) {
7780       PA.preserve<LoopAnalysis>();
7781       PA.preserve<DominatorTreeAnalysis>();
7782     }
7783     PA.preserve<BasicAA>();
7784     PA.preserve<GlobalsAA>();
7785     return PA;
7786 }
7787