1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanHCFGTransforms.h"
62 #include "VPlanPredicator.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cstdint>
144 #include <cstdlib>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <memory>
149 #include <string>
150 #include <tuple>
151 #include <utility>
152 #include <vector>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 /// We don't interleave loops with a known constant trip count below this
204 /// number.
205 static const unsigned TinyTripCountInterleaveThreshold = 128;
206 
207 static cl::opt<unsigned> ForceTargetNumScalarRegs(
208     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
209     cl::desc("A flag that overrides the target's number of scalar registers."));
210 
211 static cl::opt<unsigned> ForceTargetNumVectorRegs(
212     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
213     cl::desc("A flag that overrides the target's number of vector registers."));
214 
215 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
216     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
217     cl::desc("A flag that overrides the target's max interleave factor for "
218              "scalar loops."));
219 
220 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
221     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
222     cl::desc("A flag that overrides the target's max interleave factor for "
223              "vectorized loops."));
224 
225 static cl::opt<unsigned> ForceTargetInstructionCost(
226     "force-target-instruction-cost", cl::init(0), cl::Hidden,
227     cl::desc("A flag that overrides the target's expected cost for "
228              "an instruction to a single constant value. Mostly "
229              "useful for getting consistent testing."));
230 
231 static cl::opt<unsigned> SmallLoopCost(
232     "small-loop-cost", cl::init(20), cl::Hidden,
233     cl::desc(
234         "The cost of a loop that is considered 'small' by the interleaver."));
235 
236 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
237     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
238     cl::desc("Enable the use of the block frequency analysis to access PGO "
239              "heuristics minimizing code growth in cold regions and being more "
240              "aggressive in hot regions."));
241 
242 // Runtime interleave loops for load/store throughput.
243 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
244     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
245     cl::desc(
246         "Enable runtime interleaving until load/store ports are saturated"));
247 
248 /// The number of stores in a loop that are allowed to need predication.
249 static cl::opt<unsigned> NumberOfStoresToPredicate(
250     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
251     cl::desc("Max number of stores to be predicated behind an if."));
252 
253 static cl::opt<bool> EnableIndVarRegisterHeur(
254     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
255     cl::desc("Count the induction variable only once when interleaving"));
256 
257 static cl::opt<bool> EnableCondStoresVectorization(
258     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
259     cl::desc("Enable if predication of stores during vectorization."));
260 
261 static cl::opt<unsigned> MaxNestedScalarReductionIC(
262     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
263     cl::desc("The maximum interleave count to use when interleaving a scalar "
264              "reduction in a nested loop."));
265 
266 cl::opt<bool> EnableVPlanNativePath(
267     "enable-vplan-native-path", cl::init(false), cl::Hidden,
268     cl::desc("Enable VPlan-native vectorization path with "
269              "support for outer loop vectorization."));
270 
271 // FIXME: Remove this switch once we have divergence analysis. Currently we
272 // assume divergent non-backedge branches when this switch is true.
273 cl::opt<bool> EnableVPlanPredication(
274     "enable-vplan-predication", cl::init(false), cl::Hidden,
275     cl::desc("Enable VPlan-native vectorization path predicator with "
276              "support for outer loop vectorization."));
277 
278 // This flag enables the stress testing of the VPlan H-CFG construction in the
279 // VPlan-native vectorization path. It must be used in conjuction with
280 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
281 // verification of the H-CFGs built.
282 static cl::opt<bool> VPlanBuildStressTest(
283     "vplan-build-stress-test", cl::init(false), cl::Hidden,
284     cl::desc(
285         "Build VPlan for every supported loop nest in the function and bail "
286         "out right after the build (stress test the VPlan H-CFG construction "
287         "in the VPlan-native vectorization path)."));
288 
289 cl::opt<bool> llvm::EnableLoopInterleaving(
290     "interleave-loops", cl::init(true), cl::Hidden,
291     cl::desc("Enable loop interleaving in Loop vectorization passes"));
292 cl::opt<bool> llvm::EnableLoopVectorization(
293     "vectorize-loops", cl::init(true), cl::Hidden,
294     cl::desc("Run the Loop vectorization passes"));
295 
296 /// A helper function for converting Scalar types to vector types.
297 /// If the incoming type is void, we return void. If the VF is 1, we return
298 /// the scalar type.
299 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
300   if (Scalar->isVoidTy() || VF == 1)
301     return Scalar;
302   return VectorType::get(Scalar, VF);
303 }
304 
305 /// A helper function that returns the type of loaded or stored value.
306 static Type *getMemInstValueType(Value *I) {
307   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
308          "Expected Load or Store instruction");
309   if (auto *LI = dyn_cast<LoadInst>(I))
310     return LI->getType();
311   return cast<StoreInst>(I)->getValueOperand()->getType();
312 }
313 
314 /// A helper function that returns true if the given type is irregular. The
315 /// type is irregular if its allocated size doesn't equal the store size of an
316 /// element of the corresponding vector type at the given vectorization factor.
317 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
318   // Determine if an array of VF elements of type Ty is "bitcast compatible"
319   // with a <VF x Ty> vector.
320   if (VF > 1) {
321     auto *VectorTy = VectorType::get(Ty, VF);
322     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
323   }
324 
325   // If the vectorization factor is one, we just check if an array of type Ty
326   // requires padding between elements.
327   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
328 }
329 
330 /// A helper function that returns the reciprocal of the block probability of
331 /// predicated blocks. If we return X, we are assuming the predicated block
332 /// will execute once for every X iterations of the loop header.
333 ///
334 /// TODO: We should use actual block probability here, if available. Currently,
335 ///       we always assume predicated blocks have a 50% chance of executing.
336 static unsigned getReciprocalPredBlockProb() { return 2; }
337 
338 /// A helper function that adds a 'fast' flag to floating-point operations.
339 static Value *addFastMathFlag(Value *V) {
340   if (isa<FPMathOperator>(V))
341     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
342   return V;
343 }
344 
345 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
346   if (isa<FPMathOperator>(V))
347     cast<Instruction>(V)->setFastMathFlags(FMF);
348   return V;
349 }
350 
351 /// A helper function that returns an integer or floating-point constant with
352 /// value C.
353 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
354   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
355                            : ConstantFP::get(Ty, C);
356 }
357 
358 namespace llvm {
359 
360 /// InnerLoopVectorizer vectorizes loops which contain only one basic
361 /// block to a specified vectorization factor (VF).
362 /// This class performs the widening of scalars into vectors, or multiple
363 /// scalars. This class also implements the following features:
364 /// * It inserts an epilogue loop for handling loops that don't have iteration
365 ///   counts that are known to be a multiple of the vectorization factor.
366 /// * It handles the code generation for reduction variables.
367 /// * Scalarization (implementation using scalars) of un-vectorizable
368 ///   instructions.
369 /// InnerLoopVectorizer does not perform any vectorization-legality
370 /// checks, and relies on the caller to check for the different legality
371 /// aspects. The InnerLoopVectorizer relies on the
372 /// LoopVectorizationLegality class to provide information about the induction
373 /// and reduction variables that were found to a given vectorization factor.
374 class InnerLoopVectorizer {
375 public:
376   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
377                       LoopInfo *LI, DominatorTree *DT,
378                       const TargetLibraryInfo *TLI,
379                       const TargetTransformInfo *TTI, AssumptionCache *AC,
380                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
381                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
382                       LoopVectorizationCostModel *CM)
383       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
384         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
385         Builder(PSE.getSE()->getContext()),
386         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
387   virtual ~InnerLoopVectorizer() = default;
388 
389   /// Create a new empty loop. Unlink the old loop and connect the new one.
390   /// Return the pre-header block of the new loop.
391   BasicBlock *createVectorizedLoopSkeleton();
392 
393   /// Widen a single instruction within the innermost loop.
394   void widenInstruction(Instruction &I);
395 
396   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
397   void fixVectorizedLoop();
398 
399   // Return true if any runtime check is added.
400   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
401 
402   /// A type for vectorized values in the new loop. Each value from the
403   /// original loop, when vectorized, is represented by UF vector values in the
404   /// new unrolled loop, where UF is the unroll factor.
405   using VectorParts = SmallVector<Value *, 2>;
406 
407   /// Vectorize a single PHINode in a block. This method handles the induction
408   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
409   /// arbitrary length vectors.
410   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
411 
412   /// A helper function to scalarize a single Instruction in the innermost loop.
413   /// Generates a sequence of scalar instances for each lane between \p MinLane
414   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
415   /// inclusive..
416   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
417                             bool IfPredicateInstr);
418 
419   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
420   /// is provided, the integer induction variable will first be truncated to
421   /// the corresponding type.
422   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
423 
424   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
425   /// vector or scalar value on-demand if one is not yet available. When
426   /// vectorizing a loop, we visit the definition of an instruction before its
427   /// uses. When visiting the definition, we either vectorize or scalarize the
428   /// instruction, creating an entry for it in the corresponding map. (In some
429   /// cases, such as induction variables, we will create both vector and scalar
430   /// entries.) Then, as we encounter uses of the definition, we derive values
431   /// for each scalar or vector use unless such a value is already available.
432   /// For example, if we scalarize a definition and one of its uses is vector,
433   /// we build the required vector on-demand with an insertelement sequence
434   /// when visiting the use. Otherwise, if the use is scalar, we can use the
435   /// existing scalar definition.
436   ///
437   /// Return a value in the new loop corresponding to \p V from the original
438   /// loop at unroll index \p Part. If the value has already been vectorized,
439   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
440   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
441   /// a new vector value on-demand by inserting the scalar values into a vector
442   /// with an insertelement sequence. If the value has been neither vectorized
443   /// nor scalarized, it must be loop invariant, so we simply broadcast the
444   /// value into a vector.
445   Value *getOrCreateVectorValue(Value *V, unsigned Part);
446 
447   /// Return a value in the new loop corresponding to \p V from the original
448   /// loop at unroll and vector indices \p Instance. If the value has been
449   /// vectorized but not scalarized, the necessary extractelement instruction
450   /// will be generated.
451   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
452 
453   /// Construct the vector value of a scalarized value \p V one lane at a time.
454   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
455 
456   /// Try to vectorize the interleaved access group that \p Instr belongs to,
457   /// optionally masking the vector operations if \p BlockInMask is non-null.
458   void vectorizeInterleaveGroup(Instruction *Instr,
459                                 VectorParts *BlockInMask = nullptr);
460 
461   /// Vectorize Load and Store instructions, optionally masking the vector
462   /// operations if \p BlockInMask is non-null.
463   void vectorizeMemoryInstruction(Instruction *Instr,
464                                   VectorParts *BlockInMask = nullptr);
465 
466   /// Set the debug location in the builder using the debug location in
467   /// the instruction.
468   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
469 
470   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
471   void fixNonInductionPHIs(void);
472 
473 protected:
474   friend class LoopVectorizationPlanner;
475 
476   /// A small list of PHINodes.
477   using PhiVector = SmallVector<PHINode *, 4>;
478 
479   /// A type for scalarized values in the new loop. Each value from the
480   /// original loop, when scalarized, is represented by UF x VF scalar values
481   /// in the new unrolled loop, where UF is the unroll factor and VF is the
482   /// vectorization factor.
483   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
484 
485   /// Set up the values of the IVs correctly when exiting the vector loop.
486   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
487                     Value *CountRoundDown, Value *EndValue,
488                     BasicBlock *MiddleBlock);
489 
490   /// Create a new induction variable inside L.
491   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
492                                    Value *Step, Instruction *DL);
493 
494   /// Handle all cross-iteration phis in the header.
495   void fixCrossIterationPHIs();
496 
497   /// Fix a first-order recurrence. This is the second phase of vectorizing
498   /// this phi node.
499   void fixFirstOrderRecurrence(PHINode *Phi);
500 
501   /// Fix a reduction cross-iteration phi. This is the second phase of
502   /// vectorizing this phi node.
503   void fixReduction(PHINode *Phi);
504 
505   /// The Loop exit block may have single value PHI nodes with some
506   /// incoming value. While vectorizing we only handled real values
507   /// that were defined inside the loop and we should have one value for
508   /// each predecessor of its parent basic block. See PR14725.
509   void fixLCSSAPHIs();
510 
511   /// Iteratively sink the scalarized operands of a predicated instruction into
512   /// the block that was created for it.
513   void sinkScalarOperands(Instruction *PredInst);
514 
515   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
516   /// represented as.
517   void truncateToMinimalBitwidths();
518 
519   /// Insert the new loop to the loop hierarchy and pass manager
520   /// and update the analysis passes.
521   void updateAnalysis();
522 
523   /// Create a broadcast instruction. This method generates a broadcast
524   /// instruction (shuffle) for loop invariant values and for the induction
525   /// value. If this is the induction variable then we extend it to N, N+1, ...
526   /// this is needed because each iteration in the loop corresponds to a SIMD
527   /// element.
528   virtual Value *getBroadcastInstrs(Value *V);
529 
530   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
531   /// to each vector element of Val. The sequence starts at StartIndex.
532   /// \p Opcode is relevant for FP induction variable.
533   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
534                                Instruction::BinaryOps Opcode =
535                                Instruction::BinaryOpsEnd);
536 
537   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
538   /// variable on which to base the steps, \p Step is the size of the step, and
539   /// \p EntryVal is the value from the original loop that maps to the steps.
540   /// Note that \p EntryVal doesn't have to be an induction variable - it
541   /// can also be a truncate instruction.
542   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
543                         const InductionDescriptor &ID);
544 
545   /// Create a vector induction phi node based on an existing scalar one. \p
546   /// EntryVal is the value from the original loop that maps to the vector phi
547   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
548   /// truncate instruction, instead of widening the original IV, we widen a
549   /// version of the IV truncated to \p EntryVal's type.
550   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
551                                        Value *Step, Instruction *EntryVal);
552 
553   /// Returns true if an instruction \p I should be scalarized instead of
554   /// vectorized for the chosen vectorization factor.
555   bool shouldScalarizeInstruction(Instruction *I) const;
556 
557   /// Returns true if we should generate a scalar version of \p IV.
558   bool needsScalarInduction(Instruction *IV) const;
559 
560   /// If there is a cast involved in the induction variable \p ID, which should
561   /// be ignored in the vectorized loop body, this function records the
562   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
563   /// cast. We had already proved that the casted Phi is equal to the uncasted
564   /// Phi in the vectorized loop (under a runtime guard), and therefore
565   /// there is no need to vectorize the cast - the same value can be used in the
566   /// vector loop for both the Phi and the cast.
567   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
568   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
569   ///
570   /// \p EntryVal is the value from the original loop that maps to the vector
571   /// phi node and is used to distinguish what is the IV currently being
572   /// processed - original one (if \p EntryVal is a phi corresponding to the
573   /// original IV) or the "newly-created" one based on the proof mentioned above
574   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
575   /// latter case \p EntryVal is a TruncInst and we must not record anything for
576   /// that IV, but it's error-prone to expect callers of this routine to care
577   /// about that, hence this explicit parameter.
578   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
579                                              const Instruction *EntryVal,
580                                              Value *VectorLoopValue,
581                                              unsigned Part,
582                                              unsigned Lane = UINT_MAX);
583 
584   /// Generate a shuffle sequence that will reverse the vector Vec.
585   virtual Value *reverseVector(Value *Vec);
586 
587   /// Returns (and creates if needed) the original loop trip count.
588   Value *getOrCreateTripCount(Loop *NewLoop);
589 
590   /// Returns (and creates if needed) the trip count of the widened loop.
591   Value *getOrCreateVectorTripCount(Loop *NewLoop);
592 
593   /// Returns a bitcasted value to the requested vector type.
594   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
595   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
596                                 const DataLayout &DL);
597 
598   /// Emit a bypass check to see if the vector trip count is zero, including if
599   /// it overflows.
600   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
601 
602   /// Emit a bypass check to see if all of the SCEV assumptions we've
603   /// had to make are correct.
604   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
605 
606   /// Emit bypass checks to check any memory assumptions we may have made.
607   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
608 
609   /// Compute the transformed value of Index at offset StartValue using step
610   /// StepValue.
611   /// For integer induction, returns StartValue + Index * StepValue.
612   /// For pointer induction, returns StartValue[Index * StepValue].
613   /// FIXME: The newly created binary instructions should contain nsw/nuw
614   /// flags, which can be found from the original scalar operations.
615   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
616                               const DataLayout &DL,
617                               const InductionDescriptor &ID) const;
618 
619   /// Add additional metadata to \p To that was not present on \p Orig.
620   ///
621   /// Currently this is used to add the noalias annotations based on the
622   /// inserted memchecks.  Use this for instructions that are *cloned* into the
623   /// vector loop.
624   void addNewMetadata(Instruction *To, const Instruction *Orig);
625 
626   /// Add metadata from one instruction to another.
627   ///
628   /// This includes both the original MDs from \p From and additional ones (\see
629   /// addNewMetadata).  Use this for *newly created* instructions in the vector
630   /// loop.
631   void addMetadata(Instruction *To, Instruction *From);
632 
633   /// Similar to the previous function but it adds the metadata to a
634   /// vector of instructions.
635   void addMetadata(ArrayRef<Value *> To, Instruction *From);
636 
637   /// The original loop.
638   Loop *OrigLoop;
639 
640   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
641   /// dynamic knowledge to simplify SCEV expressions and converts them to a
642   /// more usable form.
643   PredicatedScalarEvolution &PSE;
644 
645   /// Loop Info.
646   LoopInfo *LI;
647 
648   /// Dominator Tree.
649   DominatorTree *DT;
650 
651   /// Alias Analysis.
652   AliasAnalysis *AA;
653 
654   /// Target Library Info.
655   const TargetLibraryInfo *TLI;
656 
657   /// Target Transform Info.
658   const TargetTransformInfo *TTI;
659 
660   /// Assumption Cache.
661   AssumptionCache *AC;
662 
663   /// Interface to emit optimization remarks.
664   OptimizationRemarkEmitter *ORE;
665 
666   /// LoopVersioning.  It's only set up (non-null) if memchecks were
667   /// used.
668   ///
669   /// This is currently only used to add no-alias metadata based on the
670   /// memchecks.  The actually versioning is performed manually.
671   std::unique_ptr<LoopVersioning> LVer;
672 
673   /// The vectorization SIMD factor to use. Each vector will have this many
674   /// vector elements.
675   unsigned VF;
676 
677   /// The vectorization unroll factor to use. Each scalar is vectorized to this
678   /// many different vector instructions.
679   unsigned UF;
680 
681   /// The builder that we use
682   IRBuilder<> Builder;
683 
684   // --- Vectorization state ---
685 
686   /// The vector-loop preheader.
687   BasicBlock *LoopVectorPreHeader;
688 
689   /// The scalar-loop preheader.
690   BasicBlock *LoopScalarPreHeader;
691 
692   /// Middle Block between the vector and the scalar.
693   BasicBlock *LoopMiddleBlock;
694 
695   /// The ExitBlock of the scalar loop.
696   BasicBlock *LoopExitBlock;
697 
698   /// The vector loop body.
699   BasicBlock *LoopVectorBody;
700 
701   /// The scalar loop body.
702   BasicBlock *LoopScalarBody;
703 
704   /// A list of all bypass blocks. The first block is the entry of the loop.
705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706 
707   /// The new Induction variable which was added to the new block.
708   PHINode *Induction = nullptr;
709 
710   /// The induction variable of the old basic block.
711   PHINode *OldInduction = nullptr;
712 
713   /// Maps values from the original loop to their corresponding values in the
714   /// vectorized loop. A key value can map to either vector values, scalar
715   /// values or both kinds of values, depending on whether the key was
716   /// vectorized and scalarized.
717   VectorizerValueMap VectorLoopValueMap;
718 
719   /// Store instructions that were predicated.
720   SmallVector<Instruction *, 4> PredicatedInstructions;
721 
722   /// Trip count of the original loop.
723   Value *TripCount = nullptr;
724 
725   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
726   Value *VectorTripCount = nullptr;
727 
728   /// The legality analysis.
729   LoopVectorizationLegality *Legal;
730 
731   /// The profitablity analysis.
732   LoopVectorizationCostModel *Cost;
733 
734   // Record whether runtime checks are added.
735   bool AddedSafetyChecks = false;
736 
737   // Holds the end values for each induction variable. We save the end values
738   // so we can later fix-up the external users of the induction variables.
739   DenseMap<PHINode *, Value *> IVEndValues;
740 
741   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
742   // fixed up at the end of vector code generation.
743   SmallVector<PHINode *, 8> OrigPHIsToFix;
744 };
745 
746 class InnerLoopUnroller : public InnerLoopVectorizer {
747 public:
748   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
749                     LoopInfo *LI, DominatorTree *DT,
750                     const TargetLibraryInfo *TLI,
751                     const TargetTransformInfo *TTI, AssumptionCache *AC,
752                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
753                     LoopVectorizationLegality *LVL,
754                     LoopVectorizationCostModel *CM)
755       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
756                             UnrollFactor, LVL, CM) {}
757 
758 private:
759   Value *getBroadcastInstrs(Value *V) override;
760   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
761                        Instruction::BinaryOps Opcode =
762                        Instruction::BinaryOpsEnd) override;
763   Value *reverseVector(Value *Vec) override;
764 };
765 
766 } // end namespace llvm
767 
768 /// Look for a meaningful debug location on the instruction or it's
769 /// operands.
770 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
771   if (!I)
772     return I;
773 
774   DebugLoc Empty;
775   if (I->getDebugLoc() != Empty)
776     return I;
777 
778   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
779     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
780       if (OpInst->getDebugLoc() != Empty)
781         return OpInst;
782   }
783 
784   return I;
785 }
786 
787 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
788   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
789     const DILocation *DIL = Inst->getDebugLoc();
790     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
791         !isa<DbgInfoIntrinsic>(Inst)) {
792       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
793       if (NewDIL)
794         B.SetCurrentDebugLocation(NewDIL.getValue());
795       else
796         LLVM_DEBUG(dbgs()
797                    << "Failed to create new discriminator: "
798                    << DIL->getFilename() << " Line: " << DIL->getLine());
799     }
800     else
801       B.SetCurrentDebugLocation(DIL);
802   } else
803     B.SetCurrentDebugLocation(DebugLoc());
804 }
805 
806 /// Write a record \p DebugMsg about vectorization failure to the debug
807 /// output stream. If \p I is passed, it is an instruction that prevents
808 /// vectorization.
809 #ifndef NDEBUG
810 static void debugVectorizationFailure(const StringRef DebugMsg,
811     Instruction *I) {
812   dbgs() << "LV: Not vectorizing: " << DebugMsg;
813   if (I != nullptr)
814     dbgs() << " " << *I;
815   else
816     dbgs() << '.';
817   dbgs() << '\n';
818 }
819 #endif
820 
821 /// Create an analysis remark that explains why vectorization failed
822 ///
823 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
824 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
825 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
826 /// the location of the remark.  \return the remark object that can be
827 /// streamed to.
828 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
829     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
830   Value *CodeRegion = TheLoop->getHeader();
831   DebugLoc DL = TheLoop->getStartLoc();
832 
833   if (I) {
834     CodeRegion = I->getParent();
835     // If there is no debug location attached to the instruction, revert back to
836     // using the loop's.
837     if (I->getDebugLoc())
838       DL = I->getDebugLoc();
839   }
840 
841   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
842   R << "loop not vectorized: ";
843   return R;
844 }
845 
846 namespace llvm {
847 
848 void reportVectorizationFailure(const StringRef DebugMsg,
849     const StringRef OREMsg, const StringRef ORETag,
850     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
851   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
852   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
853   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
854                 ORETag, TheLoop, I) << OREMsg);
855 }
856 
857 } // end namespace llvm
858 
859 #ifndef NDEBUG
860 /// \return string containing a file name and a line # for the given loop.
861 static std::string getDebugLocString(const Loop *L) {
862   std::string Result;
863   if (L) {
864     raw_string_ostream OS(Result);
865     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
866       LoopDbgLoc.print(OS);
867     else
868       // Just print the module name.
869       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
870     OS.flush();
871   }
872   return Result;
873 }
874 #endif
875 
876 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
877                                          const Instruction *Orig) {
878   // If the loop was versioned with memchecks, add the corresponding no-alias
879   // metadata.
880   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
881     LVer->annotateInstWithNoAlias(To, Orig);
882 }
883 
884 void InnerLoopVectorizer::addMetadata(Instruction *To,
885                                       Instruction *From) {
886   propagateMetadata(To, From);
887   addNewMetadata(To, From);
888 }
889 
890 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
891                                       Instruction *From) {
892   for (Value *V : To) {
893     if (Instruction *I = dyn_cast<Instruction>(V))
894       addMetadata(I, From);
895   }
896 }
897 
898 namespace llvm {
899 
900 // Loop vectorization cost-model hints how the scalar epilogue loop should be
901 // lowered.
902 enum ScalarEpilogueLowering {
903 
904   // The default: allowing scalar epilogues.
905   CM_ScalarEpilogueAllowed,
906 
907   // Vectorization with OptForSize: don't allow epilogues.
908   CM_ScalarEpilogueNotAllowedOptSize,
909 
910   // A special case of vectorisation with OptForSize: loops with a very small
911   // trip count are considered for vectorization under OptForSize, thereby
912   // making sure the cost of their loop body is dominant, free of runtime
913   // guards and scalar iteration overheads.
914   CM_ScalarEpilogueNotAllowedLowTripLoop,
915 
916   // Loop hint predicate indicating an epilogue is undesired.
917   CM_ScalarEpilogueNotNeededUsePredicate
918 };
919 
920 /// LoopVectorizationCostModel - estimates the expected speedups due to
921 /// vectorization.
922 /// In many cases vectorization is not profitable. This can happen because of
923 /// a number of reasons. In this class we mainly attempt to predict the
924 /// expected speedup/slowdowns due to the supported instruction set. We use the
925 /// TargetTransformInfo to query the different backends for the cost of
926 /// different operations.
927 class LoopVectorizationCostModel {
928 public:
929   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
930                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
931                              LoopVectorizationLegality *Legal,
932                              const TargetTransformInfo &TTI,
933                              const TargetLibraryInfo *TLI, DemandedBits *DB,
934                              AssumptionCache *AC,
935                              OptimizationRemarkEmitter *ORE, const Function *F,
936                              const LoopVectorizeHints *Hints,
937                              InterleavedAccessInfo &IAI)
938       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
939         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
940         Hints(Hints), InterleaveInfo(IAI) {}
941 
942   /// \return An upper bound for the vectorization factor, or None if
943   /// vectorization and interleaving should be avoided up front.
944   Optional<unsigned> computeMaxVF();
945 
946   /// \return True if runtime checks are required for vectorization, and false
947   /// otherwise.
948   bool runtimeChecksRequired();
949 
950   /// \return The most profitable vectorization factor and the cost of that VF.
951   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
952   /// then this vectorization factor will be selected if vectorization is
953   /// possible.
954   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
955 
956   /// Setup cost-based decisions for user vectorization factor.
957   void selectUserVectorizationFactor(unsigned UserVF) {
958     collectUniformsAndScalars(UserVF);
959     collectInstsToScalarize(UserVF);
960   }
961 
962   /// \return The size (in bits) of the smallest and widest types in the code
963   /// that needs to be vectorized. We ignore values that remain scalar such as
964   /// 64 bit loop indices.
965   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
966 
967   /// \return The desired interleave count.
968   /// If interleave count has been specified by metadata it will be returned.
969   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
970   /// are the selected vectorization factor and the cost of the selected VF.
971   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
972 
973   /// Memory access instruction may be vectorized in more than one way.
974   /// Form of instruction after vectorization depends on cost.
975   /// This function takes cost-based decisions for Load/Store instructions
976   /// and collects them in a map. This decisions map is used for building
977   /// the lists of loop-uniform and loop-scalar instructions.
978   /// The calculated cost is saved with widening decision in order to
979   /// avoid redundant calculations.
980   void setCostBasedWideningDecision(unsigned VF);
981 
982   /// A struct that represents some properties of the register usage
983   /// of a loop.
984   struct RegisterUsage {
985     /// Holds the number of loop invariant values that are used in the loop.
986     unsigned LoopInvariantRegs;
987 
988     /// Holds the maximum number of concurrent live intervals in the loop.
989     unsigned MaxLocalUsers;
990   };
991 
992   /// \return Returns information about the register usages of the loop for the
993   /// given vectorization factors.
994   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
995 
996   /// Collect values we want to ignore in the cost model.
997   void collectValuesToIgnore();
998 
999   /// \returns The smallest bitwidth each instruction can be represented with.
1000   /// The vector equivalents of these instructions should be truncated to this
1001   /// type.
1002   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1003     return MinBWs;
1004   }
1005 
1006   /// \returns True if it is more profitable to scalarize instruction \p I for
1007   /// vectorization factor \p VF.
1008   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1009     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1010 
1011     // Cost model is not run in the VPlan-native path - return conservative
1012     // result until this changes.
1013     if (EnableVPlanNativePath)
1014       return false;
1015 
1016     auto Scalars = InstsToScalarize.find(VF);
1017     assert(Scalars != InstsToScalarize.end() &&
1018            "VF not yet analyzed for scalarization profitability");
1019     return Scalars->second.find(I) != Scalars->second.end();
1020   }
1021 
1022   /// Returns true if \p I is known to be uniform after vectorization.
1023   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1024     if (VF == 1)
1025       return true;
1026 
1027     // Cost model is not run in the VPlan-native path - return conservative
1028     // result until this changes.
1029     if (EnableVPlanNativePath)
1030       return false;
1031 
1032     auto UniformsPerVF = Uniforms.find(VF);
1033     assert(UniformsPerVF != Uniforms.end() &&
1034            "VF not yet analyzed for uniformity");
1035     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1036   }
1037 
1038   /// Returns true if \p I is known to be scalar after vectorization.
1039   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1040     if (VF == 1)
1041       return true;
1042 
1043     // Cost model is not run in the VPlan-native path - return conservative
1044     // result until this changes.
1045     if (EnableVPlanNativePath)
1046       return false;
1047 
1048     auto ScalarsPerVF = Scalars.find(VF);
1049     assert(ScalarsPerVF != Scalars.end() &&
1050            "Scalar values are not calculated for VF");
1051     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1052   }
1053 
1054   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1055   /// for vectorization factor \p VF.
1056   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1057     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1058            !isProfitableToScalarize(I, VF) &&
1059            !isScalarAfterVectorization(I, VF);
1060   }
1061 
1062   /// Decision that was taken during cost calculation for memory instruction.
1063   enum InstWidening {
1064     CM_Unknown,
1065     CM_Widen,         // For consecutive accesses with stride +1.
1066     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1067     CM_Interleave,
1068     CM_GatherScatter,
1069     CM_Scalarize
1070   };
1071 
1072   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1073   /// instruction \p I and vector width \p VF.
1074   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1075                            unsigned Cost) {
1076     assert(VF >= 2 && "Expected VF >=2");
1077     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1078   }
1079 
1080   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1081   /// interleaving group \p Grp and vector width \p VF.
1082   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1083                            InstWidening W, unsigned Cost) {
1084     assert(VF >= 2 && "Expected VF >=2");
1085     /// Broadcast this decicion to all instructions inside the group.
1086     /// But the cost will be assigned to one instruction only.
1087     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1088       if (auto *I = Grp->getMember(i)) {
1089         if (Grp->getInsertPos() == I)
1090           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1091         else
1092           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1093       }
1094     }
1095   }
1096 
1097   /// Return the cost model decision for the given instruction \p I and vector
1098   /// width \p VF. Return CM_Unknown if this instruction did not pass
1099   /// through the cost modeling.
1100   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1101     assert(VF >= 2 && "Expected VF >=2");
1102 
1103     // Cost model is not run in the VPlan-native path - return conservative
1104     // result until this changes.
1105     if (EnableVPlanNativePath)
1106       return CM_GatherScatter;
1107 
1108     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1109     auto Itr = WideningDecisions.find(InstOnVF);
1110     if (Itr == WideningDecisions.end())
1111       return CM_Unknown;
1112     return Itr->second.first;
1113   }
1114 
1115   /// Return the vectorization cost for the given instruction \p I and vector
1116   /// width \p VF.
1117   unsigned getWideningCost(Instruction *I, unsigned VF) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1120     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1121            "The cost is not calculated");
1122     return WideningDecisions[InstOnVF].second;
1123   }
1124 
1125   /// Return True if instruction \p I is an optimizable truncate whose operand
1126   /// is an induction variable. Such a truncate will be removed by adding a new
1127   /// induction variable with the destination type.
1128   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1129     // If the instruction is not a truncate, return false.
1130     auto *Trunc = dyn_cast<TruncInst>(I);
1131     if (!Trunc)
1132       return false;
1133 
1134     // Get the source and destination types of the truncate.
1135     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1136     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1137 
1138     // If the truncate is free for the given types, return false. Replacing a
1139     // free truncate with an induction variable would add an induction variable
1140     // update instruction to each iteration of the loop. We exclude from this
1141     // check the primary induction variable since it will need an update
1142     // instruction regardless.
1143     Value *Op = Trunc->getOperand(0);
1144     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1145       return false;
1146 
1147     // If the truncated value is not an induction variable, return false.
1148     return Legal->isInductionPhi(Op);
1149   }
1150 
1151   /// Collects the instructions to scalarize for each predicated instruction in
1152   /// the loop.
1153   void collectInstsToScalarize(unsigned VF);
1154 
1155   /// Collect Uniform and Scalar values for the given \p VF.
1156   /// The sets depend on CM decision for Load/Store instructions
1157   /// that may be vectorized as interleave, gather-scatter or scalarized.
1158   void collectUniformsAndScalars(unsigned VF) {
1159     // Do the analysis once.
1160     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1161       return;
1162     setCostBasedWideningDecision(VF);
1163     collectLoopUniforms(VF);
1164     collectLoopScalars(VF);
1165   }
1166 
1167   /// Returns true if the target machine supports masked store operation
1168   /// for the given \p DataType and kind of access to \p Ptr.
1169   bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
1170     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
1171   }
1172 
1173   /// Returns true if the target machine supports masked load operation
1174   /// for the given \p DataType and kind of access to \p Ptr.
1175   bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
1176     return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
1177   }
1178 
1179   /// Returns true if the target machine supports masked scatter operation
1180   /// for the given \p DataType.
1181   bool isLegalMaskedScatter(Type *DataType) {
1182     return TTI.isLegalMaskedScatter(DataType);
1183   }
1184 
1185   /// Returns true if the target machine supports masked gather operation
1186   /// for the given \p DataType.
1187   bool isLegalMaskedGather(Type *DataType) {
1188     return TTI.isLegalMaskedGather(DataType);
1189   }
1190 
1191   /// Returns true if the target machine can represent \p V as a masked gather
1192   /// or scatter operation.
1193   bool isLegalGatherOrScatter(Value *V) {
1194     bool LI = isa<LoadInst>(V);
1195     bool SI = isa<StoreInst>(V);
1196     if (!LI && !SI)
1197       return false;
1198     auto *Ty = getMemInstValueType(V);
1199     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1200   }
1201 
1202   /// Returns true if \p I is an instruction that will be scalarized with
1203   /// predication. Such instructions include conditional stores and
1204   /// instructions that may divide by zero.
1205   /// If a non-zero VF has been calculated, we check if I will be scalarized
1206   /// predication for that VF.
1207   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1208 
1209   // Returns true if \p I is an instruction that will be predicated either
1210   // through scalar predication or masked load/store or masked gather/scatter.
1211   // Superset of instructions that return true for isScalarWithPredication.
1212   bool isPredicatedInst(Instruction *I) {
1213     if (!blockNeedsPredication(I->getParent()))
1214       return false;
1215     // Loads and stores that need some form of masked operation are predicated
1216     // instructions.
1217     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1218       return Legal->isMaskRequired(I);
1219     return isScalarWithPredication(I);
1220   }
1221 
1222   /// Returns true if \p I is a memory instruction with consecutive memory
1223   /// access that can be widened.
1224   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1225 
1226   /// Returns true if \p I is a memory instruction in an interleaved-group
1227   /// of memory accesses that can be vectorized with wide vector loads/stores
1228   /// and shuffles.
1229   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1230 
1231   /// Check if \p Instr belongs to any interleaved access group.
1232   bool isAccessInterleaved(Instruction *Instr) {
1233     return InterleaveInfo.isInterleaved(Instr);
1234   }
1235 
1236   /// Get the interleaved access group that \p Instr belongs to.
1237   const InterleaveGroup<Instruction> *
1238   getInterleavedAccessGroup(Instruction *Instr) {
1239     return InterleaveInfo.getInterleaveGroup(Instr);
1240   }
1241 
1242   /// Returns true if an interleaved group requires a scalar iteration
1243   /// to handle accesses with gaps, and there is nothing preventing us from
1244   /// creating a scalar epilogue.
1245   bool requiresScalarEpilogue() const {
1246     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1247   }
1248 
1249   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1250   /// loop hint annotation.
1251   bool isScalarEpilogueAllowed() const {
1252     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1253   }
1254 
1255   /// Returns true if all loop blocks should be masked to fold tail loop.
1256   bool foldTailByMasking() const { return FoldTailByMasking; }
1257 
1258   bool blockNeedsPredication(BasicBlock *BB) {
1259     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1260   }
1261 
1262   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1263   /// with factor VF.  Return the cost of the instruction, including
1264   /// scalarization overhead if it's needed.
1265   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1266 
1267   /// Estimate cost of a call instruction CI if it were vectorized with factor
1268   /// VF. Return the cost of the instruction, including scalarization overhead
1269   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1270   /// scalarized -
1271   /// i.e. either vector version isn't available, or is too expensive.
1272   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1273 
1274 private:
1275   unsigned NumPredStores = 0;
1276 
1277   /// \return An upper bound for the vectorization factor, larger than zero.
1278   /// One is returned if vectorization should best be avoided due to cost.
1279   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1280 
1281   /// The vectorization cost is a combination of the cost itself and a boolean
1282   /// indicating whether any of the contributing operations will actually
1283   /// operate on
1284   /// vector values after type legalization in the backend. If this latter value
1285   /// is
1286   /// false, then all operations will be scalarized (i.e. no vectorization has
1287   /// actually taken place).
1288   using VectorizationCostTy = std::pair<unsigned, bool>;
1289 
1290   /// Returns the expected execution cost. The unit of the cost does
1291   /// not matter because we use the 'cost' units to compare different
1292   /// vector widths. The cost that is returned is *not* normalized by
1293   /// the factor width.
1294   VectorizationCostTy expectedCost(unsigned VF);
1295 
1296   /// Returns the execution time cost of an instruction for a given vector
1297   /// width. Vector width of one means scalar.
1298   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1299 
1300   /// The cost-computation logic from getInstructionCost which provides
1301   /// the vector type as an output parameter.
1302   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1303 
1304   /// Calculate vectorization cost of memory instruction \p I.
1305   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1306 
1307   /// The cost computation for scalarized memory instruction.
1308   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1309 
1310   /// The cost computation for interleaving group of memory instructions.
1311   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1312 
1313   /// The cost computation for Gather/Scatter instruction.
1314   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1315 
1316   /// The cost computation for widening instruction \p I with consecutive
1317   /// memory access.
1318   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1319 
1320   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1321   /// Load: scalar load + broadcast.
1322   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1323   /// element)
1324   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1325 
1326   /// Estimate the overhead of scalarizing an instruction. This is a
1327   /// convenience wrapper for the type-based getScalarizationOverhead API.
1328   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1329 
1330   /// Returns whether the instruction is a load or store and will be a emitted
1331   /// as a vector operation.
1332   bool isConsecutiveLoadOrStore(Instruction *I);
1333 
1334   /// Returns true if an artificially high cost for emulated masked memrefs
1335   /// should be used.
1336   bool useEmulatedMaskMemRefHack(Instruction *I);
1337 
1338   /// Map of scalar integer values to the smallest bitwidth they can be legally
1339   /// represented as. The vector equivalents of these values should be truncated
1340   /// to this type.
1341   MapVector<Instruction *, uint64_t> MinBWs;
1342 
1343   /// A type representing the costs for instructions if they were to be
1344   /// scalarized rather than vectorized. The entries are Instruction-Cost
1345   /// pairs.
1346   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1347 
1348   /// A set containing all BasicBlocks that are known to present after
1349   /// vectorization as a predicated block.
1350   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1351 
1352   /// Records whether it is allowed to have the original scalar loop execute at
1353   /// least once. This may be needed as a fallback loop in case runtime
1354   /// aliasing/dependence checks fail, or to handle the tail/remainder
1355   /// iterations when the trip count is unknown or doesn't divide by the VF,
1356   /// or as a peel-loop to handle gaps in interleave-groups.
1357   /// Under optsize and when the trip count is very small we don't allow any
1358   /// iterations to execute in the scalar loop.
1359   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1360 
1361   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1362   bool FoldTailByMasking = false;
1363 
1364   /// A map holding scalar costs for different vectorization factors. The
1365   /// presence of a cost for an instruction in the mapping indicates that the
1366   /// instruction will be scalarized when vectorizing with the associated
1367   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1368   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1369 
1370   /// Holds the instructions known to be uniform after vectorization.
1371   /// The data is collected per VF.
1372   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1373 
1374   /// Holds the instructions known to be scalar after vectorization.
1375   /// The data is collected per VF.
1376   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1377 
1378   /// Holds the instructions (address computations) that are forced to be
1379   /// scalarized.
1380   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1381 
1382   /// Returns the expected difference in cost from scalarizing the expression
1383   /// feeding a predicated instruction \p PredInst. The instructions to
1384   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1385   /// non-negative return value implies the expression will be scalarized.
1386   /// Currently, only single-use chains are considered for scalarization.
1387   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1388                               unsigned VF);
1389 
1390   /// Collect the instructions that are uniform after vectorization. An
1391   /// instruction is uniform if we represent it with a single scalar value in
1392   /// the vectorized loop corresponding to each vector iteration. Examples of
1393   /// uniform instructions include pointer operands of consecutive or
1394   /// interleaved memory accesses. Note that although uniformity implies an
1395   /// instruction will be scalar, the reverse is not true. In general, a
1396   /// scalarized instruction will be represented by VF scalar values in the
1397   /// vectorized loop, each corresponding to an iteration of the original
1398   /// scalar loop.
1399   void collectLoopUniforms(unsigned VF);
1400 
1401   /// Collect the instructions that are scalar after vectorization. An
1402   /// instruction is scalar if it is known to be uniform or will be scalarized
1403   /// during vectorization. Non-uniform scalarized instructions will be
1404   /// represented by VF values in the vectorized loop, each corresponding to an
1405   /// iteration of the original scalar loop.
1406   void collectLoopScalars(unsigned VF);
1407 
1408   /// Keeps cost model vectorization decision and cost for instructions.
1409   /// Right now it is used for memory instructions only.
1410   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1411                                 std::pair<InstWidening, unsigned>>;
1412 
1413   DecisionList WideningDecisions;
1414 
1415   /// Returns true if \p V is expected to be vectorized and it needs to be
1416   /// extracted.
1417   bool needsExtract(Value *V, unsigned VF) const {
1418     Instruction *I = dyn_cast<Instruction>(V);
1419     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1420       return false;
1421 
1422     // Assume we can vectorize V (and hence we need extraction) if the
1423     // scalars are not computed yet. This can happen, because it is called
1424     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1425     // the scalars are collected. That should be a safe assumption in most
1426     // cases, because we check if the operands have vectorizable types
1427     // beforehand in LoopVectorizationLegality.
1428     return Scalars.find(VF) == Scalars.end() ||
1429            !isScalarAfterVectorization(I, VF);
1430   };
1431 
1432   /// Returns a range containing only operands needing to be extracted.
1433   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1434                                                    unsigned VF) {
1435     return SmallVector<Value *, 4>(make_filter_range(
1436         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1437   }
1438 
1439 public:
1440   /// The loop that we evaluate.
1441   Loop *TheLoop;
1442 
1443   /// Predicated scalar evolution analysis.
1444   PredicatedScalarEvolution &PSE;
1445 
1446   /// Loop Info analysis.
1447   LoopInfo *LI;
1448 
1449   /// Vectorization legality.
1450   LoopVectorizationLegality *Legal;
1451 
1452   /// Vector target information.
1453   const TargetTransformInfo &TTI;
1454 
1455   /// Target Library Info.
1456   const TargetLibraryInfo *TLI;
1457 
1458   /// Demanded bits analysis.
1459   DemandedBits *DB;
1460 
1461   /// Assumption cache.
1462   AssumptionCache *AC;
1463 
1464   /// Interface to emit optimization remarks.
1465   OptimizationRemarkEmitter *ORE;
1466 
1467   const Function *TheFunction;
1468 
1469   /// Loop Vectorize Hint.
1470   const LoopVectorizeHints *Hints;
1471 
1472   /// The interleave access information contains groups of interleaved accesses
1473   /// with the same stride and close to each other.
1474   InterleavedAccessInfo &InterleaveInfo;
1475 
1476   /// Values to ignore in the cost model.
1477   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1478 
1479   /// Values to ignore in the cost model when VF > 1.
1480   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1481 };
1482 
1483 } // end namespace llvm
1484 
1485 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1486 // vectorization. The loop needs to be annotated with #pragma omp simd
1487 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1488 // vector length information is not provided, vectorization is not considered
1489 // explicit. Interleave hints are not allowed either. These limitations will be
1490 // relaxed in the future.
1491 // Please, note that we are currently forced to abuse the pragma 'clang
1492 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1493 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1494 // provides *explicit vectorization hints* (LV can bypass legal checks and
1495 // assume that vectorization is legal). However, both hints are implemented
1496 // using the same metadata (llvm.loop.vectorize, processed by
1497 // LoopVectorizeHints). This will be fixed in the future when the native IR
1498 // representation for pragma 'omp simd' is introduced.
1499 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1500                                    OptimizationRemarkEmitter *ORE) {
1501   assert(!OuterLp->empty() && "This is not an outer loop");
1502   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1503 
1504   // Only outer loops with an explicit vectorization hint are supported.
1505   // Unannotated outer loops are ignored.
1506   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1507     return false;
1508 
1509   Function *Fn = OuterLp->getHeader()->getParent();
1510   if (!Hints.allowVectorization(Fn, OuterLp,
1511                                 true /*VectorizeOnlyWhenForced*/)) {
1512     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1513     return false;
1514   }
1515 
1516   if (Hints.getInterleave() > 1) {
1517     // TODO: Interleave support is future work.
1518     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1519                          "outer loops.\n");
1520     Hints.emitRemarkWithHints();
1521     return false;
1522   }
1523 
1524   return true;
1525 }
1526 
1527 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1528                                   OptimizationRemarkEmitter *ORE,
1529                                   SmallVectorImpl<Loop *> &V) {
1530   // Collect inner loops and outer loops without irreducible control flow. For
1531   // now, only collect outer loops that have explicit vectorization hints. If we
1532   // are stress testing the VPlan H-CFG construction, we collect the outermost
1533   // loop of every loop nest.
1534   if (L.empty() || VPlanBuildStressTest ||
1535       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1536     LoopBlocksRPO RPOT(&L);
1537     RPOT.perform(LI);
1538     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1539       V.push_back(&L);
1540       // TODO: Collect inner loops inside marked outer loops in case
1541       // vectorization fails for the outer loop. Do not invoke
1542       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1543       // already known to be reducible. We can use an inherited attribute for
1544       // that.
1545       return;
1546     }
1547   }
1548   for (Loop *InnerL : L)
1549     collectSupportedLoops(*InnerL, LI, ORE, V);
1550 }
1551 
1552 namespace {
1553 
1554 /// The LoopVectorize Pass.
1555 struct LoopVectorize : public FunctionPass {
1556   /// Pass identification, replacement for typeid
1557   static char ID;
1558 
1559   LoopVectorizePass Impl;
1560 
1561   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1562                          bool VectorizeOnlyWhenForced = false)
1563       : FunctionPass(ID) {
1564     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1565     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1566     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1567   }
1568 
1569   bool runOnFunction(Function &F) override {
1570     if (skipFunction(F))
1571       return false;
1572 
1573     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1574     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1575     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1576     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1577     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1578     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1579     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1580     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1581     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1582     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1583     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1584     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1585     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1586 
1587     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1588         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1589 
1590     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1591                         GetLAA, *ORE, PSI);
1592   }
1593 
1594   void getAnalysisUsage(AnalysisUsage &AU) const override {
1595     AU.addRequired<AssumptionCacheTracker>();
1596     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1597     AU.addRequired<DominatorTreeWrapperPass>();
1598     AU.addRequired<LoopInfoWrapperPass>();
1599     AU.addRequired<ScalarEvolutionWrapperPass>();
1600     AU.addRequired<TargetTransformInfoWrapperPass>();
1601     AU.addRequired<AAResultsWrapperPass>();
1602     AU.addRequired<LoopAccessLegacyAnalysis>();
1603     AU.addRequired<DemandedBitsWrapperPass>();
1604     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1605 
1606     // We currently do not preserve loopinfo/dominator analyses with outer loop
1607     // vectorization. Until this is addressed, mark these analyses as preserved
1608     // only for non-VPlan-native path.
1609     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1610     if (!EnableVPlanNativePath) {
1611       AU.addPreserved<LoopInfoWrapperPass>();
1612       AU.addPreserved<DominatorTreeWrapperPass>();
1613     }
1614 
1615     AU.addPreserved<BasicAAWrapperPass>();
1616     AU.addPreserved<GlobalsAAWrapperPass>();
1617     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1618   }
1619 };
1620 
1621 } // end anonymous namespace
1622 
1623 //===----------------------------------------------------------------------===//
1624 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1625 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1626 //===----------------------------------------------------------------------===//
1627 
1628 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1629   // We need to place the broadcast of invariant variables outside the loop,
1630   // but only if it's proven safe to do so. Else, broadcast will be inside
1631   // vector loop body.
1632   Instruction *Instr = dyn_cast<Instruction>(V);
1633   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1634                      (!Instr ||
1635                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1636   // Place the code for broadcasting invariant variables in the new preheader.
1637   IRBuilder<>::InsertPointGuard Guard(Builder);
1638   if (SafeToHoist)
1639     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1640 
1641   // Broadcast the scalar into all locations in the vector.
1642   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1643 
1644   return Shuf;
1645 }
1646 
1647 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1648     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1649   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1650          "Expected either an induction phi-node or a truncate of it!");
1651   Value *Start = II.getStartValue();
1652 
1653   // Construct the initial value of the vector IV in the vector loop preheader
1654   auto CurrIP = Builder.saveIP();
1655   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1656   if (isa<TruncInst>(EntryVal)) {
1657     assert(Start->getType()->isIntegerTy() &&
1658            "Truncation requires an integer type");
1659     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1660     Step = Builder.CreateTrunc(Step, TruncType);
1661     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1662   }
1663   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1664   Value *SteppedStart =
1665       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1666 
1667   // We create vector phi nodes for both integer and floating-point induction
1668   // variables. Here, we determine the kind of arithmetic we will perform.
1669   Instruction::BinaryOps AddOp;
1670   Instruction::BinaryOps MulOp;
1671   if (Step->getType()->isIntegerTy()) {
1672     AddOp = Instruction::Add;
1673     MulOp = Instruction::Mul;
1674   } else {
1675     AddOp = II.getInductionOpcode();
1676     MulOp = Instruction::FMul;
1677   }
1678 
1679   // Multiply the vectorization factor by the step using integer or
1680   // floating-point arithmetic as appropriate.
1681   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1682   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1683 
1684   // Create a vector splat to use in the induction update.
1685   //
1686   // FIXME: If the step is non-constant, we create the vector splat with
1687   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1688   //        handle a constant vector splat.
1689   Value *SplatVF = isa<Constant>(Mul)
1690                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1691                        : Builder.CreateVectorSplat(VF, Mul);
1692   Builder.restoreIP(CurrIP);
1693 
1694   // We may need to add the step a number of times, depending on the unroll
1695   // factor. The last of those goes into the PHI.
1696   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1697                                     &*LoopVectorBody->getFirstInsertionPt());
1698   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1699   Instruction *LastInduction = VecInd;
1700   for (unsigned Part = 0; Part < UF; ++Part) {
1701     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1702 
1703     if (isa<TruncInst>(EntryVal))
1704       addMetadata(LastInduction, EntryVal);
1705     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1706 
1707     LastInduction = cast<Instruction>(addFastMathFlag(
1708         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1709     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1710   }
1711 
1712   // Move the last step to the end of the latch block. This ensures consistent
1713   // placement of all induction updates.
1714   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1715   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1716   auto *ICmp = cast<Instruction>(Br->getCondition());
1717   LastInduction->moveBefore(ICmp);
1718   LastInduction->setName("vec.ind.next");
1719 
1720   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1721   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1722 }
1723 
1724 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1725   return Cost->isScalarAfterVectorization(I, VF) ||
1726          Cost->isProfitableToScalarize(I, VF);
1727 }
1728 
1729 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1730   if (shouldScalarizeInstruction(IV))
1731     return true;
1732   auto isScalarInst = [&](User *U) -> bool {
1733     auto *I = cast<Instruction>(U);
1734     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1735   };
1736   return llvm::any_of(IV->users(), isScalarInst);
1737 }
1738 
1739 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1740     const InductionDescriptor &ID, const Instruction *EntryVal,
1741     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1742   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1743          "Expected either an induction phi-node or a truncate of it!");
1744 
1745   // This induction variable is not the phi from the original loop but the
1746   // newly-created IV based on the proof that casted Phi is equal to the
1747   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1748   // re-uses the same InductionDescriptor that original IV uses but we don't
1749   // have to do any recording in this case - that is done when original IV is
1750   // processed.
1751   if (isa<TruncInst>(EntryVal))
1752     return;
1753 
1754   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1755   if (Casts.empty())
1756     return;
1757   // Only the first Cast instruction in the Casts vector is of interest.
1758   // The rest of the Casts (if exist) have no uses outside the
1759   // induction update chain itself.
1760   Instruction *CastInst = *Casts.begin();
1761   if (Lane < UINT_MAX)
1762     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1763   else
1764     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1765 }
1766 
1767 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1768   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1769          "Primary induction variable must have an integer type");
1770 
1771   auto II = Legal->getInductionVars()->find(IV);
1772   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1773 
1774   auto ID = II->second;
1775   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1776 
1777   // The scalar value to broadcast. This will be derived from the canonical
1778   // induction variable.
1779   Value *ScalarIV = nullptr;
1780 
1781   // The value from the original loop to which we are mapping the new induction
1782   // variable.
1783   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1784 
1785   // True if we have vectorized the induction variable.
1786   auto VectorizedIV = false;
1787 
1788   // Determine if we want a scalar version of the induction variable. This is
1789   // true if the induction variable itself is not widened, or if it has at
1790   // least one user in the loop that is not widened.
1791   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1792 
1793   // Generate code for the induction step. Note that induction steps are
1794   // required to be loop-invariant
1795   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1796          "Induction step should be loop invariant");
1797   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1798   Value *Step = nullptr;
1799   if (PSE.getSE()->isSCEVable(IV->getType())) {
1800     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1801     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1802                              LoopVectorPreHeader->getTerminator());
1803   } else {
1804     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1805   }
1806 
1807   // Try to create a new independent vector induction variable. If we can't
1808   // create the phi node, we will splat the scalar induction variable in each
1809   // loop iteration.
1810   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1811     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1812     VectorizedIV = true;
1813   }
1814 
1815   // If we haven't yet vectorized the induction variable, or if we will create
1816   // a scalar one, we need to define the scalar induction variable and step
1817   // values. If we were given a truncation type, truncate the canonical
1818   // induction variable and step. Otherwise, derive these values from the
1819   // induction descriptor.
1820   if (!VectorizedIV || NeedsScalarIV) {
1821     ScalarIV = Induction;
1822     if (IV != OldInduction) {
1823       ScalarIV = IV->getType()->isIntegerTy()
1824                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1825                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1826                                           IV->getType());
1827       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1828       ScalarIV->setName("offset.idx");
1829     }
1830     if (Trunc) {
1831       auto *TruncType = cast<IntegerType>(Trunc->getType());
1832       assert(Step->getType()->isIntegerTy() &&
1833              "Truncation requires an integer step");
1834       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1835       Step = Builder.CreateTrunc(Step, TruncType);
1836     }
1837   }
1838 
1839   // If we haven't yet vectorized the induction variable, splat the scalar
1840   // induction variable, and build the necessary step vectors.
1841   // TODO: Don't do it unless the vectorized IV is really required.
1842   if (!VectorizedIV) {
1843     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1844     for (unsigned Part = 0; Part < UF; ++Part) {
1845       Value *EntryPart =
1846           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1847       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1848       if (Trunc)
1849         addMetadata(EntryPart, Trunc);
1850       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1851     }
1852   }
1853 
1854   // If an induction variable is only used for counting loop iterations or
1855   // calculating addresses, it doesn't need to be widened. Create scalar steps
1856   // that can be used by instructions we will later scalarize. Note that the
1857   // addition of the scalar steps will not increase the number of instructions
1858   // in the loop in the common case prior to InstCombine. We will be trading
1859   // one vector extract for each scalar step.
1860   if (NeedsScalarIV)
1861     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1862 }
1863 
1864 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1865                                           Instruction::BinaryOps BinOp) {
1866   // Create and check the types.
1867   assert(Val->getType()->isVectorTy() && "Must be a vector");
1868   int VLen = Val->getType()->getVectorNumElements();
1869 
1870   Type *STy = Val->getType()->getScalarType();
1871   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1872          "Induction Step must be an integer or FP");
1873   assert(Step->getType() == STy && "Step has wrong type");
1874 
1875   SmallVector<Constant *, 8> Indices;
1876 
1877   if (STy->isIntegerTy()) {
1878     // Create a vector of consecutive numbers from zero to VF.
1879     for (int i = 0; i < VLen; ++i)
1880       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1881 
1882     // Add the consecutive indices to the vector value.
1883     Constant *Cv = ConstantVector::get(Indices);
1884     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1885     Step = Builder.CreateVectorSplat(VLen, Step);
1886     assert(Step->getType() == Val->getType() && "Invalid step vec");
1887     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1888     // which can be found from the original scalar operations.
1889     Step = Builder.CreateMul(Cv, Step);
1890     return Builder.CreateAdd(Val, Step, "induction");
1891   }
1892 
1893   // Floating point induction.
1894   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1895          "Binary Opcode should be specified for FP induction");
1896   // Create a vector of consecutive numbers from zero to VF.
1897   for (int i = 0; i < VLen; ++i)
1898     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1899 
1900   // Add the consecutive indices to the vector value.
1901   Constant *Cv = ConstantVector::get(Indices);
1902 
1903   Step = Builder.CreateVectorSplat(VLen, Step);
1904 
1905   // Floating point operations had to be 'fast' to enable the induction.
1906   FastMathFlags Flags;
1907   Flags.setFast();
1908 
1909   Value *MulOp = Builder.CreateFMul(Cv, Step);
1910   if (isa<Instruction>(MulOp))
1911     // Have to check, MulOp may be a constant
1912     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1913 
1914   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1915   if (isa<Instruction>(BOp))
1916     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1917   return BOp;
1918 }
1919 
1920 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1921                                            Instruction *EntryVal,
1922                                            const InductionDescriptor &ID) {
1923   // We shouldn't have to build scalar steps if we aren't vectorizing.
1924   assert(VF > 1 && "VF should be greater than one");
1925 
1926   // Get the value type and ensure it and the step have the same integer type.
1927   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1928   assert(ScalarIVTy == Step->getType() &&
1929          "Val and Step should have the same type");
1930 
1931   // We build scalar steps for both integer and floating-point induction
1932   // variables. Here, we determine the kind of arithmetic we will perform.
1933   Instruction::BinaryOps AddOp;
1934   Instruction::BinaryOps MulOp;
1935   if (ScalarIVTy->isIntegerTy()) {
1936     AddOp = Instruction::Add;
1937     MulOp = Instruction::Mul;
1938   } else {
1939     AddOp = ID.getInductionOpcode();
1940     MulOp = Instruction::FMul;
1941   }
1942 
1943   // Determine the number of scalars we need to generate for each unroll
1944   // iteration. If EntryVal is uniform, we only need to generate the first
1945   // lane. Otherwise, we generate all VF values.
1946   unsigned Lanes =
1947       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1948                                                                          : VF;
1949   // Compute the scalar steps and save the results in VectorLoopValueMap.
1950   for (unsigned Part = 0; Part < UF; ++Part) {
1951     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1952       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1953       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1954       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1955       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1956       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1957     }
1958   }
1959 }
1960 
1961 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1962   assert(V != Induction && "The new induction variable should not be used.");
1963   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1964   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1965 
1966   // If we have a stride that is replaced by one, do it here. Defer this for
1967   // the VPlan-native path until we start running Legal checks in that path.
1968   if (!EnableVPlanNativePath && Legal->hasStride(V))
1969     V = ConstantInt::get(V->getType(), 1);
1970 
1971   // If we have a vector mapped to this value, return it.
1972   if (VectorLoopValueMap.hasVectorValue(V, Part))
1973     return VectorLoopValueMap.getVectorValue(V, Part);
1974 
1975   // If the value has not been vectorized, check if it has been scalarized
1976   // instead. If it has been scalarized, and we actually need the value in
1977   // vector form, we will construct the vector values on demand.
1978   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
1979     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
1980 
1981     // If we've scalarized a value, that value should be an instruction.
1982     auto *I = cast<Instruction>(V);
1983 
1984     // If we aren't vectorizing, we can just copy the scalar map values over to
1985     // the vector map.
1986     if (VF == 1) {
1987       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
1988       return ScalarValue;
1989     }
1990 
1991     // Get the last scalar instruction we generated for V and Part. If the value
1992     // is known to be uniform after vectorization, this corresponds to lane zero
1993     // of the Part unroll iteration. Otherwise, the last instruction is the one
1994     // we created for the last vector lane of the Part unroll iteration.
1995     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
1996     auto *LastInst = cast<Instruction>(
1997         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
1998 
1999     // Set the insert point after the last scalarized instruction. This ensures
2000     // the insertelement sequence will directly follow the scalar definitions.
2001     auto OldIP = Builder.saveIP();
2002     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2003     Builder.SetInsertPoint(&*NewIP);
2004 
2005     // However, if we are vectorizing, we need to construct the vector values.
2006     // If the value is known to be uniform after vectorization, we can just
2007     // broadcast the scalar value corresponding to lane zero for each unroll
2008     // iteration. Otherwise, we construct the vector values using insertelement
2009     // instructions. Since the resulting vectors are stored in
2010     // VectorLoopValueMap, we will only generate the insertelements once.
2011     Value *VectorValue = nullptr;
2012     if (Cost->isUniformAfterVectorization(I, VF)) {
2013       VectorValue = getBroadcastInstrs(ScalarValue);
2014       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2015     } else {
2016       // Initialize packing with insertelements to start from undef.
2017       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2018       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2019       for (unsigned Lane = 0; Lane < VF; ++Lane)
2020         packScalarIntoVectorValue(V, {Part, Lane});
2021       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2022     }
2023     Builder.restoreIP(OldIP);
2024     return VectorValue;
2025   }
2026 
2027   // If this scalar is unknown, assume that it is a constant or that it is
2028   // loop invariant. Broadcast V and save the value for future uses.
2029   Value *B = getBroadcastInstrs(V);
2030   VectorLoopValueMap.setVectorValue(V, Part, B);
2031   return B;
2032 }
2033 
2034 Value *
2035 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2036                                             const VPIteration &Instance) {
2037   // If the value is not an instruction contained in the loop, it should
2038   // already be scalar.
2039   if (OrigLoop->isLoopInvariant(V))
2040     return V;
2041 
2042   assert(Instance.Lane > 0
2043              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2044              : true && "Uniform values only have lane zero");
2045 
2046   // If the value from the original loop has not been vectorized, it is
2047   // represented by UF x VF scalar values in the new loop. Return the requested
2048   // scalar value.
2049   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2050     return VectorLoopValueMap.getScalarValue(V, Instance);
2051 
2052   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2053   // for the given unroll part. If this entry is not a vector type (i.e., the
2054   // vectorization factor is one), there is no need to generate an
2055   // extractelement instruction.
2056   auto *U = getOrCreateVectorValue(V, Instance.Part);
2057   if (!U->getType()->isVectorTy()) {
2058     assert(VF == 1 && "Value not scalarized has non-vector type");
2059     return U;
2060   }
2061 
2062   // Otherwise, the value from the original loop has been vectorized and is
2063   // represented by UF vector values. Extract and return the requested scalar
2064   // value from the appropriate vector lane.
2065   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2066 }
2067 
2068 void InnerLoopVectorizer::packScalarIntoVectorValue(
2069     Value *V, const VPIteration &Instance) {
2070   assert(V != Induction && "The new induction variable should not be used.");
2071   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2072   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2073 
2074   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2075   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2076   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2077                                             Builder.getInt32(Instance.Lane));
2078   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2079 }
2080 
2081 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2082   assert(Vec->getType()->isVectorTy() && "Invalid type");
2083   SmallVector<Constant *, 8> ShuffleMask;
2084   for (unsigned i = 0; i < VF; ++i)
2085     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2086 
2087   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2088                                      ConstantVector::get(ShuffleMask),
2089                                      "reverse");
2090 }
2091 
2092 // Return whether we allow using masked interleave-groups (for dealing with
2093 // strided loads/stores that reside in predicated blocks, or for dealing
2094 // with gaps).
2095 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2096   // If an override option has been passed in for interleaved accesses, use it.
2097   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2098     return EnableMaskedInterleavedMemAccesses;
2099 
2100   return TTI.enableMaskedInterleavedAccessVectorization();
2101 }
2102 
2103 // Try to vectorize the interleave group that \p Instr belongs to.
2104 //
2105 // E.g. Translate following interleaved load group (factor = 3):
2106 //   for (i = 0; i < N; i+=3) {
2107 //     R = Pic[i];             // Member of index 0
2108 //     G = Pic[i+1];           // Member of index 1
2109 //     B = Pic[i+2];           // Member of index 2
2110 //     ... // do something to R, G, B
2111 //   }
2112 // To:
2113 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2114 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2115 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2116 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2117 //
2118 // Or translate following interleaved store group (factor = 3):
2119 //   for (i = 0; i < N; i+=3) {
2120 //     ... do something to R, G, B
2121 //     Pic[i]   = R;           // Member of index 0
2122 //     Pic[i+1] = G;           // Member of index 1
2123 //     Pic[i+2] = B;           // Member of index 2
2124 //   }
2125 // To:
2126 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2127 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2128 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2129 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2130 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2131 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2132                                                    VectorParts *BlockInMask) {
2133   const InterleaveGroup<Instruction> *Group =
2134       Cost->getInterleavedAccessGroup(Instr);
2135   assert(Group && "Fail to get an interleaved access group.");
2136 
2137   // Skip if current instruction is not the insert position.
2138   if (Instr != Group->getInsertPos())
2139     return;
2140 
2141   const DataLayout &DL = Instr->getModule()->getDataLayout();
2142   Value *Ptr = getLoadStorePointerOperand(Instr);
2143 
2144   // Prepare for the vector type of the interleaved load/store.
2145   Type *ScalarTy = getMemInstValueType(Instr);
2146   unsigned InterleaveFactor = Group->getFactor();
2147   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2148   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2149 
2150   // Prepare for the new pointers.
2151   setDebugLocFromInst(Builder, Ptr);
2152   SmallVector<Value *, 2> NewPtrs;
2153   unsigned Index = Group->getIndex(Instr);
2154 
2155   VectorParts Mask;
2156   bool IsMaskForCondRequired = BlockInMask;
2157   if (IsMaskForCondRequired) {
2158     Mask = *BlockInMask;
2159     // TODO: extend the masked interleaved-group support to reversed access.
2160     assert(!Group->isReverse() && "Reversed masked interleave-group "
2161                                   "not supported.");
2162   }
2163 
2164   // If the group is reverse, adjust the index to refer to the last vector lane
2165   // instead of the first. We adjust the index from the first vector lane,
2166   // rather than directly getting the pointer for lane VF - 1, because the
2167   // pointer operand of the interleaved access is supposed to be uniform. For
2168   // uniform instructions, we're only required to generate a value for the
2169   // first vector lane in each unroll iteration.
2170   if (Group->isReverse())
2171     Index += (VF - 1) * Group->getFactor();
2172 
2173   bool InBounds = false;
2174   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2175     InBounds = gep->isInBounds();
2176 
2177   for (unsigned Part = 0; Part < UF; Part++) {
2178     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2179 
2180     // Notice current instruction could be any index. Need to adjust the address
2181     // to the member of index 0.
2182     //
2183     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2184     //       b = A[i];       // Member of index 0
2185     // Current pointer is pointed to A[i+1], adjust it to A[i].
2186     //
2187     // E.g.  A[i+1] = a;     // Member of index 1
2188     //       A[i]   = b;     // Member of index 0
2189     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2190     // Current pointer is pointed to A[i+2], adjust it to A[i].
2191     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2192     if (InBounds)
2193       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2194 
2195     // Cast to the vector pointer type.
2196     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2197   }
2198 
2199   setDebugLocFromInst(Builder, Instr);
2200   Value *UndefVec = UndefValue::get(VecTy);
2201 
2202   Value *MaskForGaps = nullptr;
2203   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2204     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2205     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2206   }
2207 
2208   // Vectorize the interleaved load group.
2209   if (isa<LoadInst>(Instr)) {
2210     // For each unroll part, create a wide load for the group.
2211     SmallVector<Value *, 2> NewLoads;
2212     for (unsigned Part = 0; Part < UF; Part++) {
2213       Instruction *NewLoad;
2214       if (IsMaskForCondRequired || MaskForGaps) {
2215         assert(useMaskedInterleavedAccesses(*TTI) &&
2216                "masked interleaved groups are not allowed.");
2217         Value *GroupMask = MaskForGaps;
2218         if (IsMaskForCondRequired) {
2219           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2220           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2221           Value *ShuffledMask = Builder.CreateShuffleVector(
2222               Mask[Part], Undefs, RepMask, "interleaved.mask");
2223           GroupMask = MaskForGaps
2224                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2225                                                 MaskForGaps)
2226                           : ShuffledMask;
2227         }
2228         NewLoad =
2229             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2230                                      GroupMask, UndefVec, "wide.masked.vec");
2231       }
2232       else
2233         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2234                                             Group->getAlignment(), "wide.vec");
2235       Group->addMetadata(NewLoad);
2236       NewLoads.push_back(NewLoad);
2237     }
2238 
2239     // For each member in the group, shuffle out the appropriate data from the
2240     // wide loads.
2241     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2242       Instruction *Member = Group->getMember(I);
2243 
2244       // Skip the gaps in the group.
2245       if (!Member)
2246         continue;
2247 
2248       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2249       for (unsigned Part = 0; Part < UF; Part++) {
2250         Value *StridedVec = Builder.CreateShuffleVector(
2251             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2252 
2253         // If this member has different type, cast the result type.
2254         if (Member->getType() != ScalarTy) {
2255           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2256           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2257         }
2258 
2259         if (Group->isReverse())
2260           StridedVec = reverseVector(StridedVec);
2261 
2262         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2263       }
2264     }
2265     return;
2266   }
2267 
2268   // The sub vector type for current instruction.
2269   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2270 
2271   // Vectorize the interleaved store group.
2272   for (unsigned Part = 0; Part < UF; Part++) {
2273     // Collect the stored vector from each member.
2274     SmallVector<Value *, 4> StoredVecs;
2275     for (unsigned i = 0; i < InterleaveFactor; i++) {
2276       // Interleaved store group doesn't allow a gap, so each index has a member
2277       Instruction *Member = Group->getMember(i);
2278       assert(Member && "Fail to get a member from an interleaved store group");
2279 
2280       Value *StoredVec = getOrCreateVectorValue(
2281           cast<StoreInst>(Member)->getValueOperand(), Part);
2282       if (Group->isReverse())
2283         StoredVec = reverseVector(StoredVec);
2284 
2285       // If this member has different type, cast it to a unified type.
2286 
2287       if (StoredVec->getType() != SubVT)
2288         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2289 
2290       StoredVecs.push_back(StoredVec);
2291     }
2292 
2293     // Concatenate all vectors into a wide vector.
2294     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2295 
2296     // Interleave the elements in the wide vector.
2297     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2298     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2299                                               "interleaved.vec");
2300 
2301     Instruction *NewStoreInstr;
2302     if (IsMaskForCondRequired) {
2303       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2304       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2305       Value *ShuffledMask = Builder.CreateShuffleVector(
2306           Mask[Part], Undefs, RepMask, "interleaved.mask");
2307       NewStoreInstr = Builder.CreateMaskedStore(
2308           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2309     }
2310     else
2311       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2312         Group->getAlignment());
2313 
2314     Group->addMetadata(NewStoreInstr);
2315   }
2316 }
2317 
2318 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2319                                                      VectorParts *BlockInMask) {
2320   // Attempt to issue a wide load.
2321   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2322   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2323 
2324   assert((LI || SI) && "Invalid Load/Store instruction");
2325 
2326   LoopVectorizationCostModel::InstWidening Decision =
2327       Cost->getWideningDecision(Instr, VF);
2328   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2329          "CM decision should be taken at this point");
2330   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2331     return vectorizeInterleaveGroup(Instr);
2332 
2333   Type *ScalarDataTy = getMemInstValueType(Instr);
2334   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2335   Value *Ptr = getLoadStorePointerOperand(Instr);
2336   unsigned Alignment = getLoadStoreAlignment(Instr);
2337   // An alignment of 0 means target abi alignment. We need to use the scalar's
2338   // target abi alignment in such a case.
2339   const DataLayout &DL = Instr->getModule()->getDataLayout();
2340   if (!Alignment)
2341     Alignment = DL.getABITypeAlignment(ScalarDataTy);
2342   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2343 
2344   // Determine if the pointer operand of the access is either consecutive or
2345   // reverse consecutive.
2346   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2347   bool ConsecutiveStride =
2348       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2349   bool CreateGatherScatter =
2350       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2351 
2352   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2353   // gather/scatter. Otherwise Decision should have been to Scalarize.
2354   assert((ConsecutiveStride || CreateGatherScatter) &&
2355          "The instruction should be scalarized");
2356 
2357   // Handle consecutive loads/stores.
2358   if (ConsecutiveStride)
2359     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2360 
2361   VectorParts Mask;
2362   bool isMaskRequired = BlockInMask;
2363   if (isMaskRequired)
2364     Mask = *BlockInMask;
2365 
2366   bool InBounds = false;
2367   if (auto *gep = dyn_cast<GetElementPtrInst>(
2368           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2369     InBounds = gep->isInBounds();
2370 
2371   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2372     // Calculate the pointer for the specific unroll-part.
2373     GetElementPtrInst *PartPtr = nullptr;
2374 
2375     if (Reverse) {
2376       // If the address is consecutive but reversed, then the
2377       // wide store needs to start at the last vector element.
2378       PartPtr = cast<GetElementPtrInst>(
2379           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2380       PartPtr->setIsInBounds(InBounds);
2381       PartPtr = cast<GetElementPtrInst>(
2382           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2383       PartPtr->setIsInBounds(InBounds);
2384       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2385         Mask[Part] = reverseVector(Mask[Part]);
2386     } else {
2387       PartPtr = cast<GetElementPtrInst>(
2388           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2389       PartPtr->setIsInBounds(InBounds);
2390     }
2391 
2392     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2393   };
2394 
2395   // Handle Stores:
2396   if (SI) {
2397     setDebugLocFromInst(Builder, SI);
2398 
2399     for (unsigned Part = 0; Part < UF; ++Part) {
2400       Instruction *NewSI = nullptr;
2401       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2402       if (CreateGatherScatter) {
2403         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2404         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2405         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2406                                             MaskPart);
2407       } else {
2408         if (Reverse) {
2409           // If we store to reverse consecutive memory locations, then we need
2410           // to reverse the order of elements in the stored value.
2411           StoredVal = reverseVector(StoredVal);
2412           // We don't want to update the value in the map as it might be used in
2413           // another expression. So don't call resetVectorValue(StoredVal).
2414         }
2415         auto *VecPtr = CreateVecPtr(Part, Ptr);
2416         if (isMaskRequired)
2417           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2418                                             Mask[Part]);
2419         else
2420           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2421       }
2422       addMetadata(NewSI, SI);
2423     }
2424     return;
2425   }
2426 
2427   // Handle loads.
2428   assert(LI && "Must have a load instruction");
2429   setDebugLocFromInst(Builder, LI);
2430   for (unsigned Part = 0; Part < UF; ++Part) {
2431     Value *NewLI;
2432     if (CreateGatherScatter) {
2433       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2434       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2435       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2436                                          nullptr, "wide.masked.gather");
2437       addMetadata(NewLI, LI);
2438     } else {
2439       auto *VecPtr = CreateVecPtr(Part, Ptr);
2440       if (isMaskRequired)
2441         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
2442                                          UndefValue::get(DataTy),
2443                                          "wide.masked.load");
2444       else
2445         NewLI =
2446             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2447 
2448       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2449       addMetadata(NewLI, LI);
2450       if (Reverse)
2451         NewLI = reverseVector(NewLI);
2452     }
2453     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2454   }
2455 }
2456 
2457 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2458                                                const VPIteration &Instance,
2459                                                bool IfPredicateInstr) {
2460   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2461 
2462   setDebugLocFromInst(Builder, Instr);
2463 
2464   // Does this instruction return a value ?
2465   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2466 
2467   Instruction *Cloned = Instr->clone();
2468   if (!IsVoidRetTy)
2469     Cloned->setName(Instr->getName() + ".cloned");
2470 
2471   // Replace the operands of the cloned instructions with their scalar
2472   // equivalents in the new loop.
2473   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2474     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2475     Cloned->setOperand(op, NewOp);
2476   }
2477   addNewMetadata(Cloned, Instr);
2478 
2479   // Place the cloned scalar in the new loop.
2480   Builder.Insert(Cloned);
2481 
2482   // Add the cloned scalar to the scalar map entry.
2483   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2484 
2485   // If we just cloned a new assumption, add it the assumption cache.
2486   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2487     if (II->getIntrinsicID() == Intrinsic::assume)
2488       AC->registerAssumption(II);
2489 
2490   // End if-block.
2491   if (IfPredicateInstr)
2492     PredicatedInstructions.push_back(Cloned);
2493 }
2494 
2495 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2496                                                       Value *End, Value *Step,
2497                                                       Instruction *DL) {
2498   BasicBlock *Header = L->getHeader();
2499   BasicBlock *Latch = L->getLoopLatch();
2500   // As we're just creating this loop, it's possible no latch exists
2501   // yet. If so, use the header as this will be a single block loop.
2502   if (!Latch)
2503     Latch = Header;
2504 
2505   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2506   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2507   setDebugLocFromInst(Builder, OldInst);
2508   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2509 
2510   Builder.SetInsertPoint(Latch->getTerminator());
2511   setDebugLocFromInst(Builder, OldInst);
2512 
2513   // Create i+1 and fill the PHINode.
2514   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2515   Induction->addIncoming(Start, L->getLoopPreheader());
2516   Induction->addIncoming(Next, Latch);
2517   // Create the compare.
2518   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2519   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2520 
2521   // Now we have two terminators. Remove the old one from the block.
2522   Latch->getTerminator()->eraseFromParent();
2523 
2524   return Induction;
2525 }
2526 
2527 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2528   if (TripCount)
2529     return TripCount;
2530 
2531   assert(L && "Create Trip Count for null loop.");
2532   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2533   // Find the loop boundaries.
2534   ScalarEvolution *SE = PSE.getSE();
2535   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2536   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2537          "Invalid loop count");
2538 
2539   Type *IdxTy = Legal->getWidestInductionType();
2540   assert(IdxTy && "No type for induction");
2541 
2542   // The exit count might have the type of i64 while the phi is i32. This can
2543   // happen if we have an induction variable that is sign extended before the
2544   // compare. The only way that we get a backedge taken count is that the
2545   // induction variable was signed and as such will not overflow. In such a case
2546   // truncation is legal.
2547   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2548       IdxTy->getPrimitiveSizeInBits())
2549     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2550   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2551 
2552   // Get the total trip count from the count by adding 1.
2553   const SCEV *ExitCount = SE->getAddExpr(
2554       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2555 
2556   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2557 
2558   // Expand the trip count and place the new instructions in the preheader.
2559   // Notice that the pre-header does not change, only the loop body.
2560   SCEVExpander Exp(*SE, DL, "induction");
2561 
2562   // Count holds the overall loop count (N).
2563   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2564                                 L->getLoopPreheader()->getTerminator());
2565 
2566   if (TripCount->getType()->isPointerTy())
2567     TripCount =
2568         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2569                                     L->getLoopPreheader()->getTerminator());
2570 
2571   return TripCount;
2572 }
2573 
2574 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2575   if (VectorTripCount)
2576     return VectorTripCount;
2577 
2578   Value *TC = getOrCreateTripCount(L);
2579   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2580 
2581   Type *Ty = TC->getType();
2582   Constant *Step = ConstantInt::get(Ty, VF * UF);
2583 
2584   // If the tail is to be folded by masking, round the number of iterations N
2585   // up to a multiple of Step instead of rounding down. This is done by first
2586   // adding Step-1 and then rounding down. Note that it's ok if this addition
2587   // overflows: the vector induction variable will eventually wrap to zero given
2588   // that it starts at zero and its Step is a power of two; the loop will then
2589   // exit, with the last early-exit vector comparison also producing all-true.
2590   if (Cost->foldTailByMasking()) {
2591     assert(isPowerOf2_32(VF * UF) &&
2592            "VF*UF must be a power of 2 when folding tail by masking");
2593     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2594   }
2595 
2596   // Now we need to generate the expression for the part of the loop that the
2597   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2598   // iterations are not required for correctness, or N - Step, otherwise. Step
2599   // is equal to the vectorization factor (number of SIMD elements) times the
2600   // unroll factor (number of SIMD instructions).
2601   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2602 
2603   // If there is a non-reversed interleaved group that may speculatively access
2604   // memory out-of-bounds, we need to ensure that there will be at least one
2605   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2606   // the trip count, we set the remainder to be equal to the step. If the step
2607   // does not evenly divide the trip count, no adjustment is necessary since
2608   // there will already be scalar iterations. Note that the minimum iterations
2609   // check ensures that N >= Step.
2610   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2611     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2612     R = Builder.CreateSelect(IsZero, Step, R);
2613   }
2614 
2615   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2616 
2617   return VectorTripCount;
2618 }
2619 
2620 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2621                                                    const DataLayout &DL) {
2622   // Verify that V is a vector type with same number of elements as DstVTy.
2623   unsigned VF = DstVTy->getNumElements();
2624   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2625   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2626   Type *SrcElemTy = SrcVecTy->getElementType();
2627   Type *DstElemTy = DstVTy->getElementType();
2628   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2629          "Vector elements must have same size");
2630 
2631   // Do a direct cast if element types are castable.
2632   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2633     return Builder.CreateBitOrPointerCast(V, DstVTy);
2634   }
2635   // V cannot be directly casted to desired vector type.
2636   // May happen when V is a floating point vector but DstVTy is a vector of
2637   // pointers or vice-versa. Handle this using a two-step bitcast using an
2638   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2639   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2640          "Only one type should be a pointer type");
2641   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2642          "Only one type should be a floating point type");
2643   Type *IntTy =
2644       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2645   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2646   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2647   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2648 }
2649 
2650 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2651                                                          BasicBlock *Bypass) {
2652   Value *Count = getOrCreateTripCount(L);
2653   BasicBlock *BB = L->getLoopPreheader();
2654   IRBuilder<> Builder(BB->getTerminator());
2655 
2656   // Generate code to check if the loop's trip count is less than VF * UF, or
2657   // equal to it in case a scalar epilogue is required; this implies that the
2658   // vector trip count is zero. This check also covers the case where adding one
2659   // to the backedge-taken count overflowed leading to an incorrect trip count
2660   // of zero. In this case we will also jump to the scalar loop.
2661   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2662                                           : ICmpInst::ICMP_ULT;
2663 
2664   // If tail is to be folded, vector loop takes care of all iterations.
2665   Value *CheckMinIters = Builder.getFalse();
2666   if (!Cost->foldTailByMasking())
2667     CheckMinIters = Builder.CreateICmp(
2668         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2669         "min.iters.check");
2670 
2671   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2672   // Update dominator tree immediately if the generated block is a
2673   // LoopBypassBlock because SCEV expansions to generate loop bypass
2674   // checks may query it before the current function is finished.
2675   DT->addNewBlock(NewBB, BB);
2676   if (L->getParentLoop())
2677     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2678   ReplaceInstWithInst(BB->getTerminator(),
2679                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2680   LoopBypassBlocks.push_back(BB);
2681 }
2682 
2683 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2684   BasicBlock *BB = L->getLoopPreheader();
2685 
2686   // Generate the code to check that the SCEV assumptions that we made.
2687   // We want the new basic block to start at the first instruction in a
2688   // sequence of instructions that form a check.
2689   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2690                    "scev.check");
2691   Value *SCEVCheck =
2692       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2693 
2694   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2695     if (C->isZero())
2696       return;
2697 
2698   assert(!BB->getParent()->hasOptSize() &&
2699          "Cannot SCEV check stride or overflow when optimizing for size");
2700 
2701   // Create a new block containing the stride check.
2702   BB->setName("vector.scevcheck");
2703   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2704   // Update dominator tree immediately if the generated block is a
2705   // LoopBypassBlock because SCEV expansions to generate loop bypass
2706   // checks may query it before the current function is finished.
2707   DT->addNewBlock(NewBB, BB);
2708   if (L->getParentLoop())
2709     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2710   ReplaceInstWithInst(BB->getTerminator(),
2711                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2712   LoopBypassBlocks.push_back(BB);
2713   AddedSafetyChecks = true;
2714 }
2715 
2716 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2717   // VPlan-native path does not do any analysis for runtime checks currently.
2718   if (EnableVPlanNativePath)
2719     return;
2720 
2721   BasicBlock *BB = L->getLoopPreheader();
2722 
2723   // Generate the code that checks in runtime if arrays overlap. We put the
2724   // checks into a separate block to make the more common case of few elements
2725   // faster.
2726   Instruction *FirstCheckInst;
2727   Instruction *MemRuntimeCheck;
2728   std::tie(FirstCheckInst, MemRuntimeCheck) =
2729       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2730   if (!MemRuntimeCheck)
2731     return;
2732 
2733   if (BB->getParent()->hasOptSize()) {
2734     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2735            "Cannot emit memory checks when optimizing for size, unless forced "
2736            "to vectorize.");
2737     ORE->emit([&]() {
2738       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2739                                         L->getStartLoc(), L->getHeader())
2740              << "Code-size may be reduced by not forcing "
2741                 "vectorization, or by source-code modifications "
2742                 "eliminating the need for runtime checks "
2743                 "(e.g., adding 'restrict').";
2744     });
2745   }
2746 
2747   // Create a new block containing the memory check.
2748   BB->setName("vector.memcheck");
2749   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2750   // Update dominator tree immediately if the generated block is a
2751   // LoopBypassBlock because SCEV expansions to generate loop bypass
2752   // checks may query it before the current function is finished.
2753   DT->addNewBlock(NewBB, BB);
2754   if (L->getParentLoop())
2755     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2756   ReplaceInstWithInst(BB->getTerminator(),
2757                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2758   LoopBypassBlocks.push_back(BB);
2759   AddedSafetyChecks = true;
2760 
2761   // We currently don't use LoopVersioning for the actual loop cloning but we
2762   // still use it to add the noalias metadata.
2763   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2764                                            PSE.getSE());
2765   LVer->prepareNoAliasMetadata();
2766 }
2767 
2768 Value *InnerLoopVectorizer::emitTransformedIndex(
2769     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2770     const InductionDescriptor &ID) const {
2771 
2772   SCEVExpander Exp(*SE, DL, "induction");
2773   auto Step = ID.getStep();
2774   auto StartValue = ID.getStartValue();
2775   assert(Index->getType() == Step->getType() &&
2776          "Index type does not match StepValue type");
2777 
2778   // Note: the IR at this point is broken. We cannot use SE to create any new
2779   // SCEV and then expand it, hoping that SCEV's simplification will give us
2780   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2781   // lead to various SCEV crashes. So all we can do is to use builder and rely
2782   // on InstCombine for future simplifications. Here we handle some trivial
2783   // cases only.
2784   auto CreateAdd = [&B](Value *X, Value *Y) {
2785     assert(X->getType() == Y->getType() && "Types don't match!");
2786     if (auto *CX = dyn_cast<ConstantInt>(X))
2787       if (CX->isZero())
2788         return Y;
2789     if (auto *CY = dyn_cast<ConstantInt>(Y))
2790       if (CY->isZero())
2791         return X;
2792     return B.CreateAdd(X, Y);
2793   };
2794 
2795   auto CreateMul = [&B](Value *X, Value *Y) {
2796     assert(X->getType() == Y->getType() && "Types don't match!");
2797     if (auto *CX = dyn_cast<ConstantInt>(X))
2798       if (CX->isOne())
2799         return Y;
2800     if (auto *CY = dyn_cast<ConstantInt>(Y))
2801       if (CY->isOne())
2802         return X;
2803     return B.CreateMul(X, Y);
2804   };
2805 
2806   switch (ID.getKind()) {
2807   case InductionDescriptor::IK_IntInduction: {
2808     assert(Index->getType() == StartValue->getType() &&
2809            "Index type does not match StartValue type");
2810     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2811       return B.CreateSub(StartValue, Index);
2812     auto *Offset = CreateMul(
2813         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2814     return CreateAdd(StartValue, Offset);
2815   }
2816   case InductionDescriptor::IK_PtrInduction: {
2817     assert(isa<SCEVConstant>(Step) &&
2818            "Expected constant step for pointer induction");
2819     return B.CreateGEP(
2820         StartValue->getType()->getPointerElementType(), StartValue,
2821         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2822                                            &*B.GetInsertPoint())));
2823   }
2824   case InductionDescriptor::IK_FpInduction: {
2825     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2826     auto InductionBinOp = ID.getInductionBinOp();
2827     assert(InductionBinOp &&
2828            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2829             InductionBinOp->getOpcode() == Instruction::FSub) &&
2830            "Original bin op should be defined for FP induction");
2831 
2832     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2833 
2834     // Floating point operations had to be 'fast' to enable the induction.
2835     FastMathFlags Flags;
2836     Flags.setFast();
2837 
2838     Value *MulExp = B.CreateFMul(StepValue, Index);
2839     if (isa<Instruction>(MulExp))
2840       // We have to check, the MulExp may be a constant.
2841       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2842 
2843     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2844                                "induction");
2845     if (isa<Instruction>(BOp))
2846       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2847 
2848     return BOp;
2849   }
2850   case InductionDescriptor::IK_NoInduction:
2851     return nullptr;
2852   }
2853   llvm_unreachable("invalid enum");
2854 }
2855 
2856 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2857   /*
2858    In this function we generate a new loop. The new loop will contain
2859    the vectorized instructions while the old loop will continue to run the
2860    scalar remainder.
2861 
2862        [ ] <-- loop iteration number check.
2863     /   |
2864    /    v
2865   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2866   |  /  |
2867   | /   v
2868   ||   [ ]     <-- vector pre header.
2869   |/    |
2870   |     v
2871   |    [  ] \
2872   |    [  ]_|   <-- vector loop.
2873   |     |
2874   |     v
2875   |   -[ ]   <--- middle-block.
2876   |  /  |
2877   | /   v
2878   -|- >[ ]     <--- new preheader.
2879    |    |
2880    |    v
2881    |   [ ] \
2882    |   [ ]_|   <-- old scalar loop to handle remainder.
2883     \   |
2884      \  v
2885       >[ ]     <-- exit block.
2886    ...
2887    */
2888 
2889   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2890   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2891   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2892   MDNode *OrigLoopID = OrigLoop->getLoopID();
2893   assert(VectorPH && "Invalid loop structure");
2894   assert(ExitBlock && "Must have an exit block");
2895 
2896   // Some loops have a single integer induction variable, while other loops
2897   // don't. One example is c++ iterators that often have multiple pointer
2898   // induction variables. In the code below we also support a case where we
2899   // don't have a single induction variable.
2900   //
2901   // We try to obtain an induction variable from the original loop as hard
2902   // as possible. However if we don't find one that:
2903   //   - is an integer
2904   //   - counts from zero, stepping by one
2905   //   - is the size of the widest induction variable type
2906   // then we create a new one.
2907   OldInduction = Legal->getPrimaryInduction();
2908   Type *IdxTy = Legal->getWidestInductionType();
2909 
2910   // Split the single block loop into the two loop structure described above.
2911   BasicBlock *VecBody =
2912       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2913   BasicBlock *MiddleBlock =
2914       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2915   BasicBlock *ScalarPH =
2916       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2917 
2918   // Create and register the new vector loop.
2919   Loop *Lp = LI->AllocateLoop();
2920   Loop *ParentLoop = OrigLoop->getParentLoop();
2921 
2922   // Insert the new loop into the loop nest and register the new basic blocks
2923   // before calling any utilities such as SCEV that require valid LoopInfo.
2924   if (ParentLoop) {
2925     ParentLoop->addChildLoop(Lp);
2926     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2927     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2928   } else {
2929     LI->addTopLevelLoop(Lp);
2930   }
2931   Lp->addBasicBlockToLoop(VecBody, *LI);
2932 
2933   // Find the loop boundaries.
2934   Value *Count = getOrCreateTripCount(Lp);
2935 
2936   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2937 
2938   // Now, compare the new count to zero. If it is zero skip the vector loop and
2939   // jump to the scalar loop. This check also covers the case where the
2940   // backedge-taken count is uint##_max: adding one to it will overflow leading
2941   // to an incorrect trip count of zero. In this (rare) case we will also jump
2942   // to the scalar loop.
2943   emitMinimumIterationCountCheck(Lp, ScalarPH);
2944 
2945   // Generate the code to check any assumptions that we've made for SCEV
2946   // expressions.
2947   emitSCEVChecks(Lp, ScalarPH);
2948 
2949   // Generate the code that checks in runtime if arrays overlap. We put the
2950   // checks into a separate block to make the more common case of few elements
2951   // faster.
2952   emitMemRuntimeChecks(Lp, ScalarPH);
2953 
2954   // Generate the induction variable.
2955   // The loop step is equal to the vectorization factor (num of SIMD elements)
2956   // times the unroll factor (num of SIMD instructions).
2957   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2958   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2959   Induction =
2960       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2961                               getDebugLocFromInstOrOperands(OldInduction));
2962 
2963   // We are going to resume the execution of the scalar loop.
2964   // Go over all of the induction variables that we found and fix the
2965   // PHIs that are left in the scalar version of the loop.
2966   // The starting values of PHI nodes depend on the counter of the last
2967   // iteration in the vectorized loop.
2968   // If we come from a bypass edge then we need to start from the original
2969   // start value.
2970 
2971   // This variable saves the new starting index for the scalar loop. It is used
2972   // to test if there are any tail iterations left once the vector loop has
2973   // completed.
2974   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
2975   for (auto &InductionEntry : *List) {
2976     PHINode *OrigPhi = InductionEntry.first;
2977     InductionDescriptor II = InductionEntry.second;
2978 
2979     // Create phi nodes to merge from the  backedge-taken check block.
2980     PHINode *BCResumeVal = PHINode::Create(
2981         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
2982     // Copy original phi DL over to the new one.
2983     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2984     Value *&EndValue = IVEndValues[OrigPhi];
2985     if (OrigPhi == OldInduction) {
2986       // We know what the end value is.
2987       EndValue = CountRoundDown;
2988     } else {
2989       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
2990       Type *StepType = II.getStep()->getType();
2991       Instruction::CastOps CastOp =
2992         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
2993       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
2994       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2995       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
2996       EndValue->setName("ind.end");
2997     }
2998 
2999     // The new PHI merges the original incoming value, in case of a bypass,
3000     // or the value at the end of the vectorized loop.
3001     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3002 
3003     // Fix the scalar body counter (PHI node).
3004     // The old induction's phi node in the scalar body needs the truncated
3005     // value.
3006     for (BasicBlock *BB : LoopBypassBlocks)
3007       BCResumeVal->addIncoming(II.getStartValue(), BB);
3008     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3009   }
3010 
3011   // We need the OrigLoop (scalar loop part) latch terminator to help
3012   // produce correct debug info for the middle block BB instructions.
3013   // The legality check stage guarantees that the loop will have a single
3014   // latch.
3015   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3016          "Scalar loop latch terminator isn't a branch");
3017   BranchInst *ScalarLatchBr =
3018       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3019 
3020   // Add a check in the middle block to see if we have completed
3021   // all of the iterations in the first vector loop.
3022   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3023   // If tail is to be folded, we know we don't need to run the remainder.
3024   Value *CmpN = Builder.getTrue();
3025   if (!Cost->foldTailByMasking()) {
3026     CmpN =
3027         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3028                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3029 
3030     // Here we use the same DebugLoc as the scalar loop latch branch instead
3031     // of the corresponding compare because they may have ended up with
3032     // different line numbers and we want to avoid awkward line stepping while
3033     // debugging. Eg. if the compare has got a line number inside the loop.
3034     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3035   }
3036 
3037   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3038   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3039   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3040 
3041   // Get ready to start creating new instructions into the vectorized body.
3042   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3043 
3044   // Save the state.
3045   LoopVectorPreHeader = Lp->getLoopPreheader();
3046   LoopScalarPreHeader = ScalarPH;
3047   LoopMiddleBlock = MiddleBlock;
3048   LoopExitBlock = ExitBlock;
3049   LoopVectorBody = VecBody;
3050   LoopScalarBody = OldBasicBlock;
3051 
3052   Optional<MDNode *> VectorizedLoopID =
3053       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3054                                       LLVMLoopVectorizeFollowupVectorized});
3055   if (VectorizedLoopID.hasValue()) {
3056     Lp->setLoopID(VectorizedLoopID.getValue());
3057 
3058     // Do not setAlreadyVectorized if loop attributes have been defined
3059     // explicitly.
3060     return LoopVectorPreHeader;
3061   }
3062 
3063   // Keep all loop hints from the original loop on the vector loop (we'll
3064   // replace the vectorizer-specific hints below).
3065   if (MDNode *LID = OrigLoop->getLoopID())
3066     Lp->setLoopID(LID);
3067 
3068   LoopVectorizeHints Hints(Lp, true, *ORE);
3069   Hints.setAlreadyVectorized();
3070 
3071   return LoopVectorPreHeader;
3072 }
3073 
3074 // Fix up external users of the induction variable. At this point, we are
3075 // in LCSSA form, with all external PHIs that use the IV having one input value,
3076 // coming from the remainder loop. We need those PHIs to also have a correct
3077 // value for the IV when arriving directly from the middle block.
3078 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3079                                        const InductionDescriptor &II,
3080                                        Value *CountRoundDown, Value *EndValue,
3081                                        BasicBlock *MiddleBlock) {
3082   // There are two kinds of external IV usages - those that use the value
3083   // computed in the last iteration (the PHI) and those that use the penultimate
3084   // value (the value that feeds into the phi from the loop latch).
3085   // We allow both, but they, obviously, have different values.
3086 
3087   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3088 
3089   DenseMap<Value *, Value *> MissingVals;
3090 
3091   // An external user of the last iteration's value should see the value that
3092   // the remainder loop uses to initialize its own IV.
3093   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3094   for (User *U : PostInc->users()) {
3095     Instruction *UI = cast<Instruction>(U);
3096     if (!OrigLoop->contains(UI)) {
3097       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3098       MissingVals[UI] = EndValue;
3099     }
3100   }
3101 
3102   // An external user of the penultimate value need to see EndValue - Step.
3103   // The simplest way to get this is to recompute it from the constituent SCEVs,
3104   // that is Start + (Step * (CRD - 1)).
3105   for (User *U : OrigPhi->users()) {
3106     auto *UI = cast<Instruction>(U);
3107     if (!OrigLoop->contains(UI)) {
3108       const DataLayout &DL =
3109           OrigLoop->getHeader()->getModule()->getDataLayout();
3110       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3111 
3112       IRBuilder<> B(MiddleBlock->getTerminator());
3113       Value *CountMinusOne = B.CreateSub(
3114           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3115       Value *CMO =
3116           !II.getStep()->getType()->isIntegerTy()
3117               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3118                              II.getStep()->getType())
3119               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3120       CMO->setName("cast.cmo");
3121       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3122       Escape->setName("ind.escape");
3123       MissingVals[UI] = Escape;
3124     }
3125   }
3126 
3127   for (auto &I : MissingVals) {
3128     PHINode *PHI = cast<PHINode>(I.first);
3129     // One corner case we have to handle is two IVs "chasing" each-other,
3130     // that is %IV2 = phi [...], [ %IV1, %latch ]
3131     // In this case, if IV1 has an external use, we need to avoid adding both
3132     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3133     // don't already have an incoming value for the middle block.
3134     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3135       PHI->addIncoming(I.second, MiddleBlock);
3136   }
3137 }
3138 
3139 namespace {
3140 
3141 struct CSEDenseMapInfo {
3142   static bool canHandle(const Instruction *I) {
3143     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3144            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3145   }
3146 
3147   static inline Instruction *getEmptyKey() {
3148     return DenseMapInfo<Instruction *>::getEmptyKey();
3149   }
3150 
3151   static inline Instruction *getTombstoneKey() {
3152     return DenseMapInfo<Instruction *>::getTombstoneKey();
3153   }
3154 
3155   static unsigned getHashValue(const Instruction *I) {
3156     assert(canHandle(I) && "Unknown instruction!");
3157     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3158                                                            I->value_op_end()));
3159   }
3160 
3161   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3162     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3163         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3164       return LHS == RHS;
3165     return LHS->isIdenticalTo(RHS);
3166   }
3167 };
3168 
3169 } // end anonymous namespace
3170 
3171 ///Perform cse of induction variable instructions.
3172 static void cse(BasicBlock *BB) {
3173   // Perform simple cse.
3174   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3175   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3176     Instruction *In = &*I++;
3177 
3178     if (!CSEDenseMapInfo::canHandle(In))
3179       continue;
3180 
3181     // Check if we can replace this instruction with any of the
3182     // visited instructions.
3183     if (Instruction *V = CSEMap.lookup(In)) {
3184       In->replaceAllUsesWith(V);
3185       In->eraseFromParent();
3186       continue;
3187     }
3188 
3189     CSEMap[In] = In;
3190   }
3191 }
3192 
3193 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3194                                                        unsigned VF,
3195                                                        bool &NeedToScalarize) {
3196   Function *F = CI->getCalledFunction();
3197   StringRef FnName = CI->getCalledFunction()->getName();
3198   Type *ScalarRetTy = CI->getType();
3199   SmallVector<Type *, 4> Tys, ScalarTys;
3200   for (auto &ArgOp : CI->arg_operands())
3201     ScalarTys.push_back(ArgOp->getType());
3202 
3203   // Estimate cost of scalarized vector call. The source operands are assumed
3204   // to be vectors, so we need to extract individual elements from there,
3205   // execute VF scalar calls, and then gather the result into the vector return
3206   // value.
3207   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3208   if (VF == 1)
3209     return ScalarCallCost;
3210 
3211   // Compute corresponding vector type for return value and arguments.
3212   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3213   for (Type *ScalarTy : ScalarTys)
3214     Tys.push_back(ToVectorTy(ScalarTy, VF));
3215 
3216   // Compute costs of unpacking argument values for the scalar calls and
3217   // packing the return values to a vector.
3218   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3219 
3220   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3221 
3222   // If we can't emit a vector call for this function, then the currently found
3223   // cost is the cost we need to return.
3224   NeedToScalarize = true;
3225   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3226     return Cost;
3227 
3228   // If the corresponding vector cost is cheaper, return its cost.
3229   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3230   if (VectorCallCost < Cost) {
3231     NeedToScalarize = false;
3232     return VectorCallCost;
3233   }
3234   return Cost;
3235 }
3236 
3237 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3238                                                             unsigned VF) {
3239   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3240   assert(ID && "Expected intrinsic call!");
3241 
3242   FastMathFlags FMF;
3243   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3244     FMF = FPMO->getFastMathFlags();
3245 
3246   SmallVector<Value *, 4> Operands(CI->arg_operands());
3247   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3248 }
3249 
3250 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3251   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3252   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3253   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3254 }
3255 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3256   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3257   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3258   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3259 }
3260 
3261 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3262   // For every instruction `I` in MinBWs, truncate the operands, create a
3263   // truncated version of `I` and reextend its result. InstCombine runs
3264   // later and will remove any ext/trunc pairs.
3265   SmallPtrSet<Value *, 4> Erased;
3266   for (const auto &KV : Cost->getMinimalBitwidths()) {
3267     // If the value wasn't vectorized, we must maintain the original scalar
3268     // type. The absence of the value from VectorLoopValueMap indicates that it
3269     // wasn't vectorized.
3270     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3271       continue;
3272     for (unsigned Part = 0; Part < UF; ++Part) {
3273       Value *I = getOrCreateVectorValue(KV.first, Part);
3274       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3275           !isa<Instruction>(I))
3276         continue;
3277       Type *OriginalTy = I->getType();
3278       Type *ScalarTruncatedTy =
3279           IntegerType::get(OriginalTy->getContext(), KV.second);
3280       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3281                                           OriginalTy->getVectorNumElements());
3282       if (TruncatedTy == OriginalTy)
3283         continue;
3284 
3285       IRBuilder<> B(cast<Instruction>(I));
3286       auto ShrinkOperand = [&](Value *V) -> Value * {
3287         if (auto *ZI = dyn_cast<ZExtInst>(V))
3288           if (ZI->getSrcTy() == TruncatedTy)
3289             return ZI->getOperand(0);
3290         return B.CreateZExtOrTrunc(V, TruncatedTy);
3291       };
3292 
3293       // The actual instruction modification depends on the instruction type,
3294       // unfortunately.
3295       Value *NewI = nullptr;
3296       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3297         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3298                              ShrinkOperand(BO->getOperand(1)));
3299 
3300         // Any wrapping introduced by shrinking this operation shouldn't be
3301         // considered undefined behavior. So, we can't unconditionally copy
3302         // arithmetic wrapping flags to NewI.
3303         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3304       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3305         NewI =
3306             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3307                          ShrinkOperand(CI->getOperand(1)));
3308       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3309         NewI = B.CreateSelect(SI->getCondition(),
3310                               ShrinkOperand(SI->getTrueValue()),
3311                               ShrinkOperand(SI->getFalseValue()));
3312       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3313         switch (CI->getOpcode()) {
3314         default:
3315           llvm_unreachable("Unhandled cast!");
3316         case Instruction::Trunc:
3317           NewI = ShrinkOperand(CI->getOperand(0));
3318           break;
3319         case Instruction::SExt:
3320           NewI = B.CreateSExtOrTrunc(
3321               CI->getOperand(0),
3322               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3323           break;
3324         case Instruction::ZExt:
3325           NewI = B.CreateZExtOrTrunc(
3326               CI->getOperand(0),
3327               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3328           break;
3329         }
3330       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3331         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3332         auto *O0 = B.CreateZExtOrTrunc(
3333             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3334         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3335         auto *O1 = B.CreateZExtOrTrunc(
3336             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3337 
3338         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3339       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3340         // Don't do anything with the operands, just extend the result.
3341         continue;
3342       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3343         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3344         auto *O0 = B.CreateZExtOrTrunc(
3345             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3346         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3347         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3348       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3349         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3350         auto *O0 = B.CreateZExtOrTrunc(
3351             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3352         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3353       } else {
3354         // If we don't know what to do, be conservative and don't do anything.
3355         continue;
3356       }
3357 
3358       // Lastly, extend the result.
3359       NewI->takeName(cast<Instruction>(I));
3360       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3361       I->replaceAllUsesWith(Res);
3362       cast<Instruction>(I)->eraseFromParent();
3363       Erased.insert(I);
3364       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3365     }
3366   }
3367 
3368   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3369   for (const auto &KV : Cost->getMinimalBitwidths()) {
3370     // If the value wasn't vectorized, we must maintain the original scalar
3371     // type. The absence of the value from VectorLoopValueMap indicates that it
3372     // wasn't vectorized.
3373     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3374       continue;
3375     for (unsigned Part = 0; Part < UF; ++Part) {
3376       Value *I = getOrCreateVectorValue(KV.first, Part);
3377       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3378       if (Inst && Inst->use_empty()) {
3379         Value *NewI = Inst->getOperand(0);
3380         Inst->eraseFromParent();
3381         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3382       }
3383     }
3384   }
3385 }
3386 
3387 void InnerLoopVectorizer::fixVectorizedLoop() {
3388   // Insert truncates and extends for any truncated instructions as hints to
3389   // InstCombine.
3390   if (VF > 1)
3391     truncateToMinimalBitwidths();
3392 
3393   // Fix widened non-induction PHIs by setting up the PHI operands.
3394   if (OrigPHIsToFix.size()) {
3395     assert(EnableVPlanNativePath &&
3396            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3397     fixNonInductionPHIs();
3398   }
3399 
3400   // At this point every instruction in the original loop is widened to a
3401   // vector form. Now we need to fix the recurrences in the loop. These PHI
3402   // nodes are currently empty because we did not want to introduce cycles.
3403   // This is the second stage of vectorizing recurrences.
3404   fixCrossIterationPHIs();
3405 
3406   // Update the dominator tree.
3407   //
3408   // FIXME: After creating the structure of the new loop, the dominator tree is
3409   //        no longer up-to-date, and it remains that way until we update it
3410   //        here. An out-of-date dominator tree is problematic for SCEV,
3411   //        because SCEVExpander uses it to guide code generation. The
3412   //        vectorizer use SCEVExpanders in several places. Instead, we should
3413   //        keep the dominator tree up-to-date as we go.
3414   updateAnalysis();
3415 
3416   // Fix-up external users of the induction variables.
3417   for (auto &Entry : *Legal->getInductionVars())
3418     fixupIVUsers(Entry.first, Entry.second,
3419                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3420                  IVEndValues[Entry.first], LoopMiddleBlock);
3421 
3422   fixLCSSAPHIs();
3423   for (Instruction *PI : PredicatedInstructions)
3424     sinkScalarOperands(&*PI);
3425 
3426   // Remove redundant induction instructions.
3427   cse(LoopVectorBody);
3428 }
3429 
3430 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3431   // In order to support recurrences we need to be able to vectorize Phi nodes.
3432   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3433   // stage #2: We now need to fix the recurrences by adding incoming edges to
3434   // the currently empty PHI nodes. At this point every instruction in the
3435   // original loop is widened to a vector form so we can use them to construct
3436   // the incoming edges.
3437   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3438     // Handle first-order recurrences and reductions that need to be fixed.
3439     if (Legal->isFirstOrderRecurrence(&Phi))
3440       fixFirstOrderRecurrence(&Phi);
3441     else if (Legal->isReductionVariable(&Phi))
3442       fixReduction(&Phi);
3443   }
3444 }
3445 
3446 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3447   // This is the second phase of vectorizing first-order recurrences. An
3448   // overview of the transformation is described below. Suppose we have the
3449   // following loop.
3450   //
3451   //   for (int i = 0; i < n; ++i)
3452   //     b[i] = a[i] - a[i - 1];
3453   //
3454   // There is a first-order recurrence on "a". For this loop, the shorthand
3455   // scalar IR looks like:
3456   //
3457   //   scalar.ph:
3458   //     s_init = a[-1]
3459   //     br scalar.body
3460   //
3461   //   scalar.body:
3462   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3463   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3464   //     s2 = a[i]
3465   //     b[i] = s2 - s1
3466   //     br cond, scalar.body, ...
3467   //
3468   // In this example, s1 is a recurrence because it's value depends on the
3469   // previous iteration. In the first phase of vectorization, we created a
3470   // temporary value for s1. We now complete the vectorization and produce the
3471   // shorthand vector IR shown below (for VF = 4, UF = 1).
3472   //
3473   //   vector.ph:
3474   //     v_init = vector(..., ..., ..., a[-1])
3475   //     br vector.body
3476   //
3477   //   vector.body
3478   //     i = phi [0, vector.ph], [i+4, vector.body]
3479   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3480   //     v2 = a[i, i+1, i+2, i+3];
3481   //     v3 = vector(v1(3), v2(0, 1, 2))
3482   //     b[i, i+1, i+2, i+3] = v2 - v3
3483   //     br cond, vector.body, middle.block
3484   //
3485   //   middle.block:
3486   //     x = v2(3)
3487   //     br scalar.ph
3488   //
3489   //   scalar.ph:
3490   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3491   //     br scalar.body
3492   //
3493   // After execution completes the vector loop, we extract the next value of
3494   // the recurrence (x) to use as the initial value in the scalar loop.
3495 
3496   // Get the original loop preheader and single loop latch.
3497   auto *Preheader = OrigLoop->getLoopPreheader();
3498   auto *Latch = OrigLoop->getLoopLatch();
3499 
3500   // Get the initial and previous values of the scalar recurrence.
3501   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3502   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3503 
3504   // Create a vector from the initial value.
3505   auto *VectorInit = ScalarInit;
3506   if (VF > 1) {
3507     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3508     VectorInit = Builder.CreateInsertElement(
3509         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3510         Builder.getInt32(VF - 1), "vector.recur.init");
3511   }
3512 
3513   // We constructed a temporary phi node in the first phase of vectorization.
3514   // This phi node will eventually be deleted.
3515   Builder.SetInsertPoint(
3516       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3517 
3518   // Create a phi node for the new recurrence. The current value will either be
3519   // the initial value inserted into a vector or loop-varying vector value.
3520   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3521   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3522 
3523   // Get the vectorized previous value of the last part UF - 1. It appears last
3524   // among all unrolled iterations, due to the order of their construction.
3525   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3526 
3527   // Set the insertion point after the previous value if it is an instruction.
3528   // Note that the previous value may have been constant-folded so it is not
3529   // guaranteed to be an instruction in the vector loop. Also, if the previous
3530   // value is a phi node, we should insert after all the phi nodes to avoid
3531   // breaking basic block verification.
3532   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
3533       isa<PHINode>(PreviousLastPart))
3534     Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3535   else
3536     Builder.SetInsertPoint(
3537         &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
3538 
3539   // We will construct a vector for the recurrence by combining the values for
3540   // the current and previous iterations. This is the required shuffle mask.
3541   SmallVector<Constant *, 8> ShuffleMask(VF);
3542   ShuffleMask[0] = Builder.getInt32(VF - 1);
3543   for (unsigned I = 1; I < VF; ++I)
3544     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3545 
3546   // The vector from which to take the initial value for the current iteration
3547   // (actual or unrolled). Initially, this is the vector phi node.
3548   Value *Incoming = VecPhi;
3549 
3550   // Shuffle the current and previous vector and update the vector parts.
3551   for (unsigned Part = 0; Part < UF; ++Part) {
3552     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3553     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3554     auto *Shuffle =
3555         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3556                                              ConstantVector::get(ShuffleMask))
3557                : Incoming;
3558     PhiPart->replaceAllUsesWith(Shuffle);
3559     cast<Instruction>(PhiPart)->eraseFromParent();
3560     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3561     Incoming = PreviousPart;
3562   }
3563 
3564   // Fix the latch value of the new recurrence in the vector loop.
3565   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3566 
3567   // Extract the last vector element in the middle block. This will be the
3568   // initial value for the recurrence when jumping to the scalar loop.
3569   auto *ExtractForScalar = Incoming;
3570   if (VF > 1) {
3571     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3572     ExtractForScalar = Builder.CreateExtractElement(
3573         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3574   }
3575   // Extract the second last element in the middle block if the
3576   // Phi is used outside the loop. We need to extract the phi itself
3577   // and not the last element (the phi update in the current iteration). This
3578   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3579   // when the scalar loop is not run at all.
3580   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3581   if (VF > 1)
3582     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3583         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3584   // When loop is unrolled without vectorizing, initialize
3585   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3586   // `Incoming`. This is analogous to the vectorized case above: extracting the
3587   // second last element when VF > 1.
3588   else if (UF > 1)
3589     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3590 
3591   // Fix the initial value of the original recurrence in the scalar loop.
3592   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3593   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3594   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3595     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3596     Start->addIncoming(Incoming, BB);
3597   }
3598 
3599   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3600   Phi->setName("scalar.recur");
3601 
3602   // Finally, fix users of the recurrence outside the loop. The users will need
3603   // either the last value of the scalar recurrence or the last value of the
3604   // vector recurrence we extracted in the middle block. Since the loop is in
3605   // LCSSA form, we just need to find all the phi nodes for the original scalar
3606   // recurrence in the exit block, and then add an edge for the middle block.
3607   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3608     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3609       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3610     }
3611   }
3612 }
3613 
3614 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3615   Constant *Zero = Builder.getInt32(0);
3616 
3617   // Get it's reduction variable descriptor.
3618   assert(Legal->isReductionVariable(Phi) &&
3619          "Unable to find the reduction variable");
3620   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3621 
3622   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3623   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3624   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3625   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3626     RdxDesc.getMinMaxRecurrenceKind();
3627   setDebugLocFromInst(Builder, ReductionStartValue);
3628 
3629   // We need to generate a reduction vector from the incoming scalar.
3630   // To do so, we need to generate the 'identity' vector and override
3631   // one of the elements with the incoming scalar reduction. We need
3632   // to do it in the vector-loop preheader.
3633   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3634 
3635   // This is the vector-clone of the value that leaves the loop.
3636   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3637 
3638   // Find the reduction identity variable. Zero for addition, or, xor,
3639   // one for multiplication, -1 for And.
3640   Value *Identity;
3641   Value *VectorStart;
3642   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3643       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3644     // MinMax reduction have the start value as their identify.
3645     if (VF == 1) {
3646       VectorStart = Identity = ReductionStartValue;
3647     } else {
3648       VectorStart = Identity =
3649         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3650     }
3651   } else {
3652     // Handle other reduction kinds:
3653     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3654         RK, VecTy->getScalarType());
3655     if (VF == 1) {
3656       Identity = Iden;
3657       // This vector is the Identity vector where the first element is the
3658       // incoming scalar reduction.
3659       VectorStart = ReductionStartValue;
3660     } else {
3661       Identity = ConstantVector::getSplat(VF, Iden);
3662 
3663       // This vector is the Identity vector where the first element is the
3664       // incoming scalar reduction.
3665       VectorStart =
3666         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3667     }
3668   }
3669 
3670   // Fix the vector-loop phi.
3671 
3672   // Reductions do not have to start at zero. They can start with
3673   // any loop invariant values.
3674   BasicBlock *Latch = OrigLoop->getLoopLatch();
3675   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3676   for (unsigned Part = 0; Part < UF; ++Part) {
3677     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3678     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3679     // Make sure to add the reduction stat value only to the
3680     // first unroll part.
3681     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3682     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3683     cast<PHINode>(VecRdxPhi)
3684       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3685   }
3686 
3687   // Before each round, move the insertion point right between
3688   // the PHIs and the values we are going to write.
3689   // This allows us to write both PHINodes and the extractelement
3690   // instructions.
3691   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3692 
3693   setDebugLocFromInst(Builder, LoopExitInst);
3694 
3695   // If tail is folded by masking, the vector value to leave the loop should be
3696   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3697   // instead of the former.
3698   if (Cost->foldTailByMasking()) {
3699     for (unsigned Part = 0; Part < UF; ++Part) {
3700       Value *VecLoopExitInst =
3701           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3702       Value *Sel = nullptr;
3703       for (User *U : VecLoopExitInst->users()) {
3704         if (isa<SelectInst>(U)) {
3705           assert(!Sel && "Reduction exit feeding two selects");
3706           Sel = U;
3707         } else
3708           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3709       }
3710       assert(Sel && "Reduction exit feeds no select");
3711       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3712     }
3713   }
3714 
3715   // If the vector reduction can be performed in a smaller type, we truncate
3716   // then extend the loop exit value to enable InstCombine to evaluate the
3717   // entire expression in the smaller type.
3718   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3719     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3720     Builder.SetInsertPoint(
3721         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3722     VectorParts RdxParts(UF);
3723     for (unsigned Part = 0; Part < UF; ++Part) {
3724       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3725       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3726       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3727                                         : Builder.CreateZExt(Trunc, VecTy);
3728       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3729            UI != RdxParts[Part]->user_end();)
3730         if (*UI != Trunc) {
3731           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3732           RdxParts[Part] = Extnd;
3733         } else {
3734           ++UI;
3735         }
3736     }
3737     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3738     for (unsigned Part = 0; Part < UF; ++Part) {
3739       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3740       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3741     }
3742   }
3743 
3744   // Reduce all of the unrolled parts into a single vector.
3745   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3746   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3747 
3748   // The middle block terminator has already been assigned a DebugLoc here (the
3749   // OrigLoop's single latch terminator). We want the whole middle block to
3750   // appear to execute on this line because: (a) it is all compiler generated,
3751   // (b) these instructions are always executed after evaluating the latch
3752   // conditional branch, and (c) other passes may add new predecessors which
3753   // terminate on this line. This is the easiest way to ensure we don't
3754   // accidentally cause an extra step back into the loop while debugging.
3755   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3756   for (unsigned Part = 1; Part < UF; ++Part) {
3757     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3758     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3759       // Floating point operations had to be 'fast' to enable the reduction.
3760       ReducedPartRdx = addFastMathFlag(
3761           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3762                               ReducedPartRdx, "bin.rdx"),
3763           RdxDesc.getFastMathFlags());
3764     else
3765       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3766                                       RdxPart);
3767   }
3768 
3769   if (VF > 1) {
3770     bool NoNaN = Legal->hasFunNoNaNAttr();
3771     ReducedPartRdx =
3772         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3773     // If the reduction can be performed in a smaller type, we need to extend
3774     // the reduction to the wider type before we branch to the original loop.
3775     if (Phi->getType() != RdxDesc.getRecurrenceType())
3776       ReducedPartRdx =
3777         RdxDesc.isSigned()
3778         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3779         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3780   }
3781 
3782   // Create a phi node that merges control-flow from the backedge-taken check
3783   // block and the middle block.
3784   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3785                                         LoopScalarPreHeader->getTerminator());
3786   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3787     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3788   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3789 
3790   // Now, we need to fix the users of the reduction variable
3791   // inside and outside of the scalar remainder loop.
3792   // We know that the loop is in LCSSA form. We need to update the
3793   // PHI nodes in the exit blocks.
3794   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3795     // All PHINodes need to have a single entry edge, or two if
3796     // we already fixed them.
3797     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3798 
3799     // We found a reduction value exit-PHI. Update it with the
3800     // incoming bypass edge.
3801     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3802       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3803   } // end of the LCSSA phi scan.
3804 
3805     // Fix the scalar loop reduction variable with the incoming reduction sum
3806     // from the vector body and from the backedge value.
3807   int IncomingEdgeBlockIdx =
3808     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3809   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3810   // Pick the other block.
3811   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3812   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3813   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3814 }
3815 
3816 void InnerLoopVectorizer::fixLCSSAPHIs() {
3817   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3818     if (LCSSAPhi.getNumIncomingValues() == 1) {
3819       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3820       // Non-instruction incoming values will have only one value.
3821       unsigned LastLane = 0;
3822       if (isa<Instruction>(IncomingValue))
3823           LastLane = Cost->isUniformAfterVectorization(
3824                          cast<Instruction>(IncomingValue), VF)
3825                          ? 0
3826                          : VF - 1;
3827       // Can be a loop invariant incoming value or the last scalar value to be
3828       // extracted from the vectorized loop.
3829       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3830       Value *lastIncomingValue =
3831           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3832       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3833     }
3834   }
3835 }
3836 
3837 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3838   // The basic block and loop containing the predicated instruction.
3839   auto *PredBB = PredInst->getParent();
3840   auto *VectorLoop = LI->getLoopFor(PredBB);
3841 
3842   // Initialize a worklist with the operands of the predicated instruction.
3843   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3844 
3845   // Holds instructions that we need to analyze again. An instruction may be
3846   // reanalyzed if we don't yet know if we can sink it or not.
3847   SmallVector<Instruction *, 8> InstsToReanalyze;
3848 
3849   // Returns true if a given use occurs in the predicated block. Phi nodes use
3850   // their operands in their corresponding predecessor blocks.
3851   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3852     auto *I = cast<Instruction>(U.getUser());
3853     BasicBlock *BB = I->getParent();
3854     if (auto *Phi = dyn_cast<PHINode>(I))
3855       BB = Phi->getIncomingBlock(
3856           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3857     return BB == PredBB;
3858   };
3859 
3860   // Iteratively sink the scalarized operands of the predicated instruction
3861   // into the block we created for it. When an instruction is sunk, it's
3862   // operands are then added to the worklist. The algorithm ends after one pass
3863   // through the worklist doesn't sink a single instruction.
3864   bool Changed;
3865   do {
3866     // Add the instructions that need to be reanalyzed to the worklist, and
3867     // reset the changed indicator.
3868     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3869     InstsToReanalyze.clear();
3870     Changed = false;
3871 
3872     while (!Worklist.empty()) {
3873       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3874 
3875       // We can't sink an instruction if it is a phi node, is already in the
3876       // predicated block, is not in the loop, or may have side effects.
3877       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3878           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3879         continue;
3880 
3881       // It's legal to sink the instruction if all its uses occur in the
3882       // predicated block. Otherwise, there's nothing to do yet, and we may
3883       // need to reanalyze the instruction.
3884       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3885         InstsToReanalyze.push_back(I);
3886         continue;
3887       }
3888 
3889       // Move the instruction to the beginning of the predicated block, and add
3890       // it's operands to the worklist.
3891       I->moveBefore(&*PredBB->getFirstInsertionPt());
3892       Worklist.insert(I->op_begin(), I->op_end());
3893 
3894       // The sinking may have enabled other instructions to be sunk, so we will
3895       // need to iterate.
3896       Changed = true;
3897     }
3898   } while (Changed);
3899 }
3900 
3901 void InnerLoopVectorizer::fixNonInductionPHIs() {
3902   for (PHINode *OrigPhi : OrigPHIsToFix) {
3903     PHINode *NewPhi =
3904         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3905     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3906 
3907     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3908         predecessors(OrigPhi->getParent()));
3909     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3910         predecessors(NewPhi->getParent()));
3911     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3912            "Scalar and Vector BB should have the same number of predecessors");
3913 
3914     // The insertion point in Builder may be invalidated by the time we get
3915     // here. Force the Builder insertion point to something valid so that we do
3916     // not run into issues during insertion point restore in
3917     // getOrCreateVectorValue calls below.
3918     Builder.SetInsertPoint(NewPhi);
3919 
3920     // The predecessor order is preserved and we can rely on mapping between
3921     // scalar and vector block predecessors.
3922     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3923       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3924 
3925       // When looking up the new scalar/vector values to fix up, use incoming
3926       // values from original phi.
3927       Value *ScIncV =
3928           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3929 
3930       // Scalar incoming value may need a broadcast
3931       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3932       NewPhi->addIncoming(NewIncV, NewPredBB);
3933     }
3934   }
3935 }
3936 
3937 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
3938                                               unsigned VF) {
3939   PHINode *P = cast<PHINode>(PN);
3940   if (EnableVPlanNativePath) {
3941     // Currently we enter here in the VPlan-native path for non-induction
3942     // PHIs where all control flow is uniform. We simply widen these PHIs.
3943     // Create a vector phi with no operands - the vector phi operands will be
3944     // set at the end of vector code generation.
3945     Type *VecTy =
3946         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3947     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3948     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3949     OrigPHIsToFix.push_back(P);
3950 
3951     return;
3952   }
3953 
3954   assert(PN->getParent() == OrigLoop->getHeader() &&
3955          "Non-header phis should have been handled elsewhere");
3956 
3957   // In order to support recurrences we need to be able to vectorize Phi nodes.
3958   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3959   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3960   // this value when we vectorize all of the instructions that use the PHI.
3961   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
3962     for (unsigned Part = 0; Part < UF; ++Part) {
3963       // This is phase one of vectorizing PHIs.
3964       Type *VecTy =
3965           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3966       Value *EntryPart = PHINode::Create(
3967           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
3968       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
3969     }
3970     return;
3971   }
3972 
3973   setDebugLocFromInst(Builder, P);
3974 
3975   // This PHINode must be an induction variable.
3976   // Make sure that we know about it.
3977   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
3978 
3979   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
3980   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3981 
3982   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
3983   // which can be found from the original scalar operations.
3984   switch (II.getKind()) {
3985   case InductionDescriptor::IK_NoInduction:
3986     llvm_unreachable("Unknown induction");
3987   case InductionDescriptor::IK_IntInduction:
3988   case InductionDescriptor::IK_FpInduction:
3989     llvm_unreachable("Integer/fp induction is handled elsewhere.");
3990   case InductionDescriptor::IK_PtrInduction: {
3991     // Handle the pointer induction variable case.
3992     assert(P->getType()->isPointerTy() && "Unexpected type.");
3993     // This is the normalized GEP that starts counting at zero.
3994     Value *PtrInd = Induction;
3995     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
3996     // Determine the number of scalars we need to generate for each unroll
3997     // iteration. If the instruction is uniform, we only need to generate the
3998     // first lane. Otherwise, we generate all VF values.
3999     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4000     // These are the scalar results. Notice that we don't generate vector GEPs
4001     // because scalar GEPs result in better code.
4002     for (unsigned Part = 0; Part < UF; ++Part) {
4003       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4004         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4005         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4006         Value *SclrGep =
4007             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4008         SclrGep->setName("next.gep");
4009         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4010       }
4011     }
4012     return;
4013   }
4014   }
4015 }
4016 
4017 /// A helper function for checking whether an integer division-related
4018 /// instruction may divide by zero (in which case it must be predicated if
4019 /// executed conditionally in the scalar code).
4020 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4021 /// Non-zero divisors that are non compile-time constants will not be
4022 /// converted into multiplication, so we will still end up scalarizing
4023 /// the division, but can do so w/o predication.
4024 static bool mayDivideByZero(Instruction &I) {
4025   assert((I.getOpcode() == Instruction::UDiv ||
4026           I.getOpcode() == Instruction::SDiv ||
4027           I.getOpcode() == Instruction::URem ||
4028           I.getOpcode() == Instruction::SRem) &&
4029          "Unexpected instruction");
4030   Value *Divisor = I.getOperand(1);
4031   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4032   return !CInt || CInt->isZero();
4033 }
4034 
4035 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4036   switch (I.getOpcode()) {
4037   case Instruction::Br:
4038   case Instruction::PHI:
4039     llvm_unreachable("This instruction is handled by a different recipe.");
4040   case Instruction::GetElementPtr: {
4041     // Construct a vector GEP by widening the operands of the scalar GEP as
4042     // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4043     // results in a vector of pointers when at least one operand of the GEP
4044     // is vector-typed. Thus, to keep the representation compact, we only use
4045     // vector-typed operands for loop-varying values.
4046     auto *GEP = cast<GetElementPtrInst>(&I);
4047 
4048     if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
4049       // If we are vectorizing, but the GEP has only loop-invariant operands,
4050       // the GEP we build (by only using vector-typed operands for
4051       // loop-varying values) would be a scalar pointer. Thus, to ensure we
4052       // produce a vector of pointers, we need to either arbitrarily pick an
4053       // operand to broadcast, or broadcast a clone of the original GEP.
4054       // Here, we broadcast a clone of the original.
4055       //
4056       // TODO: If at some point we decide to scalarize instructions having
4057       //       loop-invariant operands, this special case will no longer be
4058       //       required. We would add the scalarization decision to
4059       //       collectLoopScalars() and teach getVectorValue() to broadcast
4060       //       the lane-zero scalar value.
4061       auto *Clone = Builder.Insert(GEP->clone());
4062       for (unsigned Part = 0; Part < UF; ++Part) {
4063         Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4064         VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
4065         addMetadata(EntryPart, GEP);
4066       }
4067     } else {
4068       // If the GEP has at least one loop-varying operand, we are sure to
4069       // produce a vector of pointers. But if we are only unrolling, we want
4070       // to produce a scalar GEP for each unroll part. Thus, the GEP we
4071       // produce with the code below will be scalar (if VF == 1) or vector
4072       // (otherwise). Note that for the unroll-only case, we still maintain
4073       // values in the vector mapping with initVector, as we do for other
4074       // instructions.
4075       for (unsigned Part = 0; Part < UF; ++Part) {
4076         // The pointer operand of the new GEP. If it's loop-invariant, we
4077         // won't broadcast it.
4078         auto *Ptr =
4079             OrigLoop->isLoopInvariant(GEP->getPointerOperand())
4080                 ? GEP->getPointerOperand()
4081                 : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4082 
4083         // Collect all the indices for the new GEP. If any index is
4084         // loop-invariant, we won't broadcast it.
4085         SmallVector<Value *, 4> Indices;
4086         for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
4087           if (OrigLoop->isLoopInvariant(U.get()))
4088             Indices.push_back(U.get());
4089           else
4090             Indices.push_back(getOrCreateVectorValue(U.get(), Part));
4091         }
4092 
4093         // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4094         // but it should be a vector, otherwise.
4095         auto *NewGEP =
4096             GEP->isInBounds()
4097                 ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4098                                             Indices)
4099                 : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4100         assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4101                "NewGEP is not a pointer vector");
4102         VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
4103         addMetadata(NewGEP, GEP);
4104       }
4105     }
4106 
4107     break;
4108   }
4109   case Instruction::UDiv:
4110   case Instruction::SDiv:
4111   case Instruction::SRem:
4112   case Instruction::URem:
4113   case Instruction::Add:
4114   case Instruction::FAdd:
4115   case Instruction::Sub:
4116   case Instruction::FSub:
4117   case Instruction::FNeg:
4118   case Instruction::Mul:
4119   case Instruction::FMul:
4120   case Instruction::FDiv:
4121   case Instruction::FRem:
4122   case Instruction::Shl:
4123   case Instruction::LShr:
4124   case Instruction::AShr:
4125   case Instruction::And:
4126   case Instruction::Or:
4127   case Instruction::Xor: {
4128     // Just widen unops and binops.
4129     setDebugLocFromInst(Builder, &I);
4130 
4131     for (unsigned Part = 0; Part < UF; ++Part) {
4132       SmallVector<Value *, 2> Ops;
4133       for (Value *Op : I.operands())
4134         Ops.push_back(getOrCreateVectorValue(Op, Part));
4135 
4136       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4137 
4138       if (auto *VecOp = dyn_cast<Instruction>(V))
4139         VecOp->copyIRFlags(&I);
4140 
4141       // Use this vector value for all users of the original instruction.
4142       VectorLoopValueMap.setVectorValue(&I, Part, V);
4143       addMetadata(V, &I);
4144     }
4145 
4146     break;
4147   }
4148   case Instruction::Select: {
4149     // Widen selects.
4150     // If the selector is loop invariant we can create a select
4151     // instruction with a scalar condition. Otherwise, use vector-select.
4152     auto *SE = PSE.getSE();
4153     bool InvariantCond =
4154         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4155     setDebugLocFromInst(Builder, &I);
4156 
4157     // The condition can be loop invariant  but still defined inside the
4158     // loop. This means that we can't just use the original 'cond' value.
4159     // We have to take the 'vectorized' value and pick the first lane.
4160     // Instcombine will make this a no-op.
4161 
4162     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4163 
4164     for (unsigned Part = 0; Part < UF; ++Part) {
4165       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4166       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4167       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4168       Value *Sel =
4169           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4170       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4171       addMetadata(Sel, &I);
4172     }
4173 
4174     break;
4175   }
4176 
4177   case Instruction::ICmp:
4178   case Instruction::FCmp: {
4179     // Widen compares. Generate vector compares.
4180     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4181     auto *Cmp = cast<CmpInst>(&I);
4182     setDebugLocFromInst(Builder, Cmp);
4183     for (unsigned Part = 0; Part < UF; ++Part) {
4184       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4185       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4186       Value *C = nullptr;
4187       if (FCmp) {
4188         // Propagate fast math flags.
4189         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4190         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4191         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4192       } else {
4193         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4194       }
4195       VectorLoopValueMap.setVectorValue(&I, Part, C);
4196       addMetadata(C, &I);
4197     }
4198 
4199     break;
4200   }
4201 
4202   case Instruction::ZExt:
4203   case Instruction::SExt:
4204   case Instruction::FPToUI:
4205   case Instruction::FPToSI:
4206   case Instruction::FPExt:
4207   case Instruction::PtrToInt:
4208   case Instruction::IntToPtr:
4209   case Instruction::SIToFP:
4210   case Instruction::UIToFP:
4211   case Instruction::Trunc:
4212   case Instruction::FPTrunc:
4213   case Instruction::BitCast: {
4214     auto *CI = cast<CastInst>(&I);
4215     setDebugLocFromInst(Builder, CI);
4216 
4217     /// Vectorize casts.
4218     Type *DestTy =
4219         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4220 
4221     for (unsigned Part = 0; Part < UF; ++Part) {
4222       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4223       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4224       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4225       addMetadata(Cast, &I);
4226     }
4227     break;
4228   }
4229 
4230   case Instruction::Call: {
4231     // Ignore dbg intrinsics.
4232     if (isa<DbgInfoIntrinsic>(I))
4233       break;
4234     setDebugLocFromInst(Builder, &I);
4235 
4236     Module *M = I.getParent()->getParent()->getParent();
4237     auto *CI = cast<CallInst>(&I);
4238 
4239     StringRef FnName = CI->getCalledFunction()->getName();
4240     Function *F = CI->getCalledFunction();
4241     Type *RetTy = ToVectorTy(CI->getType(), VF);
4242     SmallVector<Type *, 4> Tys;
4243     for (Value *ArgOperand : CI->arg_operands())
4244       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4245 
4246     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4247 
4248     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4249     // version of the instruction.
4250     // Is it beneficial to perform intrinsic call compared to lib call?
4251     bool NeedToScalarize;
4252     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4253     bool UseVectorIntrinsic =
4254         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4255     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4256            "Instruction should be scalarized elsewhere.");
4257 
4258     for (unsigned Part = 0; Part < UF; ++Part) {
4259       SmallVector<Value *, 4> Args;
4260       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4261         Value *Arg = CI->getArgOperand(i);
4262         // Some intrinsics have a scalar argument - don't replace it with a
4263         // vector.
4264         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4265           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4266         Args.push_back(Arg);
4267       }
4268 
4269       Function *VectorF;
4270       if (UseVectorIntrinsic) {
4271         // Use vector version of the intrinsic.
4272         Type *TysForDecl[] = {CI->getType()};
4273         if (VF > 1)
4274           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4275         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4276       } else {
4277         // Use vector version of the library call.
4278         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4279         assert(!VFnName.empty() && "Vector function name is empty.");
4280         VectorF = M->getFunction(VFnName);
4281         if (!VectorF) {
4282           // Generate a declaration
4283           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4284           VectorF =
4285               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4286           VectorF->copyAttributesFrom(F);
4287         }
4288       }
4289       assert(VectorF && "Can't create vector function.");
4290 
4291       SmallVector<OperandBundleDef, 1> OpBundles;
4292       CI->getOperandBundlesAsDefs(OpBundles);
4293       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4294 
4295       if (isa<FPMathOperator>(V))
4296         V->copyFastMathFlags(CI);
4297 
4298       VectorLoopValueMap.setVectorValue(&I, Part, V);
4299       addMetadata(V, &I);
4300     }
4301 
4302     break;
4303   }
4304 
4305   default:
4306     // This instruction is not vectorized by simple widening.
4307     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4308     llvm_unreachable("Unhandled instruction!");
4309   } // end of switch.
4310 }
4311 
4312 void InnerLoopVectorizer::updateAnalysis() {
4313   // Forget the original basic block.
4314   PSE.getSE()->forgetLoop(OrigLoop);
4315 
4316   // DT is not kept up-to-date for outer loop vectorization
4317   if (EnableVPlanNativePath)
4318     return;
4319 
4320   // Update the dominator tree information.
4321   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4322          "Entry does not dominate exit.");
4323 
4324   DT->addNewBlock(LoopMiddleBlock,
4325                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4326   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4327   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4328   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4329   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4330 }
4331 
4332 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4333   // We should not collect Scalars more than once per VF. Right now, this
4334   // function is called from collectUniformsAndScalars(), which already does
4335   // this check. Collecting Scalars for VF=1 does not make any sense.
4336   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4337          "This function should not be visited twice for the same VF");
4338 
4339   SmallSetVector<Instruction *, 8> Worklist;
4340 
4341   // These sets are used to seed the analysis with pointers used by memory
4342   // accesses that will remain scalar.
4343   SmallSetVector<Instruction *, 8> ScalarPtrs;
4344   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4345 
4346   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4347   // The pointer operands of loads and stores will be scalar as long as the
4348   // memory access is not a gather or scatter operation. The value operand of a
4349   // store will remain scalar if the store is scalarized.
4350   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4351     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4352     assert(WideningDecision != CM_Unknown &&
4353            "Widening decision should be ready at this moment");
4354     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4355       if (Ptr == Store->getValueOperand())
4356         return WideningDecision == CM_Scalarize;
4357     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4358            "Ptr is neither a value or pointer operand");
4359     return WideningDecision != CM_GatherScatter;
4360   };
4361 
4362   // A helper that returns true if the given value is a bitcast or
4363   // getelementptr instruction contained in the loop.
4364   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4365     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4366             isa<GetElementPtrInst>(V)) &&
4367            !TheLoop->isLoopInvariant(V);
4368   };
4369 
4370   // A helper that evaluates a memory access's use of a pointer. If the use
4371   // will be a scalar use, and the pointer is only used by memory accesses, we
4372   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4373   // PossibleNonScalarPtrs.
4374   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4375     // We only care about bitcast and getelementptr instructions contained in
4376     // the loop.
4377     if (!isLoopVaryingBitCastOrGEP(Ptr))
4378       return;
4379 
4380     // If the pointer has already been identified as scalar (e.g., if it was
4381     // also identified as uniform), there's nothing to do.
4382     auto *I = cast<Instruction>(Ptr);
4383     if (Worklist.count(I))
4384       return;
4385 
4386     // If the use of the pointer will be a scalar use, and all users of the
4387     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4388     // place the pointer in PossibleNonScalarPtrs.
4389     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4390           return isa<LoadInst>(U) || isa<StoreInst>(U);
4391         }))
4392       ScalarPtrs.insert(I);
4393     else
4394       PossibleNonScalarPtrs.insert(I);
4395   };
4396 
4397   // We seed the scalars analysis with three classes of instructions: (1)
4398   // instructions marked uniform-after-vectorization, (2) bitcast and
4399   // getelementptr instructions used by memory accesses requiring a scalar use,
4400   // and (3) pointer induction variables and their update instructions (we
4401   // currently only scalarize these).
4402   //
4403   // (1) Add to the worklist all instructions that have been identified as
4404   // uniform-after-vectorization.
4405   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4406 
4407   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4408   // memory accesses requiring a scalar use. The pointer operands of loads and
4409   // stores will be scalar as long as the memory accesses is not a gather or
4410   // scatter operation. The value operand of a store will remain scalar if the
4411   // store is scalarized.
4412   for (auto *BB : TheLoop->blocks())
4413     for (auto &I : *BB) {
4414       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4415         evaluatePtrUse(Load, Load->getPointerOperand());
4416       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4417         evaluatePtrUse(Store, Store->getPointerOperand());
4418         evaluatePtrUse(Store, Store->getValueOperand());
4419       }
4420     }
4421   for (auto *I : ScalarPtrs)
4422     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4423       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4424       Worklist.insert(I);
4425     }
4426 
4427   // (3) Add to the worklist all pointer induction variables and their update
4428   // instructions.
4429   //
4430   // TODO: Once we are able to vectorize pointer induction variables we should
4431   //       no longer insert them into the worklist here.
4432   auto *Latch = TheLoop->getLoopLatch();
4433   for (auto &Induction : *Legal->getInductionVars()) {
4434     auto *Ind = Induction.first;
4435     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4436     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4437       continue;
4438     Worklist.insert(Ind);
4439     Worklist.insert(IndUpdate);
4440     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4441     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4442                       << "\n");
4443   }
4444 
4445   // Insert the forced scalars.
4446   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4447   // induction variable when the PHI user is scalarized.
4448   auto ForcedScalar = ForcedScalars.find(VF);
4449   if (ForcedScalar != ForcedScalars.end())
4450     for (auto *I : ForcedScalar->second)
4451       Worklist.insert(I);
4452 
4453   // Expand the worklist by looking through any bitcasts and getelementptr
4454   // instructions we've already identified as scalar. This is similar to the
4455   // expansion step in collectLoopUniforms(); however, here we're only
4456   // expanding to include additional bitcasts and getelementptr instructions.
4457   unsigned Idx = 0;
4458   while (Idx != Worklist.size()) {
4459     Instruction *Dst = Worklist[Idx++];
4460     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4461       continue;
4462     auto *Src = cast<Instruction>(Dst->getOperand(0));
4463     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4464           auto *J = cast<Instruction>(U);
4465           return !TheLoop->contains(J) || Worklist.count(J) ||
4466                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4467                   isScalarUse(J, Src));
4468         })) {
4469       Worklist.insert(Src);
4470       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4471     }
4472   }
4473 
4474   // An induction variable will remain scalar if all users of the induction
4475   // variable and induction variable update remain scalar.
4476   for (auto &Induction : *Legal->getInductionVars()) {
4477     auto *Ind = Induction.first;
4478     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4479 
4480     // We already considered pointer induction variables, so there's no reason
4481     // to look at their users again.
4482     //
4483     // TODO: Once we are able to vectorize pointer induction variables we
4484     //       should no longer skip over them here.
4485     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4486       continue;
4487 
4488     // Determine if all users of the induction variable are scalar after
4489     // vectorization.
4490     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4491       auto *I = cast<Instruction>(U);
4492       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4493     });
4494     if (!ScalarInd)
4495       continue;
4496 
4497     // Determine if all users of the induction variable update instruction are
4498     // scalar after vectorization.
4499     auto ScalarIndUpdate =
4500         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4501           auto *I = cast<Instruction>(U);
4502           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4503         });
4504     if (!ScalarIndUpdate)
4505       continue;
4506 
4507     // The induction variable and its update instruction will remain scalar.
4508     Worklist.insert(Ind);
4509     Worklist.insert(IndUpdate);
4510     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4511     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4512                       << "\n");
4513   }
4514 
4515   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4516 }
4517 
4518 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4519   if (!blockNeedsPredication(I->getParent()))
4520     return false;
4521   switch(I->getOpcode()) {
4522   default:
4523     break;
4524   case Instruction::Load:
4525   case Instruction::Store: {
4526     if (!Legal->isMaskRequired(I))
4527       return false;
4528     auto *Ptr = getLoadStorePointerOperand(I);
4529     auto *Ty = getMemInstValueType(I);
4530     // We have already decided how to vectorize this instruction, get that
4531     // result.
4532     if (VF > 1) {
4533       InstWidening WideningDecision = getWideningDecision(I, VF);
4534       assert(WideningDecision != CM_Unknown &&
4535              "Widening decision should be ready at this moment");
4536       return WideningDecision == CM_Scalarize;
4537     }
4538     return isa<LoadInst>(I) ?
4539         !(isLegalMaskedLoad(Ty, Ptr)  || isLegalMaskedGather(Ty))
4540       : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
4541   }
4542   case Instruction::UDiv:
4543   case Instruction::SDiv:
4544   case Instruction::SRem:
4545   case Instruction::URem:
4546     return mayDivideByZero(*I);
4547   }
4548   return false;
4549 }
4550 
4551 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4552                                                                unsigned VF) {
4553   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4554   assert(getWideningDecision(I, VF) == CM_Unknown &&
4555          "Decision should not be set yet.");
4556   auto *Group = getInterleavedAccessGroup(I);
4557   assert(Group && "Must have a group.");
4558 
4559   // If the instruction's allocated size doesn't equal it's type size, it
4560   // requires padding and will be scalarized.
4561   auto &DL = I->getModule()->getDataLayout();
4562   auto *ScalarTy = getMemInstValueType(I);
4563   if (hasIrregularType(ScalarTy, DL, VF))
4564     return false;
4565 
4566   // Check if masking is required.
4567   // A Group may need masking for one of two reasons: it resides in a block that
4568   // needs predication, or it was decided to use masking to deal with gaps.
4569   bool PredicatedAccessRequiresMasking =
4570       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4571   bool AccessWithGapsRequiresMasking =
4572       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4573   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4574     return true;
4575 
4576   // If masked interleaving is required, we expect that the user/target had
4577   // enabled it, because otherwise it either wouldn't have been created or
4578   // it should have been invalidated by the CostModel.
4579   assert(useMaskedInterleavedAccesses(TTI) &&
4580          "Masked interleave-groups for predicated accesses are not enabled.");
4581 
4582   auto *Ty = getMemInstValueType(I);
4583   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
4584                           : TTI.isLegalMaskedStore(Ty);
4585 }
4586 
4587 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4588                                                                unsigned VF) {
4589   // Get and ensure we have a valid memory instruction.
4590   LoadInst *LI = dyn_cast<LoadInst>(I);
4591   StoreInst *SI = dyn_cast<StoreInst>(I);
4592   assert((LI || SI) && "Invalid memory instruction");
4593 
4594   auto *Ptr = getLoadStorePointerOperand(I);
4595 
4596   // In order to be widened, the pointer should be consecutive, first of all.
4597   if (!Legal->isConsecutivePtr(Ptr))
4598     return false;
4599 
4600   // If the instruction is a store located in a predicated block, it will be
4601   // scalarized.
4602   if (isScalarWithPredication(I))
4603     return false;
4604 
4605   // If the instruction's allocated size doesn't equal it's type size, it
4606   // requires padding and will be scalarized.
4607   auto &DL = I->getModule()->getDataLayout();
4608   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4609   if (hasIrregularType(ScalarTy, DL, VF))
4610     return false;
4611 
4612   return true;
4613 }
4614 
4615 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4616   // We should not collect Uniforms more than once per VF. Right now,
4617   // this function is called from collectUniformsAndScalars(), which
4618   // already does this check. Collecting Uniforms for VF=1 does not make any
4619   // sense.
4620 
4621   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4622          "This function should not be visited twice for the same VF");
4623 
4624   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4625   // not analyze again.  Uniforms.count(VF) will return 1.
4626   Uniforms[VF].clear();
4627 
4628   // We now know that the loop is vectorizable!
4629   // Collect instructions inside the loop that will remain uniform after
4630   // vectorization.
4631 
4632   // Global values, params and instructions outside of current loop are out of
4633   // scope.
4634   auto isOutOfScope = [&](Value *V) -> bool {
4635     Instruction *I = dyn_cast<Instruction>(V);
4636     return (!I || !TheLoop->contains(I));
4637   };
4638 
4639   SetVector<Instruction *> Worklist;
4640   BasicBlock *Latch = TheLoop->getLoopLatch();
4641 
4642   // Start with the conditional branch. If the branch condition is an
4643   // instruction contained in the loop that is only used by the branch, it is
4644   // uniform.
4645   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4646   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
4647     Worklist.insert(Cmp);
4648     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
4649   }
4650 
4651   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4652   // are pointers that are treated like consecutive pointers during
4653   // vectorization. The pointer operands of interleaved accesses are an
4654   // example.
4655   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4656 
4657   // Holds pointer operands of instructions that are possibly non-uniform.
4658   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4659 
4660   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4661     InstWidening WideningDecision = getWideningDecision(I, VF);
4662     assert(WideningDecision != CM_Unknown &&
4663            "Widening decision should be ready at this moment");
4664 
4665     return (WideningDecision == CM_Widen ||
4666             WideningDecision == CM_Widen_Reverse ||
4667             WideningDecision == CM_Interleave);
4668   };
4669   // Iterate over the instructions in the loop, and collect all
4670   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4671   // that a consecutive-like pointer operand will be scalarized, we collect it
4672   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4673   // getelementptr instruction can be used by both vectorized and scalarized
4674   // memory instructions. For example, if a loop loads and stores from the same
4675   // location, but the store is conditional, the store will be scalarized, and
4676   // the getelementptr won't remain uniform.
4677   for (auto *BB : TheLoop->blocks())
4678     for (auto &I : *BB) {
4679       // If there's no pointer operand, there's nothing to do.
4680       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4681       if (!Ptr)
4682         continue;
4683 
4684       // True if all users of Ptr are memory accesses that have Ptr as their
4685       // pointer operand.
4686       auto UsersAreMemAccesses =
4687           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4688             return getLoadStorePointerOperand(U) == Ptr;
4689           });
4690 
4691       // Ensure the memory instruction will not be scalarized or used by
4692       // gather/scatter, making its pointer operand non-uniform. If the pointer
4693       // operand is used by any instruction other than a memory access, we
4694       // conservatively assume the pointer operand may be non-uniform.
4695       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4696         PossibleNonUniformPtrs.insert(Ptr);
4697 
4698       // If the memory instruction will be vectorized and its pointer operand
4699       // is consecutive-like, or interleaving - the pointer operand should
4700       // remain uniform.
4701       else
4702         ConsecutiveLikePtrs.insert(Ptr);
4703     }
4704 
4705   // Add to the Worklist all consecutive and consecutive-like pointers that
4706   // aren't also identified as possibly non-uniform.
4707   for (auto *V : ConsecutiveLikePtrs)
4708     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
4709       LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
4710       Worklist.insert(V);
4711     }
4712 
4713   // Expand Worklist in topological order: whenever a new instruction
4714   // is added , its users should be already inside Worklist.  It ensures
4715   // a uniform instruction will only be used by uniform instructions.
4716   unsigned idx = 0;
4717   while (idx != Worklist.size()) {
4718     Instruction *I = Worklist[idx++];
4719 
4720     for (auto OV : I->operand_values()) {
4721       // isOutOfScope operands cannot be uniform instructions.
4722       if (isOutOfScope(OV))
4723         continue;
4724       // First order recurrence Phi's should typically be considered
4725       // non-uniform.
4726       auto *OP = dyn_cast<PHINode>(OV);
4727       if (OP && Legal->isFirstOrderRecurrence(OP))
4728         continue;
4729       // If all the users of the operand are uniform, then add the
4730       // operand into the uniform worklist.
4731       auto *OI = cast<Instruction>(OV);
4732       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4733             auto *J = cast<Instruction>(U);
4734             return Worklist.count(J) ||
4735                    (OI == getLoadStorePointerOperand(J) &&
4736                     isUniformDecision(J, VF));
4737           })) {
4738         Worklist.insert(OI);
4739         LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
4740       }
4741     }
4742   }
4743 
4744   // Returns true if Ptr is the pointer operand of a memory access instruction
4745   // I, and I is known to not require scalarization.
4746   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4747     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4748   };
4749 
4750   // For an instruction to be added into Worklist above, all its users inside
4751   // the loop should also be in Worklist. However, this condition cannot be
4752   // true for phi nodes that form a cyclic dependence. We must process phi
4753   // nodes separately. An induction variable will remain uniform if all users
4754   // of the induction variable and induction variable update remain uniform.
4755   // The code below handles both pointer and non-pointer induction variables.
4756   for (auto &Induction : *Legal->getInductionVars()) {
4757     auto *Ind = Induction.first;
4758     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4759 
4760     // Determine if all users of the induction variable are uniform after
4761     // vectorization.
4762     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4763       auto *I = cast<Instruction>(U);
4764       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4765              isVectorizedMemAccessUse(I, Ind);
4766     });
4767     if (!UniformInd)
4768       continue;
4769 
4770     // Determine if all users of the induction variable update instruction are
4771     // uniform after vectorization.
4772     auto UniformIndUpdate =
4773         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4774           auto *I = cast<Instruction>(U);
4775           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4776                  isVectorizedMemAccessUse(I, IndUpdate);
4777         });
4778     if (!UniformIndUpdate)
4779       continue;
4780 
4781     // The induction variable and its update instruction will remain uniform.
4782     Worklist.insert(Ind);
4783     Worklist.insert(IndUpdate);
4784     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
4785     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
4786                       << "\n");
4787   }
4788 
4789   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4790 }
4791 
4792 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4793   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4794 
4795   if (Legal->getRuntimePointerChecking()->Need) {
4796     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4797         "runtime pointer checks needed. Enable vectorization of this "
4798         "loop with '#pragma clang loop vectorize(enable)' when "
4799         "compiling with -Os/-Oz",
4800         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4801     return true;
4802   }
4803 
4804   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4805     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4806         "runtime SCEV checks needed. Enable vectorization of this "
4807         "loop with '#pragma clang loop vectorize(enable)' when "
4808         "compiling with -Os/-Oz",
4809         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4810     return true;
4811   }
4812 
4813   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4814   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4815     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4816         "runtime stride == 1 checks needed. Enable vectorization of "
4817         "this loop with '#pragma clang loop vectorize(enable)' when "
4818         "compiling with -Os/-Oz",
4819         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4820     return true;
4821   }
4822 
4823   return false;
4824 }
4825 
4826 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4827   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4828     // TODO: It may by useful to do since it's still likely to be dynamically
4829     // uniform if the target can skip.
4830     reportVectorizationFailure(
4831         "Not inserting runtime ptr check for divergent target",
4832         "runtime pointer checks needed. Not enabled for divergent target",
4833         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4834     return None;
4835   }
4836 
4837   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4838   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4839   if (TC == 1) {
4840     reportVectorizationFailure("Single iteration (non) loop",
4841         "loop trip count is one, irrelevant for vectorization",
4842         "SingleIterationLoop", ORE, TheLoop);
4843     return None;
4844   }
4845 
4846   switch (ScalarEpilogueStatus) {
4847   case CM_ScalarEpilogueAllowed:
4848     return computeFeasibleMaxVF(TC);
4849   case CM_ScalarEpilogueNotNeededUsePredicate:
4850     LLVM_DEBUG(
4851         dbgs() << "LV: vector predicate hint/switch found.\n"
4852                << "LV: Not allowing scalar epilogue, creating predicated "
4853                << "vector loop.\n");
4854     break;
4855   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4856     // fallthrough as a special case of OptForSize
4857   case CM_ScalarEpilogueNotAllowedOptSize:
4858     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4859       LLVM_DEBUG(
4860           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4861     else
4862       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4863                         << "count.\n");
4864 
4865     // Bail if runtime checks are required, which are not good when optimising
4866     // for size.
4867     if (runtimeChecksRequired())
4868       return None;
4869     break;
4870   }
4871 
4872   // Now try the tail folding
4873 
4874   // Invalidate interleave groups that require an epilogue if we can't mask
4875   // the interleave-group.
4876   if (!useMaskedInterleavedAccesses(TTI))
4877     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4878 
4879   unsigned MaxVF = computeFeasibleMaxVF(TC);
4880   if (TC > 0 && TC % MaxVF == 0) {
4881     // Accept MaxVF if we do not have a tail.
4882     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4883     return MaxVF;
4884   }
4885 
4886   // If we don't know the precise trip count, or if the trip count that we
4887   // found modulo the vectorization factor is not zero, try to fold the tail
4888   // by masking.
4889   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4890   if (Legal->prepareToFoldTailByMasking()) {
4891     FoldTailByMasking = true;
4892     return MaxVF;
4893   }
4894 
4895   if (TC == 0) {
4896     reportVectorizationFailure(
4897         "Unable to calculate the loop count due to complex control flow",
4898         "unable to calculate the loop count due to complex control flow",
4899         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4900     return None;
4901   }
4902 
4903   reportVectorizationFailure(
4904       "Cannot optimize for size and vectorize at the same time.",
4905       "cannot optimize for size and vectorize at the same time. "
4906       "Enable vectorization of this loop with '#pragma clang loop "
4907       "vectorize(enable)' when compiling with -Os/-Oz",
4908       "NoTailLoopWithOptForSize", ORE, TheLoop);
4909   return None;
4910 }
4911 
4912 unsigned
4913 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4914   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4915   unsigned SmallestType, WidestType;
4916   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4917   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4918 
4919   // Get the maximum safe dependence distance in bits computed by LAA.
4920   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4921   // the memory accesses that is most restrictive (involved in the smallest
4922   // dependence distance).
4923   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4924 
4925   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4926 
4927   unsigned MaxVectorSize = WidestRegister / WidestType;
4928 
4929   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4930                     << " / " << WidestType << " bits.\n");
4931   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4932                     << WidestRegister << " bits.\n");
4933 
4934   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4935                                  " into one vector!");
4936   if (MaxVectorSize == 0) {
4937     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4938     MaxVectorSize = 1;
4939     return MaxVectorSize;
4940   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4941              isPowerOf2_32(ConstTripCount)) {
4942     // We need to clamp the VF to be the ConstTripCount. There is no point in
4943     // choosing a higher viable VF as done in the loop below.
4944     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4945                       << ConstTripCount << "\n");
4946     MaxVectorSize = ConstTripCount;
4947     return MaxVectorSize;
4948   }
4949 
4950   unsigned MaxVF = MaxVectorSize;
4951   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
4952       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
4953     // Collect all viable vectorization factors larger than the default MaxVF
4954     // (i.e. MaxVectorSize).
4955     SmallVector<unsigned, 8> VFs;
4956     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
4957     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
4958       VFs.push_back(VS);
4959 
4960     // For each VF calculate its register usage.
4961     auto RUs = calculateRegisterUsage(VFs);
4962 
4963     // Select the largest VF which doesn't require more registers than existing
4964     // ones.
4965     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
4966     for (int i = RUs.size() - 1; i >= 0; --i) {
4967       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
4968         MaxVF = VFs[i];
4969         break;
4970       }
4971     }
4972     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
4973       if (MaxVF < MinVF) {
4974         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4975                           << ") with target's minimum: " << MinVF << '\n');
4976         MaxVF = MinVF;
4977       }
4978     }
4979   }
4980   return MaxVF;
4981 }
4982 
4983 VectorizationFactor
4984 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
4985   float Cost = expectedCost(1).first;
4986   const float ScalarCost = Cost;
4987   unsigned Width = 1;
4988   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
4989 
4990   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
4991   if (ForceVectorization && MaxVF > 1) {
4992     // Ignore scalar width, because the user explicitly wants vectorization.
4993     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4994     // evaluation.
4995     Cost = std::numeric_limits<float>::max();
4996   }
4997 
4998   for (unsigned i = 2; i <= MaxVF; i *= 2) {
4999     // Notice that the vector loop needs to be executed less times, so
5000     // we need to divide the cost of the vector loops by the width of
5001     // the vector elements.
5002     VectorizationCostTy C = expectedCost(i);
5003     float VectorCost = C.first / (float)i;
5004     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5005                       << " costs: " << (int)VectorCost << ".\n");
5006     if (!C.second && !ForceVectorization) {
5007       LLVM_DEBUG(
5008           dbgs() << "LV: Not considering vector loop of width " << i
5009                  << " because it will not generate any vector instructions.\n");
5010       continue;
5011     }
5012     if (VectorCost < Cost) {
5013       Cost = VectorCost;
5014       Width = i;
5015     }
5016   }
5017 
5018   if (!EnableCondStoresVectorization && NumPredStores) {
5019     reportVectorizationFailure("There are conditional stores.",
5020         "store that is conditionally executed prevents vectorization",
5021         "ConditionalStore", ORE, TheLoop);
5022     Width = 1;
5023     Cost = ScalarCost;
5024   }
5025 
5026   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5027              << "LV: Vectorization seems to be not beneficial, "
5028              << "but was forced by a user.\n");
5029   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5030   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5031   return Factor;
5032 }
5033 
5034 std::pair<unsigned, unsigned>
5035 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5036   unsigned MinWidth = -1U;
5037   unsigned MaxWidth = 8;
5038   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5039 
5040   // For each block.
5041   for (BasicBlock *BB : TheLoop->blocks()) {
5042     // For each instruction in the loop.
5043     for (Instruction &I : BB->instructionsWithoutDebug()) {
5044       Type *T = I.getType();
5045 
5046       // Skip ignored values.
5047       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5048         continue;
5049 
5050       // Only examine Loads, Stores and PHINodes.
5051       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5052         continue;
5053 
5054       // Examine PHI nodes that are reduction variables. Update the type to
5055       // account for the recurrence type.
5056       if (auto *PN = dyn_cast<PHINode>(&I)) {
5057         if (!Legal->isReductionVariable(PN))
5058           continue;
5059         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5060         T = RdxDesc.getRecurrenceType();
5061       }
5062 
5063       // Examine the stored values.
5064       if (auto *ST = dyn_cast<StoreInst>(&I))
5065         T = ST->getValueOperand()->getType();
5066 
5067       // Ignore loaded pointer types and stored pointer types that are not
5068       // vectorizable.
5069       //
5070       // FIXME: The check here attempts to predict whether a load or store will
5071       //        be vectorized. We only know this for certain after a VF has
5072       //        been selected. Here, we assume that if an access can be
5073       //        vectorized, it will be. We should also look at extending this
5074       //        optimization to non-pointer types.
5075       //
5076       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5077           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5078         continue;
5079 
5080       MinWidth = std::min(MinWidth,
5081                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5082       MaxWidth = std::max(MaxWidth,
5083                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5084     }
5085   }
5086 
5087   return {MinWidth, MaxWidth};
5088 }
5089 
5090 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5091                                                            unsigned LoopCost) {
5092   // -- The interleave heuristics --
5093   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5094   // There are many micro-architectural considerations that we can't predict
5095   // at this level. For example, frontend pressure (on decode or fetch) due to
5096   // code size, or the number and capabilities of the execution ports.
5097   //
5098   // We use the following heuristics to select the interleave count:
5099   // 1. If the code has reductions, then we interleave to break the cross
5100   // iteration dependency.
5101   // 2. If the loop is really small, then we interleave to reduce the loop
5102   // overhead.
5103   // 3. We don't interleave if we think that we will spill registers to memory
5104   // due to the increased register pressure.
5105 
5106   if (!isScalarEpilogueAllowed())
5107     return 1;
5108 
5109   // We used the distance for the interleave count.
5110   if (Legal->getMaxSafeDepDistBytes() != -1U)
5111     return 1;
5112 
5113   // Do not interleave loops with a relatively small trip count.
5114   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5115   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
5116     return 1;
5117 
5118   unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
5119   LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5120                     << " registers\n");
5121 
5122   if (VF == 1) {
5123     if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5124       TargetNumRegisters = ForceTargetNumScalarRegs;
5125   } else {
5126     if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5127       TargetNumRegisters = ForceTargetNumVectorRegs;
5128   }
5129 
5130   RegisterUsage R = calculateRegisterUsage({VF})[0];
5131   // We divide by these constants so assume that we have at least one
5132   // instruction that uses at least one register.
5133   R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
5134 
5135   // We calculate the interleave count using the following formula.
5136   // Subtract the number of loop invariants from the number of available
5137   // registers. These registers are used by all of the interleaved instances.
5138   // Next, divide the remaining registers by the number of registers that is
5139   // required by the loop, in order to estimate how many parallel instances
5140   // fit without causing spills. All of this is rounded down if necessary to be
5141   // a power of two. We want power of two interleave count to simplify any
5142   // addressing operations or alignment considerations.
5143   // We also want power of two interleave counts to ensure that the induction
5144   // variable of the vector loop wraps to zero, when tail is folded by masking;
5145   // this currently happens when OptForSize, in which case IC is set to 1 above.
5146   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
5147                               R.MaxLocalUsers);
5148 
5149   // Don't count the induction variable as interleaved.
5150   if (EnableIndVarRegisterHeur)
5151     IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
5152                        std::max(1U, (R.MaxLocalUsers - 1)));
5153 
5154   // Clamp the interleave ranges to reasonable counts.
5155   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5156 
5157   // Check if the user has overridden the max.
5158   if (VF == 1) {
5159     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5160       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5161   } else {
5162     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5163       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5164   }
5165 
5166   // If the trip count is constant, limit the interleave count to be less than
5167   // the trip count divided by VF.
5168   if (TC > 0) {
5169     assert(TC >= VF && "VF exceeds trip count?");
5170     if ((TC / VF) < MaxInterleaveCount)
5171       MaxInterleaveCount = (TC / VF);
5172   }
5173 
5174   // If we did not calculate the cost for VF (because the user selected the VF)
5175   // then we calculate the cost of VF here.
5176   if (LoopCost == 0)
5177     LoopCost = expectedCost(VF).first;
5178 
5179   assert(LoopCost && "Non-zero loop cost expected");
5180 
5181   // Clamp the calculated IC to be between the 1 and the max interleave count
5182   // that the target and trip count allows.
5183   if (IC > MaxInterleaveCount)
5184     IC = MaxInterleaveCount;
5185   else if (IC < 1)
5186     IC = 1;
5187 
5188   // Interleave if we vectorized this loop and there is a reduction that could
5189   // benefit from interleaving.
5190   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5191     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5192     return IC;
5193   }
5194 
5195   // Note that if we've already vectorized the loop we will have done the
5196   // runtime check and so interleaving won't require further checks.
5197   bool InterleavingRequiresRuntimePointerCheck =
5198       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5199 
5200   // We want to interleave small loops in order to reduce the loop overhead and
5201   // potentially expose ILP opportunities.
5202   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5203   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5204     // We assume that the cost overhead is 1 and we use the cost model
5205     // to estimate the cost of the loop and interleave until the cost of the
5206     // loop overhead is about 5% of the cost of the loop.
5207     unsigned SmallIC =
5208         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5209 
5210     // Interleave until store/load ports (estimated by max interleave count) are
5211     // saturated.
5212     unsigned NumStores = Legal->getNumStores();
5213     unsigned NumLoads = Legal->getNumLoads();
5214     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5215     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5216 
5217     // If we have a scalar reduction (vector reductions are already dealt with
5218     // by this point), we can increase the critical path length if the loop
5219     // we're interleaving is inside another loop. Limit, by default to 2, so the
5220     // critical path only gets increased by one reduction operation.
5221     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5222       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5223       SmallIC = std::min(SmallIC, F);
5224       StoresIC = std::min(StoresIC, F);
5225       LoadsIC = std::min(LoadsIC, F);
5226     }
5227 
5228     if (EnableLoadStoreRuntimeInterleave &&
5229         std::max(StoresIC, LoadsIC) > SmallIC) {
5230       LLVM_DEBUG(
5231           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5232       return std::max(StoresIC, LoadsIC);
5233     }
5234 
5235     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5236     return SmallIC;
5237   }
5238 
5239   // Interleave if this is a large loop (small loops are already dealt with by
5240   // this point) that could benefit from interleaving.
5241   bool HasReductions = !Legal->getReductionVars()->empty();
5242   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5243     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5244     return IC;
5245   }
5246 
5247   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5248   return 1;
5249 }
5250 
5251 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5252 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5253   // This function calculates the register usage by measuring the highest number
5254   // of values that are alive at a single location. Obviously, this is a very
5255   // rough estimation. We scan the loop in a topological order in order and
5256   // assign a number to each instruction. We use RPO to ensure that defs are
5257   // met before their users. We assume that each instruction that has in-loop
5258   // users starts an interval. We record every time that an in-loop value is
5259   // used, so we have a list of the first and last occurrences of each
5260   // instruction. Next, we transpose this data structure into a multi map that
5261   // holds the list of intervals that *end* at a specific location. This multi
5262   // map allows us to perform a linear search. We scan the instructions linearly
5263   // and record each time that a new interval starts, by placing it in a set.
5264   // If we find this value in the multi-map then we remove it from the set.
5265   // The max register usage is the maximum size of the set.
5266   // We also search for instructions that are defined outside the loop, but are
5267   // used inside the loop. We need this number separately from the max-interval
5268   // usage number because when we unroll, loop-invariant values do not take
5269   // more register.
5270   LoopBlocksDFS DFS(TheLoop);
5271   DFS.perform(LI);
5272 
5273   RegisterUsage RU;
5274 
5275   // Each 'key' in the map opens a new interval. The values
5276   // of the map are the index of the 'last seen' usage of the
5277   // instruction that is the key.
5278   using IntervalMap = DenseMap<Instruction *, unsigned>;
5279 
5280   // Maps instruction to its index.
5281   SmallVector<Instruction *, 64> IdxToInstr;
5282   // Marks the end of each interval.
5283   IntervalMap EndPoint;
5284   // Saves the list of instruction indices that are used in the loop.
5285   SmallPtrSet<Instruction *, 8> Ends;
5286   // Saves the list of values that are used in the loop but are
5287   // defined outside the loop, such as arguments and constants.
5288   SmallPtrSet<Value *, 8> LoopInvariants;
5289 
5290   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5291     for (Instruction &I : BB->instructionsWithoutDebug()) {
5292       IdxToInstr.push_back(&I);
5293 
5294       // Save the end location of each USE.
5295       for (Value *U : I.operands()) {
5296         auto *Instr = dyn_cast<Instruction>(U);
5297 
5298         // Ignore non-instruction values such as arguments, constants, etc.
5299         if (!Instr)
5300           continue;
5301 
5302         // If this instruction is outside the loop then record it and continue.
5303         if (!TheLoop->contains(Instr)) {
5304           LoopInvariants.insert(Instr);
5305           continue;
5306         }
5307 
5308         // Overwrite previous end points.
5309         EndPoint[Instr] = IdxToInstr.size();
5310         Ends.insert(Instr);
5311       }
5312     }
5313   }
5314 
5315   // Saves the list of intervals that end with the index in 'key'.
5316   using InstrList = SmallVector<Instruction *, 2>;
5317   DenseMap<unsigned, InstrList> TransposeEnds;
5318 
5319   // Transpose the EndPoints to a list of values that end at each index.
5320   for (auto &Interval : EndPoint)
5321     TransposeEnds[Interval.second].push_back(Interval.first);
5322 
5323   SmallPtrSet<Instruction *, 8> OpenIntervals;
5324 
5325   // Get the size of the widest register.
5326   unsigned MaxSafeDepDist = -1U;
5327   if (Legal->getMaxSafeDepDistBytes() != -1U)
5328     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5329   unsigned WidestRegister =
5330       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5331   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5332 
5333   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5334   SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
5335 
5336   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5337 
5338   // A lambda that gets the register usage for the given type and VF.
5339   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5340     if (Ty->isTokenTy())
5341       return 0U;
5342     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5343     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5344   };
5345 
5346   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5347     Instruction *I = IdxToInstr[i];
5348 
5349     // Remove all of the instructions that end at this location.
5350     InstrList &List = TransposeEnds[i];
5351     for (Instruction *ToRemove : List)
5352       OpenIntervals.erase(ToRemove);
5353 
5354     // Ignore instructions that are never used within the loop.
5355     if (Ends.find(I) == Ends.end())
5356       continue;
5357 
5358     // Skip ignored values.
5359     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5360       continue;
5361 
5362     // For each VF find the maximum usage of registers.
5363     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5364       if (VFs[j] == 1) {
5365         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
5366         continue;
5367       }
5368       collectUniformsAndScalars(VFs[j]);
5369       // Count the number of live intervals.
5370       unsigned RegUsage = 0;
5371       for (auto Inst : OpenIntervals) {
5372         // Skip ignored values for VF > 1.
5373         if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
5374             isScalarAfterVectorization(Inst, VFs[j]))
5375           continue;
5376         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
5377       }
5378       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
5379     }
5380 
5381     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5382                       << OpenIntervals.size() << '\n');
5383 
5384     // Add the current instruction to the list of open intervals.
5385     OpenIntervals.insert(I);
5386   }
5387 
5388   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5389     unsigned Invariant = 0;
5390     if (VFs[i] == 1)
5391       Invariant = LoopInvariants.size();
5392     else {
5393       for (auto Inst : LoopInvariants)
5394         Invariant += GetRegUsage(Inst->getType(), VFs[i]);
5395     }
5396 
5397     LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
5398     LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
5399     LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
5400                       << '\n');
5401 
5402     RU.LoopInvariantRegs = Invariant;
5403     RU.MaxLocalUsers = MaxUsages[i];
5404     RUs[i] = RU;
5405   }
5406 
5407   return RUs;
5408 }
5409 
5410 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5411   // TODO: Cost model for emulated masked load/store is completely
5412   // broken. This hack guides the cost model to use an artificially
5413   // high enough value to practically disable vectorization with such
5414   // operations, except where previously deployed legality hack allowed
5415   // using very low cost values. This is to avoid regressions coming simply
5416   // from moving "masked load/store" check from legality to cost model.
5417   // Masked Load/Gather emulation was previously never allowed.
5418   // Limited number of Masked Store/Scatter emulation was allowed.
5419   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5420   return isa<LoadInst>(I) ||
5421          (isa<StoreInst>(I) &&
5422           NumPredStores > NumberOfStoresToPredicate);
5423 }
5424 
5425 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5426   // If we aren't vectorizing the loop, or if we've already collected the
5427   // instructions to scalarize, there's nothing to do. Collection may already
5428   // have occurred if we have a user-selected VF and are now computing the
5429   // expected cost for interleaving.
5430   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5431     return;
5432 
5433   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5434   // not profitable to scalarize any instructions, the presence of VF in the
5435   // map will indicate that we've analyzed it already.
5436   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5437 
5438   // Find all the instructions that are scalar with predication in the loop and
5439   // determine if it would be better to not if-convert the blocks they are in.
5440   // If so, we also record the instructions to scalarize.
5441   for (BasicBlock *BB : TheLoop->blocks()) {
5442     if (!blockNeedsPredication(BB))
5443       continue;
5444     for (Instruction &I : *BB)
5445       if (isScalarWithPredication(&I)) {
5446         ScalarCostsTy ScalarCosts;
5447         // Do not apply discount logic if hacked cost is needed
5448         // for emulated masked memrefs.
5449         if (!useEmulatedMaskMemRefHack(&I) &&
5450             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5451           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5452         // Remember that BB will remain after vectorization.
5453         PredicatedBBsAfterVectorization.insert(BB);
5454       }
5455   }
5456 }
5457 
5458 int LoopVectorizationCostModel::computePredInstDiscount(
5459     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5460     unsigned VF) {
5461   assert(!isUniformAfterVectorization(PredInst, VF) &&
5462          "Instruction marked uniform-after-vectorization will be predicated");
5463 
5464   // Initialize the discount to zero, meaning that the scalar version and the
5465   // vector version cost the same.
5466   int Discount = 0;
5467 
5468   // Holds instructions to analyze. The instructions we visit are mapped in
5469   // ScalarCosts. Those instructions are the ones that would be scalarized if
5470   // we find that the scalar version costs less.
5471   SmallVector<Instruction *, 8> Worklist;
5472 
5473   // Returns true if the given instruction can be scalarized.
5474   auto canBeScalarized = [&](Instruction *I) -> bool {
5475     // We only attempt to scalarize instructions forming a single-use chain
5476     // from the original predicated block that would otherwise be vectorized.
5477     // Although not strictly necessary, we give up on instructions we know will
5478     // already be scalar to avoid traversing chains that are unlikely to be
5479     // beneficial.
5480     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5481         isScalarAfterVectorization(I, VF))
5482       return false;
5483 
5484     // If the instruction is scalar with predication, it will be analyzed
5485     // separately. We ignore it within the context of PredInst.
5486     if (isScalarWithPredication(I))
5487       return false;
5488 
5489     // If any of the instruction's operands are uniform after vectorization,
5490     // the instruction cannot be scalarized. This prevents, for example, a
5491     // masked load from being scalarized.
5492     //
5493     // We assume we will only emit a value for lane zero of an instruction
5494     // marked uniform after vectorization, rather than VF identical values.
5495     // Thus, if we scalarize an instruction that uses a uniform, we would
5496     // create uses of values corresponding to the lanes we aren't emitting code
5497     // for. This behavior can be changed by allowing getScalarValue to clone
5498     // the lane zero values for uniforms rather than asserting.
5499     for (Use &U : I->operands())
5500       if (auto *J = dyn_cast<Instruction>(U.get()))
5501         if (isUniformAfterVectorization(J, VF))
5502           return false;
5503 
5504     // Otherwise, we can scalarize the instruction.
5505     return true;
5506   };
5507 
5508   // Compute the expected cost discount from scalarizing the entire expression
5509   // feeding the predicated instruction. We currently only consider expressions
5510   // that are single-use instruction chains.
5511   Worklist.push_back(PredInst);
5512   while (!Worklist.empty()) {
5513     Instruction *I = Worklist.pop_back_val();
5514 
5515     // If we've already analyzed the instruction, there's nothing to do.
5516     if (ScalarCosts.find(I) != ScalarCosts.end())
5517       continue;
5518 
5519     // Compute the cost of the vector instruction. Note that this cost already
5520     // includes the scalarization overhead of the predicated instruction.
5521     unsigned VectorCost = getInstructionCost(I, VF).first;
5522 
5523     // Compute the cost of the scalarized instruction. This cost is the cost of
5524     // the instruction as if it wasn't if-converted and instead remained in the
5525     // predicated block. We will scale this cost by block probability after
5526     // computing the scalarization overhead.
5527     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5528 
5529     // Compute the scalarization overhead of needed insertelement instructions
5530     // and phi nodes.
5531     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5532       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5533                                                  true, false);
5534       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5535     }
5536 
5537     // Compute the scalarization overhead of needed extractelement
5538     // instructions. For each of the instruction's operands, if the operand can
5539     // be scalarized, add it to the worklist; otherwise, account for the
5540     // overhead.
5541     for (Use &U : I->operands())
5542       if (auto *J = dyn_cast<Instruction>(U.get())) {
5543         assert(VectorType::isValidElementType(J->getType()) &&
5544                "Instruction has non-scalar type");
5545         if (canBeScalarized(J))
5546           Worklist.push_back(J);
5547         else if (needsExtract(J, VF))
5548           ScalarCost += TTI.getScalarizationOverhead(
5549                               ToVectorTy(J->getType(),VF), false, true);
5550       }
5551 
5552     // Scale the total scalar cost by block probability.
5553     ScalarCost /= getReciprocalPredBlockProb();
5554 
5555     // Compute the discount. A non-negative discount means the vector version
5556     // of the instruction costs more, and scalarizing would be beneficial.
5557     Discount += VectorCost - ScalarCost;
5558     ScalarCosts[I] = ScalarCost;
5559   }
5560 
5561   return Discount;
5562 }
5563 
5564 LoopVectorizationCostModel::VectorizationCostTy
5565 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5566   VectorizationCostTy Cost;
5567 
5568   // For each block.
5569   for (BasicBlock *BB : TheLoop->blocks()) {
5570     VectorizationCostTy BlockCost;
5571 
5572     // For each instruction in the old loop.
5573     for (Instruction &I : BB->instructionsWithoutDebug()) {
5574       // Skip ignored values.
5575       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5576           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5577         continue;
5578 
5579       VectorizationCostTy C = getInstructionCost(&I, VF);
5580 
5581       // Check if we should override the cost.
5582       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5583         C.first = ForceTargetInstructionCost;
5584 
5585       BlockCost.first += C.first;
5586       BlockCost.second |= C.second;
5587       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5588                         << " for VF " << VF << " For instruction: " << I
5589                         << '\n');
5590     }
5591 
5592     // If we are vectorizing a predicated block, it will have been
5593     // if-converted. This means that the block's instructions (aside from
5594     // stores and instructions that may divide by zero) will now be
5595     // unconditionally executed. For the scalar case, we may not always execute
5596     // the predicated block. Thus, scale the block's cost by the probability of
5597     // executing it.
5598     if (VF == 1 && blockNeedsPredication(BB))
5599       BlockCost.first /= getReciprocalPredBlockProb();
5600 
5601     Cost.first += BlockCost.first;
5602     Cost.second |= BlockCost.second;
5603   }
5604 
5605   return Cost;
5606 }
5607 
5608 /// Gets Address Access SCEV after verifying that the access pattern
5609 /// is loop invariant except the induction variable dependence.
5610 ///
5611 /// This SCEV can be sent to the Target in order to estimate the address
5612 /// calculation cost.
5613 static const SCEV *getAddressAccessSCEV(
5614               Value *Ptr,
5615               LoopVectorizationLegality *Legal,
5616               PredicatedScalarEvolution &PSE,
5617               const Loop *TheLoop) {
5618 
5619   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5620   if (!Gep)
5621     return nullptr;
5622 
5623   // We are looking for a gep with all loop invariant indices except for one
5624   // which should be an induction variable.
5625   auto SE = PSE.getSE();
5626   unsigned NumOperands = Gep->getNumOperands();
5627   for (unsigned i = 1; i < NumOperands; ++i) {
5628     Value *Opd = Gep->getOperand(i);
5629     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5630         !Legal->isInductionVariable(Opd))
5631       return nullptr;
5632   }
5633 
5634   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5635   return PSE.getSCEV(Ptr);
5636 }
5637 
5638 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5639   return Legal->hasStride(I->getOperand(0)) ||
5640          Legal->hasStride(I->getOperand(1));
5641 }
5642 
5643 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5644                                                                  unsigned VF) {
5645   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5646   Type *ValTy = getMemInstValueType(I);
5647   auto SE = PSE.getSE();
5648 
5649   unsigned Alignment = getLoadStoreAlignment(I);
5650   unsigned AS = getLoadStoreAddressSpace(I);
5651   Value *Ptr = getLoadStorePointerOperand(I);
5652   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5653 
5654   // Figure out whether the access is strided and get the stride value
5655   // if it's known in compile time
5656   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5657 
5658   // Get the cost of the scalar memory instruction and address computation.
5659   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5660 
5661   // Don't pass *I here, since it is scalar but will actually be part of a
5662   // vectorized loop where the user of it is a vectorized instruction.
5663   Cost += VF *
5664           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
5665                               AS);
5666 
5667   // Get the overhead of the extractelement and insertelement instructions
5668   // we might create due to scalarization.
5669   Cost += getScalarizationOverhead(I, VF);
5670 
5671   // If we have a predicated store, it may not be executed for each vector
5672   // lane. Scale the cost by the probability of executing the predicated
5673   // block.
5674   if (isPredicatedInst(I)) {
5675     Cost /= getReciprocalPredBlockProb();
5676 
5677     if (useEmulatedMaskMemRefHack(I))
5678       // Artificially setting to a high enough value to practically disable
5679       // vectorization with such operations.
5680       Cost = 3000000;
5681   }
5682 
5683   return Cost;
5684 }
5685 
5686 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5687                                                              unsigned VF) {
5688   Type *ValTy = getMemInstValueType(I);
5689   Type *VectorTy = ToVectorTy(ValTy, VF);
5690   unsigned Alignment = getLoadStoreAlignment(I);
5691   Value *Ptr = getLoadStorePointerOperand(I);
5692   unsigned AS = getLoadStoreAddressSpace(I);
5693   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5694 
5695   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5696          "Stride should be 1 or -1 for consecutive memory access");
5697   unsigned Cost = 0;
5698   if (Legal->isMaskRequired(I))
5699     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
5700   else
5701     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5702 
5703   bool Reverse = ConsecutiveStride < 0;
5704   if (Reverse)
5705     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5706   return Cost;
5707 }
5708 
5709 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5710                                                          unsigned VF) {
5711   Type *ValTy = getMemInstValueType(I);
5712   Type *VectorTy = ToVectorTy(ValTy, VF);
5713   unsigned Alignment = getLoadStoreAlignment(I);
5714   unsigned AS = getLoadStoreAddressSpace(I);
5715   if (isa<LoadInst>(I)) {
5716     return TTI.getAddressComputationCost(ValTy) +
5717            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5718            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5719   }
5720   StoreInst *SI = cast<StoreInst>(I);
5721 
5722   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5723   return TTI.getAddressComputationCost(ValTy) +
5724          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5725          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
5726                                                Instruction::ExtractElement,
5727                                                VectorTy, VF - 1));
5728 }
5729 
5730 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5731                                                           unsigned VF) {
5732   Type *ValTy = getMemInstValueType(I);
5733   Type *VectorTy = ToVectorTy(ValTy, VF);
5734   unsigned Alignment = getLoadStoreAlignment(I);
5735   Value *Ptr = getLoadStorePointerOperand(I);
5736 
5737   return TTI.getAddressComputationCost(VectorTy) +
5738          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5739                                     Legal->isMaskRequired(I), Alignment);
5740 }
5741 
5742 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5743                                                             unsigned VF) {
5744   Type *ValTy = getMemInstValueType(I);
5745   Type *VectorTy = ToVectorTy(ValTy, VF);
5746   unsigned AS = getLoadStoreAddressSpace(I);
5747 
5748   auto Group = getInterleavedAccessGroup(I);
5749   assert(Group && "Fail to get an interleaved access group.");
5750 
5751   unsigned InterleaveFactor = Group->getFactor();
5752   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5753 
5754   // Holds the indices of existing members in an interleaved load group.
5755   // An interleaved store group doesn't need this as it doesn't allow gaps.
5756   SmallVector<unsigned, 4> Indices;
5757   if (isa<LoadInst>(I)) {
5758     for (unsigned i = 0; i < InterleaveFactor; i++)
5759       if (Group->getMember(i))
5760         Indices.push_back(i);
5761   }
5762 
5763   // Calculate the cost of the whole interleaved group.
5764   bool UseMaskForGaps =
5765       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5766   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5767       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5768       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5769 
5770   if (Group->isReverse()) {
5771     // TODO: Add support for reversed masked interleaved access.
5772     assert(!Legal->isMaskRequired(I) &&
5773            "Reverse masked interleaved access not supported.");
5774     Cost += Group->getNumMembers() *
5775             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5776   }
5777   return Cost;
5778 }
5779 
5780 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5781                                                               unsigned VF) {
5782   // Calculate scalar cost only. Vectorization cost should be ready at this
5783   // moment.
5784   if (VF == 1) {
5785     Type *ValTy = getMemInstValueType(I);
5786     unsigned Alignment = getLoadStoreAlignment(I);
5787     unsigned AS = getLoadStoreAddressSpace(I);
5788 
5789     return TTI.getAddressComputationCost(ValTy) +
5790            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5791   }
5792   return getWideningCost(I, VF);
5793 }
5794 
5795 LoopVectorizationCostModel::VectorizationCostTy
5796 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5797   // If we know that this instruction will remain uniform, check the cost of
5798   // the scalar version.
5799   if (isUniformAfterVectorization(I, VF))
5800     VF = 1;
5801 
5802   if (VF > 1 && isProfitableToScalarize(I, VF))
5803     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5804 
5805   // Forced scalars do not have any scalarization overhead.
5806   auto ForcedScalar = ForcedScalars.find(VF);
5807   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5808     auto InstSet = ForcedScalar->second;
5809     if (InstSet.find(I) != InstSet.end())
5810       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5811   }
5812 
5813   Type *VectorTy;
5814   unsigned C = getInstructionCost(I, VF, VectorTy);
5815 
5816   bool TypeNotScalarized =
5817       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5818   return VectorizationCostTy(C, TypeNotScalarized);
5819 }
5820 
5821 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5822                                                               unsigned VF) {
5823 
5824   if (VF == 1)
5825     return 0;
5826 
5827   unsigned Cost = 0;
5828   Type *RetTy = ToVectorTy(I->getType(), VF);
5829   if (!RetTy->isVoidTy() &&
5830       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5831     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5832 
5833   // Some targets keep addresses scalar.
5834   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5835     return Cost;
5836 
5837   // Some targets support efficient element stores.
5838   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5839     return Cost;
5840 
5841   // Collect operands to consider.
5842   CallInst *CI = dyn_cast<CallInst>(I);
5843   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5844 
5845   // Skip operands that do not require extraction/scalarization and do not incur
5846   // any overhead.
5847   return Cost + TTI.getOperandsScalarizationOverhead(
5848                     filterExtractingOperands(Ops, VF), VF);
5849 }
5850 
5851 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5852   if (VF == 1)
5853     return;
5854   NumPredStores = 0;
5855   for (BasicBlock *BB : TheLoop->blocks()) {
5856     // For each instruction in the old loop.
5857     for (Instruction &I : *BB) {
5858       Value *Ptr =  getLoadStorePointerOperand(&I);
5859       if (!Ptr)
5860         continue;
5861 
5862       // TODO: We should generate better code and update the cost model for
5863       // predicated uniform stores. Today they are treated as any other
5864       // predicated store (see added test cases in
5865       // invariant-store-vectorization.ll).
5866       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5867         NumPredStores++;
5868 
5869       if (Legal->isUniform(Ptr) &&
5870           // Conditional loads and stores should be scalarized and predicated.
5871           // isScalarWithPredication cannot be used here since masked
5872           // gather/scatters are not considered scalar with predication.
5873           !Legal->blockNeedsPredication(I.getParent())) {
5874         // TODO: Avoid replicating loads and stores instead of
5875         // relying on instcombine to remove them.
5876         // Load: Scalar load + broadcast
5877         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5878         unsigned Cost = getUniformMemOpCost(&I, VF);
5879         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5880         continue;
5881       }
5882 
5883       // We assume that widening is the best solution when possible.
5884       if (memoryInstructionCanBeWidened(&I, VF)) {
5885         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5886         int ConsecutiveStride =
5887                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5888         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5889                "Expected consecutive stride.");
5890         InstWidening Decision =
5891             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5892         setWideningDecision(&I, VF, Decision, Cost);
5893         continue;
5894       }
5895 
5896       // Choose between Interleaving, Gather/Scatter or Scalarization.
5897       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
5898       unsigned NumAccesses = 1;
5899       if (isAccessInterleaved(&I)) {
5900         auto Group = getInterleavedAccessGroup(&I);
5901         assert(Group && "Fail to get an interleaved access group.");
5902 
5903         // Make one decision for the whole group.
5904         if (getWideningDecision(&I, VF) != CM_Unknown)
5905           continue;
5906 
5907         NumAccesses = Group->getNumMembers();
5908         if (interleavedAccessCanBeWidened(&I, VF))
5909           InterleaveCost = getInterleaveGroupCost(&I, VF);
5910       }
5911 
5912       unsigned GatherScatterCost =
5913           isLegalGatherOrScatter(&I)
5914               ? getGatherScatterCost(&I, VF) * NumAccesses
5915               : std::numeric_limits<unsigned>::max();
5916 
5917       unsigned ScalarizationCost =
5918           getMemInstScalarizationCost(&I, VF) * NumAccesses;
5919 
5920       // Choose better solution for the current VF,
5921       // write down this decision and use it during vectorization.
5922       unsigned Cost;
5923       InstWidening Decision;
5924       if (InterleaveCost <= GatherScatterCost &&
5925           InterleaveCost < ScalarizationCost) {
5926         Decision = CM_Interleave;
5927         Cost = InterleaveCost;
5928       } else if (GatherScatterCost < ScalarizationCost) {
5929         Decision = CM_GatherScatter;
5930         Cost = GatherScatterCost;
5931       } else {
5932         Decision = CM_Scalarize;
5933         Cost = ScalarizationCost;
5934       }
5935       // If the instructions belongs to an interleave group, the whole group
5936       // receives the same decision. The whole group receives the cost, but
5937       // the cost will actually be assigned to one instruction.
5938       if (auto Group = getInterleavedAccessGroup(&I))
5939         setWideningDecision(Group, VF, Decision, Cost);
5940       else
5941         setWideningDecision(&I, VF, Decision, Cost);
5942     }
5943   }
5944 
5945   // Make sure that any load of address and any other address computation
5946   // remains scalar unless there is gather/scatter support. This avoids
5947   // inevitable extracts into address registers, and also has the benefit of
5948   // activating LSR more, since that pass can't optimize vectorized
5949   // addresses.
5950   if (TTI.prefersVectorizedAddressing())
5951     return;
5952 
5953   // Start with all scalar pointer uses.
5954   SmallPtrSet<Instruction *, 8> AddrDefs;
5955   for (BasicBlock *BB : TheLoop->blocks())
5956     for (Instruction &I : *BB) {
5957       Instruction *PtrDef =
5958         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5959       if (PtrDef && TheLoop->contains(PtrDef) &&
5960           getWideningDecision(&I, VF) != CM_GatherScatter)
5961         AddrDefs.insert(PtrDef);
5962     }
5963 
5964   // Add all instructions used to generate the addresses.
5965   SmallVector<Instruction *, 4> Worklist;
5966   for (auto *I : AddrDefs)
5967     Worklist.push_back(I);
5968   while (!Worklist.empty()) {
5969     Instruction *I = Worklist.pop_back_val();
5970     for (auto &Op : I->operands())
5971       if (auto *InstOp = dyn_cast<Instruction>(Op))
5972         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
5973             AddrDefs.insert(InstOp).second)
5974           Worklist.push_back(InstOp);
5975   }
5976 
5977   for (auto *I : AddrDefs) {
5978     if (isa<LoadInst>(I)) {
5979       // Setting the desired widening decision should ideally be handled in
5980       // by cost functions, but since this involves the task of finding out
5981       // if the loaded register is involved in an address computation, it is
5982       // instead changed here when we know this is the case.
5983       InstWidening Decision = getWideningDecision(I, VF);
5984       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5985         // Scalarize a widened load of address.
5986         setWideningDecision(I, VF, CM_Scalarize,
5987                             (VF * getMemoryInstructionCost(I, 1)));
5988       else if (auto Group = getInterleavedAccessGroup(I)) {
5989         // Scalarize an interleave group of address loads.
5990         for (unsigned I = 0; I < Group->getFactor(); ++I) {
5991           if (Instruction *Member = Group->getMember(I))
5992             setWideningDecision(Member, VF, CM_Scalarize,
5993                                 (VF * getMemoryInstructionCost(Member, 1)));
5994         }
5995       }
5996     } else
5997       // Make sure I gets scalarized and a cost estimate without
5998       // scalarization overhead.
5999       ForcedScalars[VF].insert(I);
6000   }
6001 }
6002 
6003 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6004                                                         unsigned VF,
6005                                                         Type *&VectorTy) {
6006   Type *RetTy = I->getType();
6007   if (canTruncateToMinimalBitwidth(I, VF))
6008     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6009   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6010   auto SE = PSE.getSE();
6011 
6012   // TODO: We need to estimate the cost of intrinsic calls.
6013   switch (I->getOpcode()) {
6014   case Instruction::GetElementPtr:
6015     // We mark this instruction as zero-cost because the cost of GEPs in
6016     // vectorized code depends on whether the corresponding memory instruction
6017     // is scalarized or not. Therefore, we handle GEPs with the memory
6018     // instruction cost.
6019     return 0;
6020   case Instruction::Br: {
6021     // In cases of scalarized and predicated instructions, there will be VF
6022     // predicated blocks in the vectorized loop. Each branch around these
6023     // blocks requires also an extract of its vector compare i1 element.
6024     bool ScalarPredicatedBB = false;
6025     BranchInst *BI = cast<BranchInst>(I);
6026     if (VF > 1 && BI->isConditional() &&
6027         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6028              PredicatedBBsAfterVectorization.end() ||
6029          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6030              PredicatedBBsAfterVectorization.end()))
6031       ScalarPredicatedBB = true;
6032 
6033     if (ScalarPredicatedBB) {
6034       // Return cost for branches around scalarized and predicated blocks.
6035       Type *Vec_i1Ty =
6036           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6037       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6038               (TTI.getCFInstrCost(Instruction::Br) * VF));
6039     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6040       // The back-edge branch will remain, as will all scalar branches.
6041       return TTI.getCFInstrCost(Instruction::Br);
6042     else
6043       // This branch will be eliminated by if-conversion.
6044       return 0;
6045     // Note: We currently assume zero cost for an unconditional branch inside
6046     // a predicated block since it will become a fall-through, although we
6047     // may decide in the future to call TTI for all branches.
6048   }
6049   case Instruction::PHI: {
6050     auto *Phi = cast<PHINode>(I);
6051 
6052     // First-order recurrences are replaced by vector shuffles inside the loop.
6053     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6054     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6055       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6056                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6057 
6058     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6059     // converted into select instructions. We require N - 1 selects per phi
6060     // node, where N is the number of incoming values.
6061     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6062       return (Phi->getNumIncomingValues() - 1) *
6063              TTI.getCmpSelInstrCost(
6064                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6065                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6066 
6067     return TTI.getCFInstrCost(Instruction::PHI);
6068   }
6069   case Instruction::UDiv:
6070   case Instruction::SDiv:
6071   case Instruction::URem:
6072   case Instruction::SRem:
6073     // If we have a predicated instruction, it may not be executed for each
6074     // vector lane. Get the scalarization cost and scale this amount by the
6075     // probability of executing the predicated block. If the instruction is not
6076     // predicated, we fall through to the next case.
6077     if (VF > 1 && isScalarWithPredication(I)) {
6078       unsigned Cost = 0;
6079 
6080       // These instructions have a non-void type, so account for the phi nodes
6081       // that we will create. This cost is likely to be zero. The phi node
6082       // cost, if any, should be scaled by the block probability because it
6083       // models a copy at the end of each predicated block.
6084       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6085 
6086       // The cost of the non-predicated instruction.
6087       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6088 
6089       // The cost of insertelement and extractelement instructions needed for
6090       // scalarization.
6091       Cost += getScalarizationOverhead(I, VF);
6092 
6093       // Scale the cost by the probability of executing the predicated blocks.
6094       // This assumes the predicated block for each vector lane is equally
6095       // likely.
6096       return Cost / getReciprocalPredBlockProb();
6097     }
6098     LLVM_FALLTHROUGH;
6099   case Instruction::Add:
6100   case Instruction::FAdd:
6101   case Instruction::Sub:
6102   case Instruction::FSub:
6103   case Instruction::Mul:
6104   case Instruction::FMul:
6105   case Instruction::FDiv:
6106   case Instruction::FRem:
6107   case Instruction::Shl:
6108   case Instruction::LShr:
6109   case Instruction::AShr:
6110   case Instruction::And:
6111   case Instruction::Or:
6112   case Instruction::Xor: {
6113     // Since we will replace the stride by 1 the multiplication should go away.
6114     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6115       return 0;
6116     // Certain instructions can be cheaper to vectorize if they have a constant
6117     // second vector operand. One example of this are shifts on x86.
6118     Value *Op2 = I->getOperand(1);
6119     TargetTransformInfo::OperandValueProperties Op2VP;
6120     TargetTransformInfo::OperandValueKind Op2VK =
6121         TTI.getOperandInfo(Op2, Op2VP);
6122     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6123       Op2VK = TargetTransformInfo::OK_UniformValue;
6124 
6125     SmallVector<const Value *, 4> Operands(I->operand_values());
6126     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6127     return N * TTI.getArithmeticInstrCost(
6128                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6129                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands);
6130   }
6131   case Instruction::FNeg: {
6132     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6133     return N * TTI.getArithmeticInstrCost(
6134                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6135                    TargetTransformInfo::OK_AnyValue,
6136                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6137                    I->getOperand(0));
6138   }
6139   case Instruction::Select: {
6140     SelectInst *SI = cast<SelectInst>(I);
6141     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6142     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6143     Type *CondTy = SI->getCondition()->getType();
6144     if (!ScalarCond)
6145       CondTy = VectorType::get(CondTy, VF);
6146 
6147     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6148   }
6149   case Instruction::ICmp:
6150   case Instruction::FCmp: {
6151     Type *ValTy = I->getOperand(0)->getType();
6152     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6153     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6154       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6155     VectorTy = ToVectorTy(ValTy, VF);
6156     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6157   }
6158   case Instruction::Store:
6159   case Instruction::Load: {
6160     unsigned Width = VF;
6161     if (Width > 1) {
6162       InstWidening Decision = getWideningDecision(I, Width);
6163       assert(Decision != CM_Unknown &&
6164              "CM decision should be taken at this point");
6165       if (Decision == CM_Scalarize)
6166         Width = 1;
6167     }
6168     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6169     return getMemoryInstructionCost(I, VF);
6170   }
6171   case Instruction::ZExt:
6172   case Instruction::SExt:
6173   case Instruction::FPToUI:
6174   case Instruction::FPToSI:
6175   case Instruction::FPExt:
6176   case Instruction::PtrToInt:
6177   case Instruction::IntToPtr:
6178   case Instruction::SIToFP:
6179   case Instruction::UIToFP:
6180   case Instruction::Trunc:
6181   case Instruction::FPTrunc:
6182   case Instruction::BitCast: {
6183     // We optimize the truncation of induction variables having constant
6184     // integer steps. The cost of these truncations is the same as the scalar
6185     // operation.
6186     if (isOptimizableIVTruncate(I, VF)) {
6187       auto *Trunc = cast<TruncInst>(I);
6188       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6189                                   Trunc->getSrcTy(), Trunc);
6190     }
6191 
6192     Type *SrcScalarTy = I->getOperand(0)->getType();
6193     Type *SrcVecTy =
6194         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6195     if (canTruncateToMinimalBitwidth(I, VF)) {
6196       // This cast is going to be shrunk. This may remove the cast or it might
6197       // turn it into slightly different cast. For example, if MinBW == 16,
6198       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6199       //
6200       // Calculate the modified src and dest types.
6201       Type *MinVecTy = VectorTy;
6202       if (I->getOpcode() == Instruction::Trunc) {
6203         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6204         VectorTy =
6205             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6206       } else if (I->getOpcode() == Instruction::ZExt ||
6207                  I->getOpcode() == Instruction::SExt) {
6208         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6209         VectorTy =
6210             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6211       }
6212     }
6213 
6214     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6215     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6216   }
6217   case Instruction::Call: {
6218     bool NeedToScalarize;
6219     CallInst *CI = cast<CallInst>(I);
6220     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6221     if (getVectorIntrinsicIDForCall(CI, TLI))
6222       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6223     return CallCost;
6224   }
6225   default:
6226     // The cost of executing VF copies of the scalar instruction. This opcode
6227     // is unknown. Assume that it is the same as 'mul'.
6228     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6229            getScalarizationOverhead(I, VF);
6230   } // end of switch.
6231 }
6232 
6233 char LoopVectorize::ID = 0;
6234 
6235 static const char lv_name[] = "Loop Vectorization";
6236 
6237 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6238 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6239 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6240 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6241 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6242 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6243 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6244 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6245 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6246 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6247 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6248 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6249 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6250 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6251 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6252 
6253 namespace llvm {
6254 
6255 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6256 
6257 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6258                               bool VectorizeOnlyWhenForced) {
6259   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6260 }
6261 
6262 } // end namespace llvm
6263 
6264 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6265   // Check if the pointer operand of a load or store instruction is
6266   // consecutive.
6267   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6268     return Legal->isConsecutivePtr(Ptr);
6269   return false;
6270 }
6271 
6272 void LoopVectorizationCostModel::collectValuesToIgnore() {
6273   // Ignore ephemeral values.
6274   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6275 
6276   // Ignore type-promoting instructions we identified during reduction
6277   // detection.
6278   for (auto &Reduction : *Legal->getReductionVars()) {
6279     RecurrenceDescriptor &RedDes = Reduction.second;
6280     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6281     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6282   }
6283   // Ignore type-casting instructions we identified during induction
6284   // detection.
6285   for (auto &Induction : *Legal->getInductionVars()) {
6286     InductionDescriptor &IndDes = Induction.second;
6287     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6288     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6289   }
6290 }
6291 
6292 // TODO: we could return a pair of values that specify the max VF and
6293 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6294 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6295 // doesn't have a cost model that can choose which plan to execute if
6296 // more than one is generated.
6297 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6298                                  LoopVectorizationCostModel &CM) {
6299   unsigned WidestType;
6300   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6301   return WidestVectorRegBits / WidestType;
6302 }
6303 
6304 VectorizationFactor
6305 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6306   unsigned VF = UserVF;
6307   // Outer loop handling: They may require CFG and instruction level
6308   // transformations before even evaluating whether vectorization is profitable.
6309   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6310   // the vectorization pipeline.
6311   if (!OrigLoop->empty()) {
6312     // If the user doesn't provide a vectorization factor, determine a
6313     // reasonable one.
6314     if (!UserVF) {
6315       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6316       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6317 
6318       // Make sure we have a VF > 1 for stress testing.
6319       if (VPlanBuildStressTest && VF < 2) {
6320         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6321                           << "overriding computed VF.\n");
6322         VF = 4;
6323       }
6324     }
6325     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6326     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6327     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6328                       << " to build VPlans.\n");
6329     buildVPlans(VF, VF);
6330 
6331     // For VPlan build stress testing, we bail out after VPlan construction.
6332     if (VPlanBuildStressTest)
6333       return VectorizationFactor::Disabled();
6334 
6335     return {VF, 0};
6336   }
6337 
6338   LLVM_DEBUG(
6339       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6340                 "VPlan-native path.\n");
6341   return VectorizationFactor::Disabled();
6342 }
6343 
6344 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6345   assert(OrigLoop->empty() && "Inner loop expected.");
6346   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6347   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6348     return None;
6349 
6350   // Invalidate interleave groups if all blocks of loop will be predicated.
6351   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6352       !useMaskedInterleavedAccesses(*TTI)) {
6353     LLVM_DEBUG(
6354         dbgs()
6355         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6356            "which requires masked-interleaved support.\n");
6357     CM.InterleaveInfo.reset();
6358   }
6359 
6360   if (UserVF) {
6361     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6362     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6363     // Collect the instructions (and their associated costs) that will be more
6364     // profitable to scalarize.
6365     CM.selectUserVectorizationFactor(UserVF);
6366     buildVPlansWithVPRecipes(UserVF, UserVF);
6367     LLVM_DEBUG(printPlans(dbgs()));
6368     return {{UserVF, 0}};
6369   }
6370 
6371   unsigned MaxVF = MaybeMaxVF.getValue();
6372   assert(MaxVF != 0 && "MaxVF is zero.");
6373 
6374   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6375     // Collect Uniform and Scalar instructions after vectorization with VF.
6376     CM.collectUniformsAndScalars(VF);
6377 
6378     // Collect the instructions (and their associated costs) that will be more
6379     // profitable to scalarize.
6380     if (VF > 1)
6381       CM.collectInstsToScalarize(VF);
6382   }
6383 
6384   buildVPlansWithVPRecipes(1, MaxVF);
6385   LLVM_DEBUG(printPlans(dbgs()));
6386   if (MaxVF == 1)
6387     return VectorizationFactor::Disabled();
6388 
6389   // Select the optimal vectorization factor.
6390   return CM.selectVectorizationFactor(MaxVF);
6391 }
6392 
6393 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6394   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6395                     << '\n');
6396   BestVF = VF;
6397   BestUF = UF;
6398 
6399   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6400     return !Plan->hasVF(VF);
6401   });
6402   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6403 }
6404 
6405 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6406                                            DominatorTree *DT) {
6407   // Perform the actual loop transformation.
6408 
6409   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6410   VPCallbackILV CallbackILV(ILV);
6411 
6412   VPTransformState State{BestVF, BestUF,      LI,
6413                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6414                          &ILV,   CallbackILV};
6415   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6416   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6417 
6418   //===------------------------------------------------===//
6419   //
6420   // Notice: any optimization or new instruction that go
6421   // into the code below should also be implemented in
6422   // the cost-model.
6423   //
6424   //===------------------------------------------------===//
6425 
6426   // 2. Copy and widen instructions from the old loop into the new loop.
6427   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6428   VPlans.front()->execute(&State);
6429 
6430   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6431   //    predication, updating analyses.
6432   ILV.fixVectorizedLoop();
6433 }
6434 
6435 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6436     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6437   BasicBlock *Latch = OrigLoop->getLoopLatch();
6438 
6439   // We create new control-flow for the vectorized loop, so the original
6440   // condition will be dead after vectorization if it's only used by the
6441   // branch.
6442   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6443   if (Cmp && Cmp->hasOneUse())
6444     DeadInstructions.insert(Cmp);
6445 
6446   // We create new "steps" for induction variable updates to which the original
6447   // induction variables map. An original update instruction will be dead if
6448   // all its users except the induction variable are dead.
6449   for (auto &Induction : *Legal->getInductionVars()) {
6450     PHINode *Ind = Induction.first;
6451     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6452     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6453           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6454                                  DeadInstructions.end();
6455         }))
6456       DeadInstructions.insert(IndUpdate);
6457 
6458     // We record as "Dead" also the type-casting instructions we had identified
6459     // during induction analysis. We don't need any handling for them in the
6460     // vectorized loop because we have proven that, under a proper runtime
6461     // test guarding the vectorized loop, the value of the phi, and the casted
6462     // value of the phi, are the same. The last instruction in this casting chain
6463     // will get its scalar/vector/widened def from the scalar/vector/widened def
6464     // of the respective phi node. Any other casts in the induction def-use chain
6465     // have no other uses outside the phi update chain, and will be ignored.
6466     InductionDescriptor &IndDes = Induction.second;
6467     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6468     DeadInstructions.insert(Casts.begin(), Casts.end());
6469   }
6470 }
6471 
6472 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6473 
6474 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6475 
6476 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6477                                         Instruction::BinaryOps BinOp) {
6478   // When unrolling and the VF is 1, we only need to add a simple scalar.
6479   Type *Ty = Val->getType();
6480   assert(!Ty->isVectorTy() && "Val must be a scalar");
6481 
6482   if (Ty->isFloatingPointTy()) {
6483     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6484 
6485     // Floating point operations had to be 'fast' to enable the unrolling.
6486     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6487     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6488   }
6489   Constant *C = ConstantInt::get(Ty, StartIdx);
6490   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6491 }
6492 
6493 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6494   SmallVector<Metadata *, 4> MDs;
6495   // Reserve first location for self reference to the LoopID metadata node.
6496   MDs.push_back(nullptr);
6497   bool IsUnrollMetadata = false;
6498   MDNode *LoopID = L->getLoopID();
6499   if (LoopID) {
6500     // First find existing loop unrolling disable metadata.
6501     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6502       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6503       if (MD) {
6504         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6505         IsUnrollMetadata =
6506             S && S->getString().startswith("llvm.loop.unroll.disable");
6507       }
6508       MDs.push_back(LoopID->getOperand(i));
6509     }
6510   }
6511 
6512   if (!IsUnrollMetadata) {
6513     // Add runtime unroll disable metadata.
6514     LLVMContext &Context = L->getHeader()->getContext();
6515     SmallVector<Metadata *, 1> DisableOperands;
6516     DisableOperands.push_back(
6517         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6518     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6519     MDs.push_back(DisableNode);
6520     MDNode *NewLoopID = MDNode::get(Context, MDs);
6521     // Set operand 0 to refer to the loop id itself.
6522     NewLoopID->replaceOperandWith(0, NewLoopID);
6523     L->setLoopID(NewLoopID);
6524   }
6525 }
6526 
6527 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6528     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6529   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6530   bool PredicateAtRangeStart = Predicate(Range.Start);
6531 
6532   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6533     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6534       Range.End = TmpVF;
6535       break;
6536     }
6537 
6538   return PredicateAtRangeStart;
6539 }
6540 
6541 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6542 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6543 /// of VF's starting at a given VF and extending it as much as possible. Each
6544 /// vectorization decision can potentially shorten this sub-range during
6545 /// buildVPlan().
6546 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6547   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6548     VFRange SubRange = {VF, MaxVF + 1};
6549     VPlans.push_back(buildVPlan(SubRange));
6550     VF = SubRange.End;
6551   }
6552 }
6553 
6554 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6555                                          VPlanPtr &Plan) {
6556   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6557 
6558   // Look for cached value.
6559   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6560   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6561   if (ECEntryIt != EdgeMaskCache.end())
6562     return ECEntryIt->second;
6563 
6564   VPValue *SrcMask = createBlockInMask(Src, Plan);
6565 
6566   // The terminator has to be a branch inst!
6567   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6568   assert(BI && "Unexpected terminator found");
6569 
6570   if (!BI->isConditional())
6571     return EdgeMaskCache[Edge] = SrcMask;
6572 
6573   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6574   assert(EdgeMask && "No Edge Mask found for condition");
6575 
6576   if (BI->getSuccessor(0) != Dst)
6577     EdgeMask = Builder.createNot(EdgeMask);
6578 
6579   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6580     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6581 
6582   return EdgeMaskCache[Edge] = EdgeMask;
6583 }
6584 
6585 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6586   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6587 
6588   // Look for cached value.
6589   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6590   if (BCEntryIt != BlockMaskCache.end())
6591     return BCEntryIt->second;
6592 
6593   // All-one mask is modelled as no-mask following the convention for masked
6594   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6595   VPValue *BlockMask = nullptr;
6596 
6597   if (OrigLoop->getHeader() == BB) {
6598     if (!CM.blockNeedsPredication(BB))
6599       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6600 
6601     // Introduce the early-exit compare IV <= BTC to form header block mask.
6602     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6603     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6604     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6605     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6606     return BlockMaskCache[BB] = BlockMask;
6607   }
6608 
6609   // This is the block mask. We OR all incoming edges.
6610   for (auto *Predecessor : predecessors(BB)) {
6611     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6612     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6613       return BlockMaskCache[BB] = EdgeMask;
6614 
6615     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6616       BlockMask = EdgeMask;
6617       continue;
6618     }
6619 
6620     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6621   }
6622 
6623   return BlockMaskCache[BB] = BlockMask;
6624 }
6625 
6626 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
6627                                                            VFRange &Range,
6628                                                            VPlanPtr &Plan) {
6629   const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I);
6630   if (!IG)
6631     return nullptr;
6632 
6633   // Now check if IG is relevant for VF's in the given range.
6634   auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
6635     return [=](unsigned VF) -> bool {
6636       return (VF >= 2 && // Query is illegal for VF == 1
6637               CM.getWideningDecision(I, VF) ==
6638                   LoopVectorizationCostModel::CM_Interleave);
6639     };
6640   };
6641   if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range))
6642     return nullptr;
6643 
6644   // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
6645   // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
6646   // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
6647   assert(I == IG->getInsertPos() &&
6648          "Generating a recipe for an adjunct member of an interleave group");
6649 
6650   VPValue *Mask = nullptr;
6651   if (Legal->isMaskRequired(I))
6652     Mask = createBlockInMask(I->getParent(), Plan);
6653 
6654   return new VPInterleaveRecipe(IG, Mask);
6655 }
6656 
6657 VPWidenMemoryInstructionRecipe *
6658 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6659                                   VPlanPtr &Plan) {
6660   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6661     return nullptr;
6662 
6663   auto willWiden = [&](unsigned VF) -> bool {
6664     if (VF == 1)
6665       return false;
6666     if (CM.isScalarAfterVectorization(I, VF) ||
6667         CM.isProfitableToScalarize(I, VF))
6668       return false;
6669     LoopVectorizationCostModel::InstWidening Decision =
6670         CM.getWideningDecision(I, VF);
6671     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6672            "CM decision should be taken at this point.");
6673     assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
6674            "Interleave memory opportunity should be caught earlier.");
6675     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6676   };
6677 
6678   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6679     return nullptr;
6680 
6681   VPValue *Mask = nullptr;
6682   if (Legal->isMaskRequired(I))
6683     Mask = createBlockInMask(I->getParent(), Plan);
6684 
6685   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6686 }
6687 
6688 VPWidenIntOrFpInductionRecipe *
6689 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6690   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6691     // Check if this is an integer or fp induction. If so, build the recipe that
6692     // produces its scalar and vector values.
6693     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6694     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6695         II.getKind() == InductionDescriptor::IK_FpInduction)
6696       return new VPWidenIntOrFpInductionRecipe(Phi);
6697 
6698     return nullptr;
6699   }
6700 
6701   // Optimize the special case where the source is a constant integer
6702   // induction variable. Notice that we can only optimize the 'trunc' case
6703   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6704   // (c) other casts depend on pointer size.
6705 
6706   // Determine whether \p K is a truncation based on an induction variable that
6707   // can be optimized.
6708   auto isOptimizableIVTruncate =
6709       [&](Instruction *K) -> std::function<bool(unsigned)> {
6710     return
6711         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6712   };
6713 
6714   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6715                                isOptimizableIVTruncate(I), Range))
6716     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6717                                              cast<TruncInst>(I));
6718   return nullptr;
6719 }
6720 
6721 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6722   PHINode *Phi = dyn_cast<PHINode>(I);
6723   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6724     return nullptr;
6725 
6726   // We know that all PHIs in non-header blocks are converted into selects, so
6727   // we don't have to worry about the insertion order and we can just use the
6728   // builder. At this point we generate the predication tree. There may be
6729   // duplications since this is a simple recursive scan, but future
6730   // optimizations will clean it up.
6731 
6732   SmallVector<VPValue *, 2> Masks;
6733   unsigned NumIncoming = Phi->getNumIncomingValues();
6734   for (unsigned In = 0; In < NumIncoming; In++) {
6735     VPValue *EdgeMask =
6736       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6737     assert((EdgeMask || NumIncoming == 1) &&
6738            "Multiple predecessors with one having a full mask");
6739     if (EdgeMask)
6740       Masks.push_back(EdgeMask);
6741   }
6742   return new VPBlendRecipe(Phi, Masks);
6743 }
6744 
6745 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6746                                  VFRange &Range) {
6747 
6748   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6749       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6750 
6751   if (IsPredicated)
6752     return false;
6753 
6754   auto IsVectorizableOpcode = [](unsigned Opcode) {
6755     switch (Opcode) {
6756     case Instruction::Add:
6757     case Instruction::And:
6758     case Instruction::AShr:
6759     case Instruction::BitCast:
6760     case Instruction::Br:
6761     case Instruction::Call:
6762     case Instruction::FAdd:
6763     case Instruction::FCmp:
6764     case Instruction::FDiv:
6765     case Instruction::FMul:
6766     case Instruction::FNeg:
6767     case Instruction::FPExt:
6768     case Instruction::FPToSI:
6769     case Instruction::FPToUI:
6770     case Instruction::FPTrunc:
6771     case Instruction::FRem:
6772     case Instruction::FSub:
6773     case Instruction::GetElementPtr:
6774     case Instruction::ICmp:
6775     case Instruction::IntToPtr:
6776     case Instruction::Load:
6777     case Instruction::LShr:
6778     case Instruction::Mul:
6779     case Instruction::Or:
6780     case Instruction::PHI:
6781     case Instruction::PtrToInt:
6782     case Instruction::SDiv:
6783     case Instruction::Select:
6784     case Instruction::SExt:
6785     case Instruction::Shl:
6786     case Instruction::SIToFP:
6787     case Instruction::SRem:
6788     case Instruction::Store:
6789     case Instruction::Sub:
6790     case Instruction::Trunc:
6791     case Instruction::UDiv:
6792     case Instruction::UIToFP:
6793     case Instruction::URem:
6794     case Instruction::Xor:
6795     case Instruction::ZExt:
6796       return true;
6797     }
6798     return false;
6799   };
6800 
6801   if (!IsVectorizableOpcode(I->getOpcode()))
6802     return false;
6803 
6804   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6805     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6806     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6807                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6808       return false;
6809   }
6810 
6811   auto willWiden = [&](unsigned VF) -> bool {
6812     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6813                              CM.isProfitableToScalarize(I, VF)))
6814       return false;
6815     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6816       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6817       // The following case may be scalarized depending on the VF.
6818       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6819       // version of the instruction.
6820       // Is it beneficial to perform intrinsic call compared to lib call?
6821       bool NeedToScalarize;
6822       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6823       bool UseVectorIntrinsic =
6824           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6825       return UseVectorIntrinsic || !NeedToScalarize;
6826     }
6827     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6828       assert(CM.getWideningDecision(I, VF) ==
6829                  LoopVectorizationCostModel::CM_Scalarize &&
6830              "Memory widening decisions should have been taken care by now");
6831       return false;
6832     }
6833     return true;
6834   };
6835 
6836   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6837     return false;
6838 
6839   // Success: widen this instruction. We optimize the common case where
6840   // consecutive instructions can be represented by a single recipe.
6841   if (!VPBB->empty()) {
6842     VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back());
6843     if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
6844       return true;
6845   }
6846 
6847   VPBB->appendRecipe(new VPWidenRecipe(I));
6848   return true;
6849 }
6850 
6851 VPBasicBlock *VPRecipeBuilder::handleReplication(
6852     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6853     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6854     VPlanPtr &Plan) {
6855   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6856       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6857       Range);
6858 
6859   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6860       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6861 
6862   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6863 
6864   // Find if I uses a predicated instruction. If so, it will use its scalar
6865   // value. Avoid hoisting the insert-element which packs the scalar value into
6866   // a vector value, as that happens iff all users use the vector value.
6867   for (auto &Op : I->operands())
6868     if (auto *PredInst = dyn_cast<Instruction>(Op))
6869       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6870         PredInst2Recipe[PredInst]->setAlsoPack(false);
6871 
6872   // Finalize the recipe for Instr, first if it is not predicated.
6873   if (!IsPredicated) {
6874     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6875     VPBB->appendRecipe(Recipe);
6876     return VPBB;
6877   }
6878   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6879   assert(VPBB->getSuccessors().empty() &&
6880          "VPBB has successors when handling predicated replication.");
6881   // Record predicated instructions for above packing optimizations.
6882   PredInst2Recipe[I] = Recipe;
6883   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6884   VPBlockUtils::insertBlockAfter(Region, VPBB);
6885   auto *RegSucc = new VPBasicBlock();
6886   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6887   return RegSucc;
6888 }
6889 
6890 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6891                                                       VPRecipeBase *PredRecipe,
6892                                                       VPlanPtr &Plan) {
6893   // Instructions marked for predication are replicated and placed under an
6894   // if-then construct to prevent side-effects.
6895 
6896   // Generate recipes to compute the block mask for this region.
6897   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6898 
6899   // Build the triangular if-then region.
6900   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6901   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6902   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6903   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6904   auto *PHIRecipe =
6905       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6906   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6907   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6908   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6909 
6910   // Note: first set Entry as region entry and then connect successors starting
6911   // from it in order, to propagate the "parent" of each VPBasicBlock.
6912   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6913   VPBlockUtils::connectBlocks(Pred, Exit);
6914 
6915   return Region;
6916 }
6917 
6918 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
6919                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
6920   VPRecipeBase *Recipe = nullptr;
6921   // Check if Instr should belong to an interleave memory recipe, or already
6922   // does. In the latter case Instr is irrelevant.
6923   if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
6924     VPBB->appendRecipe(Recipe);
6925     return true;
6926   }
6927 
6928   // Check if Instr is a memory operation that should be widened.
6929   if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) {
6930     VPBB->appendRecipe(Recipe);
6931     return true;
6932   }
6933 
6934   // Check if Instr should form some PHI recipe.
6935   if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
6936     VPBB->appendRecipe(Recipe);
6937     return true;
6938   }
6939   if ((Recipe = tryToBlend(Instr, Plan))) {
6940     VPBB->appendRecipe(Recipe);
6941     return true;
6942   }
6943   if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
6944     VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
6945     return true;
6946   }
6947 
6948   // Check if Instr is to be widened by a general VPWidenRecipe, after
6949   // having first checked for specific widening recipes that deal with
6950   // Interleave Groups, Inductions and Phi nodes.
6951   if (tryToWiden(Instr, VPBB, Range))
6952     return true;
6953 
6954   return false;
6955 }
6956 
6957 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
6958                                                         unsigned MaxVF) {
6959   assert(OrigLoop->empty() && "Inner loop expected.");
6960 
6961   // Collect conditions feeding internal conditional branches; they need to be
6962   // represented in VPlan for it to model masking.
6963   SmallPtrSet<Value *, 1> NeedDef;
6964 
6965   auto *Latch = OrigLoop->getLoopLatch();
6966   for (BasicBlock *BB : OrigLoop->blocks()) {
6967     if (BB == Latch)
6968       continue;
6969     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
6970     if (Branch && Branch->isConditional())
6971       NeedDef.insert(Branch->getCondition());
6972   }
6973 
6974   // If the tail is to be folded by masking, the primary induction variable
6975   // needs to be represented in VPlan for it to model early-exit masking.
6976   // Also, both the Phi and the live-out instruction of each reduction are
6977   // required in order to introduce a select between them in VPlan.
6978   if (CM.foldTailByMasking()) {
6979     NeedDef.insert(Legal->getPrimaryInduction());
6980     for (auto &Reduction : *Legal->getReductionVars()) {
6981       NeedDef.insert(Reduction.first);
6982       NeedDef.insert(Reduction.second.getLoopExitInstr());
6983     }
6984   }
6985 
6986   // Collect instructions from the original loop that will become trivially dead
6987   // in the vectorized loop. We don't need to vectorize these instructions. For
6988   // example, original induction update instructions can become dead because we
6989   // separately emit induction "steps" when generating code for the new loop.
6990   // Similarly, we create a new latch condition when setting up the structure
6991   // of the new loop, so the old one can become dead.
6992   SmallPtrSet<Instruction *, 4> DeadInstructions;
6993   collectTriviallyDeadInstructions(DeadInstructions);
6994 
6995   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6996     VFRange SubRange = {VF, MaxVF + 1};
6997     VPlans.push_back(
6998         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
6999     VF = SubRange.End;
7000   }
7001 }
7002 
7003 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7004     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7005     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7006   // Hold a mapping from predicated instructions to their recipes, in order to
7007   // fix their AlsoPack behavior if a user is determined to replicate and use a
7008   // scalar instead of vector value.
7009   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7010 
7011   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7012   DenseMap<Instruction *, Instruction *> SinkAfterInverse;
7013 
7014   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7015   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7016   auto Plan = std::make_unique<VPlan>(VPBB);
7017 
7018   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7019   // Represent values that will have defs inside VPlan.
7020   for (Value *V : NeedDef)
7021     Plan->addVPValue(V);
7022 
7023   // Scan the body of the loop in a topological order to visit each basic block
7024   // after having visited its predecessor basic blocks.
7025   LoopBlocksDFS DFS(OrigLoop);
7026   DFS.perform(LI);
7027 
7028   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7029     // Relevant instructions from basic block BB will be grouped into VPRecipe
7030     // ingredients and fill a new VPBasicBlock.
7031     unsigned VPBBsForBB = 0;
7032     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7033     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7034     VPBB = FirstVPBBForBB;
7035     Builder.setInsertPoint(VPBB);
7036 
7037     std::vector<Instruction *> Ingredients;
7038 
7039     // Organize the ingredients to vectorize from current basic block in the
7040     // right order.
7041     for (Instruction &I : BB->instructionsWithoutDebug()) {
7042       Instruction *Instr = &I;
7043 
7044       // First filter out irrelevant instructions, to ensure no recipes are
7045       // built for them.
7046       if (isa<BranchInst>(Instr) ||
7047           DeadInstructions.find(Instr) != DeadInstructions.end())
7048         continue;
7049 
7050       // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
7051       // member of the IG, do not construct any Recipe for it.
7052       const InterleaveGroup<Instruction> *IG =
7053           CM.getInterleavedAccessGroup(Instr);
7054       if (IG && Instr != IG->getInsertPos() &&
7055           Range.Start >= 2 && // Query is illegal for VF == 1
7056           CM.getWideningDecision(Instr, Range.Start) ==
7057               LoopVectorizationCostModel::CM_Interleave) {
7058         auto SinkCandidate = SinkAfterInverse.find(Instr);
7059         if (SinkCandidate != SinkAfterInverse.end())
7060           Ingredients.push_back(SinkCandidate->second);
7061         continue;
7062       }
7063 
7064       // Move instructions to handle first-order recurrences, step 1: avoid
7065       // handling this instruction until after we've handled the instruction it
7066       // should follow.
7067       auto SAIt = SinkAfter.find(Instr);
7068       if (SAIt != SinkAfter.end()) {
7069         LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after"
7070                           << *SAIt->second
7071                           << " to vectorize a 1st order recurrence.\n");
7072         SinkAfterInverse[SAIt->second] = Instr;
7073         continue;
7074       }
7075 
7076       Ingredients.push_back(Instr);
7077 
7078       // Move instructions to handle first-order recurrences, step 2: push the
7079       // instruction to be sunk at its insertion point.
7080       auto SAInvIt = SinkAfterInverse.find(Instr);
7081       if (SAInvIt != SinkAfterInverse.end())
7082         Ingredients.push_back(SAInvIt->second);
7083     }
7084 
7085     // Introduce each ingredient into VPlan.
7086     for (Instruction *Instr : Ingredients) {
7087       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7088         continue;
7089 
7090       // Otherwise, if all widening options failed, Instruction is to be
7091       // replicated. This may create a successor for VPBB.
7092       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7093           Instr, Range, VPBB, PredInst2Recipe, Plan);
7094       if (NextVPBB != VPBB) {
7095         VPBB = NextVPBB;
7096         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7097                                     : "");
7098       }
7099     }
7100   }
7101 
7102   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7103   // may also be empty, such as the last one VPBB, reflecting original
7104   // basic-blocks with no recipes.
7105   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7106   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7107   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7108   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7109   delete PreEntry;
7110 
7111   // Finally, if tail is folded by masking, introduce selects between the phi
7112   // and the live-out instruction of each reduction, at the end of the latch.
7113   if (CM.foldTailByMasking()) {
7114     Builder.setInsertPoint(VPBB);
7115     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7116     for (auto &Reduction : *Legal->getReductionVars()) {
7117       VPValue *Phi = Plan->getVPValue(Reduction.first);
7118       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7119       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7120     }
7121   }
7122 
7123   std::string PlanName;
7124   raw_string_ostream RSO(PlanName);
7125   unsigned VF = Range.Start;
7126   Plan->addVF(VF);
7127   RSO << "Initial VPlan for VF={" << VF;
7128   for (VF *= 2; VF < Range.End; VF *= 2) {
7129     Plan->addVF(VF);
7130     RSO << "," << VF;
7131   }
7132   RSO << "},UF>=1";
7133   RSO.flush();
7134   Plan->setName(PlanName);
7135 
7136   return Plan;
7137 }
7138 
7139 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7140   // Outer loop handling: They may require CFG and instruction level
7141   // transformations before even evaluating whether vectorization is profitable.
7142   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7143   // the vectorization pipeline.
7144   assert(!OrigLoop->empty());
7145   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7146 
7147   // Create new empty VPlan
7148   auto Plan = std::make_unique<VPlan>();
7149 
7150   // Build hierarchical CFG
7151   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7152   HCFGBuilder.buildHierarchicalCFG();
7153 
7154   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7155     Plan->addVF(VF);
7156 
7157   if (EnableVPlanPredication) {
7158     VPlanPredicator VPP(*Plan);
7159     VPP.predicate();
7160 
7161     // Avoid running transformation to recipes until masked code generation in
7162     // VPlan-native path is in place.
7163     return Plan;
7164   }
7165 
7166   SmallPtrSet<Instruction *, 1> DeadInstructions;
7167   VPlanHCFGTransforms::VPInstructionsToVPRecipes(
7168       Plan, Legal->getInductionVars(), DeadInstructions);
7169 
7170   return Plan;
7171 }
7172 
7173 Value* LoopVectorizationPlanner::VPCallbackILV::
7174 getOrCreateVectorValues(Value *V, unsigned Part) {
7175       return ILV.getOrCreateVectorValue(V, Part);
7176 }
7177 
7178 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7179   O << " +\n"
7180     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7181   IG->getInsertPos()->printAsOperand(O, false);
7182   if (User) {
7183     O << ", ";
7184     User->getOperand(0)->printAsOperand(O);
7185   }
7186   O << "\\l\"";
7187   for (unsigned i = 0; i < IG->getFactor(); ++i)
7188     if (Instruction *I = IG->getMember(i))
7189       O << " +\n"
7190         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7191 }
7192 
7193 void VPWidenRecipe::execute(VPTransformState &State) {
7194   for (auto &Instr : make_range(Begin, End))
7195     State.ILV->widenInstruction(Instr);
7196 }
7197 
7198 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7199   assert(!State.Instance && "Int or FP induction being replicated.");
7200   State.ILV->widenIntOrFpInduction(IV, Trunc);
7201 }
7202 
7203 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7204   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7205 }
7206 
7207 void VPBlendRecipe::execute(VPTransformState &State) {
7208   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7209   // We know that all PHIs in non-header blocks are converted into
7210   // selects, so we don't have to worry about the insertion order and we
7211   // can just use the builder.
7212   // At this point we generate the predication tree. There may be
7213   // duplications since this is a simple recursive scan, but future
7214   // optimizations will clean it up.
7215 
7216   unsigned NumIncoming = Phi->getNumIncomingValues();
7217 
7218   assert((User || NumIncoming == 1) &&
7219          "Multiple predecessors with predecessors having a full mask");
7220   // Generate a sequence of selects of the form:
7221   // SELECT(Mask3, In3,
7222   //      SELECT(Mask2, In2,
7223   //                   ( ...)))
7224   InnerLoopVectorizer::VectorParts Entry(State.UF);
7225   for (unsigned In = 0; In < NumIncoming; ++In) {
7226     for (unsigned Part = 0; Part < State.UF; ++Part) {
7227       // We might have single edge PHIs (blocks) - use an identity
7228       // 'select' for the first PHI operand.
7229       Value *In0 =
7230           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7231       if (In == 0)
7232         Entry[Part] = In0; // Initialize with the first incoming value.
7233       else {
7234         // Select between the current value and the previous incoming edge
7235         // based on the incoming mask.
7236         Value *Cond = State.get(User->getOperand(In), Part);
7237         Entry[Part] =
7238             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7239       }
7240     }
7241   }
7242   for (unsigned Part = 0; Part < State.UF; ++Part)
7243     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7244 }
7245 
7246 void VPInterleaveRecipe::execute(VPTransformState &State) {
7247   assert(!State.Instance && "Interleave group being replicated.");
7248   if (!User)
7249     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7250 
7251   // Last (and currently only) operand is a mask.
7252   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7253   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7254   for (unsigned Part = 0; Part < State.UF; ++Part)
7255     MaskValues[Part] = State.get(Mask, Part);
7256   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7257 }
7258 
7259 void VPReplicateRecipe::execute(VPTransformState &State) {
7260   if (State.Instance) { // Generate a single instance.
7261     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7262     // Insert scalar instance packing it into a vector.
7263     if (AlsoPack && State.VF > 1) {
7264       // If we're constructing lane 0, initialize to start from undef.
7265       if (State.Instance->Lane == 0) {
7266         Value *Undef =
7267             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7268         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7269       }
7270       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7271     }
7272     return;
7273   }
7274 
7275   // Generate scalar instances for all VF lanes of all UF parts, unless the
7276   // instruction is uniform inwhich case generate only the first lane for each
7277   // of the UF parts.
7278   unsigned EndLane = IsUniform ? 1 : State.VF;
7279   for (unsigned Part = 0; Part < State.UF; ++Part)
7280     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7281       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7282 }
7283 
7284 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7285   assert(State.Instance && "Branch on Mask works only on single instance.");
7286 
7287   unsigned Part = State.Instance->Part;
7288   unsigned Lane = State.Instance->Lane;
7289 
7290   Value *ConditionBit = nullptr;
7291   if (!User) // Block in mask is all-one.
7292     ConditionBit = State.Builder.getTrue();
7293   else {
7294     VPValue *BlockInMask = User->getOperand(0);
7295     ConditionBit = State.get(BlockInMask, Part);
7296     if (ConditionBit->getType()->isVectorTy())
7297       ConditionBit = State.Builder.CreateExtractElement(
7298           ConditionBit, State.Builder.getInt32(Lane));
7299   }
7300 
7301   // Replace the temporary unreachable terminator with a new conditional branch,
7302   // whose two destinations will be set later when they are created.
7303   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7304   assert(isa<UnreachableInst>(CurrentTerminator) &&
7305          "Expected to replace unreachable terminator with conditional branch.");
7306   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7307   CondBr->setSuccessor(0, nullptr);
7308   ReplaceInstWithInst(CurrentTerminator, CondBr);
7309 }
7310 
7311 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7312   assert(State.Instance && "Predicated instruction PHI works per instance.");
7313   Instruction *ScalarPredInst = cast<Instruction>(
7314       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7315   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7316   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7317   assert(PredicatingBB && "Predicated block has no single predecessor.");
7318 
7319   // By current pack/unpack logic we need to generate only a single phi node: if
7320   // a vector value for the predicated instruction exists at this point it means
7321   // the instruction has vector users only, and a phi for the vector value is
7322   // needed. In this case the recipe of the predicated instruction is marked to
7323   // also do that packing, thereby "hoisting" the insert-element sequence.
7324   // Otherwise, a phi node for the scalar value is needed.
7325   unsigned Part = State.Instance->Part;
7326   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7327     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7328     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7329     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7330     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7331     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7332     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7333   } else {
7334     Type *PredInstType = PredInst->getType();
7335     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7336     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7337     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7338     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7339   }
7340 }
7341 
7342 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7343   if (!User)
7344     return State.ILV->vectorizeMemoryInstruction(&Instr);
7345 
7346   // Last (and currently only) operand is a mask.
7347   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7348   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7349   for (unsigned Part = 0; Part < State.UF; ++Part)
7350     MaskValues[Part] = State.get(Mask, Part);
7351   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7352 }
7353 
7354 static ScalarEpilogueLowering
7355 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7356                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
7357   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7358   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7359       (F->hasOptSize() ||
7360        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
7361     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7362   else if (PreferPredicateOverEpilog || Hints.getPredicate())
7363     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7364 
7365   return SEL;
7366 }
7367 
7368 // Process the loop in the VPlan-native vectorization path. This path builds
7369 // VPlan upfront in the vectorization pipeline, which allows to apply
7370 // VPlan-to-VPlan transformations from the very beginning without modifying the
7371 // input LLVM IR.
7372 static bool processLoopInVPlanNativePath(
7373     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7374     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7375     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7376     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7377     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7378 
7379   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7380   Function *F = L->getHeader()->getParent();
7381   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7382   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7383 
7384   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7385                                 &Hints, IAI);
7386   // Use the planner for outer loop vectorization.
7387   // TODO: CM is not used at this point inside the planner. Turn CM into an
7388   // optional argument if we don't need it in the future.
7389   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM);
7390 
7391   // Get user vectorization factor.
7392   const unsigned UserVF = Hints.getWidth();
7393 
7394   // Plan how to best vectorize, return the best VF and its cost.
7395   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7396 
7397   // If we are stress testing VPlan builds, do not attempt to generate vector
7398   // code. Masked vector code generation support will follow soon.
7399   // Also, do not attempt to vectorize if no vector code will be produced.
7400   if (VPlanBuildStressTest || EnableVPlanPredication ||
7401       VectorizationFactor::Disabled() == VF)
7402     return false;
7403 
7404   LVP.setBestPlan(VF.Width, 1);
7405 
7406   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7407                          &CM);
7408   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7409                     << L->getHeader()->getParent()->getName() << "\"\n");
7410   LVP.executePlan(LB, DT);
7411 
7412   // Mark the loop as already vectorized to avoid vectorizing again.
7413   Hints.setAlreadyVectorized();
7414 
7415   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7416   return true;
7417 }
7418 
7419 bool LoopVectorizePass::processLoop(Loop *L) {
7420   assert((EnableVPlanNativePath || L->empty()) &&
7421          "VPlan-native path is not enabled. Only process inner loops.");
7422 
7423 #ifndef NDEBUG
7424   const std::string DebugLocStr = getDebugLocString(L);
7425 #endif /* NDEBUG */
7426 
7427   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7428                     << L->getHeader()->getParent()->getName() << "\" from "
7429                     << DebugLocStr << "\n");
7430 
7431   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7432 
7433   LLVM_DEBUG(
7434       dbgs() << "LV: Loop hints:"
7435              << " force="
7436              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7437                      ? "disabled"
7438                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7439                             ? "enabled"
7440                             : "?"))
7441              << " width=" << Hints.getWidth()
7442              << " unroll=" << Hints.getInterleave() << "\n");
7443 
7444   // Function containing loop
7445   Function *F = L->getHeader()->getParent();
7446 
7447   // Looking at the diagnostic output is the only way to determine if a loop
7448   // was vectorized (other than looking at the IR or machine code), so it
7449   // is important to generate an optimization remark for each loop. Most of
7450   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7451   // generated as OptimizationRemark and OptimizationRemarkMissed are
7452   // less verbose reporting vectorized loops and unvectorized loops that may
7453   // benefit from vectorization, respectively.
7454 
7455   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7456     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7457     return false;
7458   }
7459 
7460   PredicatedScalarEvolution PSE(*SE, *L);
7461 
7462   // Check if it is legal to vectorize the loop.
7463   LoopVectorizationRequirements Requirements(*ORE);
7464   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7465                                 &Requirements, &Hints, DB, AC);
7466   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7467     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7468     Hints.emitRemarkWithHints();
7469     return false;
7470   }
7471 
7472   // Check the function attributes and profiles to find out if this function
7473   // should be optimized for size.
7474   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
7475 
7476   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7477   // here. They may require CFG and instruction level transformations before
7478   // even evaluating whether vectorization is profitable. Since we cannot modify
7479   // the incoming IR, we need to build VPlan upfront in the vectorization
7480   // pipeline.
7481   if (!L->empty())
7482     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7483                                         ORE, BFI, PSI, Hints);
7484 
7485   assert(L->empty() && "Inner loop expected.");
7486   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7487   // count by optimizing for size, to minimize overheads.
7488   // Prefer constant trip counts over profile data, over upper bound estimate.
7489   unsigned ExpectedTC = 0;
7490   bool HasExpectedTC = false;
7491   if (const SCEVConstant *ConstExits =
7492       dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) {
7493     const APInt &ExitsCount = ConstExits->getAPInt();
7494     // We are interested in small values for ExpectedTC. Skip over those that
7495     // can't fit an unsigned.
7496     if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) {
7497       ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1;
7498       HasExpectedTC = true;
7499     }
7500   }
7501   // ExpectedTC may be large because it's bound by a variable. Check
7502   // profiling information to validate we should vectorize.
7503   if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7504     auto EstimatedTC = getLoopEstimatedTripCount(L);
7505     if (EstimatedTC) {
7506       ExpectedTC = *EstimatedTC;
7507       HasExpectedTC = true;
7508     }
7509   }
7510   if (!HasExpectedTC) {
7511     ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7512     HasExpectedTC = (ExpectedTC > 0);
7513   }
7514 
7515   if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
7516     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7517                       << "This loop is worth vectorizing only if no scalar "
7518                       << "iteration overheads are incurred.");
7519     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7520       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7521     else {
7522       LLVM_DEBUG(dbgs() << "\n");
7523       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7524     }
7525   }
7526 
7527   // Check the function attributes to see if implicit floats are allowed.
7528   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7529   // an integer loop and the vector instructions selected are purely integer
7530   // vector instructions?
7531   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7532     reportVectorizationFailure(
7533         "Can't vectorize when the NoImplicitFloat attribute is used",
7534         "loop not vectorized due to NoImplicitFloat attribute",
7535         "NoImplicitFloat", ORE, L);
7536     Hints.emitRemarkWithHints();
7537     return false;
7538   }
7539 
7540   // Check if the target supports potentially unsafe FP vectorization.
7541   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7542   // for the target we're vectorizing for, to make sure none of the
7543   // additional fp-math flags can help.
7544   if (Hints.isPotentiallyUnsafe() &&
7545       TTI->isFPVectorizationPotentiallyUnsafe()) {
7546     reportVectorizationFailure(
7547         "Potentially unsafe FP op prevents vectorization",
7548         "loop not vectorized due to unsafe FP support.",
7549         "UnsafeFP", ORE, L);
7550     Hints.emitRemarkWithHints();
7551     return false;
7552   }
7553 
7554   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7555   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7556 
7557   // If an override option has been passed in for interleaved accesses, use it.
7558   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7559     UseInterleaved = EnableInterleavedMemAccesses;
7560 
7561   // Analyze interleaved memory accesses.
7562   if (UseInterleaved) {
7563     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7564   }
7565 
7566   // Use the cost model.
7567   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7568                                 F, &Hints, IAI);
7569   CM.collectValuesToIgnore();
7570 
7571   // Use the planner for vectorization.
7572   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
7573 
7574   // Get user vectorization factor.
7575   unsigned UserVF = Hints.getWidth();
7576 
7577   // Plan how to best vectorize, return the best VF and its cost.
7578   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7579 
7580   VectorizationFactor VF = VectorizationFactor::Disabled();
7581   unsigned IC = 1;
7582   unsigned UserIC = Hints.getInterleave();
7583 
7584   if (MaybeVF) {
7585     VF = *MaybeVF;
7586     // Select the interleave count.
7587     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7588   }
7589 
7590   // Identify the diagnostic messages that should be produced.
7591   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7592   bool VectorizeLoop = true, InterleaveLoop = true;
7593   if (Requirements.doesNotMeet(F, L, Hints)) {
7594     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7595                          "requirements.\n");
7596     Hints.emitRemarkWithHints();
7597     return false;
7598   }
7599 
7600   if (VF.Width == 1) {
7601     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7602     VecDiagMsg = std::make_pair(
7603         "VectorizationNotBeneficial",
7604         "the cost-model indicates that vectorization is not beneficial");
7605     VectorizeLoop = false;
7606   }
7607 
7608   if (!MaybeVF && UserIC > 1) {
7609     // Tell the user interleaving was avoided up-front, despite being explicitly
7610     // requested.
7611     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7612                          "interleaving should be avoided up front\n");
7613     IntDiagMsg = std::make_pair(
7614         "InterleavingAvoided",
7615         "Ignoring UserIC, because interleaving was avoided up front");
7616     InterleaveLoop = false;
7617   } else if (IC == 1 && UserIC <= 1) {
7618     // Tell the user interleaving is not beneficial.
7619     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7620     IntDiagMsg = std::make_pair(
7621         "InterleavingNotBeneficial",
7622         "the cost-model indicates that interleaving is not beneficial");
7623     InterleaveLoop = false;
7624     if (UserIC == 1) {
7625       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7626       IntDiagMsg.second +=
7627           " and is explicitly disabled or interleave count is set to 1";
7628     }
7629   } else if (IC > 1 && UserIC == 1) {
7630     // Tell the user interleaving is beneficial, but it explicitly disabled.
7631     LLVM_DEBUG(
7632         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7633     IntDiagMsg = std::make_pair(
7634         "InterleavingBeneficialButDisabled",
7635         "the cost-model indicates that interleaving is beneficial "
7636         "but is explicitly disabled or interleave count is set to 1");
7637     InterleaveLoop = false;
7638   }
7639 
7640   // Override IC if user provided an interleave count.
7641   IC = UserIC > 0 ? UserIC : IC;
7642 
7643   // Emit diagnostic messages, if any.
7644   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7645   if (!VectorizeLoop && !InterleaveLoop) {
7646     // Do not vectorize or interleaving the loop.
7647     ORE->emit([&]() {
7648       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7649                                       L->getStartLoc(), L->getHeader())
7650              << VecDiagMsg.second;
7651     });
7652     ORE->emit([&]() {
7653       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7654                                       L->getStartLoc(), L->getHeader())
7655              << IntDiagMsg.second;
7656     });
7657     return false;
7658   } else if (!VectorizeLoop && InterleaveLoop) {
7659     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7660     ORE->emit([&]() {
7661       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7662                                         L->getStartLoc(), L->getHeader())
7663              << VecDiagMsg.second;
7664     });
7665   } else if (VectorizeLoop && !InterleaveLoop) {
7666     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7667                       << ") in " << DebugLocStr << '\n');
7668     ORE->emit([&]() {
7669       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7670                                         L->getStartLoc(), L->getHeader())
7671              << IntDiagMsg.second;
7672     });
7673   } else if (VectorizeLoop && InterleaveLoop) {
7674     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7675                       << ") in " << DebugLocStr << '\n');
7676     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7677   }
7678 
7679   LVP.setBestPlan(VF.Width, IC);
7680 
7681   using namespace ore;
7682   bool DisableRuntimeUnroll = false;
7683   MDNode *OrigLoopID = L->getLoopID();
7684 
7685   if (!VectorizeLoop) {
7686     assert(IC > 1 && "interleave count should not be 1 or 0");
7687     // If we decided that it is not legal to vectorize the loop, then
7688     // interleave it.
7689     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7690                                &CM);
7691     LVP.executePlan(Unroller, DT);
7692 
7693     ORE->emit([&]() {
7694       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7695                                 L->getHeader())
7696              << "interleaved loop (interleaved count: "
7697              << NV("InterleaveCount", IC) << ")";
7698     });
7699   } else {
7700     // If we decided that it is *legal* to vectorize the loop, then do it.
7701     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7702                            &LVL, &CM);
7703     LVP.executePlan(LB, DT);
7704     ++LoopsVectorized;
7705 
7706     // Add metadata to disable runtime unrolling a scalar loop when there are
7707     // no runtime checks about strides and memory. A scalar loop that is
7708     // rarely used is not worth unrolling.
7709     if (!LB.areSafetyChecksAdded())
7710       DisableRuntimeUnroll = true;
7711 
7712     // Report the vectorization decision.
7713     ORE->emit([&]() {
7714       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7715                                 L->getHeader())
7716              << "vectorized loop (vectorization width: "
7717              << NV("VectorizationFactor", VF.Width)
7718              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7719     });
7720   }
7721 
7722   Optional<MDNode *> RemainderLoopID =
7723       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7724                                       LLVMLoopVectorizeFollowupEpilogue});
7725   if (RemainderLoopID.hasValue()) {
7726     L->setLoopID(RemainderLoopID.getValue());
7727   } else {
7728     if (DisableRuntimeUnroll)
7729       AddRuntimeUnrollDisableMetaData(L);
7730 
7731     // Mark the loop as already vectorized to avoid vectorizing again.
7732     Hints.setAlreadyVectorized();
7733   }
7734 
7735   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7736   return true;
7737 }
7738 
7739 bool LoopVectorizePass::runImpl(
7740     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7741     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7742     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7743     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7744     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7745   SE = &SE_;
7746   LI = &LI_;
7747   TTI = &TTI_;
7748   DT = &DT_;
7749   BFI = &BFI_;
7750   TLI = TLI_;
7751   AA = &AA_;
7752   AC = &AC_;
7753   GetLAA = &GetLAA_;
7754   DB = &DB_;
7755   ORE = &ORE_;
7756   PSI = PSI_;
7757 
7758   // Don't attempt if
7759   // 1. the target claims to have no vector registers, and
7760   // 2. interleaving won't help ILP.
7761   //
7762   // The second condition is necessary because, even if the target has no
7763   // vector registers, loop vectorization may still enable scalar
7764   // interleaving.
7765   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
7766     return false;
7767 
7768   bool Changed = false;
7769 
7770   // The vectorizer requires loops to be in simplified form.
7771   // Since simplification may add new inner loops, it has to run before the
7772   // legality and profitability checks. This means running the loop vectorizer
7773   // will simplify all loops, regardless of whether anything end up being
7774   // vectorized.
7775   for (auto &L : *LI)
7776     Changed |=
7777         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7778 
7779   // Build up a worklist of inner-loops to vectorize. This is necessary as
7780   // the act of vectorizing or partially unrolling a loop creates new loops
7781   // and can invalidate iterators across the loops.
7782   SmallVector<Loop *, 8> Worklist;
7783 
7784   for (Loop *L : *LI)
7785     collectSupportedLoops(*L, LI, ORE, Worklist);
7786 
7787   LoopsAnalyzed += Worklist.size();
7788 
7789   // Now walk the identified inner loops.
7790   while (!Worklist.empty()) {
7791     Loop *L = Worklist.pop_back_val();
7792 
7793     // For the inner loops we actually process, form LCSSA to simplify the
7794     // transform.
7795     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7796 
7797     Changed |= processLoop(L);
7798   }
7799 
7800   // Process each loop nest in the function.
7801   return Changed;
7802 }
7803 
7804 PreservedAnalyses LoopVectorizePass::run(Function &F,
7805                                          FunctionAnalysisManager &AM) {
7806     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7807     auto &LI = AM.getResult<LoopAnalysis>(F);
7808     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7809     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7810     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7811     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7812     auto &AA = AM.getResult<AAManager>(F);
7813     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7814     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7815     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7816     MemorySSA *MSSA = EnableMSSALoopDependency
7817                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7818                           : nullptr;
7819 
7820     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7821     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7822         [&](Loop &L) -> const LoopAccessInfo & {
7823       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7824       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7825     };
7826     const ModuleAnalysisManager &MAM =
7827         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7828     ProfileSummaryInfo *PSI =
7829         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7830     bool Changed =
7831         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7832     if (!Changed)
7833       return PreservedAnalyses::all();
7834     PreservedAnalyses PA;
7835 
7836     // We currently do not preserve loopinfo/dominator analyses with outer loop
7837     // vectorization. Until this is addressed, mark these analyses as preserved
7838     // only for non-VPlan-native path.
7839     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7840     if (!EnableVPlanNativePath) {
7841       PA.preserve<LoopAnalysis>();
7842       PA.preserve<DominatorTreeAnalysis>();
7843     }
7844     PA.preserve<BasicAA>();
7845     PA.preserve<GlobalsAA>();
7846     return PA;
7847 }
7848