1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize the interleaved access group that \p Instr belongs to,
486   /// optionally masking the vector operations if \p BlockInMask is non-null.
487   void vectorizeInterleaveGroup(Instruction *Instr,
488                                 VectorParts *BlockInMask = nullptr);
489 
490   /// Vectorize Load and Store instructions, optionally masking the vector
491   /// operations if \p BlockInMask is non-null.
492   void vectorizeMemoryInstruction(Instruction *Instr,
493                                   VectorParts *BlockInMask = nullptr);
494 
495   /// Set the debug location in the builder using the debug location in
496   /// the instruction.
497   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
498 
499   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
500   void fixNonInductionPHIs(void);
501 
502 protected:
503   friend class LoopVectorizationPlanner;
504 
505   /// A small list of PHINodes.
506   using PhiVector = SmallVector<PHINode *, 4>;
507 
508   /// A type for scalarized values in the new loop. Each value from the
509   /// original loop, when scalarized, is represented by UF x VF scalar values
510   /// in the new unrolled loop, where UF is the unroll factor and VF is the
511   /// vectorization factor.
512   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
513 
514   /// Set up the values of the IVs correctly when exiting the vector loop.
515   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
516                     Value *CountRoundDown, Value *EndValue,
517                     BasicBlock *MiddleBlock);
518 
519   /// Create a new induction variable inside L.
520   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
521                                    Value *Step, Instruction *DL);
522 
523   /// Handle all cross-iteration phis in the header.
524   void fixCrossIterationPHIs();
525 
526   /// Fix a first-order recurrence. This is the second phase of vectorizing
527   /// this phi node.
528   void fixFirstOrderRecurrence(PHINode *Phi);
529 
530   /// Fix a reduction cross-iteration phi. This is the second phase of
531   /// vectorizing this phi node.
532   void fixReduction(PHINode *Phi);
533 
534   /// The Loop exit block may have single value PHI nodes with some
535   /// incoming value. While vectorizing we only handled real values
536   /// that were defined inside the loop and we should have one value for
537   /// each predecessor of its parent basic block. See PR14725.
538   void fixLCSSAPHIs();
539 
540   /// Iteratively sink the scalarized operands of a predicated instruction into
541   /// the block that was created for it.
542   void sinkScalarOperands(Instruction *PredInst);
543 
544   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
545   /// represented as.
546   void truncateToMinimalBitwidths();
547 
548   /// Insert the new loop to the loop hierarchy and pass manager
549   /// and update the analysis passes.
550   void updateAnalysis();
551 
552   /// Create a broadcast instruction. This method generates a broadcast
553   /// instruction (shuffle) for loop invariant values and for the induction
554   /// value. If this is the induction variable then we extend it to N, N+1, ...
555   /// this is needed because each iteration in the loop corresponds to a SIMD
556   /// element.
557   virtual Value *getBroadcastInstrs(Value *V);
558 
559   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
560   /// to each vector element of Val. The sequence starts at StartIndex.
561   /// \p Opcode is relevant for FP induction variable.
562   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
563                                Instruction::BinaryOps Opcode =
564                                Instruction::BinaryOpsEnd);
565 
566   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
567   /// variable on which to base the steps, \p Step is the size of the step, and
568   /// \p EntryVal is the value from the original loop that maps to the steps.
569   /// Note that \p EntryVal doesn't have to be an induction variable - it
570   /// can also be a truncate instruction.
571   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
572                         const InductionDescriptor &ID);
573 
574   /// Create a vector induction phi node based on an existing scalar one. \p
575   /// EntryVal is the value from the original loop that maps to the vector phi
576   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
577   /// truncate instruction, instead of widening the original IV, we widen a
578   /// version of the IV truncated to \p EntryVal's type.
579   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
580                                        Value *Step, Instruction *EntryVal);
581 
582   /// Returns true if an instruction \p I should be scalarized instead of
583   /// vectorized for the chosen vectorization factor.
584   bool shouldScalarizeInstruction(Instruction *I) const;
585 
586   /// Returns true if we should generate a scalar version of \p IV.
587   bool needsScalarInduction(Instruction *IV) const;
588 
589   /// If there is a cast involved in the induction variable \p ID, which should
590   /// be ignored in the vectorized loop body, this function records the
591   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
592   /// cast. We had already proved that the casted Phi is equal to the uncasted
593   /// Phi in the vectorized loop (under a runtime guard), and therefore
594   /// there is no need to vectorize the cast - the same value can be used in the
595   /// vector loop for both the Phi and the cast.
596   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
597   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
598   ///
599   /// \p EntryVal is the value from the original loop that maps to the vector
600   /// phi node and is used to distinguish what is the IV currently being
601   /// processed - original one (if \p EntryVal is a phi corresponding to the
602   /// original IV) or the "newly-created" one based on the proof mentioned above
603   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
604   /// latter case \p EntryVal is a TruncInst and we must not record anything for
605   /// that IV, but it's error-prone to expect callers of this routine to care
606   /// about that, hence this explicit parameter.
607   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
608                                              const Instruction *EntryVal,
609                                              Value *VectorLoopValue,
610                                              unsigned Part,
611                                              unsigned Lane = UINT_MAX);
612 
613   /// Generate a shuffle sequence that will reverse the vector Vec.
614   virtual Value *reverseVector(Value *Vec);
615 
616   /// Returns (and creates if needed) the original loop trip count.
617   Value *getOrCreateTripCount(Loop *NewLoop);
618 
619   /// Returns (and creates if needed) the trip count of the widened loop.
620   Value *getOrCreateVectorTripCount(Loop *NewLoop);
621 
622   /// Returns a bitcasted value to the requested vector type.
623   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
624   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
625                                 const DataLayout &DL);
626 
627   /// Emit a bypass check to see if the vector trip count is zero, including if
628   /// it overflows.
629   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
630 
631   /// Emit a bypass check to see if all of the SCEV assumptions we've
632   /// had to make are correct.
633   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
634 
635   /// Emit bypass checks to check any memory assumptions we may have made.
636   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
637 
638   /// Compute the transformed value of Index at offset StartValue using step
639   /// StepValue.
640   /// For integer induction, returns StartValue + Index * StepValue.
641   /// For pointer induction, returns StartValue[Index * StepValue].
642   /// FIXME: The newly created binary instructions should contain nsw/nuw
643   /// flags, which can be found from the original scalar operations.
644   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
645                               const DataLayout &DL,
646                               const InductionDescriptor &ID) const;
647 
648   /// Add additional metadata to \p To that was not present on \p Orig.
649   ///
650   /// Currently this is used to add the noalias annotations based on the
651   /// inserted memchecks.  Use this for instructions that are *cloned* into the
652   /// vector loop.
653   void addNewMetadata(Instruction *To, const Instruction *Orig);
654 
655   /// Add metadata from one instruction to another.
656   ///
657   /// This includes both the original MDs from \p From and additional ones (\see
658   /// addNewMetadata).  Use this for *newly created* instructions in the vector
659   /// loop.
660   void addMetadata(Instruction *To, Instruction *From);
661 
662   /// Similar to the previous function but it adds the metadata to a
663   /// vector of instructions.
664   void addMetadata(ArrayRef<Value *> To, Instruction *From);
665 
666   /// The original loop.
667   Loop *OrigLoop;
668 
669   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
670   /// dynamic knowledge to simplify SCEV expressions and converts them to a
671   /// more usable form.
672   PredicatedScalarEvolution &PSE;
673 
674   /// Loop Info.
675   LoopInfo *LI;
676 
677   /// Dominator Tree.
678   DominatorTree *DT;
679 
680   /// Alias Analysis.
681   AliasAnalysis *AA;
682 
683   /// Target Library Info.
684   const TargetLibraryInfo *TLI;
685 
686   /// Target Transform Info.
687   const TargetTransformInfo *TTI;
688 
689   /// Assumption Cache.
690   AssumptionCache *AC;
691 
692   /// Interface to emit optimization remarks.
693   OptimizationRemarkEmitter *ORE;
694 
695   /// LoopVersioning.  It's only set up (non-null) if memchecks were
696   /// used.
697   ///
698   /// This is currently only used to add no-alias metadata based on the
699   /// memchecks.  The actually versioning is performed manually.
700   std::unique_ptr<LoopVersioning> LVer;
701 
702   /// The vectorization SIMD factor to use. Each vector will have this many
703   /// vector elements.
704   unsigned VF;
705 
706   /// The vectorization unroll factor to use. Each scalar is vectorized to this
707   /// many different vector instructions.
708   unsigned UF;
709 
710   /// The builder that we use
711   IRBuilder<> Builder;
712 
713   // --- Vectorization state ---
714 
715   /// The vector-loop preheader.
716   BasicBlock *LoopVectorPreHeader;
717 
718   /// The scalar-loop preheader.
719   BasicBlock *LoopScalarPreHeader;
720 
721   /// Middle Block between the vector and the scalar.
722   BasicBlock *LoopMiddleBlock;
723 
724   /// The ExitBlock of the scalar loop.
725   BasicBlock *LoopExitBlock;
726 
727   /// The vector loop body.
728   BasicBlock *LoopVectorBody;
729 
730   /// The scalar loop body.
731   BasicBlock *LoopScalarBody;
732 
733   /// A list of all bypass blocks. The first block is the entry of the loop.
734   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
735 
736   /// The new Induction variable which was added to the new block.
737   PHINode *Induction = nullptr;
738 
739   /// The induction variable of the old basic block.
740   PHINode *OldInduction = nullptr;
741 
742   /// Maps values from the original loop to their corresponding values in the
743   /// vectorized loop. A key value can map to either vector values, scalar
744   /// values or both kinds of values, depending on whether the key was
745   /// vectorized and scalarized.
746   VectorizerValueMap VectorLoopValueMap;
747 
748   /// Store instructions that were predicated.
749   SmallVector<Instruction *, 4> PredicatedInstructions;
750 
751   /// Trip count of the original loop.
752   Value *TripCount = nullptr;
753 
754   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
755   Value *VectorTripCount = nullptr;
756 
757   /// The legality analysis.
758   LoopVectorizationLegality *Legal;
759 
760   /// The profitablity analysis.
761   LoopVectorizationCostModel *Cost;
762 
763   // Record whether runtime checks are added.
764   bool AddedSafetyChecks = false;
765 
766   // Holds the end values for each induction variable. We save the end values
767   // so we can later fix-up the external users of the induction variables.
768   DenseMap<PHINode *, Value *> IVEndValues;
769 
770   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
771   // fixed up at the end of vector code generation.
772   SmallVector<PHINode *, 8> OrigPHIsToFix;
773 };
774 
775 class InnerLoopUnroller : public InnerLoopVectorizer {
776 public:
777   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
778                     LoopInfo *LI, DominatorTree *DT,
779                     const TargetLibraryInfo *TLI,
780                     const TargetTransformInfo *TTI, AssumptionCache *AC,
781                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
782                     LoopVectorizationLegality *LVL,
783                     LoopVectorizationCostModel *CM)
784       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
785                             UnrollFactor, LVL, CM) {}
786 
787 private:
788   Value *getBroadcastInstrs(Value *V) override;
789   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
790                        Instruction::BinaryOps Opcode =
791                        Instruction::BinaryOpsEnd) override;
792   Value *reverseVector(Value *Vec) override;
793 };
794 
795 } // end namespace llvm
796 
797 /// Look for a meaningful debug location on the instruction or it's
798 /// operands.
799 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
800   if (!I)
801     return I;
802 
803   DebugLoc Empty;
804   if (I->getDebugLoc() != Empty)
805     return I;
806 
807   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
808     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
809       if (OpInst->getDebugLoc() != Empty)
810         return OpInst;
811   }
812 
813   return I;
814 }
815 
816 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
817   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
818     const DILocation *DIL = Inst->getDebugLoc();
819     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
820         !isa<DbgInfoIntrinsic>(Inst)) {
821       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
822       if (NewDIL)
823         B.SetCurrentDebugLocation(NewDIL.getValue());
824       else
825         LLVM_DEBUG(dbgs()
826                    << "Failed to create new discriminator: "
827                    << DIL->getFilename() << " Line: " << DIL->getLine());
828     }
829     else
830       B.SetCurrentDebugLocation(DIL);
831   } else
832     B.SetCurrentDebugLocation(DebugLoc());
833 }
834 
835 /// Write a record \p DebugMsg about vectorization failure to the debug
836 /// output stream. If \p I is passed, it is an instruction that prevents
837 /// vectorization.
838 #ifndef NDEBUG
839 static void debugVectorizationFailure(const StringRef DebugMsg,
840     Instruction *I) {
841   dbgs() << "LV: Not vectorizing: " << DebugMsg;
842   if (I != nullptr)
843     dbgs() << " " << *I;
844   else
845     dbgs() << '.';
846   dbgs() << '\n';
847 }
848 #endif
849 
850 /// Create an analysis remark that explains why vectorization failed
851 ///
852 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
853 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
854 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
855 /// the location of the remark.  \return the remark object that can be
856 /// streamed to.
857 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
858     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
859   Value *CodeRegion = TheLoop->getHeader();
860   DebugLoc DL = TheLoop->getStartLoc();
861 
862   if (I) {
863     CodeRegion = I->getParent();
864     // If there is no debug location attached to the instruction, revert back to
865     // using the loop's.
866     if (I->getDebugLoc())
867       DL = I->getDebugLoc();
868   }
869 
870   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
871   R << "loop not vectorized: ";
872   return R;
873 }
874 
875 namespace llvm {
876 
877 void reportVectorizationFailure(const StringRef DebugMsg,
878     const StringRef OREMsg, const StringRef ORETag,
879     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
880   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
881   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
882   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
883                 ORETag, TheLoop, I) << OREMsg);
884 }
885 
886 } // end namespace llvm
887 
888 #ifndef NDEBUG
889 /// \return string containing a file name and a line # for the given loop.
890 static std::string getDebugLocString(const Loop *L) {
891   std::string Result;
892   if (L) {
893     raw_string_ostream OS(Result);
894     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
895       LoopDbgLoc.print(OS);
896     else
897       // Just print the module name.
898       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
899     OS.flush();
900   }
901   return Result;
902 }
903 #endif
904 
905 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
906                                          const Instruction *Orig) {
907   // If the loop was versioned with memchecks, add the corresponding no-alias
908   // metadata.
909   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
910     LVer->annotateInstWithNoAlias(To, Orig);
911 }
912 
913 void InnerLoopVectorizer::addMetadata(Instruction *To,
914                                       Instruction *From) {
915   propagateMetadata(To, From);
916   addNewMetadata(To, From);
917 }
918 
919 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
920                                       Instruction *From) {
921   for (Value *V : To) {
922     if (Instruction *I = dyn_cast<Instruction>(V))
923       addMetadata(I, From);
924   }
925 }
926 
927 namespace llvm {
928 
929 // Loop vectorization cost-model hints how the scalar epilogue loop should be
930 // lowered.
931 enum ScalarEpilogueLowering {
932 
933   // The default: allowing scalar epilogues.
934   CM_ScalarEpilogueAllowed,
935 
936   // Vectorization with OptForSize: don't allow epilogues.
937   CM_ScalarEpilogueNotAllowedOptSize,
938 
939   // A special case of vectorisation with OptForSize: loops with a very small
940   // trip count are considered for vectorization under OptForSize, thereby
941   // making sure the cost of their loop body is dominant, free of runtime
942   // guards and scalar iteration overheads.
943   CM_ScalarEpilogueNotAllowedLowTripLoop,
944 
945   // Loop hint predicate indicating an epilogue is undesired.
946   CM_ScalarEpilogueNotNeededUsePredicate
947 };
948 
949 /// LoopVectorizationCostModel - estimates the expected speedups due to
950 /// vectorization.
951 /// In many cases vectorization is not profitable. This can happen because of
952 /// a number of reasons. In this class we mainly attempt to predict the
953 /// expected speedup/slowdowns due to the supported instruction set. We use the
954 /// TargetTransformInfo to query the different backends for the cost of
955 /// different operations.
956 class LoopVectorizationCostModel {
957 public:
958   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
959                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
960                              LoopVectorizationLegality *Legal,
961                              const TargetTransformInfo &TTI,
962                              const TargetLibraryInfo *TLI, DemandedBits *DB,
963                              AssumptionCache *AC,
964                              OptimizationRemarkEmitter *ORE, const Function *F,
965                              const LoopVectorizeHints *Hints,
966                              InterleavedAccessInfo &IAI)
967       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
968         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
969         Hints(Hints), InterleaveInfo(IAI) {}
970 
971   /// \return An upper bound for the vectorization factor, or None if
972   /// vectorization and interleaving should be avoided up front.
973   Optional<unsigned> computeMaxVF();
974 
975   /// \return True if runtime checks are required for vectorization, and false
976   /// otherwise.
977   bool runtimeChecksRequired();
978 
979   /// \return The most profitable vectorization factor and the cost of that VF.
980   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
981   /// then this vectorization factor will be selected if vectorization is
982   /// possible.
983   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
984 
985   /// Setup cost-based decisions for user vectorization factor.
986   void selectUserVectorizationFactor(unsigned UserVF) {
987     collectUniformsAndScalars(UserVF);
988     collectInstsToScalarize(UserVF);
989   }
990 
991   /// \return The size (in bits) of the smallest and widest types in the code
992   /// that needs to be vectorized. We ignore values that remain scalar such as
993   /// 64 bit loop indices.
994   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
995 
996   /// \return The desired interleave count.
997   /// If interleave count has been specified by metadata it will be returned.
998   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
999   /// are the selected vectorization factor and the cost of the selected VF.
1000   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1001 
1002   /// Memory access instruction may be vectorized in more than one way.
1003   /// Form of instruction after vectorization depends on cost.
1004   /// This function takes cost-based decisions for Load/Store instructions
1005   /// and collects them in a map. This decisions map is used for building
1006   /// the lists of loop-uniform and loop-scalar instructions.
1007   /// The calculated cost is saved with widening decision in order to
1008   /// avoid redundant calculations.
1009   void setCostBasedWideningDecision(unsigned VF);
1010 
1011   /// A struct that represents some properties of the register usage
1012   /// of a loop.
1013   struct RegisterUsage {
1014     /// Holds the number of loop invariant values that are used in the loop.
1015     /// The key is ClassID of target-provided register class.
1016     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1017     /// Holds the maximum number of concurrent live intervals in the loop.
1018     /// The key is ClassID of target-provided register class.
1019     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1020   };
1021 
1022   /// \return Returns information about the register usages of the loop for the
1023   /// given vectorization factors.
1024   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1025 
1026   /// Collect values we want to ignore in the cost model.
1027   void collectValuesToIgnore();
1028 
1029   /// \returns The smallest bitwidth each instruction can be represented with.
1030   /// The vector equivalents of these instructions should be truncated to this
1031   /// type.
1032   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1033     return MinBWs;
1034   }
1035 
1036   /// \returns True if it is more profitable to scalarize instruction \p I for
1037   /// vectorization factor \p VF.
1038   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1039     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1040 
1041     // Cost model is not run in the VPlan-native path - return conservative
1042     // result until this changes.
1043     if (EnableVPlanNativePath)
1044       return false;
1045 
1046     auto Scalars = InstsToScalarize.find(VF);
1047     assert(Scalars != InstsToScalarize.end() &&
1048            "VF not yet analyzed for scalarization profitability");
1049     return Scalars->second.find(I) != Scalars->second.end();
1050   }
1051 
1052   /// Returns true if \p I is known to be uniform after vectorization.
1053   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1054     if (VF == 1)
1055       return true;
1056 
1057     // Cost model is not run in the VPlan-native path - return conservative
1058     // result until this changes.
1059     if (EnableVPlanNativePath)
1060       return false;
1061 
1062     auto UniformsPerVF = Uniforms.find(VF);
1063     assert(UniformsPerVF != Uniforms.end() &&
1064            "VF not yet analyzed for uniformity");
1065     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1066   }
1067 
1068   /// Returns true if \p I is known to be scalar after vectorization.
1069   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1070     if (VF == 1)
1071       return true;
1072 
1073     // Cost model is not run in the VPlan-native path - return conservative
1074     // result until this changes.
1075     if (EnableVPlanNativePath)
1076       return false;
1077 
1078     auto ScalarsPerVF = Scalars.find(VF);
1079     assert(ScalarsPerVF != Scalars.end() &&
1080            "Scalar values are not calculated for VF");
1081     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1082   }
1083 
1084   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1085   /// for vectorization factor \p VF.
1086   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1087     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1088            !isProfitableToScalarize(I, VF) &&
1089            !isScalarAfterVectorization(I, VF);
1090   }
1091 
1092   /// Decision that was taken during cost calculation for memory instruction.
1093   enum InstWidening {
1094     CM_Unknown,
1095     CM_Widen,         // For consecutive accesses with stride +1.
1096     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1097     CM_Interleave,
1098     CM_GatherScatter,
1099     CM_Scalarize
1100   };
1101 
1102   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1103   /// instruction \p I and vector width \p VF.
1104   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1105                            unsigned Cost) {
1106     assert(VF >= 2 && "Expected VF >=2");
1107     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1108   }
1109 
1110   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1111   /// interleaving group \p Grp and vector width \p VF.
1112   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1113                            InstWidening W, unsigned Cost) {
1114     assert(VF >= 2 && "Expected VF >=2");
1115     /// Broadcast this decicion to all instructions inside the group.
1116     /// But the cost will be assigned to one instruction only.
1117     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1118       if (auto *I = Grp->getMember(i)) {
1119         if (Grp->getInsertPos() == I)
1120           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1121         else
1122           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1123       }
1124     }
1125   }
1126 
1127   /// Return the cost model decision for the given instruction \p I and vector
1128   /// width \p VF. Return CM_Unknown if this instruction did not pass
1129   /// through the cost modeling.
1130   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1131     assert(VF >= 2 && "Expected VF >=2");
1132 
1133     // Cost model is not run in the VPlan-native path - return conservative
1134     // result until this changes.
1135     if (EnableVPlanNativePath)
1136       return CM_GatherScatter;
1137 
1138     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1139     auto Itr = WideningDecisions.find(InstOnVF);
1140     if (Itr == WideningDecisions.end())
1141       return CM_Unknown;
1142     return Itr->second.first;
1143   }
1144 
1145   /// Return the vectorization cost for the given instruction \p I and vector
1146   /// width \p VF.
1147   unsigned getWideningCost(Instruction *I, unsigned VF) {
1148     assert(VF >= 2 && "Expected VF >=2");
1149     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1150     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1151            "The cost is not calculated");
1152     return WideningDecisions[InstOnVF].second;
1153   }
1154 
1155   /// Return True if instruction \p I is an optimizable truncate whose operand
1156   /// is an induction variable. Such a truncate will be removed by adding a new
1157   /// induction variable with the destination type.
1158   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1159     // If the instruction is not a truncate, return false.
1160     auto *Trunc = dyn_cast<TruncInst>(I);
1161     if (!Trunc)
1162       return false;
1163 
1164     // Get the source and destination types of the truncate.
1165     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1166     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1167 
1168     // If the truncate is free for the given types, return false. Replacing a
1169     // free truncate with an induction variable would add an induction variable
1170     // update instruction to each iteration of the loop. We exclude from this
1171     // check the primary induction variable since it will need an update
1172     // instruction regardless.
1173     Value *Op = Trunc->getOperand(0);
1174     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1175       return false;
1176 
1177     // If the truncated value is not an induction variable, return false.
1178     return Legal->isInductionPhi(Op);
1179   }
1180 
1181   /// Collects the instructions to scalarize for each predicated instruction in
1182   /// the loop.
1183   void collectInstsToScalarize(unsigned VF);
1184 
1185   /// Collect Uniform and Scalar values for the given \p VF.
1186   /// The sets depend on CM decision for Load/Store instructions
1187   /// that may be vectorized as interleave, gather-scatter or scalarized.
1188   void collectUniformsAndScalars(unsigned VF) {
1189     // Do the analysis once.
1190     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1191       return;
1192     setCostBasedWideningDecision(VF);
1193     collectLoopUniforms(VF);
1194     collectLoopScalars(VF);
1195   }
1196 
1197   /// Returns true if the target machine supports masked store operation
1198   /// for the given \p DataType and kind of access to \p Ptr.
1199   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1200     return Legal->isConsecutivePtr(Ptr) &&
1201            TTI.isLegalMaskedStore(DataType, Alignment);
1202   }
1203 
1204   /// Returns true if the target machine supports masked load operation
1205   /// for the given \p DataType and kind of access to \p Ptr.
1206   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1207     return Legal->isConsecutivePtr(Ptr) &&
1208            TTI.isLegalMaskedLoad(DataType, Alignment);
1209   }
1210 
1211   /// Returns true if the target machine supports masked scatter operation
1212   /// for the given \p DataType.
1213   bool isLegalMaskedScatter(Type *DataType) {
1214     return TTI.isLegalMaskedScatter(DataType);
1215   }
1216 
1217   /// Returns true if the target machine supports masked gather operation
1218   /// for the given \p DataType.
1219   bool isLegalMaskedGather(Type *DataType) {
1220     return TTI.isLegalMaskedGather(DataType);
1221   }
1222 
1223   /// Returns true if the target machine can represent \p V as a masked gather
1224   /// or scatter operation.
1225   bool isLegalGatherOrScatter(Value *V) {
1226     bool LI = isa<LoadInst>(V);
1227     bool SI = isa<StoreInst>(V);
1228     if (!LI && !SI)
1229       return false;
1230     auto *Ty = getMemInstValueType(V);
1231     return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
1232   }
1233 
1234   /// Returns true if \p I is an instruction that will be scalarized with
1235   /// predication. Such instructions include conditional stores and
1236   /// instructions that may divide by zero.
1237   /// If a non-zero VF has been calculated, we check if I will be scalarized
1238   /// predication for that VF.
1239   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1240 
1241   // Returns true if \p I is an instruction that will be predicated either
1242   // through scalar predication or masked load/store or masked gather/scatter.
1243   // Superset of instructions that return true for isScalarWithPredication.
1244   bool isPredicatedInst(Instruction *I) {
1245     if (!blockNeedsPredication(I->getParent()))
1246       return false;
1247     // Loads and stores that need some form of masked operation are predicated
1248     // instructions.
1249     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1250       return Legal->isMaskRequired(I);
1251     return isScalarWithPredication(I);
1252   }
1253 
1254   /// Returns true if \p I is a memory instruction with consecutive memory
1255   /// access that can be widened.
1256   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1257 
1258   /// Returns true if \p I is a memory instruction in an interleaved-group
1259   /// of memory accesses that can be vectorized with wide vector loads/stores
1260   /// and shuffles.
1261   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1262 
1263   /// Check if \p Instr belongs to any interleaved access group.
1264   bool isAccessInterleaved(Instruction *Instr) {
1265     return InterleaveInfo.isInterleaved(Instr);
1266   }
1267 
1268   /// Get the interleaved access group that \p Instr belongs to.
1269   const InterleaveGroup<Instruction> *
1270   getInterleavedAccessGroup(Instruction *Instr) {
1271     return InterleaveInfo.getInterleaveGroup(Instr);
1272   }
1273 
1274   /// Returns true if an interleaved group requires a scalar iteration
1275   /// to handle accesses with gaps, and there is nothing preventing us from
1276   /// creating a scalar epilogue.
1277   bool requiresScalarEpilogue() const {
1278     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1279   }
1280 
1281   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1282   /// loop hint annotation.
1283   bool isScalarEpilogueAllowed() const {
1284     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1285   }
1286 
1287   /// Returns true if all loop blocks should be masked to fold tail loop.
1288   bool foldTailByMasking() const { return FoldTailByMasking; }
1289 
1290   bool blockNeedsPredication(BasicBlock *BB) {
1291     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1292   }
1293 
1294   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1295   /// with factor VF.  Return the cost of the instruction, including
1296   /// scalarization overhead if it's needed.
1297   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1298 
1299   /// Estimate cost of a call instruction CI if it were vectorized with factor
1300   /// VF. Return the cost of the instruction, including scalarization overhead
1301   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1302   /// scalarized -
1303   /// i.e. either vector version isn't available, or is too expensive.
1304   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1305 
1306 private:
1307   unsigned NumPredStores = 0;
1308 
1309   /// \return An upper bound for the vectorization factor, larger than zero.
1310   /// One is returned if vectorization should best be avoided due to cost.
1311   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1312 
1313   /// The vectorization cost is a combination of the cost itself and a boolean
1314   /// indicating whether any of the contributing operations will actually
1315   /// operate on
1316   /// vector values after type legalization in the backend. If this latter value
1317   /// is
1318   /// false, then all operations will be scalarized (i.e. no vectorization has
1319   /// actually taken place).
1320   using VectorizationCostTy = std::pair<unsigned, bool>;
1321 
1322   /// Returns the expected execution cost. The unit of the cost does
1323   /// not matter because we use the 'cost' units to compare different
1324   /// vector widths. The cost that is returned is *not* normalized by
1325   /// the factor width.
1326   VectorizationCostTy expectedCost(unsigned VF);
1327 
1328   /// Returns the execution time cost of an instruction for a given vector
1329   /// width. Vector width of one means scalar.
1330   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1331 
1332   /// The cost-computation logic from getInstructionCost which provides
1333   /// the vector type as an output parameter.
1334   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1335 
1336   /// Calculate vectorization cost of memory instruction \p I.
1337   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1338 
1339   /// The cost computation for scalarized memory instruction.
1340   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1341 
1342   /// The cost computation for interleaving group of memory instructions.
1343   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1344 
1345   /// The cost computation for Gather/Scatter instruction.
1346   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1347 
1348   /// The cost computation for widening instruction \p I with consecutive
1349   /// memory access.
1350   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1351 
1352   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1353   /// Load: scalar load + broadcast.
1354   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1355   /// element)
1356   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1357 
1358   /// Estimate the overhead of scalarizing an instruction. This is a
1359   /// convenience wrapper for the type-based getScalarizationOverhead API.
1360   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1361 
1362   /// Returns whether the instruction is a load or store and will be a emitted
1363   /// as a vector operation.
1364   bool isConsecutiveLoadOrStore(Instruction *I);
1365 
1366   /// Returns true if an artificially high cost for emulated masked memrefs
1367   /// should be used.
1368   bool useEmulatedMaskMemRefHack(Instruction *I);
1369 
1370   /// Map of scalar integer values to the smallest bitwidth they can be legally
1371   /// represented as. The vector equivalents of these values should be truncated
1372   /// to this type.
1373   MapVector<Instruction *, uint64_t> MinBWs;
1374 
1375   /// A type representing the costs for instructions if they were to be
1376   /// scalarized rather than vectorized. The entries are Instruction-Cost
1377   /// pairs.
1378   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1379 
1380   /// A set containing all BasicBlocks that are known to present after
1381   /// vectorization as a predicated block.
1382   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1383 
1384   /// Records whether it is allowed to have the original scalar loop execute at
1385   /// least once. This may be needed as a fallback loop in case runtime
1386   /// aliasing/dependence checks fail, or to handle the tail/remainder
1387   /// iterations when the trip count is unknown or doesn't divide by the VF,
1388   /// or as a peel-loop to handle gaps in interleave-groups.
1389   /// Under optsize and when the trip count is very small we don't allow any
1390   /// iterations to execute in the scalar loop.
1391   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1392 
1393   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1394   bool FoldTailByMasking = false;
1395 
1396   /// A map holding scalar costs for different vectorization factors. The
1397   /// presence of a cost for an instruction in the mapping indicates that the
1398   /// instruction will be scalarized when vectorizing with the associated
1399   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1400   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1401 
1402   /// Holds the instructions known to be uniform after vectorization.
1403   /// The data is collected per VF.
1404   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1405 
1406   /// Holds the instructions known to be scalar after vectorization.
1407   /// The data is collected per VF.
1408   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1409 
1410   /// Holds the instructions (address computations) that are forced to be
1411   /// scalarized.
1412   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1413 
1414   /// Returns the expected difference in cost from scalarizing the expression
1415   /// feeding a predicated instruction \p PredInst. The instructions to
1416   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1417   /// non-negative return value implies the expression will be scalarized.
1418   /// Currently, only single-use chains are considered for scalarization.
1419   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1420                               unsigned VF);
1421 
1422   /// Collect the instructions that are uniform after vectorization. An
1423   /// instruction is uniform if we represent it with a single scalar value in
1424   /// the vectorized loop corresponding to each vector iteration. Examples of
1425   /// uniform instructions include pointer operands of consecutive or
1426   /// interleaved memory accesses. Note that although uniformity implies an
1427   /// instruction will be scalar, the reverse is not true. In general, a
1428   /// scalarized instruction will be represented by VF scalar values in the
1429   /// vectorized loop, each corresponding to an iteration of the original
1430   /// scalar loop.
1431   void collectLoopUniforms(unsigned VF);
1432 
1433   /// Collect the instructions that are scalar after vectorization. An
1434   /// instruction is scalar if it is known to be uniform or will be scalarized
1435   /// during vectorization. Non-uniform scalarized instructions will be
1436   /// represented by VF values in the vectorized loop, each corresponding to an
1437   /// iteration of the original scalar loop.
1438   void collectLoopScalars(unsigned VF);
1439 
1440   /// Keeps cost model vectorization decision and cost for instructions.
1441   /// Right now it is used for memory instructions only.
1442   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1443                                 std::pair<InstWidening, unsigned>>;
1444 
1445   DecisionList WideningDecisions;
1446 
1447   /// Returns true if \p V is expected to be vectorized and it needs to be
1448   /// extracted.
1449   bool needsExtract(Value *V, unsigned VF) const {
1450     Instruction *I = dyn_cast<Instruction>(V);
1451     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1452       return false;
1453 
1454     // Assume we can vectorize V (and hence we need extraction) if the
1455     // scalars are not computed yet. This can happen, because it is called
1456     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1457     // the scalars are collected. That should be a safe assumption in most
1458     // cases, because we check if the operands have vectorizable types
1459     // beforehand in LoopVectorizationLegality.
1460     return Scalars.find(VF) == Scalars.end() ||
1461            !isScalarAfterVectorization(I, VF);
1462   };
1463 
1464   /// Returns a range containing only operands needing to be extracted.
1465   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1466                                                    unsigned VF) {
1467     return SmallVector<Value *, 4>(make_filter_range(
1468         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1469   }
1470 
1471 public:
1472   /// The loop that we evaluate.
1473   Loop *TheLoop;
1474 
1475   /// Predicated scalar evolution analysis.
1476   PredicatedScalarEvolution &PSE;
1477 
1478   /// Loop Info analysis.
1479   LoopInfo *LI;
1480 
1481   /// Vectorization legality.
1482   LoopVectorizationLegality *Legal;
1483 
1484   /// Vector target information.
1485   const TargetTransformInfo &TTI;
1486 
1487   /// Target Library Info.
1488   const TargetLibraryInfo *TLI;
1489 
1490   /// Demanded bits analysis.
1491   DemandedBits *DB;
1492 
1493   /// Assumption cache.
1494   AssumptionCache *AC;
1495 
1496   /// Interface to emit optimization remarks.
1497   OptimizationRemarkEmitter *ORE;
1498 
1499   const Function *TheFunction;
1500 
1501   /// Loop Vectorize Hint.
1502   const LoopVectorizeHints *Hints;
1503 
1504   /// The interleave access information contains groups of interleaved accesses
1505   /// with the same stride and close to each other.
1506   InterleavedAccessInfo &InterleaveInfo;
1507 
1508   /// Values to ignore in the cost model.
1509   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1510 
1511   /// Values to ignore in the cost model when VF > 1.
1512   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1513 };
1514 
1515 } // end namespace llvm
1516 
1517 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1518 // vectorization. The loop needs to be annotated with #pragma omp simd
1519 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1520 // vector length information is not provided, vectorization is not considered
1521 // explicit. Interleave hints are not allowed either. These limitations will be
1522 // relaxed in the future.
1523 // Please, note that we are currently forced to abuse the pragma 'clang
1524 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1525 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1526 // provides *explicit vectorization hints* (LV can bypass legal checks and
1527 // assume that vectorization is legal). However, both hints are implemented
1528 // using the same metadata (llvm.loop.vectorize, processed by
1529 // LoopVectorizeHints). This will be fixed in the future when the native IR
1530 // representation for pragma 'omp simd' is introduced.
1531 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1532                                    OptimizationRemarkEmitter *ORE) {
1533   assert(!OuterLp->empty() && "This is not an outer loop");
1534   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1535 
1536   // Only outer loops with an explicit vectorization hint are supported.
1537   // Unannotated outer loops are ignored.
1538   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1539     return false;
1540 
1541   Function *Fn = OuterLp->getHeader()->getParent();
1542   if (!Hints.allowVectorization(Fn, OuterLp,
1543                                 true /*VectorizeOnlyWhenForced*/)) {
1544     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1545     return false;
1546   }
1547 
1548   if (Hints.getInterleave() > 1) {
1549     // TODO: Interleave support is future work.
1550     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1551                          "outer loops.\n");
1552     Hints.emitRemarkWithHints();
1553     return false;
1554   }
1555 
1556   return true;
1557 }
1558 
1559 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1560                                   OptimizationRemarkEmitter *ORE,
1561                                   SmallVectorImpl<Loop *> &V) {
1562   // Collect inner loops and outer loops without irreducible control flow. For
1563   // now, only collect outer loops that have explicit vectorization hints. If we
1564   // are stress testing the VPlan H-CFG construction, we collect the outermost
1565   // loop of every loop nest.
1566   if (L.empty() || VPlanBuildStressTest ||
1567       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1568     LoopBlocksRPO RPOT(&L);
1569     RPOT.perform(LI);
1570     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1571       V.push_back(&L);
1572       // TODO: Collect inner loops inside marked outer loops in case
1573       // vectorization fails for the outer loop. Do not invoke
1574       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1575       // already known to be reducible. We can use an inherited attribute for
1576       // that.
1577       return;
1578     }
1579   }
1580   for (Loop *InnerL : L)
1581     collectSupportedLoops(*InnerL, LI, ORE, V);
1582 }
1583 
1584 namespace {
1585 
1586 /// The LoopVectorize Pass.
1587 struct LoopVectorize : public FunctionPass {
1588   /// Pass identification, replacement for typeid
1589   static char ID;
1590 
1591   LoopVectorizePass Impl;
1592 
1593   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1594                          bool VectorizeOnlyWhenForced = false)
1595       : FunctionPass(ID) {
1596     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1597     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1598     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1599   }
1600 
1601   bool runOnFunction(Function &F) override {
1602     if (skipFunction(F))
1603       return false;
1604 
1605     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1606     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1607     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1608     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1609     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1610     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1611     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1612     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1613     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1614     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1615     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1616     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1617     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1618 
1619     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1620         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1621 
1622     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1623                         GetLAA, *ORE, PSI);
1624   }
1625 
1626   void getAnalysisUsage(AnalysisUsage &AU) const override {
1627     AU.addRequired<AssumptionCacheTracker>();
1628     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1629     AU.addRequired<DominatorTreeWrapperPass>();
1630     AU.addRequired<LoopInfoWrapperPass>();
1631     AU.addRequired<ScalarEvolutionWrapperPass>();
1632     AU.addRequired<TargetTransformInfoWrapperPass>();
1633     AU.addRequired<AAResultsWrapperPass>();
1634     AU.addRequired<LoopAccessLegacyAnalysis>();
1635     AU.addRequired<DemandedBitsWrapperPass>();
1636     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1637 
1638     // We currently do not preserve loopinfo/dominator analyses with outer loop
1639     // vectorization. Until this is addressed, mark these analyses as preserved
1640     // only for non-VPlan-native path.
1641     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1642     if (!EnableVPlanNativePath) {
1643       AU.addPreserved<LoopInfoWrapperPass>();
1644       AU.addPreserved<DominatorTreeWrapperPass>();
1645     }
1646 
1647     AU.addPreserved<BasicAAWrapperPass>();
1648     AU.addPreserved<GlobalsAAWrapperPass>();
1649     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1650   }
1651 };
1652 
1653 } // end anonymous namespace
1654 
1655 //===----------------------------------------------------------------------===//
1656 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1657 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1658 //===----------------------------------------------------------------------===//
1659 
1660 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1661   // We need to place the broadcast of invariant variables outside the loop,
1662   // but only if it's proven safe to do so. Else, broadcast will be inside
1663   // vector loop body.
1664   Instruction *Instr = dyn_cast<Instruction>(V);
1665   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1666                      (!Instr ||
1667                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1668   // Place the code for broadcasting invariant variables in the new preheader.
1669   IRBuilder<>::InsertPointGuard Guard(Builder);
1670   if (SafeToHoist)
1671     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1672 
1673   // Broadcast the scalar into all locations in the vector.
1674   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1675 
1676   return Shuf;
1677 }
1678 
1679 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1680     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1681   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1682          "Expected either an induction phi-node or a truncate of it!");
1683   Value *Start = II.getStartValue();
1684 
1685   // Construct the initial value of the vector IV in the vector loop preheader
1686   auto CurrIP = Builder.saveIP();
1687   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1688   if (isa<TruncInst>(EntryVal)) {
1689     assert(Start->getType()->isIntegerTy() &&
1690            "Truncation requires an integer type");
1691     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1692     Step = Builder.CreateTrunc(Step, TruncType);
1693     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1694   }
1695   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1696   Value *SteppedStart =
1697       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1698 
1699   // We create vector phi nodes for both integer and floating-point induction
1700   // variables. Here, we determine the kind of arithmetic we will perform.
1701   Instruction::BinaryOps AddOp;
1702   Instruction::BinaryOps MulOp;
1703   if (Step->getType()->isIntegerTy()) {
1704     AddOp = Instruction::Add;
1705     MulOp = Instruction::Mul;
1706   } else {
1707     AddOp = II.getInductionOpcode();
1708     MulOp = Instruction::FMul;
1709   }
1710 
1711   // Multiply the vectorization factor by the step using integer or
1712   // floating-point arithmetic as appropriate.
1713   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1714   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1715 
1716   // Create a vector splat to use in the induction update.
1717   //
1718   // FIXME: If the step is non-constant, we create the vector splat with
1719   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1720   //        handle a constant vector splat.
1721   Value *SplatVF = isa<Constant>(Mul)
1722                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1723                        : Builder.CreateVectorSplat(VF, Mul);
1724   Builder.restoreIP(CurrIP);
1725 
1726   // We may need to add the step a number of times, depending on the unroll
1727   // factor. The last of those goes into the PHI.
1728   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1729                                     &*LoopVectorBody->getFirstInsertionPt());
1730   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1731   Instruction *LastInduction = VecInd;
1732   for (unsigned Part = 0; Part < UF; ++Part) {
1733     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1734 
1735     if (isa<TruncInst>(EntryVal))
1736       addMetadata(LastInduction, EntryVal);
1737     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1738 
1739     LastInduction = cast<Instruction>(addFastMathFlag(
1740         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1741     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1742   }
1743 
1744   // Move the last step to the end of the latch block. This ensures consistent
1745   // placement of all induction updates.
1746   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1747   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1748   auto *ICmp = cast<Instruction>(Br->getCondition());
1749   LastInduction->moveBefore(ICmp);
1750   LastInduction->setName("vec.ind.next");
1751 
1752   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1753   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1754 }
1755 
1756 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1757   return Cost->isScalarAfterVectorization(I, VF) ||
1758          Cost->isProfitableToScalarize(I, VF);
1759 }
1760 
1761 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1762   if (shouldScalarizeInstruction(IV))
1763     return true;
1764   auto isScalarInst = [&](User *U) -> bool {
1765     auto *I = cast<Instruction>(U);
1766     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1767   };
1768   return llvm::any_of(IV->users(), isScalarInst);
1769 }
1770 
1771 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1772     const InductionDescriptor &ID, const Instruction *EntryVal,
1773     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1774   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1775          "Expected either an induction phi-node or a truncate of it!");
1776 
1777   // This induction variable is not the phi from the original loop but the
1778   // newly-created IV based on the proof that casted Phi is equal to the
1779   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1780   // re-uses the same InductionDescriptor that original IV uses but we don't
1781   // have to do any recording in this case - that is done when original IV is
1782   // processed.
1783   if (isa<TruncInst>(EntryVal))
1784     return;
1785 
1786   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1787   if (Casts.empty())
1788     return;
1789   // Only the first Cast instruction in the Casts vector is of interest.
1790   // The rest of the Casts (if exist) have no uses outside the
1791   // induction update chain itself.
1792   Instruction *CastInst = *Casts.begin();
1793   if (Lane < UINT_MAX)
1794     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1795   else
1796     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1797 }
1798 
1799 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1800   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1801          "Primary induction variable must have an integer type");
1802 
1803   auto II = Legal->getInductionVars()->find(IV);
1804   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1805 
1806   auto ID = II->second;
1807   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1808 
1809   // The scalar value to broadcast. This will be derived from the canonical
1810   // induction variable.
1811   Value *ScalarIV = nullptr;
1812 
1813   // The value from the original loop to which we are mapping the new induction
1814   // variable.
1815   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1816 
1817   // True if we have vectorized the induction variable.
1818   auto VectorizedIV = false;
1819 
1820   // Determine if we want a scalar version of the induction variable. This is
1821   // true if the induction variable itself is not widened, or if it has at
1822   // least one user in the loop that is not widened.
1823   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1824 
1825   // Generate code for the induction step. Note that induction steps are
1826   // required to be loop-invariant
1827   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1828          "Induction step should be loop invariant");
1829   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1830   Value *Step = nullptr;
1831   if (PSE.getSE()->isSCEVable(IV->getType())) {
1832     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1833     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1834                              LoopVectorPreHeader->getTerminator());
1835   } else {
1836     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1837   }
1838 
1839   // Try to create a new independent vector induction variable. If we can't
1840   // create the phi node, we will splat the scalar induction variable in each
1841   // loop iteration.
1842   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1843     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1844     VectorizedIV = true;
1845   }
1846 
1847   // If we haven't yet vectorized the induction variable, or if we will create
1848   // a scalar one, we need to define the scalar induction variable and step
1849   // values. If we were given a truncation type, truncate the canonical
1850   // induction variable and step. Otherwise, derive these values from the
1851   // induction descriptor.
1852   if (!VectorizedIV || NeedsScalarIV) {
1853     ScalarIV = Induction;
1854     if (IV != OldInduction) {
1855       ScalarIV = IV->getType()->isIntegerTy()
1856                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1857                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1858                                           IV->getType());
1859       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1860       ScalarIV->setName("offset.idx");
1861     }
1862     if (Trunc) {
1863       auto *TruncType = cast<IntegerType>(Trunc->getType());
1864       assert(Step->getType()->isIntegerTy() &&
1865              "Truncation requires an integer step");
1866       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1867       Step = Builder.CreateTrunc(Step, TruncType);
1868     }
1869   }
1870 
1871   // If we haven't yet vectorized the induction variable, splat the scalar
1872   // induction variable, and build the necessary step vectors.
1873   // TODO: Don't do it unless the vectorized IV is really required.
1874   if (!VectorizedIV) {
1875     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1876     for (unsigned Part = 0; Part < UF; ++Part) {
1877       Value *EntryPart =
1878           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1879       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1880       if (Trunc)
1881         addMetadata(EntryPart, Trunc);
1882       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1883     }
1884   }
1885 
1886   // If an induction variable is only used for counting loop iterations or
1887   // calculating addresses, it doesn't need to be widened. Create scalar steps
1888   // that can be used by instructions we will later scalarize. Note that the
1889   // addition of the scalar steps will not increase the number of instructions
1890   // in the loop in the common case prior to InstCombine. We will be trading
1891   // one vector extract for each scalar step.
1892   if (NeedsScalarIV)
1893     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1894 }
1895 
1896 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1897                                           Instruction::BinaryOps BinOp) {
1898   // Create and check the types.
1899   assert(Val->getType()->isVectorTy() && "Must be a vector");
1900   int VLen = Val->getType()->getVectorNumElements();
1901 
1902   Type *STy = Val->getType()->getScalarType();
1903   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1904          "Induction Step must be an integer or FP");
1905   assert(Step->getType() == STy && "Step has wrong type");
1906 
1907   SmallVector<Constant *, 8> Indices;
1908 
1909   if (STy->isIntegerTy()) {
1910     // Create a vector of consecutive numbers from zero to VF.
1911     for (int i = 0; i < VLen; ++i)
1912       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1913 
1914     // Add the consecutive indices to the vector value.
1915     Constant *Cv = ConstantVector::get(Indices);
1916     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1917     Step = Builder.CreateVectorSplat(VLen, Step);
1918     assert(Step->getType() == Val->getType() && "Invalid step vec");
1919     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1920     // which can be found from the original scalar operations.
1921     Step = Builder.CreateMul(Cv, Step);
1922     return Builder.CreateAdd(Val, Step, "induction");
1923   }
1924 
1925   // Floating point induction.
1926   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1927          "Binary Opcode should be specified for FP induction");
1928   // Create a vector of consecutive numbers from zero to VF.
1929   for (int i = 0; i < VLen; ++i)
1930     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1931 
1932   // Add the consecutive indices to the vector value.
1933   Constant *Cv = ConstantVector::get(Indices);
1934 
1935   Step = Builder.CreateVectorSplat(VLen, Step);
1936 
1937   // Floating point operations had to be 'fast' to enable the induction.
1938   FastMathFlags Flags;
1939   Flags.setFast();
1940 
1941   Value *MulOp = Builder.CreateFMul(Cv, Step);
1942   if (isa<Instruction>(MulOp))
1943     // Have to check, MulOp may be a constant
1944     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1945 
1946   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1947   if (isa<Instruction>(BOp))
1948     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1949   return BOp;
1950 }
1951 
1952 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1953                                            Instruction *EntryVal,
1954                                            const InductionDescriptor &ID) {
1955   // We shouldn't have to build scalar steps if we aren't vectorizing.
1956   assert(VF > 1 && "VF should be greater than one");
1957 
1958   // Get the value type and ensure it and the step have the same integer type.
1959   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1960   assert(ScalarIVTy == Step->getType() &&
1961          "Val and Step should have the same type");
1962 
1963   // We build scalar steps for both integer and floating-point induction
1964   // variables. Here, we determine the kind of arithmetic we will perform.
1965   Instruction::BinaryOps AddOp;
1966   Instruction::BinaryOps MulOp;
1967   if (ScalarIVTy->isIntegerTy()) {
1968     AddOp = Instruction::Add;
1969     MulOp = Instruction::Mul;
1970   } else {
1971     AddOp = ID.getInductionOpcode();
1972     MulOp = Instruction::FMul;
1973   }
1974 
1975   // Determine the number of scalars we need to generate for each unroll
1976   // iteration. If EntryVal is uniform, we only need to generate the first
1977   // lane. Otherwise, we generate all VF values.
1978   unsigned Lanes =
1979       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1980                                                                          : VF;
1981   // Compute the scalar steps and save the results in VectorLoopValueMap.
1982   for (unsigned Part = 0; Part < UF; ++Part) {
1983     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1984       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1985       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1986       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1987       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1988       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1989     }
1990   }
1991 }
1992 
1993 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1994   assert(V != Induction && "The new induction variable should not be used.");
1995   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
1996   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
1997 
1998   // If we have a stride that is replaced by one, do it here. Defer this for
1999   // the VPlan-native path until we start running Legal checks in that path.
2000   if (!EnableVPlanNativePath && Legal->hasStride(V))
2001     V = ConstantInt::get(V->getType(), 1);
2002 
2003   // If we have a vector mapped to this value, return it.
2004   if (VectorLoopValueMap.hasVectorValue(V, Part))
2005     return VectorLoopValueMap.getVectorValue(V, Part);
2006 
2007   // If the value has not been vectorized, check if it has been scalarized
2008   // instead. If it has been scalarized, and we actually need the value in
2009   // vector form, we will construct the vector values on demand.
2010   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2011     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2012 
2013     // If we've scalarized a value, that value should be an instruction.
2014     auto *I = cast<Instruction>(V);
2015 
2016     // If we aren't vectorizing, we can just copy the scalar map values over to
2017     // the vector map.
2018     if (VF == 1) {
2019       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2020       return ScalarValue;
2021     }
2022 
2023     // Get the last scalar instruction we generated for V and Part. If the value
2024     // is known to be uniform after vectorization, this corresponds to lane zero
2025     // of the Part unroll iteration. Otherwise, the last instruction is the one
2026     // we created for the last vector lane of the Part unroll iteration.
2027     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2028     auto *LastInst = cast<Instruction>(
2029         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2030 
2031     // Set the insert point after the last scalarized instruction. This ensures
2032     // the insertelement sequence will directly follow the scalar definitions.
2033     auto OldIP = Builder.saveIP();
2034     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2035     Builder.SetInsertPoint(&*NewIP);
2036 
2037     // However, if we are vectorizing, we need to construct the vector values.
2038     // If the value is known to be uniform after vectorization, we can just
2039     // broadcast the scalar value corresponding to lane zero for each unroll
2040     // iteration. Otherwise, we construct the vector values using insertelement
2041     // instructions. Since the resulting vectors are stored in
2042     // VectorLoopValueMap, we will only generate the insertelements once.
2043     Value *VectorValue = nullptr;
2044     if (Cost->isUniformAfterVectorization(I, VF)) {
2045       VectorValue = getBroadcastInstrs(ScalarValue);
2046       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2047     } else {
2048       // Initialize packing with insertelements to start from undef.
2049       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2050       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2051       for (unsigned Lane = 0; Lane < VF; ++Lane)
2052         packScalarIntoVectorValue(V, {Part, Lane});
2053       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2054     }
2055     Builder.restoreIP(OldIP);
2056     return VectorValue;
2057   }
2058 
2059   // If this scalar is unknown, assume that it is a constant or that it is
2060   // loop invariant. Broadcast V and save the value for future uses.
2061   Value *B = getBroadcastInstrs(V);
2062   VectorLoopValueMap.setVectorValue(V, Part, B);
2063   return B;
2064 }
2065 
2066 Value *
2067 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2068                                             const VPIteration &Instance) {
2069   // If the value is not an instruction contained in the loop, it should
2070   // already be scalar.
2071   if (OrigLoop->isLoopInvariant(V))
2072     return V;
2073 
2074   assert(Instance.Lane > 0
2075              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2076              : true && "Uniform values only have lane zero");
2077 
2078   // If the value from the original loop has not been vectorized, it is
2079   // represented by UF x VF scalar values in the new loop. Return the requested
2080   // scalar value.
2081   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2082     return VectorLoopValueMap.getScalarValue(V, Instance);
2083 
2084   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2085   // for the given unroll part. If this entry is not a vector type (i.e., the
2086   // vectorization factor is one), there is no need to generate an
2087   // extractelement instruction.
2088   auto *U = getOrCreateVectorValue(V, Instance.Part);
2089   if (!U->getType()->isVectorTy()) {
2090     assert(VF == 1 && "Value not scalarized has non-vector type");
2091     return U;
2092   }
2093 
2094   // Otherwise, the value from the original loop has been vectorized and is
2095   // represented by UF vector values. Extract and return the requested scalar
2096   // value from the appropriate vector lane.
2097   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2098 }
2099 
2100 void InnerLoopVectorizer::packScalarIntoVectorValue(
2101     Value *V, const VPIteration &Instance) {
2102   assert(V != Induction && "The new induction variable should not be used.");
2103   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2104   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2105 
2106   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2107   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2108   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2109                                             Builder.getInt32(Instance.Lane));
2110   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2111 }
2112 
2113 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2114   assert(Vec->getType()->isVectorTy() && "Invalid type");
2115   SmallVector<Constant *, 8> ShuffleMask;
2116   for (unsigned i = 0; i < VF; ++i)
2117     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2118 
2119   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2120                                      ConstantVector::get(ShuffleMask),
2121                                      "reverse");
2122 }
2123 
2124 // Return whether we allow using masked interleave-groups (for dealing with
2125 // strided loads/stores that reside in predicated blocks, or for dealing
2126 // with gaps).
2127 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2128   // If an override option has been passed in for interleaved accesses, use it.
2129   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2130     return EnableMaskedInterleavedMemAccesses;
2131 
2132   return TTI.enableMaskedInterleavedAccessVectorization();
2133 }
2134 
2135 // Try to vectorize the interleave group that \p Instr belongs to.
2136 //
2137 // E.g. Translate following interleaved load group (factor = 3):
2138 //   for (i = 0; i < N; i+=3) {
2139 //     R = Pic[i];             // Member of index 0
2140 //     G = Pic[i+1];           // Member of index 1
2141 //     B = Pic[i+2];           // Member of index 2
2142 //     ... // do something to R, G, B
2143 //   }
2144 // To:
2145 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2146 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2147 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2148 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2149 //
2150 // Or translate following interleaved store group (factor = 3):
2151 //   for (i = 0; i < N; i+=3) {
2152 //     ... do something to R, G, B
2153 //     Pic[i]   = R;           // Member of index 0
2154 //     Pic[i+1] = G;           // Member of index 1
2155 //     Pic[i+2] = B;           // Member of index 2
2156 //   }
2157 // To:
2158 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2159 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2160 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2161 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2162 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2163 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2164                                                    VectorParts *BlockInMask) {
2165   const InterleaveGroup<Instruction> *Group =
2166       Cost->getInterleavedAccessGroup(Instr);
2167   assert(Group && "Fail to get an interleaved access group.");
2168 
2169   // Skip if current instruction is not the insert position.
2170   if (Instr != Group->getInsertPos())
2171     return;
2172 
2173   const DataLayout &DL = Instr->getModule()->getDataLayout();
2174   Value *Ptr = getLoadStorePointerOperand(Instr);
2175 
2176   // Prepare for the vector type of the interleaved load/store.
2177   Type *ScalarTy = getMemInstValueType(Instr);
2178   unsigned InterleaveFactor = Group->getFactor();
2179   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2180   Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr));
2181 
2182   // Prepare for the new pointers.
2183   setDebugLocFromInst(Builder, Ptr);
2184   SmallVector<Value *, 2> NewPtrs;
2185   unsigned Index = Group->getIndex(Instr);
2186 
2187   VectorParts Mask;
2188   bool IsMaskForCondRequired = BlockInMask;
2189   if (IsMaskForCondRequired) {
2190     Mask = *BlockInMask;
2191     // TODO: extend the masked interleaved-group support to reversed access.
2192     assert(!Group->isReverse() && "Reversed masked interleave-group "
2193                                   "not supported.");
2194   }
2195 
2196   // If the group is reverse, adjust the index to refer to the last vector lane
2197   // instead of the first. We adjust the index from the first vector lane,
2198   // rather than directly getting the pointer for lane VF - 1, because the
2199   // pointer operand of the interleaved access is supposed to be uniform. For
2200   // uniform instructions, we're only required to generate a value for the
2201   // first vector lane in each unroll iteration.
2202   if (Group->isReverse())
2203     Index += (VF - 1) * Group->getFactor();
2204 
2205   bool InBounds = false;
2206   if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2207     InBounds = gep->isInBounds();
2208 
2209   for (unsigned Part = 0; Part < UF; Part++) {
2210     Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
2211 
2212     // Notice current instruction could be any index. Need to adjust the address
2213     // to the member of index 0.
2214     //
2215     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2216     //       b = A[i];       // Member of index 0
2217     // Current pointer is pointed to A[i+1], adjust it to A[i].
2218     //
2219     // E.g.  A[i+1] = a;     // Member of index 1
2220     //       A[i]   = b;     // Member of index 0
2221     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2222     // Current pointer is pointed to A[i+2], adjust it to A[i].
2223     NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index));
2224     if (InBounds)
2225       cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);
2226 
2227     // Cast to the vector pointer type.
2228     NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
2229   }
2230 
2231   setDebugLocFromInst(Builder, Instr);
2232   Value *UndefVec = UndefValue::get(VecTy);
2233 
2234   Value *MaskForGaps = nullptr;
2235   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2236     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2237     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2238   }
2239 
2240   // Vectorize the interleaved load group.
2241   if (isa<LoadInst>(Instr)) {
2242     // For each unroll part, create a wide load for the group.
2243     SmallVector<Value *, 2> NewLoads;
2244     for (unsigned Part = 0; Part < UF; Part++) {
2245       Instruction *NewLoad;
2246       if (IsMaskForCondRequired || MaskForGaps) {
2247         assert(useMaskedInterleavedAccesses(*TTI) &&
2248                "masked interleaved groups are not allowed.");
2249         Value *GroupMask = MaskForGaps;
2250         if (IsMaskForCondRequired) {
2251           auto *Undefs = UndefValue::get(Mask[Part]->getType());
2252           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2253           Value *ShuffledMask = Builder.CreateShuffleVector(
2254               Mask[Part], Undefs, RepMask, "interleaved.mask");
2255           GroupMask = MaskForGaps
2256                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2257                                                 MaskForGaps)
2258                           : ShuffledMask;
2259         }
2260         NewLoad =
2261             Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
2262                                      GroupMask, UndefVec, "wide.masked.vec");
2263       }
2264       else
2265         NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part],
2266                                             Group->getAlignment(), "wide.vec");
2267       Group->addMetadata(NewLoad);
2268       NewLoads.push_back(NewLoad);
2269     }
2270 
2271     // For each member in the group, shuffle out the appropriate data from the
2272     // wide loads.
2273     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2274       Instruction *Member = Group->getMember(I);
2275 
2276       // Skip the gaps in the group.
2277       if (!Member)
2278         continue;
2279 
2280       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2281       for (unsigned Part = 0; Part < UF; Part++) {
2282         Value *StridedVec = Builder.CreateShuffleVector(
2283             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2284 
2285         // If this member has different type, cast the result type.
2286         if (Member->getType() != ScalarTy) {
2287           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2288           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2289         }
2290 
2291         if (Group->isReverse())
2292           StridedVec = reverseVector(StridedVec);
2293 
2294         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2295       }
2296     }
2297     return;
2298   }
2299 
2300   // The sub vector type for current instruction.
2301   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2302 
2303   // Vectorize the interleaved store group.
2304   for (unsigned Part = 0; Part < UF; Part++) {
2305     // Collect the stored vector from each member.
2306     SmallVector<Value *, 4> StoredVecs;
2307     for (unsigned i = 0; i < InterleaveFactor; i++) {
2308       // Interleaved store group doesn't allow a gap, so each index has a member
2309       Instruction *Member = Group->getMember(i);
2310       assert(Member && "Fail to get a member from an interleaved store group");
2311 
2312       Value *StoredVec = getOrCreateVectorValue(
2313           cast<StoreInst>(Member)->getValueOperand(), Part);
2314       if (Group->isReverse())
2315         StoredVec = reverseVector(StoredVec);
2316 
2317       // If this member has different type, cast it to a unified type.
2318 
2319       if (StoredVec->getType() != SubVT)
2320         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2321 
2322       StoredVecs.push_back(StoredVec);
2323     }
2324 
2325     // Concatenate all vectors into a wide vector.
2326     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2327 
2328     // Interleave the elements in the wide vector.
2329     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2330     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2331                                               "interleaved.vec");
2332 
2333     Instruction *NewStoreInstr;
2334     if (IsMaskForCondRequired) {
2335       auto *Undefs = UndefValue::get(Mask[Part]->getType());
2336       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2337       Value *ShuffledMask = Builder.CreateShuffleVector(
2338           Mask[Part], Undefs, RepMask, "interleaved.mask");
2339       NewStoreInstr = Builder.CreateMaskedStore(
2340           IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
2341     }
2342     else
2343       NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part],
2344         Group->getAlignment());
2345 
2346     Group->addMetadata(NewStoreInstr);
2347   }
2348 }
2349 
2350 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2351                                                      VectorParts *BlockInMask) {
2352   // Attempt to issue a wide load.
2353   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2354   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2355 
2356   assert((LI || SI) && "Invalid Load/Store instruction");
2357 
2358   LoopVectorizationCostModel::InstWidening Decision =
2359       Cost->getWideningDecision(Instr, VF);
2360   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2361          "CM decision should be taken at this point");
2362   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2363     return vectorizeInterleaveGroup(Instr);
2364 
2365   Type *ScalarDataTy = getMemInstValueType(Instr);
2366   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2367   Value *Ptr = getLoadStorePointerOperand(Instr);
2368   // An alignment of 0 means target abi alignment. We need to use the scalar's
2369   // target abi alignment in such a case.
2370   const DataLayout &DL = Instr->getModule()->getDataLayout();
2371   const Align Alignment =
2372       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2373   unsigned AddressSpace = getLoadStoreAddressSpace(Instr);
2374 
2375   // Determine if the pointer operand of the access is either consecutive or
2376   // reverse consecutive.
2377   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2378   bool ConsecutiveStride =
2379       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2380   bool CreateGatherScatter =
2381       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2382 
2383   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2384   // gather/scatter. Otherwise Decision should have been to Scalarize.
2385   assert((ConsecutiveStride || CreateGatherScatter) &&
2386          "The instruction should be scalarized");
2387 
2388   // Handle consecutive loads/stores.
2389   if (ConsecutiveStride)
2390     Ptr = getOrCreateScalarValue(Ptr, {0, 0});
2391 
2392   VectorParts Mask;
2393   bool isMaskRequired = BlockInMask;
2394   if (isMaskRequired)
2395     Mask = *BlockInMask;
2396 
2397   bool InBounds = false;
2398   if (auto *gep = dyn_cast<GetElementPtrInst>(
2399           getLoadStorePointerOperand(Instr)->stripPointerCasts()))
2400     InBounds = gep->isInBounds();
2401 
2402   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2403     // Calculate the pointer for the specific unroll-part.
2404     GetElementPtrInst *PartPtr = nullptr;
2405 
2406     if (Reverse) {
2407       // If the address is consecutive but reversed, then the
2408       // wide store needs to start at the last vector element.
2409       PartPtr = cast<GetElementPtrInst>(
2410           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2411       PartPtr->setIsInBounds(InBounds);
2412       PartPtr = cast<GetElementPtrInst>(
2413           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2414       PartPtr->setIsInBounds(InBounds);
2415       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2416         Mask[Part] = reverseVector(Mask[Part]);
2417     } else {
2418       PartPtr = cast<GetElementPtrInst>(
2419           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2420       PartPtr->setIsInBounds(InBounds);
2421     }
2422 
2423     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2424   };
2425 
2426   // Handle Stores:
2427   if (SI) {
2428     setDebugLocFromInst(Builder, SI);
2429 
2430     for (unsigned Part = 0; Part < UF; ++Part) {
2431       Instruction *NewSI = nullptr;
2432       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2433       if (CreateGatherScatter) {
2434         Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2435         Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2436         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2437                                             Alignment.value(), MaskPart);
2438       } else {
2439         if (Reverse) {
2440           // If we store to reverse consecutive memory locations, then we need
2441           // to reverse the order of elements in the stored value.
2442           StoredVal = reverseVector(StoredVal);
2443           // We don't want to update the value in the map as it might be used in
2444           // another expression. So don't call resetVectorValue(StoredVal).
2445         }
2446         auto *VecPtr = CreateVecPtr(Part, Ptr);
2447         if (isMaskRequired)
2448           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr,
2449                                             Alignment.value(), Mask[Part]);
2450         else
2451           NewSI =
2452               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2453       }
2454       addMetadata(NewSI, SI);
2455     }
2456     return;
2457   }
2458 
2459   // Handle loads.
2460   assert(LI && "Must have a load instruction");
2461   setDebugLocFromInst(Builder, LI);
2462   for (unsigned Part = 0; Part < UF; ++Part) {
2463     Value *NewLI;
2464     if (CreateGatherScatter) {
2465       Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr;
2466       Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
2467       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2468                                          nullptr, "wide.masked.gather");
2469       addMetadata(NewLI, LI);
2470     } else {
2471       auto *VecPtr = CreateVecPtr(Part, Ptr);
2472       if (isMaskRequired)
2473         NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part],
2474                                          UndefValue::get(DataTy),
2475                                          "wide.masked.load");
2476       else
2477         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2478                                           "wide.load");
2479 
2480       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2481       addMetadata(NewLI, LI);
2482       if (Reverse)
2483         NewLI = reverseVector(NewLI);
2484     }
2485     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2486   }
2487 }
2488 
2489 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2490                                                const VPIteration &Instance,
2491                                                bool IfPredicateInstr) {
2492   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2493 
2494   setDebugLocFromInst(Builder, Instr);
2495 
2496   // Does this instruction return a value ?
2497   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2498 
2499   Instruction *Cloned = Instr->clone();
2500   if (!IsVoidRetTy)
2501     Cloned->setName(Instr->getName() + ".cloned");
2502 
2503   // Replace the operands of the cloned instructions with their scalar
2504   // equivalents in the new loop.
2505   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2506     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2507     Cloned->setOperand(op, NewOp);
2508   }
2509   addNewMetadata(Cloned, Instr);
2510 
2511   // Place the cloned scalar in the new loop.
2512   Builder.Insert(Cloned);
2513 
2514   // Add the cloned scalar to the scalar map entry.
2515   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2516 
2517   // If we just cloned a new assumption, add it the assumption cache.
2518   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2519     if (II->getIntrinsicID() == Intrinsic::assume)
2520       AC->registerAssumption(II);
2521 
2522   // End if-block.
2523   if (IfPredicateInstr)
2524     PredicatedInstructions.push_back(Cloned);
2525 }
2526 
2527 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2528                                                       Value *End, Value *Step,
2529                                                       Instruction *DL) {
2530   BasicBlock *Header = L->getHeader();
2531   BasicBlock *Latch = L->getLoopLatch();
2532   // As we're just creating this loop, it's possible no latch exists
2533   // yet. If so, use the header as this will be a single block loop.
2534   if (!Latch)
2535     Latch = Header;
2536 
2537   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2538   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2539   setDebugLocFromInst(Builder, OldInst);
2540   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2541 
2542   Builder.SetInsertPoint(Latch->getTerminator());
2543   setDebugLocFromInst(Builder, OldInst);
2544 
2545   // Create i+1 and fill the PHINode.
2546   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2547   Induction->addIncoming(Start, L->getLoopPreheader());
2548   Induction->addIncoming(Next, Latch);
2549   // Create the compare.
2550   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2551   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2552 
2553   // Now we have two terminators. Remove the old one from the block.
2554   Latch->getTerminator()->eraseFromParent();
2555 
2556   return Induction;
2557 }
2558 
2559 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2560   if (TripCount)
2561     return TripCount;
2562 
2563   assert(L && "Create Trip Count for null loop.");
2564   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2565   // Find the loop boundaries.
2566   ScalarEvolution *SE = PSE.getSE();
2567   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2568   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2569          "Invalid loop count");
2570 
2571   Type *IdxTy = Legal->getWidestInductionType();
2572   assert(IdxTy && "No type for induction");
2573 
2574   // The exit count might have the type of i64 while the phi is i32. This can
2575   // happen if we have an induction variable that is sign extended before the
2576   // compare. The only way that we get a backedge taken count is that the
2577   // induction variable was signed and as such will not overflow. In such a case
2578   // truncation is legal.
2579   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2580       IdxTy->getPrimitiveSizeInBits())
2581     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2582   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2583 
2584   // Get the total trip count from the count by adding 1.
2585   const SCEV *ExitCount = SE->getAddExpr(
2586       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2587 
2588   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2589 
2590   // Expand the trip count and place the new instructions in the preheader.
2591   // Notice that the pre-header does not change, only the loop body.
2592   SCEVExpander Exp(*SE, DL, "induction");
2593 
2594   // Count holds the overall loop count (N).
2595   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2596                                 L->getLoopPreheader()->getTerminator());
2597 
2598   if (TripCount->getType()->isPointerTy())
2599     TripCount =
2600         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2601                                     L->getLoopPreheader()->getTerminator());
2602 
2603   return TripCount;
2604 }
2605 
2606 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2607   if (VectorTripCount)
2608     return VectorTripCount;
2609 
2610   Value *TC = getOrCreateTripCount(L);
2611   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2612 
2613   Type *Ty = TC->getType();
2614   Constant *Step = ConstantInt::get(Ty, VF * UF);
2615 
2616   // If the tail is to be folded by masking, round the number of iterations N
2617   // up to a multiple of Step instead of rounding down. This is done by first
2618   // adding Step-1 and then rounding down. Note that it's ok if this addition
2619   // overflows: the vector induction variable will eventually wrap to zero given
2620   // that it starts at zero and its Step is a power of two; the loop will then
2621   // exit, with the last early-exit vector comparison also producing all-true.
2622   if (Cost->foldTailByMasking()) {
2623     assert(isPowerOf2_32(VF * UF) &&
2624            "VF*UF must be a power of 2 when folding tail by masking");
2625     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2626   }
2627 
2628   // Now we need to generate the expression for the part of the loop that the
2629   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2630   // iterations are not required for correctness, or N - Step, otherwise. Step
2631   // is equal to the vectorization factor (number of SIMD elements) times the
2632   // unroll factor (number of SIMD instructions).
2633   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2634 
2635   // If there is a non-reversed interleaved group that may speculatively access
2636   // memory out-of-bounds, we need to ensure that there will be at least one
2637   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2638   // the trip count, we set the remainder to be equal to the step. If the step
2639   // does not evenly divide the trip count, no adjustment is necessary since
2640   // there will already be scalar iterations. Note that the minimum iterations
2641   // check ensures that N >= Step.
2642   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2643     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2644     R = Builder.CreateSelect(IsZero, Step, R);
2645   }
2646 
2647   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2648 
2649   return VectorTripCount;
2650 }
2651 
2652 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2653                                                    const DataLayout &DL) {
2654   // Verify that V is a vector type with same number of elements as DstVTy.
2655   unsigned VF = DstVTy->getNumElements();
2656   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2657   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2658   Type *SrcElemTy = SrcVecTy->getElementType();
2659   Type *DstElemTy = DstVTy->getElementType();
2660   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2661          "Vector elements must have same size");
2662 
2663   // Do a direct cast if element types are castable.
2664   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2665     return Builder.CreateBitOrPointerCast(V, DstVTy);
2666   }
2667   // V cannot be directly casted to desired vector type.
2668   // May happen when V is a floating point vector but DstVTy is a vector of
2669   // pointers or vice-versa. Handle this using a two-step bitcast using an
2670   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2671   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2672          "Only one type should be a pointer type");
2673   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2674          "Only one type should be a floating point type");
2675   Type *IntTy =
2676       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2677   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2678   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2679   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2680 }
2681 
2682 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2683                                                          BasicBlock *Bypass) {
2684   Value *Count = getOrCreateTripCount(L);
2685   BasicBlock *BB = L->getLoopPreheader();
2686   IRBuilder<> Builder(BB->getTerminator());
2687 
2688   // Generate code to check if the loop's trip count is less than VF * UF, or
2689   // equal to it in case a scalar epilogue is required; this implies that the
2690   // vector trip count is zero. This check also covers the case where adding one
2691   // to the backedge-taken count overflowed leading to an incorrect trip count
2692   // of zero. In this case we will also jump to the scalar loop.
2693   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2694                                           : ICmpInst::ICMP_ULT;
2695 
2696   // If tail is to be folded, vector loop takes care of all iterations.
2697   Value *CheckMinIters = Builder.getFalse();
2698   if (!Cost->foldTailByMasking())
2699     CheckMinIters = Builder.CreateICmp(
2700         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2701         "min.iters.check");
2702 
2703   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2704   // Update dominator tree immediately if the generated block is a
2705   // LoopBypassBlock because SCEV expansions to generate loop bypass
2706   // checks may query it before the current function is finished.
2707   DT->addNewBlock(NewBB, BB);
2708   if (L->getParentLoop())
2709     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2710   ReplaceInstWithInst(BB->getTerminator(),
2711                       BranchInst::Create(Bypass, NewBB, CheckMinIters));
2712   LoopBypassBlocks.push_back(BB);
2713 }
2714 
2715 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2716   BasicBlock *BB = L->getLoopPreheader();
2717 
2718   // Generate the code to check that the SCEV assumptions that we made.
2719   // We want the new basic block to start at the first instruction in a
2720   // sequence of instructions that form a check.
2721   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2722                    "scev.check");
2723   Value *SCEVCheck =
2724       Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator());
2725 
2726   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2727     if (C->isZero())
2728       return;
2729 
2730   assert(!BB->getParent()->hasOptSize() &&
2731          "Cannot SCEV check stride or overflow when optimizing for size");
2732 
2733   // Create a new block containing the stride check.
2734   BB->setName("vector.scevcheck");
2735   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2736   // Update dominator tree immediately if the generated block is a
2737   // LoopBypassBlock because SCEV expansions to generate loop bypass
2738   // checks may query it before the current function is finished.
2739   DT->addNewBlock(NewBB, BB);
2740   if (L->getParentLoop())
2741     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2742   ReplaceInstWithInst(BB->getTerminator(),
2743                       BranchInst::Create(Bypass, NewBB, SCEVCheck));
2744   LoopBypassBlocks.push_back(BB);
2745   AddedSafetyChecks = true;
2746 }
2747 
2748 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2749   // VPlan-native path does not do any analysis for runtime checks currently.
2750   if (EnableVPlanNativePath)
2751     return;
2752 
2753   BasicBlock *BB = L->getLoopPreheader();
2754 
2755   // Generate the code that checks in runtime if arrays overlap. We put the
2756   // checks into a separate block to make the more common case of few elements
2757   // faster.
2758   Instruction *FirstCheckInst;
2759   Instruction *MemRuntimeCheck;
2760   std::tie(FirstCheckInst, MemRuntimeCheck) =
2761       Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
2762   if (!MemRuntimeCheck)
2763     return;
2764 
2765   if (BB->getParent()->hasOptSize()) {
2766     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2767            "Cannot emit memory checks when optimizing for size, unless forced "
2768            "to vectorize.");
2769     ORE->emit([&]() {
2770       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2771                                         L->getStartLoc(), L->getHeader())
2772              << "Code-size may be reduced by not forcing "
2773                 "vectorization, or by source-code modifications "
2774                 "eliminating the need for runtime checks "
2775                 "(e.g., adding 'restrict').";
2776     });
2777   }
2778 
2779   // Create a new block containing the memory check.
2780   BB->setName("vector.memcheck");
2781   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
2782   // Update dominator tree immediately if the generated block is a
2783   // LoopBypassBlock because SCEV expansions to generate loop bypass
2784   // checks may query it before the current function is finished.
2785   DT->addNewBlock(NewBB, BB);
2786   if (L->getParentLoop())
2787     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
2788   ReplaceInstWithInst(BB->getTerminator(),
2789                       BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
2790   LoopBypassBlocks.push_back(BB);
2791   AddedSafetyChecks = true;
2792 
2793   // We currently don't use LoopVersioning for the actual loop cloning but we
2794   // still use it to add the noalias metadata.
2795   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2796                                            PSE.getSE());
2797   LVer->prepareNoAliasMetadata();
2798 }
2799 
2800 Value *InnerLoopVectorizer::emitTransformedIndex(
2801     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2802     const InductionDescriptor &ID) const {
2803 
2804   SCEVExpander Exp(*SE, DL, "induction");
2805   auto Step = ID.getStep();
2806   auto StartValue = ID.getStartValue();
2807   assert(Index->getType() == Step->getType() &&
2808          "Index type does not match StepValue type");
2809 
2810   // Note: the IR at this point is broken. We cannot use SE to create any new
2811   // SCEV and then expand it, hoping that SCEV's simplification will give us
2812   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2813   // lead to various SCEV crashes. So all we can do is to use builder and rely
2814   // on InstCombine for future simplifications. Here we handle some trivial
2815   // cases only.
2816   auto CreateAdd = [&B](Value *X, Value *Y) {
2817     assert(X->getType() == Y->getType() && "Types don't match!");
2818     if (auto *CX = dyn_cast<ConstantInt>(X))
2819       if (CX->isZero())
2820         return Y;
2821     if (auto *CY = dyn_cast<ConstantInt>(Y))
2822       if (CY->isZero())
2823         return X;
2824     return B.CreateAdd(X, Y);
2825   };
2826 
2827   auto CreateMul = [&B](Value *X, Value *Y) {
2828     assert(X->getType() == Y->getType() && "Types don't match!");
2829     if (auto *CX = dyn_cast<ConstantInt>(X))
2830       if (CX->isOne())
2831         return Y;
2832     if (auto *CY = dyn_cast<ConstantInt>(Y))
2833       if (CY->isOne())
2834         return X;
2835     return B.CreateMul(X, Y);
2836   };
2837 
2838   switch (ID.getKind()) {
2839   case InductionDescriptor::IK_IntInduction: {
2840     assert(Index->getType() == StartValue->getType() &&
2841            "Index type does not match StartValue type");
2842     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2843       return B.CreateSub(StartValue, Index);
2844     auto *Offset = CreateMul(
2845         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2846     return CreateAdd(StartValue, Offset);
2847   }
2848   case InductionDescriptor::IK_PtrInduction: {
2849     assert(isa<SCEVConstant>(Step) &&
2850            "Expected constant step for pointer induction");
2851     return B.CreateGEP(
2852         StartValue->getType()->getPointerElementType(), StartValue,
2853         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2854                                            &*B.GetInsertPoint())));
2855   }
2856   case InductionDescriptor::IK_FpInduction: {
2857     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2858     auto InductionBinOp = ID.getInductionBinOp();
2859     assert(InductionBinOp &&
2860            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2861             InductionBinOp->getOpcode() == Instruction::FSub) &&
2862            "Original bin op should be defined for FP induction");
2863 
2864     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2865 
2866     // Floating point operations had to be 'fast' to enable the induction.
2867     FastMathFlags Flags;
2868     Flags.setFast();
2869 
2870     Value *MulExp = B.CreateFMul(StepValue, Index);
2871     if (isa<Instruction>(MulExp))
2872       // We have to check, the MulExp may be a constant.
2873       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2874 
2875     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2876                                "induction");
2877     if (isa<Instruction>(BOp))
2878       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2879 
2880     return BOp;
2881   }
2882   case InductionDescriptor::IK_NoInduction:
2883     return nullptr;
2884   }
2885   llvm_unreachable("invalid enum");
2886 }
2887 
2888 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2889   /*
2890    In this function we generate a new loop. The new loop will contain
2891    the vectorized instructions while the old loop will continue to run the
2892    scalar remainder.
2893 
2894        [ ] <-- loop iteration number check.
2895     /   |
2896    /    v
2897   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2898   |  /  |
2899   | /   v
2900   ||   [ ]     <-- vector pre header.
2901   |/    |
2902   |     v
2903   |    [  ] \
2904   |    [  ]_|   <-- vector loop.
2905   |     |
2906   |     v
2907   |   -[ ]   <--- middle-block.
2908   |  /  |
2909   | /   v
2910   -|- >[ ]     <--- new preheader.
2911    |    |
2912    |    v
2913    |   [ ] \
2914    |   [ ]_|   <-- old scalar loop to handle remainder.
2915     \   |
2916      \  v
2917       >[ ]     <-- exit block.
2918    ...
2919    */
2920 
2921   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
2922   BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2923   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
2924   MDNode *OrigLoopID = OrigLoop->getLoopID();
2925   assert(VectorPH && "Invalid loop structure");
2926   assert(ExitBlock && "Must have an exit block");
2927 
2928   // Some loops have a single integer induction variable, while other loops
2929   // don't. One example is c++ iterators that often have multiple pointer
2930   // induction variables. In the code below we also support a case where we
2931   // don't have a single induction variable.
2932   //
2933   // We try to obtain an induction variable from the original loop as hard
2934   // as possible. However if we don't find one that:
2935   //   - is an integer
2936   //   - counts from zero, stepping by one
2937   //   - is the size of the widest induction variable type
2938   // then we create a new one.
2939   OldInduction = Legal->getPrimaryInduction();
2940   Type *IdxTy = Legal->getWidestInductionType();
2941 
2942   // Split the single block loop into the two loop structure described above.
2943   BasicBlock *VecBody =
2944       VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
2945   BasicBlock *MiddleBlock =
2946       VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
2947   BasicBlock *ScalarPH =
2948       MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
2949 
2950   // Create and register the new vector loop.
2951   Loop *Lp = LI->AllocateLoop();
2952   Loop *ParentLoop = OrigLoop->getParentLoop();
2953 
2954   // Insert the new loop into the loop nest and register the new basic blocks
2955   // before calling any utilities such as SCEV that require valid LoopInfo.
2956   if (ParentLoop) {
2957     ParentLoop->addChildLoop(Lp);
2958     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
2959     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
2960   } else {
2961     LI->addTopLevelLoop(Lp);
2962   }
2963   Lp->addBasicBlockToLoop(VecBody, *LI);
2964 
2965   // Find the loop boundaries.
2966   Value *Count = getOrCreateTripCount(Lp);
2967 
2968   Value *StartIdx = ConstantInt::get(IdxTy, 0);
2969 
2970   // Now, compare the new count to zero. If it is zero skip the vector loop and
2971   // jump to the scalar loop. This check also covers the case where the
2972   // backedge-taken count is uint##_max: adding one to it will overflow leading
2973   // to an incorrect trip count of zero. In this (rare) case we will also jump
2974   // to the scalar loop.
2975   emitMinimumIterationCountCheck(Lp, ScalarPH);
2976 
2977   // Generate the code to check any assumptions that we've made for SCEV
2978   // expressions.
2979   emitSCEVChecks(Lp, ScalarPH);
2980 
2981   // Generate the code that checks in runtime if arrays overlap. We put the
2982   // checks into a separate block to make the more common case of few elements
2983   // faster.
2984   emitMemRuntimeChecks(Lp, ScalarPH);
2985 
2986   // Generate the induction variable.
2987   // The loop step is equal to the vectorization factor (num of SIMD elements)
2988   // times the unroll factor (num of SIMD instructions).
2989   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
2990   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
2991   Induction =
2992       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
2993                               getDebugLocFromInstOrOperands(OldInduction));
2994 
2995   // We are going to resume the execution of the scalar loop.
2996   // Go over all of the induction variables that we found and fix the
2997   // PHIs that are left in the scalar version of the loop.
2998   // The starting values of PHI nodes depend on the counter of the last
2999   // iteration in the vectorized loop.
3000   // If we come from a bypass edge then we need to start from the original
3001   // start value.
3002 
3003   // This variable saves the new starting index for the scalar loop. It is used
3004   // to test if there are any tail iterations left once the vector loop has
3005   // completed.
3006   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3007   for (auto &InductionEntry : *List) {
3008     PHINode *OrigPhi = InductionEntry.first;
3009     InductionDescriptor II = InductionEntry.second;
3010 
3011     // Create phi nodes to merge from the  backedge-taken check block.
3012     PHINode *BCResumeVal = PHINode::Create(
3013         OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
3014     // Copy original phi DL over to the new one.
3015     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3016     Value *&EndValue = IVEndValues[OrigPhi];
3017     if (OrigPhi == OldInduction) {
3018       // We know what the end value is.
3019       EndValue = CountRoundDown;
3020     } else {
3021       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3022       Type *StepType = II.getStep()->getType();
3023       Instruction::CastOps CastOp =
3024         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3025       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3026       const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
3027       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3028       EndValue->setName("ind.end");
3029     }
3030 
3031     // The new PHI merges the original incoming value, in case of a bypass,
3032     // or the value at the end of the vectorized loop.
3033     BCResumeVal->addIncoming(EndValue, MiddleBlock);
3034 
3035     // Fix the scalar body counter (PHI node).
3036     // The old induction's phi node in the scalar body needs the truncated
3037     // value.
3038     for (BasicBlock *BB : LoopBypassBlocks)
3039       BCResumeVal->addIncoming(II.getStartValue(), BB);
3040     OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal);
3041   }
3042 
3043   // We need the OrigLoop (scalar loop part) latch terminator to help
3044   // produce correct debug info for the middle block BB instructions.
3045   // The legality check stage guarantees that the loop will have a single
3046   // latch.
3047   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3048          "Scalar loop latch terminator isn't a branch");
3049   BranchInst *ScalarLatchBr =
3050       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3051 
3052   // Add a check in the middle block to see if we have completed
3053   // all of the iterations in the first vector loop.
3054   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3055   // If tail is to be folded, we know we don't need to run the remainder.
3056   Value *CmpN = Builder.getTrue();
3057   if (!Cost->foldTailByMasking()) {
3058     CmpN =
3059         CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3060                         CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
3061 
3062     // Here we use the same DebugLoc as the scalar loop latch branch instead
3063     // of the corresponding compare because they may have ended up with
3064     // different line numbers and we want to avoid awkward line stepping while
3065     // debugging. Eg. if the compare has got a line number inside the loop.
3066     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3067   }
3068 
3069   BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN);
3070   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3071   ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst);
3072 
3073   // Get ready to start creating new instructions into the vectorized body.
3074   Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
3075 
3076   // Save the state.
3077   LoopVectorPreHeader = Lp->getLoopPreheader();
3078   LoopScalarPreHeader = ScalarPH;
3079   LoopMiddleBlock = MiddleBlock;
3080   LoopExitBlock = ExitBlock;
3081   LoopVectorBody = VecBody;
3082   LoopScalarBody = OldBasicBlock;
3083 
3084   Optional<MDNode *> VectorizedLoopID =
3085       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3086                                       LLVMLoopVectorizeFollowupVectorized});
3087   if (VectorizedLoopID.hasValue()) {
3088     Lp->setLoopID(VectorizedLoopID.getValue());
3089 
3090     // Do not setAlreadyVectorized if loop attributes have been defined
3091     // explicitly.
3092     return LoopVectorPreHeader;
3093   }
3094 
3095   // Keep all loop hints from the original loop on the vector loop (we'll
3096   // replace the vectorizer-specific hints below).
3097   if (MDNode *LID = OrigLoop->getLoopID())
3098     Lp->setLoopID(LID);
3099 
3100   LoopVectorizeHints Hints(Lp, true, *ORE);
3101   Hints.setAlreadyVectorized();
3102 
3103   return LoopVectorPreHeader;
3104 }
3105 
3106 // Fix up external users of the induction variable. At this point, we are
3107 // in LCSSA form, with all external PHIs that use the IV having one input value,
3108 // coming from the remainder loop. We need those PHIs to also have a correct
3109 // value for the IV when arriving directly from the middle block.
3110 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3111                                        const InductionDescriptor &II,
3112                                        Value *CountRoundDown, Value *EndValue,
3113                                        BasicBlock *MiddleBlock) {
3114   // There are two kinds of external IV usages - those that use the value
3115   // computed in the last iteration (the PHI) and those that use the penultimate
3116   // value (the value that feeds into the phi from the loop latch).
3117   // We allow both, but they, obviously, have different values.
3118 
3119   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3120 
3121   DenseMap<Value *, Value *> MissingVals;
3122 
3123   // An external user of the last iteration's value should see the value that
3124   // the remainder loop uses to initialize its own IV.
3125   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3126   for (User *U : PostInc->users()) {
3127     Instruction *UI = cast<Instruction>(U);
3128     if (!OrigLoop->contains(UI)) {
3129       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3130       MissingVals[UI] = EndValue;
3131     }
3132   }
3133 
3134   // An external user of the penultimate value need to see EndValue - Step.
3135   // The simplest way to get this is to recompute it from the constituent SCEVs,
3136   // that is Start + (Step * (CRD - 1)).
3137   for (User *U : OrigPhi->users()) {
3138     auto *UI = cast<Instruction>(U);
3139     if (!OrigLoop->contains(UI)) {
3140       const DataLayout &DL =
3141           OrigLoop->getHeader()->getModule()->getDataLayout();
3142       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3143 
3144       IRBuilder<> B(MiddleBlock->getTerminator());
3145       Value *CountMinusOne = B.CreateSub(
3146           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3147       Value *CMO =
3148           !II.getStep()->getType()->isIntegerTy()
3149               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3150                              II.getStep()->getType())
3151               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3152       CMO->setName("cast.cmo");
3153       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3154       Escape->setName("ind.escape");
3155       MissingVals[UI] = Escape;
3156     }
3157   }
3158 
3159   for (auto &I : MissingVals) {
3160     PHINode *PHI = cast<PHINode>(I.first);
3161     // One corner case we have to handle is two IVs "chasing" each-other,
3162     // that is %IV2 = phi [...], [ %IV1, %latch ]
3163     // In this case, if IV1 has an external use, we need to avoid adding both
3164     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3165     // don't already have an incoming value for the middle block.
3166     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3167       PHI->addIncoming(I.second, MiddleBlock);
3168   }
3169 }
3170 
3171 namespace {
3172 
3173 struct CSEDenseMapInfo {
3174   static bool canHandle(const Instruction *I) {
3175     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3176            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3177   }
3178 
3179   static inline Instruction *getEmptyKey() {
3180     return DenseMapInfo<Instruction *>::getEmptyKey();
3181   }
3182 
3183   static inline Instruction *getTombstoneKey() {
3184     return DenseMapInfo<Instruction *>::getTombstoneKey();
3185   }
3186 
3187   static unsigned getHashValue(const Instruction *I) {
3188     assert(canHandle(I) && "Unknown instruction!");
3189     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3190                                                            I->value_op_end()));
3191   }
3192 
3193   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3194     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3195         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3196       return LHS == RHS;
3197     return LHS->isIdenticalTo(RHS);
3198   }
3199 };
3200 
3201 } // end anonymous namespace
3202 
3203 ///Perform cse of induction variable instructions.
3204 static void cse(BasicBlock *BB) {
3205   // Perform simple cse.
3206   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3207   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3208     Instruction *In = &*I++;
3209 
3210     if (!CSEDenseMapInfo::canHandle(In))
3211       continue;
3212 
3213     // Check if we can replace this instruction with any of the
3214     // visited instructions.
3215     if (Instruction *V = CSEMap.lookup(In)) {
3216       In->replaceAllUsesWith(V);
3217       In->eraseFromParent();
3218       continue;
3219     }
3220 
3221     CSEMap[In] = In;
3222   }
3223 }
3224 
3225 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3226                                                        unsigned VF,
3227                                                        bool &NeedToScalarize) {
3228   Function *F = CI->getCalledFunction();
3229   StringRef FnName = CI->getCalledFunction()->getName();
3230   Type *ScalarRetTy = CI->getType();
3231   SmallVector<Type *, 4> Tys, ScalarTys;
3232   for (auto &ArgOp : CI->arg_operands())
3233     ScalarTys.push_back(ArgOp->getType());
3234 
3235   // Estimate cost of scalarized vector call. The source operands are assumed
3236   // to be vectors, so we need to extract individual elements from there,
3237   // execute VF scalar calls, and then gather the result into the vector return
3238   // value.
3239   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3240   if (VF == 1)
3241     return ScalarCallCost;
3242 
3243   // Compute corresponding vector type for return value and arguments.
3244   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3245   for (Type *ScalarTy : ScalarTys)
3246     Tys.push_back(ToVectorTy(ScalarTy, VF));
3247 
3248   // Compute costs of unpacking argument values for the scalar calls and
3249   // packing the return values to a vector.
3250   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3251 
3252   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3253 
3254   // If we can't emit a vector call for this function, then the currently found
3255   // cost is the cost we need to return.
3256   NeedToScalarize = true;
3257   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3258     return Cost;
3259 
3260   // If the corresponding vector cost is cheaper, return its cost.
3261   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3262   if (VectorCallCost < Cost) {
3263     NeedToScalarize = false;
3264     return VectorCallCost;
3265   }
3266   return Cost;
3267 }
3268 
3269 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3270                                                             unsigned VF) {
3271   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3272   assert(ID && "Expected intrinsic call!");
3273 
3274   FastMathFlags FMF;
3275   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3276     FMF = FPMO->getFastMathFlags();
3277 
3278   SmallVector<Value *, 4> Operands(CI->arg_operands());
3279   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3280 }
3281 
3282 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3283   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3284   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3285   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3286 }
3287 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3288   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3289   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3290   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3291 }
3292 
3293 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3294   // For every instruction `I` in MinBWs, truncate the operands, create a
3295   // truncated version of `I` and reextend its result. InstCombine runs
3296   // later and will remove any ext/trunc pairs.
3297   SmallPtrSet<Value *, 4> Erased;
3298   for (const auto &KV : Cost->getMinimalBitwidths()) {
3299     // If the value wasn't vectorized, we must maintain the original scalar
3300     // type. The absence of the value from VectorLoopValueMap indicates that it
3301     // wasn't vectorized.
3302     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3303       continue;
3304     for (unsigned Part = 0; Part < UF; ++Part) {
3305       Value *I = getOrCreateVectorValue(KV.first, Part);
3306       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3307           !isa<Instruction>(I))
3308         continue;
3309       Type *OriginalTy = I->getType();
3310       Type *ScalarTruncatedTy =
3311           IntegerType::get(OriginalTy->getContext(), KV.second);
3312       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3313                                           OriginalTy->getVectorNumElements());
3314       if (TruncatedTy == OriginalTy)
3315         continue;
3316 
3317       IRBuilder<> B(cast<Instruction>(I));
3318       auto ShrinkOperand = [&](Value *V) -> Value * {
3319         if (auto *ZI = dyn_cast<ZExtInst>(V))
3320           if (ZI->getSrcTy() == TruncatedTy)
3321             return ZI->getOperand(0);
3322         return B.CreateZExtOrTrunc(V, TruncatedTy);
3323       };
3324 
3325       // The actual instruction modification depends on the instruction type,
3326       // unfortunately.
3327       Value *NewI = nullptr;
3328       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3329         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3330                              ShrinkOperand(BO->getOperand(1)));
3331 
3332         // Any wrapping introduced by shrinking this operation shouldn't be
3333         // considered undefined behavior. So, we can't unconditionally copy
3334         // arithmetic wrapping flags to NewI.
3335         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3336       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3337         NewI =
3338             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3339                          ShrinkOperand(CI->getOperand(1)));
3340       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3341         NewI = B.CreateSelect(SI->getCondition(),
3342                               ShrinkOperand(SI->getTrueValue()),
3343                               ShrinkOperand(SI->getFalseValue()));
3344       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3345         switch (CI->getOpcode()) {
3346         default:
3347           llvm_unreachable("Unhandled cast!");
3348         case Instruction::Trunc:
3349           NewI = ShrinkOperand(CI->getOperand(0));
3350           break;
3351         case Instruction::SExt:
3352           NewI = B.CreateSExtOrTrunc(
3353               CI->getOperand(0),
3354               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3355           break;
3356         case Instruction::ZExt:
3357           NewI = B.CreateZExtOrTrunc(
3358               CI->getOperand(0),
3359               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3360           break;
3361         }
3362       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3363         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3364         auto *O0 = B.CreateZExtOrTrunc(
3365             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3366         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3367         auto *O1 = B.CreateZExtOrTrunc(
3368             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3369 
3370         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3371       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3372         // Don't do anything with the operands, just extend the result.
3373         continue;
3374       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3375         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3376         auto *O0 = B.CreateZExtOrTrunc(
3377             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3378         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3379         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3380       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3381         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3382         auto *O0 = B.CreateZExtOrTrunc(
3383             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3384         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3385       } else {
3386         // If we don't know what to do, be conservative and don't do anything.
3387         continue;
3388       }
3389 
3390       // Lastly, extend the result.
3391       NewI->takeName(cast<Instruction>(I));
3392       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3393       I->replaceAllUsesWith(Res);
3394       cast<Instruction>(I)->eraseFromParent();
3395       Erased.insert(I);
3396       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3397     }
3398   }
3399 
3400   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3401   for (const auto &KV : Cost->getMinimalBitwidths()) {
3402     // If the value wasn't vectorized, we must maintain the original scalar
3403     // type. The absence of the value from VectorLoopValueMap indicates that it
3404     // wasn't vectorized.
3405     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3406       continue;
3407     for (unsigned Part = 0; Part < UF; ++Part) {
3408       Value *I = getOrCreateVectorValue(KV.first, Part);
3409       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3410       if (Inst && Inst->use_empty()) {
3411         Value *NewI = Inst->getOperand(0);
3412         Inst->eraseFromParent();
3413         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3414       }
3415     }
3416   }
3417 }
3418 
3419 void InnerLoopVectorizer::fixVectorizedLoop() {
3420   // Insert truncates and extends for any truncated instructions as hints to
3421   // InstCombine.
3422   if (VF > 1)
3423     truncateToMinimalBitwidths();
3424 
3425   // Fix widened non-induction PHIs by setting up the PHI operands.
3426   if (OrigPHIsToFix.size()) {
3427     assert(EnableVPlanNativePath &&
3428            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3429     fixNonInductionPHIs();
3430   }
3431 
3432   // At this point every instruction in the original loop is widened to a
3433   // vector form. Now we need to fix the recurrences in the loop. These PHI
3434   // nodes are currently empty because we did not want to introduce cycles.
3435   // This is the second stage of vectorizing recurrences.
3436   fixCrossIterationPHIs();
3437 
3438   // Update the dominator tree.
3439   //
3440   // FIXME: After creating the structure of the new loop, the dominator tree is
3441   //        no longer up-to-date, and it remains that way until we update it
3442   //        here. An out-of-date dominator tree is problematic for SCEV,
3443   //        because SCEVExpander uses it to guide code generation. The
3444   //        vectorizer use SCEVExpanders in several places. Instead, we should
3445   //        keep the dominator tree up-to-date as we go.
3446   updateAnalysis();
3447 
3448   // Fix-up external users of the induction variables.
3449   for (auto &Entry : *Legal->getInductionVars())
3450     fixupIVUsers(Entry.first, Entry.second,
3451                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3452                  IVEndValues[Entry.first], LoopMiddleBlock);
3453 
3454   fixLCSSAPHIs();
3455   for (Instruction *PI : PredicatedInstructions)
3456     sinkScalarOperands(&*PI);
3457 
3458   // Remove redundant induction instructions.
3459   cse(LoopVectorBody);
3460 }
3461 
3462 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3463   // In order to support recurrences we need to be able to vectorize Phi nodes.
3464   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3465   // stage #2: We now need to fix the recurrences by adding incoming edges to
3466   // the currently empty PHI nodes. At this point every instruction in the
3467   // original loop is widened to a vector form so we can use them to construct
3468   // the incoming edges.
3469   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3470     // Handle first-order recurrences and reductions that need to be fixed.
3471     if (Legal->isFirstOrderRecurrence(&Phi))
3472       fixFirstOrderRecurrence(&Phi);
3473     else if (Legal->isReductionVariable(&Phi))
3474       fixReduction(&Phi);
3475   }
3476 }
3477 
3478 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3479   // This is the second phase of vectorizing first-order recurrences. An
3480   // overview of the transformation is described below. Suppose we have the
3481   // following loop.
3482   //
3483   //   for (int i = 0; i < n; ++i)
3484   //     b[i] = a[i] - a[i - 1];
3485   //
3486   // There is a first-order recurrence on "a". For this loop, the shorthand
3487   // scalar IR looks like:
3488   //
3489   //   scalar.ph:
3490   //     s_init = a[-1]
3491   //     br scalar.body
3492   //
3493   //   scalar.body:
3494   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3495   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3496   //     s2 = a[i]
3497   //     b[i] = s2 - s1
3498   //     br cond, scalar.body, ...
3499   //
3500   // In this example, s1 is a recurrence because it's value depends on the
3501   // previous iteration. In the first phase of vectorization, we created a
3502   // temporary value for s1. We now complete the vectorization and produce the
3503   // shorthand vector IR shown below (for VF = 4, UF = 1).
3504   //
3505   //   vector.ph:
3506   //     v_init = vector(..., ..., ..., a[-1])
3507   //     br vector.body
3508   //
3509   //   vector.body
3510   //     i = phi [0, vector.ph], [i+4, vector.body]
3511   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3512   //     v2 = a[i, i+1, i+2, i+3];
3513   //     v3 = vector(v1(3), v2(0, 1, 2))
3514   //     b[i, i+1, i+2, i+3] = v2 - v3
3515   //     br cond, vector.body, middle.block
3516   //
3517   //   middle.block:
3518   //     x = v2(3)
3519   //     br scalar.ph
3520   //
3521   //   scalar.ph:
3522   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3523   //     br scalar.body
3524   //
3525   // After execution completes the vector loop, we extract the next value of
3526   // the recurrence (x) to use as the initial value in the scalar loop.
3527 
3528   // Get the original loop preheader and single loop latch.
3529   auto *Preheader = OrigLoop->getLoopPreheader();
3530   auto *Latch = OrigLoop->getLoopLatch();
3531 
3532   // Get the initial and previous values of the scalar recurrence.
3533   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3534   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3535 
3536   // Create a vector from the initial value.
3537   auto *VectorInit = ScalarInit;
3538   if (VF > 1) {
3539     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3540     VectorInit = Builder.CreateInsertElement(
3541         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3542         Builder.getInt32(VF - 1), "vector.recur.init");
3543   }
3544 
3545   // We constructed a temporary phi node in the first phase of vectorization.
3546   // This phi node will eventually be deleted.
3547   Builder.SetInsertPoint(
3548       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3549 
3550   // Create a phi node for the new recurrence. The current value will either be
3551   // the initial value inserted into a vector or loop-varying vector value.
3552   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3553   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3554 
3555   // Get the vectorized previous value of the last part UF - 1. It appears last
3556   // among all unrolled iterations, due to the order of their construction.
3557   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3558 
3559   // Find and set the insertion point after the previous value if it is an
3560   // instruction.
3561   BasicBlock::iterator InsertPt;
3562   // Note that the previous value may have been constant-folded so it is not
3563   // guaranteed to be an instruction in the vector loop.
3564   // FIXME: Loop invariant values do not form recurrences. We should deal with
3565   //        them earlier.
3566   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3567     InsertPt = LoopVectorBody->getFirstInsertionPt();
3568   else {
3569     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3570     if (isa<PHINode>(PreviousLastPart))
3571       // If the previous value is a phi node, we should insert after all the phi
3572       // nodes in the block containing the PHI to avoid breaking basic block
3573       // verification. Note that the basic block may be different to
3574       // LoopVectorBody, in case we predicate the loop.
3575       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3576     else
3577       InsertPt = ++PreviousInst->getIterator();
3578   }
3579   Builder.SetInsertPoint(&*InsertPt);
3580 
3581   // We will construct a vector for the recurrence by combining the values for
3582   // the current and previous iterations. This is the required shuffle mask.
3583   SmallVector<Constant *, 8> ShuffleMask(VF);
3584   ShuffleMask[0] = Builder.getInt32(VF - 1);
3585   for (unsigned I = 1; I < VF; ++I)
3586     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3587 
3588   // The vector from which to take the initial value for the current iteration
3589   // (actual or unrolled). Initially, this is the vector phi node.
3590   Value *Incoming = VecPhi;
3591 
3592   // Shuffle the current and previous vector and update the vector parts.
3593   for (unsigned Part = 0; Part < UF; ++Part) {
3594     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3595     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3596     auto *Shuffle =
3597         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3598                                              ConstantVector::get(ShuffleMask))
3599                : Incoming;
3600     PhiPart->replaceAllUsesWith(Shuffle);
3601     cast<Instruction>(PhiPart)->eraseFromParent();
3602     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3603     Incoming = PreviousPart;
3604   }
3605 
3606   // Fix the latch value of the new recurrence in the vector loop.
3607   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3608 
3609   // Extract the last vector element in the middle block. This will be the
3610   // initial value for the recurrence when jumping to the scalar loop.
3611   auto *ExtractForScalar = Incoming;
3612   if (VF > 1) {
3613     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3614     ExtractForScalar = Builder.CreateExtractElement(
3615         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3616   }
3617   // Extract the second last element in the middle block if the
3618   // Phi is used outside the loop. We need to extract the phi itself
3619   // and not the last element (the phi update in the current iteration). This
3620   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3621   // when the scalar loop is not run at all.
3622   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3623   if (VF > 1)
3624     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3625         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3626   // When loop is unrolled without vectorizing, initialize
3627   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3628   // `Incoming`. This is analogous to the vectorized case above: extracting the
3629   // second last element when VF > 1.
3630   else if (UF > 1)
3631     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3632 
3633   // Fix the initial value of the original recurrence in the scalar loop.
3634   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3635   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3636   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3637     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3638     Start->addIncoming(Incoming, BB);
3639   }
3640 
3641   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3642   Phi->setName("scalar.recur");
3643 
3644   // Finally, fix users of the recurrence outside the loop. The users will need
3645   // either the last value of the scalar recurrence or the last value of the
3646   // vector recurrence we extracted in the middle block. Since the loop is in
3647   // LCSSA form, we just need to find all the phi nodes for the original scalar
3648   // recurrence in the exit block, and then add an edge for the middle block.
3649   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3650     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3651       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3652     }
3653   }
3654 }
3655 
3656 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3657   Constant *Zero = Builder.getInt32(0);
3658 
3659   // Get it's reduction variable descriptor.
3660   assert(Legal->isReductionVariable(Phi) &&
3661          "Unable to find the reduction variable");
3662   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3663 
3664   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3665   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3666   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3667   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3668     RdxDesc.getMinMaxRecurrenceKind();
3669   setDebugLocFromInst(Builder, ReductionStartValue);
3670 
3671   // We need to generate a reduction vector from the incoming scalar.
3672   // To do so, we need to generate the 'identity' vector and override
3673   // one of the elements with the incoming scalar reduction. We need
3674   // to do it in the vector-loop preheader.
3675   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3676 
3677   // This is the vector-clone of the value that leaves the loop.
3678   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3679 
3680   // Find the reduction identity variable. Zero for addition, or, xor,
3681   // one for multiplication, -1 for And.
3682   Value *Identity;
3683   Value *VectorStart;
3684   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3685       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3686     // MinMax reduction have the start value as their identify.
3687     if (VF == 1) {
3688       VectorStart = Identity = ReductionStartValue;
3689     } else {
3690       VectorStart = Identity =
3691         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3692     }
3693   } else {
3694     // Handle other reduction kinds:
3695     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3696         RK, VecTy->getScalarType());
3697     if (VF == 1) {
3698       Identity = Iden;
3699       // This vector is the Identity vector where the first element is the
3700       // incoming scalar reduction.
3701       VectorStart = ReductionStartValue;
3702     } else {
3703       Identity = ConstantVector::getSplat(VF, Iden);
3704 
3705       // This vector is the Identity vector where the first element is the
3706       // incoming scalar reduction.
3707       VectorStart =
3708         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3709     }
3710   }
3711 
3712   // Fix the vector-loop phi.
3713 
3714   // Reductions do not have to start at zero. They can start with
3715   // any loop invariant values.
3716   BasicBlock *Latch = OrigLoop->getLoopLatch();
3717   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3718   for (unsigned Part = 0; Part < UF; ++Part) {
3719     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3720     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3721     // Make sure to add the reduction stat value only to the
3722     // first unroll part.
3723     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3724     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3725     cast<PHINode>(VecRdxPhi)
3726       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3727   }
3728 
3729   // Before each round, move the insertion point right between
3730   // the PHIs and the values we are going to write.
3731   // This allows us to write both PHINodes and the extractelement
3732   // instructions.
3733   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3734 
3735   setDebugLocFromInst(Builder, LoopExitInst);
3736 
3737   // If tail is folded by masking, the vector value to leave the loop should be
3738   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3739   // instead of the former.
3740   if (Cost->foldTailByMasking()) {
3741     for (unsigned Part = 0; Part < UF; ++Part) {
3742       Value *VecLoopExitInst =
3743           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3744       Value *Sel = nullptr;
3745       for (User *U : VecLoopExitInst->users()) {
3746         if (isa<SelectInst>(U)) {
3747           assert(!Sel && "Reduction exit feeding two selects");
3748           Sel = U;
3749         } else
3750           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3751       }
3752       assert(Sel && "Reduction exit feeds no select");
3753       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3754     }
3755   }
3756 
3757   // If the vector reduction can be performed in a smaller type, we truncate
3758   // then extend the loop exit value to enable InstCombine to evaluate the
3759   // entire expression in the smaller type.
3760   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3761     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3762     Builder.SetInsertPoint(
3763         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3764     VectorParts RdxParts(UF);
3765     for (unsigned Part = 0; Part < UF; ++Part) {
3766       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3767       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3768       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3769                                         : Builder.CreateZExt(Trunc, VecTy);
3770       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3771            UI != RdxParts[Part]->user_end();)
3772         if (*UI != Trunc) {
3773           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3774           RdxParts[Part] = Extnd;
3775         } else {
3776           ++UI;
3777         }
3778     }
3779     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3780     for (unsigned Part = 0; Part < UF; ++Part) {
3781       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3782       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3783     }
3784   }
3785 
3786   // Reduce all of the unrolled parts into a single vector.
3787   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3788   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3789 
3790   // The middle block terminator has already been assigned a DebugLoc here (the
3791   // OrigLoop's single latch terminator). We want the whole middle block to
3792   // appear to execute on this line because: (a) it is all compiler generated,
3793   // (b) these instructions are always executed after evaluating the latch
3794   // conditional branch, and (c) other passes may add new predecessors which
3795   // terminate on this line. This is the easiest way to ensure we don't
3796   // accidentally cause an extra step back into the loop while debugging.
3797   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3798   for (unsigned Part = 1; Part < UF; ++Part) {
3799     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3800     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3801       // Floating point operations had to be 'fast' to enable the reduction.
3802       ReducedPartRdx = addFastMathFlag(
3803           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3804                               ReducedPartRdx, "bin.rdx"),
3805           RdxDesc.getFastMathFlags());
3806     else
3807       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3808                                       RdxPart);
3809   }
3810 
3811   if (VF > 1) {
3812     bool NoNaN = Legal->hasFunNoNaNAttr();
3813     ReducedPartRdx =
3814         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3815     // If the reduction can be performed in a smaller type, we need to extend
3816     // the reduction to the wider type before we branch to the original loop.
3817     if (Phi->getType() != RdxDesc.getRecurrenceType())
3818       ReducedPartRdx =
3819         RdxDesc.isSigned()
3820         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3821         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3822   }
3823 
3824   // Create a phi node that merges control-flow from the backedge-taken check
3825   // block and the middle block.
3826   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3827                                         LoopScalarPreHeader->getTerminator());
3828   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3829     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3830   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3831 
3832   // Now, we need to fix the users of the reduction variable
3833   // inside and outside of the scalar remainder loop.
3834   // We know that the loop is in LCSSA form. We need to update the
3835   // PHI nodes in the exit blocks.
3836   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3837     // All PHINodes need to have a single entry edge, or two if
3838     // we already fixed them.
3839     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3840 
3841     // We found a reduction value exit-PHI. Update it with the
3842     // incoming bypass edge.
3843     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3844       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3845   } // end of the LCSSA phi scan.
3846 
3847     // Fix the scalar loop reduction variable with the incoming reduction sum
3848     // from the vector body and from the backedge value.
3849   int IncomingEdgeBlockIdx =
3850     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3851   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3852   // Pick the other block.
3853   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3854   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3855   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3856 }
3857 
3858 void InnerLoopVectorizer::fixLCSSAPHIs() {
3859   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3860     if (LCSSAPhi.getNumIncomingValues() == 1) {
3861       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3862       // Non-instruction incoming values will have only one value.
3863       unsigned LastLane = 0;
3864       if (isa<Instruction>(IncomingValue))
3865           LastLane = Cost->isUniformAfterVectorization(
3866                          cast<Instruction>(IncomingValue), VF)
3867                          ? 0
3868                          : VF - 1;
3869       // Can be a loop invariant incoming value or the last scalar value to be
3870       // extracted from the vectorized loop.
3871       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3872       Value *lastIncomingValue =
3873           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3874       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3875     }
3876   }
3877 }
3878 
3879 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3880   // The basic block and loop containing the predicated instruction.
3881   auto *PredBB = PredInst->getParent();
3882   auto *VectorLoop = LI->getLoopFor(PredBB);
3883 
3884   // Initialize a worklist with the operands of the predicated instruction.
3885   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3886 
3887   // Holds instructions that we need to analyze again. An instruction may be
3888   // reanalyzed if we don't yet know if we can sink it or not.
3889   SmallVector<Instruction *, 8> InstsToReanalyze;
3890 
3891   // Returns true if a given use occurs in the predicated block. Phi nodes use
3892   // their operands in their corresponding predecessor blocks.
3893   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3894     auto *I = cast<Instruction>(U.getUser());
3895     BasicBlock *BB = I->getParent();
3896     if (auto *Phi = dyn_cast<PHINode>(I))
3897       BB = Phi->getIncomingBlock(
3898           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3899     return BB == PredBB;
3900   };
3901 
3902   // Iteratively sink the scalarized operands of the predicated instruction
3903   // into the block we created for it. When an instruction is sunk, it's
3904   // operands are then added to the worklist. The algorithm ends after one pass
3905   // through the worklist doesn't sink a single instruction.
3906   bool Changed;
3907   do {
3908     // Add the instructions that need to be reanalyzed to the worklist, and
3909     // reset the changed indicator.
3910     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3911     InstsToReanalyze.clear();
3912     Changed = false;
3913 
3914     while (!Worklist.empty()) {
3915       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3916 
3917       // We can't sink an instruction if it is a phi node, is already in the
3918       // predicated block, is not in the loop, or may have side effects.
3919       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3920           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3921         continue;
3922 
3923       // It's legal to sink the instruction if all its uses occur in the
3924       // predicated block. Otherwise, there's nothing to do yet, and we may
3925       // need to reanalyze the instruction.
3926       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3927         InstsToReanalyze.push_back(I);
3928         continue;
3929       }
3930 
3931       // Move the instruction to the beginning of the predicated block, and add
3932       // it's operands to the worklist.
3933       I->moveBefore(&*PredBB->getFirstInsertionPt());
3934       Worklist.insert(I->op_begin(), I->op_end());
3935 
3936       // The sinking may have enabled other instructions to be sunk, so we will
3937       // need to iterate.
3938       Changed = true;
3939     }
3940   } while (Changed);
3941 }
3942 
3943 void InnerLoopVectorizer::fixNonInductionPHIs() {
3944   for (PHINode *OrigPhi : OrigPHIsToFix) {
3945     PHINode *NewPhi =
3946         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3947     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3948 
3949     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3950         predecessors(OrigPhi->getParent()));
3951     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3952         predecessors(NewPhi->getParent()));
3953     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3954            "Scalar and Vector BB should have the same number of predecessors");
3955 
3956     // The insertion point in Builder may be invalidated by the time we get
3957     // here. Force the Builder insertion point to something valid so that we do
3958     // not run into issues during insertion point restore in
3959     // getOrCreateVectorValue calls below.
3960     Builder.SetInsertPoint(NewPhi);
3961 
3962     // The predecessor order is preserved and we can rely on mapping between
3963     // scalar and vector block predecessors.
3964     for (unsigned i = 0; i < NumIncomingValues; ++i) {
3965       BasicBlock *NewPredBB = VectorBBPredecessors[i];
3966 
3967       // When looking up the new scalar/vector values to fix up, use incoming
3968       // values from original phi.
3969       Value *ScIncV =
3970           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3971 
3972       // Scalar incoming value may need a broadcast
3973       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3974       NewPhi->addIncoming(NewIncV, NewPredBB);
3975     }
3976   }
3977 }
3978 
3979 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
3980                                    unsigned VF, bool IsPtrLoopInvariant,
3981                                    SmallBitVector &IsIndexLoopInvariant) {
3982   // Construct a vector GEP by widening the operands of the scalar GEP as
3983   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
3984   // results in a vector of pointers when at least one operand of the GEP
3985   // is vector-typed. Thus, to keep the representation compact, we only use
3986   // vector-typed operands for loop-varying values.
3987 
3988   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
3989     // If we are vectorizing, but the GEP has only loop-invariant operands,
3990     // the GEP we build (by only using vector-typed operands for
3991     // loop-varying values) would be a scalar pointer. Thus, to ensure we
3992     // produce a vector of pointers, we need to either arbitrarily pick an
3993     // operand to broadcast, or broadcast a clone of the original GEP.
3994     // Here, we broadcast a clone of the original.
3995     //
3996     // TODO: If at some point we decide to scalarize instructions having
3997     //       loop-invariant operands, this special case will no longer be
3998     //       required. We would add the scalarization decision to
3999     //       collectLoopScalars() and teach getVectorValue() to broadcast
4000     //       the lane-zero scalar value.
4001     auto *Clone = Builder.Insert(GEP->clone());
4002     for (unsigned Part = 0; Part < UF; ++Part) {
4003       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4004       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4005       addMetadata(EntryPart, GEP);
4006     }
4007   } else {
4008     // If the GEP has at least one loop-varying operand, we are sure to
4009     // produce a vector of pointers. But if we are only unrolling, we want
4010     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4011     // produce with the code below will be scalar (if VF == 1) or vector
4012     // (otherwise). Note that for the unroll-only case, we still maintain
4013     // values in the vector mapping with initVector, as we do for other
4014     // instructions.
4015     for (unsigned Part = 0; Part < UF; ++Part) {
4016       // The pointer operand of the new GEP. If it's loop-invariant, we
4017       // won't broadcast it.
4018       auto *Ptr = IsPtrLoopInvariant
4019                       ? GEP->getPointerOperand()
4020                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4021 
4022       // Collect all the indices for the new GEP. If any index is
4023       // loop-invariant, we won't broadcast it.
4024       SmallVector<Value *, 4> Indices;
4025       for (auto Index : enumerate(GEP->indices())) {
4026         Value *User = Index.value().get();
4027         if (IsIndexLoopInvariant[Index.index()])
4028           Indices.push_back(User);
4029         else
4030           Indices.push_back(getOrCreateVectorValue(User, Part));
4031       }
4032 
4033       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4034       // but it should be a vector, otherwise.
4035       auto *NewGEP =
4036           GEP->isInBounds()
4037               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4038                                           Indices)
4039               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4040       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4041              "NewGEP is not a pointer vector");
4042       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4043       addMetadata(NewGEP, GEP);
4044     }
4045   }
4046 }
4047 
4048 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4049                                               unsigned VF) {
4050   PHINode *P = cast<PHINode>(PN);
4051   if (EnableVPlanNativePath) {
4052     // Currently we enter here in the VPlan-native path for non-induction
4053     // PHIs where all control flow is uniform. We simply widen these PHIs.
4054     // Create a vector phi with no operands - the vector phi operands will be
4055     // set at the end of vector code generation.
4056     Type *VecTy =
4057         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4058     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4059     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4060     OrigPHIsToFix.push_back(P);
4061 
4062     return;
4063   }
4064 
4065   assert(PN->getParent() == OrigLoop->getHeader() &&
4066          "Non-header phis should have been handled elsewhere");
4067 
4068   // In order to support recurrences we need to be able to vectorize Phi nodes.
4069   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4070   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4071   // this value when we vectorize all of the instructions that use the PHI.
4072   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4073     for (unsigned Part = 0; Part < UF; ++Part) {
4074       // This is phase one of vectorizing PHIs.
4075       Type *VecTy =
4076           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4077       Value *EntryPart = PHINode::Create(
4078           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4079       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4080     }
4081     return;
4082   }
4083 
4084   setDebugLocFromInst(Builder, P);
4085 
4086   // This PHINode must be an induction variable.
4087   // Make sure that we know about it.
4088   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4089 
4090   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4091   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4092 
4093   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4094   // which can be found from the original scalar operations.
4095   switch (II.getKind()) {
4096   case InductionDescriptor::IK_NoInduction:
4097     llvm_unreachable("Unknown induction");
4098   case InductionDescriptor::IK_IntInduction:
4099   case InductionDescriptor::IK_FpInduction:
4100     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4101   case InductionDescriptor::IK_PtrInduction: {
4102     // Handle the pointer induction variable case.
4103     assert(P->getType()->isPointerTy() && "Unexpected type.");
4104     // This is the normalized GEP that starts counting at zero.
4105     Value *PtrInd = Induction;
4106     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4107     // Determine the number of scalars we need to generate for each unroll
4108     // iteration. If the instruction is uniform, we only need to generate the
4109     // first lane. Otherwise, we generate all VF values.
4110     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4111     // These are the scalar results. Notice that we don't generate vector GEPs
4112     // because scalar GEPs result in better code.
4113     for (unsigned Part = 0; Part < UF; ++Part) {
4114       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4115         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4116         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4117         Value *SclrGep =
4118             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4119         SclrGep->setName("next.gep");
4120         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4121       }
4122     }
4123     return;
4124   }
4125   }
4126 }
4127 
4128 /// A helper function for checking whether an integer division-related
4129 /// instruction may divide by zero (in which case it must be predicated if
4130 /// executed conditionally in the scalar code).
4131 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4132 /// Non-zero divisors that are non compile-time constants will not be
4133 /// converted into multiplication, so we will still end up scalarizing
4134 /// the division, but can do so w/o predication.
4135 static bool mayDivideByZero(Instruction &I) {
4136   assert((I.getOpcode() == Instruction::UDiv ||
4137           I.getOpcode() == Instruction::SDiv ||
4138           I.getOpcode() == Instruction::URem ||
4139           I.getOpcode() == Instruction::SRem) &&
4140          "Unexpected instruction");
4141   Value *Divisor = I.getOperand(1);
4142   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4143   return !CInt || CInt->isZero();
4144 }
4145 
4146 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4147   switch (I.getOpcode()) {
4148   case Instruction::Br:
4149   case Instruction::PHI:
4150   case Instruction::GetElementPtr:
4151     llvm_unreachable("This instruction is handled by a different recipe.");
4152   case Instruction::UDiv:
4153   case Instruction::SDiv:
4154   case Instruction::SRem:
4155   case Instruction::URem:
4156   case Instruction::Add:
4157   case Instruction::FAdd:
4158   case Instruction::Sub:
4159   case Instruction::FSub:
4160   case Instruction::FNeg:
4161   case Instruction::Mul:
4162   case Instruction::FMul:
4163   case Instruction::FDiv:
4164   case Instruction::FRem:
4165   case Instruction::Shl:
4166   case Instruction::LShr:
4167   case Instruction::AShr:
4168   case Instruction::And:
4169   case Instruction::Or:
4170   case Instruction::Xor: {
4171     // Just widen unops and binops.
4172     setDebugLocFromInst(Builder, &I);
4173 
4174     for (unsigned Part = 0; Part < UF; ++Part) {
4175       SmallVector<Value *, 2> Ops;
4176       for (Value *Op : I.operands())
4177         Ops.push_back(getOrCreateVectorValue(Op, Part));
4178 
4179       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4180 
4181       if (auto *VecOp = dyn_cast<Instruction>(V))
4182         VecOp->copyIRFlags(&I);
4183 
4184       // Use this vector value for all users of the original instruction.
4185       VectorLoopValueMap.setVectorValue(&I, Part, V);
4186       addMetadata(V, &I);
4187     }
4188 
4189     break;
4190   }
4191   case Instruction::Select: {
4192     // Widen selects.
4193     // If the selector is loop invariant we can create a select
4194     // instruction with a scalar condition. Otherwise, use vector-select.
4195     auto *SE = PSE.getSE();
4196     bool InvariantCond =
4197         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4198     setDebugLocFromInst(Builder, &I);
4199 
4200     // The condition can be loop invariant  but still defined inside the
4201     // loop. This means that we can't just use the original 'cond' value.
4202     // We have to take the 'vectorized' value and pick the first lane.
4203     // Instcombine will make this a no-op.
4204 
4205     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4206 
4207     for (unsigned Part = 0; Part < UF; ++Part) {
4208       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4209       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4210       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4211       Value *Sel =
4212           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4213       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4214       addMetadata(Sel, &I);
4215     }
4216 
4217     break;
4218   }
4219 
4220   case Instruction::ICmp:
4221   case Instruction::FCmp: {
4222     // Widen compares. Generate vector compares.
4223     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4224     auto *Cmp = cast<CmpInst>(&I);
4225     setDebugLocFromInst(Builder, Cmp);
4226     for (unsigned Part = 0; Part < UF; ++Part) {
4227       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4228       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4229       Value *C = nullptr;
4230       if (FCmp) {
4231         // Propagate fast math flags.
4232         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4233         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4234         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4235       } else {
4236         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4237       }
4238       VectorLoopValueMap.setVectorValue(&I, Part, C);
4239       addMetadata(C, &I);
4240     }
4241 
4242     break;
4243   }
4244 
4245   case Instruction::ZExt:
4246   case Instruction::SExt:
4247   case Instruction::FPToUI:
4248   case Instruction::FPToSI:
4249   case Instruction::FPExt:
4250   case Instruction::PtrToInt:
4251   case Instruction::IntToPtr:
4252   case Instruction::SIToFP:
4253   case Instruction::UIToFP:
4254   case Instruction::Trunc:
4255   case Instruction::FPTrunc:
4256   case Instruction::BitCast: {
4257     auto *CI = cast<CastInst>(&I);
4258     setDebugLocFromInst(Builder, CI);
4259 
4260     /// Vectorize casts.
4261     Type *DestTy =
4262         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4263 
4264     for (unsigned Part = 0; Part < UF; ++Part) {
4265       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4266       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4267       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4268       addMetadata(Cast, &I);
4269     }
4270     break;
4271   }
4272 
4273   case Instruction::Call: {
4274     // Ignore dbg intrinsics.
4275     if (isa<DbgInfoIntrinsic>(I))
4276       break;
4277     setDebugLocFromInst(Builder, &I);
4278 
4279     Module *M = I.getParent()->getParent()->getParent();
4280     auto *CI = cast<CallInst>(&I);
4281 
4282     StringRef FnName = CI->getCalledFunction()->getName();
4283     Function *F = CI->getCalledFunction();
4284     Type *RetTy = ToVectorTy(CI->getType(), VF);
4285     SmallVector<Type *, 4> Tys;
4286     for (Value *ArgOperand : CI->arg_operands())
4287       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4288 
4289     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4290 
4291     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4292     // version of the instruction.
4293     // Is it beneficial to perform intrinsic call compared to lib call?
4294     bool NeedToScalarize;
4295     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4296     bool UseVectorIntrinsic =
4297         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4298     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4299            "Instruction should be scalarized elsewhere.");
4300 
4301     for (unsigned Part = 0; Part < UF; ++Part) {
4302       SmallVector<Value *, 4> Args;
4303       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4304         Value *Arg = CI->getArgOperand(i);
4305         // Some intrinsics have a scalar argument - don't replace it with a
4306         // vector.
4307         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4308           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4309         Args.push_back(Arg);
4310       }
4311 
4312       Function *VectorF;
4313       if (UseVectorIntrinsic) {
4314         // Use vector version of the intrinsic.
4315         Type *TysForDecl[] = {CI->getType()};
4316         if (VF > 1)
4317           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4318         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4319       } else {
4320         // Use vector version of the library call.
4321         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4322         assert(!VFnName.empty() && "Vector function name is empty.");
4323         VectorF = M->getFunction(VFnName);
4324         if (!VectorF) {
4325           // Generate a declaration
4326           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4327           VectorF =
4328               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4329           VectorF->copyAttributesFrom(F);
4330         }
4331       }
4332       assert(VectorF && "Can't create vector function.");
4333 
4334       SmallVector<OperandBundleDef, 1> OpBundles;
4335       CI->getOperandBundlesAsDefs(OpBundles);
4336       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4337 
4338       if (isa<FPMathOperator>(V))
4339         V->copyFastMathFlags(CI);
4340 
4341       VectorLoopValueMap.setVectorValue(&I, Part, V);
4342       addMetadata(V, &I);
4343     }
4344 
4345     break;
4346   }
4347 
4348   default:
4349     // This instruction is not vectorized by simple widening.
4350     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4351     llvm_unreachable("Unhandled instruction!");
4352   } // end of switch.
4353 }
4354 
4355 void InnerLoopVectorizer::updateAnalysis() {
4356   // Forget the original basic block.
4357   PSE.getSE()->forgetLoop(OrigLoop);
4358 
4359   // DT is not kept up-to-date for outer loop vectorization
4360   if (EnableVPlanNativePath)
4361     return;
4362 
4363   // Update the dominator tree information.
4364   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
4365          "Entry does not dominate exit.");
4366 
4367   DT->addNewBlock(LoopMiddleBlock,
4368                   LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4369   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
4370   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
4371   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
4372   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
4373 }
4374 
4375 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4376   // We should not collect Scalars more than once per VF. Right now, this
4377   // function is called from collectUniformsAndScalars(), which already does
4378   // this check. Collecting Scalars for VF=1 does not make any sense.
4379   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4380          "This function should not be visited twice for the same VF");
4381 
4382   SmallSetVector<Instruction *, 8> Worklist;
4383 
4384   // These sets are used to seed the analysis with pointers used by memory
4385   // accesses that will remain scalar.
4386   SmallSetVector<Instruction *, 8> ScalarPtrs;
4387   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4388 
4389   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4390   // The pointer operands of loads and stores will be scalar as long as the
4391   // memory access is not a gather or scatter operation. The value operand of a
4392   // store will remain scalar if the store is scalarized.
4393   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4394     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4395     assert(WideningDecision != CM_Unknown &&
4396            "Widening decision should be ready at this moment");
4397     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4398       if (Ptr == Store->getValueOperand())
4399         return WideningDecision == CM_Scalarize;
4400     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4401            "Ptr is neither a value or pointer operand");
4402     return WideningDecision != CM_GatherScatter;
4403   };
4404 
4405   // A helper that returns true if the given value is a bitcast or
4406   // getelementptr instruction contained in the loop.
4407   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4408     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4409             isa<GetElementPtrInst>(V)) &&
4410            !TheLoop->isLoopInvariant(V);
4411   };
4412 
4413   // A helper that evaluates a memory access's use of a pointer. If the use
4414   // will be a scalar use, and the pointer is only used by memory accesses, we
4415   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4416   // PossibleNonScalarPtrs.
4417   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4418     // We only care about bitcast and getelementptr instructions contained in
4419     // the loop.
4420     if (!isLoopVaryingBitCastOrGEP(Ptr))
4421       return;
4422 
4423     // If the pointer has already been identified as scalar (e.g., if it was
4424     // also identified as uniform), there's nothing to do.
4425     auto *I = cast<Instruction>(Ptr);
4426     if (Worklist.count(I))
4427       return;
4428 
4429     // If the use of the pointer will be a scalar use, and all users of the
4430     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4431     // place the pointer in PossibleNonScalarPtrs.
4432     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4433           return isa<LoadInst>(U) || isa<StoreInst>(U);
4434         }))
4435       ScalarPtrs.insert(I);
4436     else
4437       PossibleNonScalarPtrs.insert(I);
4438   };
4439 
4440   // We seed the scalars analysis with three classes of instructions: (1)
4441   // instructions marked uniform-after-vectorization, (2) bitcast and
4442   // getelementptr instructions used by memory accesses requiring a scalar use,
4443   // and (3) pointer induction variables and their update instructions (we
4444   // currently only scalarize these).
4445   //
4446   // (1) Add to the worklist all instructions that have been identified as
4447   // uniform-after-vectorization.
4448   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4449 
4450   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4451   // memory accesses requiring a scalar use. The pointer operands of loads and
4452   // stores will be scalar as long as the memory accesses is not a gather or
4453   // scatter operation. The value operand of a store will remain scalar if the
4454   // store is scalarized.
4455   for (auto *BB : TheLoop->blocks())
4456     for (auto &I : *BB) {
4457       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4458         evaluatePtrUse(Load, Load->getPointerOperand());
4459       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4460         evaluatePtrUse(Store, Store->getPointerOperand());
4461         evaluatePtrUse(Store, Store->getValueOperand());
4462       }
4463     }
4464   for (auto *I : ScalarPtrs)
4465     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4466       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4467       Worklist.insert(I);
4468     }
4469 
4470   // (3) Add to the worklist all pointer induction variables and their update
4471   // instructions.
4472   //
4473   // TODO: Once we are able to vectorize pointer induction variables we should
4474   //       no longer insert them into the worklist here.
4475   auto *Latch = TheLoop->getLoopLatch();
4476   for (auto &Induction : *Legal->getInductionVars()) {
4477     auto *Ind = Induction.first;
4478     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4479     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4480       continue;
4481     Worklist.insert(Ind);
4482     Worklist.insert(IndUpdate);
4483     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4484     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4485                       << "\n");
4486   }
4487 
4488   // Insert the forced scalars.
4489   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4490   // induction variable when the PHI user is scalarized.
4491   auto ForcedScalar = ForcedScalars.find(VF);
4492   if (ForcedScalar != ForcedScalars.end())
4493     for (auto *I : ForcedScalar->second)
4494       Worklist.insert(I);
4495 
4496   // Expand the worklist by looking through any bitcasts and getelementptr
4497   // instructions we've already identified as scalar. This is similar to the
4498   // expansion step in collectLoopUniforms(); however, here we're only
4499   // expanding to include additional bitcasts and getelementptr instructions.
4500   unsigned Idx = 0;
4501   while (Idx != Worklist.size()) {
4502     Instruction *Dst = Worklist[Idx++];
4503     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4504       continue;
4505     auto *Src = cast<Instruction>(Dst->getOperand(0));
4506     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4507           auto *J = cast<Instruction>(U);
4508           return !TheLoop->contains(J) || Worklist.count(J) ||
4509                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4510                   isScalarUse(J, Src));
4511         })) {
4512       Worklist.insert(Src);
4513       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4514     }
4515   }
4516 
4517   // An induction variable will remain scalar if all users of the induction
4518   // variable and induction variable update remain scalar.
4519   for (auto &Induction : *Legal->getInductionVars()) {
4520     auto *Ind = Induction.first;
4521     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4522 
4523     // We already considered pointer induction variables, so there's no reason
4524     // to look at their users again.
4525     //
4526     // TODO: Once we are able to vectorize pointer induction variables we
4527     //       should no longer skip over them here.
4528     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4529       continue;
4530 
4531     // Determine if all users of the induction variable are scalar after
4532     // vectorization.
4533     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4534       auto *I = cast<Instruction>(U);
4535       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4536     });
4537     if (!ScalarInd)
4538       continue;
4539 
4540     // Determine if all users of the induction variable update instruction are
4541     // scalar after vectorization.
4542     auto ScalarIndUpdate =
4543         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4544           auto *I = cast<Instruction>(U);
4545           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4546         });
4547     if (!ScalarIndUpdate)
4548       continue;
4549 
4550     // The induction variable and its update instruction will remain scalar.
4551     Worklist.insert(Ind);
4552     Worklist.insert(IndUpdate);
4553     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4554     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4555                       << "\n");
4556   }
4557 
4558   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4559 }
4560 
4561 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4562   if (!blockNeedsPredication(I->getParent()))
4563     return false;
4564   switch(I->getOpcode()) {
4565   default:
4566     break;
4567   case Instruction::Load:
4568   case Instruction::Store: {
4569     if (!Legal->isMaskRequired(I))
4570       return false;
4571     auto *Ptr = getLoadStorePointerOperand(I);
4572     auto *Ty = getMemInstValueType(I);
4573     // We have already decided how to vectorize this instruction, get that
4574     // result.
4575     if (VF > 1) {
4576       InstWidening WideningDecision = getWideningDecision(I, VF);
4577       assert(WideningDecision != CM_Unknown &&
4578              "Widening decision should be ready at this moment");
4579       return WideningDecision == CM_Scalarize;
4580     }
4581     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4582     return isa<LoadInst>(I) ?
4583         !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty))
4584       : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty));
4585   }
4586   case Instruction::UDiv:
4587   case Instruction::SDiv:
4588   case Instruction::SRem:
4589   case Instruction::URem:
4590     return mayDivideByZero(*I);
4591   }
4592   return false;
4593 }
4594 
4595 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4596                                                                unsigned VF) {
4597   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4598   assert(getWideningDecision(I, VF) == CM_Unknown &&
4599          "Decision should not be set yet.");
4600   auto *Group = getInterleavedAccessGroup(I);
4601   assert(Group && "Must have a group.");
4602 
4603   // If the instruction's allocated size doesn't equal it's type size, it
4604   // requires padding and will be scalarized.
4605   auto &DL = I->getModule()->getDataLayout();
4606   auto *ScalarTy = getMemInstValueType(I);
4607   if (hasIrregularType(ScalarTy, DL, VF))
4608     return false;
4609 
4610   // Check if masking is required.
4611   // A Group may need masking for one of two reasons: it resides in a block that
4612   // needs predication, or it was decided to use masking to deal with gaps.
4613   bool PredicatedAccessRequiresMasking =
4614       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4615   bool AccessWithGapsRequiresMasking =
4616       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4617   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4618     return true;
4619 
4620   // If masked interleaving is required, we expect that the user/target had
4621   // enabled it, because otherwise it either wouldn't have been created or
4622   // it should have been invalidated by the CostModel.
4623   assert(useMaskedInterleavedAccesses(TTI) &&
4624          "Masked interleave-groups for predicated accesses are not enabled.");
4625 
4626   auto *Ty = getMemInstValueType(I);
4627   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4628   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4629                           : TTI.isLegalMaskedStore(Ty, Alignment);
4630 }
4631 
4632 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4633                                                                unsigned VF) {
4634   // Get and ensure we have a valid memory instruction.
4635   LoadInst *LI = dyn_cast<LoadInst>(I);
4636   StoreInst *SI = dyn_cast<StoreInst>(I);
4637   assert((LI || SI) && "Invalid memory instruction");
4638 
4639   auto *Ptr = getLoadStorePointerOperand(I);
4640 
4641   // In order to be widened, the pointer should be consecutive, first of all.
4642   if (!Legal->isConsecutivePtr(Ptr))
4643     return false;
4644 
4645   // If the instruction is a store located in a predicated block, it will be
4646   // scalarized.
4647   if (isScalarWithPredication(I))
4648     return false;
4649 
4650   // If the instruction's allocated size doesn't equal it's type size, it
4651   // requires padding and will be scalarized.
4652   auto &DL = I->getModule()->getDataLayout();
4653   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4654   if (hasIrregularType(ScalarTy, DL, VF))
4655     return false;
4656 
4657   return true;
4658 }
4659 
4660 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4661   // We should not collect Uniforms more than once per VF. Right now,
4662   // this function is called from collectUniformsAndScalars(), which
4663   // already does this check. Collecting Uniforms for VF=1 does not make any
4664   // sense.
4665 
4666   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4667          "This function should not be visited twice for the same VF");
4668 
4669   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4670   // not analyze again.  Uniforms.count(VF) will return 1.
4671   Uniforms[VF].clear();
4672 
4673   // We now know that the loop is vectorizable!
4674   // Collect instructions inside the loop that will remain uniform after
4675   // vectorization.
4676 
4677   // Global values, params and instructions outside of current loop are out of
4678   // scope.
4679   auto isOutOfScope = [&](Value *V) -> bool {
4680     Instruction *I = dyn_cast<Instruction>(V);
4681     return (!I || !TheLoop->contains(I));
4682   };
4683 
4684   SetVector<Instruction *> Worklist;
4685   BasicBlock *Latch = TheLoop->getLoopLatch();
4686 
4687   // Instructions that are scalar with predication must not be considered
4688   // uniform after vectorization, because that would create an erroneous
4689   // replicating region where only a single instance out of VF should be formed.
4690   // TODO: optimize such seldom cases if found important, see PR40816.
4691   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4692     if (isScalarWithPredication(I, VF)) {
4693       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4694                         << *I << "\n");
4695       return;
4696     }
4697     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4698     Worklist.insert(I);
4699   };
4700 
4701   // Start with the conditional branch. If the branch condition is an
4702   // instruction contained in the loop that is only used by the branch, it is
4703   // uniform.
4704   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4705   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4706     addToWorklistIfAllowed(Cmp);
4707 
4708   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4709   // are pointers that are treated like consecutive pointers during
4710   // vectorization. The pointer operands of interleaved accesses are an
4711   // example.
4712   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4713 
4714   // Holds pointer operands of instructions that are possibly non-uniform.
4715   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4716 
4717   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4718     InstWidening WideningDecision = getWideningDecision(I, VF);
4719     assert(WideningDecision != CM_Unknown &&
4720            "Widening decision should be ready at this moment");
4721 
4722     return (WideningDecision == CM_Widen ||
4723             WideningDecision == CM_Widen_Reverse ||
4724             WideningDecision == CM_Interleave);
4725   };
4726   // Iterate over the instructions in the loop, and collect all
4727   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4728   // that a consecutive-like pointer operand will be scalarized, we collect it
4729   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4730   // getelementptr instruction can be used by both vectorized and scalarized
4731   // memory instructions. For example, if a loop loads and stores from the same
4732   // location, but the store is conditional, the store will be scalarized, and
4733   // the getelementptr won't remain uniform.
4734   for (auto *BB : TheLoop->blocks())
4735     for (auto &I : *BB) {
4736       // If there's no pointer operand, there's nothing to do.
4737       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4738       if (!Ptr)
4739         continue;
4740 
4741       // True if all users of Ptr are memory accesses that have Ptr as their
4742       // pointer operand.
4743       auto UsersAreMemAccesses =
4744           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4745             return getLoadStorePointerOperand(U) == Ptr;
4746           });
4747 
4748       // Ensure the memory instruction will not be scalarized or used by
4749       // gather/scatter, making its pointer operand non-uniform. If the pointer
4750       // operand is used by any instruction other than a memory access, we
4751       // conservatively assume the pointer operand may be non-uniform.
4752       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4753         PossibleNonUniformPtrs.insert(Ptr);
4754 
4755       // If the memory instruction will be vectorized and its pointer operand
4756       // is consecutive-like, or interleaving - the pointer operand should
4757       // remain uniform.
4758       else
4759         ConsecutiveLikePtrs.insert(Ptr);
4760     }
4761 
4762   // Add to the Worklist all consecutive and consecutive-like pointers that
4763   // aren't also identified as possibly non-uniform.
4764   for (auto *V : ConsecutiveLikePtrs)
4765     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4766       addToWorklistIfAllowed(V);
4767 
4768   // Expand Worklist in topological order: whenever a new instruction
4769   // is added , its users should be already inside Worklist.  It ensures
4770   // a uniform instruction will only be used by uniform instructions.
4771   unsigned idx = 0;
4772   while (idx != Worklist.size()) {
4773     Instruction *I = Worklist[idx++];
4774 
4775     for (auto OV : I->operand_values()) {
4776       // isOutOfScope operands cannot be uniform instructions.
4777       if (isOutOfScope(OV))
4778         continue;
4779       // First order recurrence Phi's should typically be considered
4780       // non-uniform.
4781       auto *OP = dyn_cast<PHINode>(OV);
4782       if (OP && Legal->isFirstOrderRecurrence(OP))
4783         continue;
4784       // If all the users of the operand are uniform, then add the
4785       // operand into the uniform worklist.
4786       auto *OI = cast<Instruction>(OV);
4787       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4788             auto *J = cast<Instruction>(U);
4789             return Worklist.count(J) ||
4790                    (OI == getLoadStorePointerOperand(J) &&
4791                     isUniformDecision(J, VF));
4792           }))
4793         addToWorklistIfAllowed(OI);
4794     }
4795   }
4796 
4797   // Returns true if Ptr is the pointer operand of a memory access instruction
4798   // I, and I is known to not require scalarization.
4799   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4800     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4801   };
4802 
4803   // For an instruction to be added into Worklist above, all its users inside
4804   // the loop should also be in Worklist. However, this condition cannot be
4805   // true for phi nodes that form a cyclic dependence. We must process phi
4806   // nodes separately. An induction variable will remain uniform if all users
4807   // of the induction variable and induction variable update remain uniform.
4808   // The code below handles both pointer and non-pointer induction variables.
4809   for (auto &Induction : *Legal->getInductionVars()) {
4810     auto *Ind = Induction.first;
4811     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4812 
4813     // Determine if all users of the induction variable are uniform after
4814     // vectorization.
4815     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4816       auto *I = cast<Instruction>(U);
4817       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4818              isVectorizedMemAccessUse(I, Ind);
4819     });
4820     if (!UniformInd)
4821       continue;
4822 
4823     // Determine if all users of the induction variable update instruction are
4824     // uniform after vectorization.
4825     auto UniformIndUpdate =
4826         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4827           auto *I = cast<Instruction>(U);
4828           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4829                  isVectorizedMemAccessUse(I, IndUpdate);
4830         });
4831     if (!UniformIndUpdate)
4832       continue;
4833 
4834     // The induction variable and its update instruction will remain uniform.
4835     addToWorklistIfAllowed(Ind);
4836     addToWorklistIfAllowed(IndUpdate);
4837   }
4838 
4839   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4840 }
4841 
4842 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4843   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4844 
4845   if (Legal->getRuntimePointerChecking()->Need) {
4846     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4847         "runtime pointer checks needed. Enable vectorization of this "
4848         "loop with '#pragma clang loop vectorize(enable)' when "
4849         "compiling with -Os/-Oz",
4850         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4851     return true;
4852   }
4853 
4854   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4855     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4856         "runtime SCEV checks needed. Enable vectorization of this "
4857         "loop with '#pragma clang loop vectorize(enable)' when "
4858         "compiling with -Os/-Oz",
4859         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4860     return true;
4861   }
4862 
4863   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4864   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4865     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4866         "runtime stride == 1 checks needed. Enable vectorization of "
4867         "this loop with '#pragma clang loop vectorize(enable)' when "
4868         "compiling with -Os/-Oz",
4869         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4870     return true;
4871   }
4872 
4873   return false;
4874 }
4875 
4876 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4877   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4878     // TODO: It may by useful to do since it's still likely to be dynamically
4879     // uniform if the target can skip.
4880     reportVectorizationFailure(
4881         "Not inserting runtime ptr check for divergent target",
4882         "runtime pointer checks needed. Not enabled for divergent target",
4883         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4884     return None;
4885   }
4886 
4887   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4888   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4889   if (TC == 1) {
4890     reportVectorizationFailure("Single iteration (non) loop",
4891         "loop trip count is one, irrelevant for vectorization",
4892         "SingleIterationLoop", ORE, TheLoop);
4893     return None;
4894   }
4895 
4896   switch (ScalarEpilogueStatus) {
4897   case CM_ScalarEpilogueAllowed:
4898     return computeFeasibleMaxVF(TC);
4899   case CM_ScalarEpilogueNotNeededUsePredicate:
4900     LLVM_DEBUG(
4901         dbgs() << "LV: vector predicate hint/switch found.\n"
4902                << "LV: Not allowing scalar epilogue, creating predicated "
4903                << "vector loop.\n");
4904     break;
4905   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4906     // fallthrough as a special case of OptForSize
4907   case CM_ScalarEpilogueNotAllowedOptSize:
4908     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4909       LLVM_DEBUG(
4910           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4911     else
4912       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4913                         << "count.\n");
4914 
4915     // Bail if runtime checks are required, which are not good when optimising
4916     // for size.
4917     if (runtimeChecksRequired())
4918       return None;
4919     break;
4920   }
4921 
4922   // Now try the tail folding
4923 
4924   // Invalidate interleave groups that require an epilogue if we can't mask
4925   // the interleave-group.
4926   if (!useMaskedInterleavedAccesses(TTI))
4927     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4928 
4929   unsigned MaxVF = computeFeasibleMaxVF(TC);
4930   if (TC > 0 && TC % MaxVF == 0) {
4931     // Accept MaxVF if we do not have a tail.
4932     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4933     return MaxVF;
4934   }
4935 
4936   // If we don't know the precise trip count, or if the trip count that we
4937   // found modulo the vectorization factor is not zero, try to fold the tail
4938   // by masking.
4939   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4940   if (Legal->prepareToFoldTailByMasking()) {
4941     FoldTailByMasking = true;
4942     return MaxVF;
4943   }
4944 
4945   if (TC == 0) {
4946     reportVectorizationFailure(
4947         "Unable to calculate the loop count due to complex control flow",
4948         "unable to calculate the loop count due to complex control flow",
4949         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4950     return None;
4951   }
4952 
4953   reportVectorizationFailure(
4954       "Cannot optimize for size and vectorize at the same time.",
4955       "cannot optimize for size and vectorize at the same time. "
4956       "Enable vectorization of this loop with '#pragma clang loop "
4957       "vectorize(enable)' when compiling with -Os/-Oz",
4958       "NoTailLoopWithOptForSize", ORE, TheLoop);
4959   return None;
4960 }
4961 
4962 unsigned
4963 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
4964   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4965   unsigned SmallestType, WidestType;
4966   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4967   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
4968 
4969   // Get the maximum safe dependence distance in bits computed by LAA.
4970   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4971   // the memory accesses that is most restrictive (involved in the smallest
4972   // dependence distance).
4973   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
4974 
4975   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
4976 
4977   unsigned MaxVectorSize = WidestRegister / WidestType;
4978 
4979   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4980                     << " / " << WidestType << " bits.\n");
4981   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4982                     << WidestRegister << " bits.\n");
4983 
4984   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
4985                                  " into one vector!");
4986   if (MaxVectorSize == 0) {
4987     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
4988     MaxVectorSize = 1;
4989     return MaxVectorSize;
4990   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
4991              isPowerOf2_32(ConstTripCount)) {
4992     // We need to clamp the VF to be the ConstTripCount. There is no point in
4993     // choosing a higher viable VF as done in the loop below.
4994     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
4995                       << ConstTripCount << "\n");
4996     MaxVectorSize = ConstTripCount;
4997     return MaxVectorSize;
4998   }
4999 
5000   unsigned MaxVF = MaxVectorSize;
5001   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5002       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5003     // Collect all viable vectorization factors larger than the default MaxVF
5004     // (i.e. MaxVectorSize).
5005     SmallVector<unsigned, 8> VFs;
5006     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5007     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5008       VFs.push_back(VS);
5009 
5010     // For each VF calculate its register usage.
5011     auto RUs = calculateRegisterUsage(VFs);
5012 
5013     // Select the largest VF which doesn't require more registers than existing
5014     // ones.
5015     for (int i = RUs.size() - 1; i >= 0; --i) {
5016       bool Selected = true;
5017       for (auto& pair : RUs[i].MaxLocalUsers) {
5018         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5019         if (pair.second > TargetNumRegisters)
5020           Selected = false;
5021       }
5022       if (Selected) {
5023         MaxVF = VFs[i];
5024         break;
5025       }
5026     }
5027     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5028       if (MaxVF < MinVF) {
5029         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5030                           << ") with target's minimum: " << MinVF << '\n');
5031         MaxVF = MinVF;
5032       }
5033     }
5034   }
5035   return MaxVF;
5036 }
5037 
5038 VectorizationFactor
5039 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5040   float Cost = expectedCost(1).first;
5041   const float ScalarCost = Cost;
5042   unsigned Width = 1;
5043   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5044 
5045   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5046   if (ForceVectorization && MaxVF > 1) {
5047     // Ignore scalar width, because the user explicitly wants vectorization.
5048     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5049     // evaluation.
5050     Cost = std::numeric_limits<float>::max();
5051   }
5052 
5053   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5054     // Notice that the vector loop needs to be executed less times, so
5055     // we need to divide the cost of the vector loops by the width of
5056     // the vector elements.
5057     VectorizationCostTy C = expectedCost(i);
5058     float VectorCost = C.first / (float)i;
5059     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5060                       << " costs: " << (int)VectorCost << ".\n");
5061     if (!C.second && !ForceVectorization) {
5062       LLVM_DEBUG(
5063           dbgs() << "LV: Not considering vector loop of width " << i
5064                  << " because it will not generate any vector instructions.\n");
5065       continue;
5066     }
5067     if (VectorCost < Cost) {
5068       Cost = VectorCost;
5069       Width = i;
5070     }
5071   }
5072 
5073   if (!EnableCondStoresVectorization && NumPredStores) {
5074     reportVectorizationFailure("There are conditional stores.",
5075         "store that is conditionally executed prevents vectorization",
5076         "ConditionalStore", ORE, TheLoop);
5077     Width = 1;
5078     Cost = ScalarCost;
5079   }
5080 
5081   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5082              << "LV: Vectorization seems to be not beneficial, "
5083              << "but was forced by a user.\n");
5084   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5085   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5086   return Factor;
5087 }
5088 
5089 std::pair<unsigned, unsigned>
5090 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5091   unsigned MinWidth = -1U;
5092   unsigned MaxWidth = 8;
5093   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5094 
5095   // For each block.
5096   for (BasicBlock *BB : TheLoop->blocks()) {
5097     // For each instruction in the loop.
5098     for (Instruction &I : BB->instructionsWithoutDebug()) {
5099       Type *T = I.getType();
5100 
5101       // Skip ignored values.
5102       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5103         continue;
5104 
5105       // Only examine Loads, Stores and PHINodes.
5106       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5107         continue;
5108 
5109       // Examine PHI nodes that are reduction variables. Update the type to
5110       // account for the recurrence type.
5111       if (auto *PN = dyn_cast<PHINode>(&I)) {
5112         if (!Legal->isReductionVariable(PN))
5113           continue;
5114         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5115         T = RdxDesc.getRecurrenceType();
5116       }
5117 
5118       // Examine the stored values.
5119       if (auto *ST = dyn_cast<StoreInst>(&I))
5120         T = ST->getValueOperand()->getType();
5121 
5122       // Ignore loaded pointer types and stored pointer types that are not
5123       // vectorizable.
5124       //
5125       // FIXME: The check here attempts to predict whether a load or store will
5126       //        be vectorized. We only know this for certain after a VF has
5127       //        been selected. Here, we assume that if an access can be
5128       //        vectorized, it will be. We should also look at extending this
5129       //        optimization to non-pointer types.
5130       //
5131       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5132           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5133         continue;
5134 
5135       MinWidth = std::min(MinWidth,
5136                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5137       MaxWidth = std::max(MaxWidth,
5138                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5139     }
5140   }
5141 
5142   return {MinWidth, MaxWidth};
5143 }
5144 
5145 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5146                                                            unsigned LoopCost) {
5147   // -- The interleave heuristics --
5148   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5149   // There are many micro-architectural considerations that we can't predict
5150   // at this level. For example, frontend pressure (on decode or fetch) due to
5151   // code size, or the number and capabilities of the execution ports.
5152   //
5153   // We use the following heuristics to select the interleave count:
5154   // 1. If the code has reductions, then we interleave to break the cross
5155   // iteration dependency.
5156   // 2. If the loop is really small, then we interleave to reduce the loop
5157   // overhead.
5158   // 3. We don't interleave if we think that we will spill registers to memory
5159   // due to the increased register pressure.
5160 
5161   if (!isScalarEpilogueAllowed())
5162     return 1;
5163 
5164   // We used the distance for the interleave count.
5165   if (Legal->getMaxSafeDepDistBytes() != -1U)
5166     return 1;
5167 
5168   // Do not interleave loops with a relatively small known or estimated trip
5169   // count.
5170   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5171   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5172     return 1;
5173 
5174   RegisterUsage R = calculateRegisterUsage({VF})[0];
5175   // We divide by these constants so assume that we have at least one
5176   // instruction that uses at least one register.
5177   for (auto& pair : R.MaxLocalUsers) {
5178     pair.second = std::max(pair.second, 1U);
5179   }
5180 
5181   // We calculate the interleave count using the following formula.
5182   // Subtract the number of loop invariants from the number of available
5183   // registers. These registers are used by all of the interleaved instances.
5184   // Next, divide the remaining registers by the number of registers that is
5185   // required by the loop, in order to estimate how many parallel instances
5186   // fit without causing spills. All of this is rounded down if necessary to be
5187   // a power of two. We want power of two interleave count to simplify any
5188   // addressing operations or alignment considerations.
5189   // We also want power of two interleave counts to ensure that the induction
5190   // variable of the vector loop wraps to zero, when tail is folded by masking;
5191   // this currently happens when OptForSize, in which case IC is set to 1 above.
5192   unsigned IC = UINT_MAX;
5193 
5194   for (auto& pair : R.MaxLocalUsers) {
5195     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5196     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5197                       << " registers of "
5198                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5199     if (VF == 1) {
5200       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5201         TargetNumRegisters = ForceTargetNumScalarRegs;
5202     } else {
5203       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5204         TargetNumRegisters = ForceTargetNumVectorRegs;
5205     }
5206     unsigned MaxLocalUsers = pair.second;
5207     unsigned LoopInvariantRegs = 0;
5208     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5209       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5210 
5211     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5212     // Don't count the induction variable as interleaved.
5213     if (EnableIndVarRegisterHeur) {
5214       TmpIC =
5215           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5216                         std::max(1U, (MaxLocalUsers - 1)));
5217     }
5218 
5219     IC = std::min(IC, TmpIC);
5220   }
5221 
5222   // Clamp the interleave ranges to reasonable counts.
5223   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5224 
5225   // Check if the user has overridden the max.
5226   if (VF == 1) {
5227     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5228       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5229   } else {
5230     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5231       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5232   }
5233 
5234   // If trip count is known or estimated compile time constant, limit the
5235   // interleave count to be less than the trip count divided by VF.
5236   if (BestKnownTC) {
5237     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5238   }
5239 
5240   // If we did not calculate the cost for VF (because the user selected the VF)
5241   // then we calculate the cost of VF here.
5242   if (LoopCost == 0)
5243     LoopCost = expectedCost(VF).first;
5244 
5245   assert(LoopCost && "Non-zero loop cost expected");
5246 
5247   // Clamp the calculated IC to be between the 1 and the max interleave count
5248   // that the target and trip count allows.
5249   if (IC > MaxInterleaveCount)
5250     IC = MaxInterleaveCount;
5251   else if (IC < 1)
5252     IC = 1;
5253 
5254   // Interleave if we vectorized this loop and there is a reduction that could
5255   // benefit from interleaving.
5256   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5257     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5258     return IC;
5259   }
5260 
5261   // Note that if we've already vectorized the loop we will have done the
5262   // runtime check and so interleaving won't require further checks.
5263   bool InterleavingRequiresRuntimePointerCheck =
5264       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5265 
5266   // We want to interleave small loops in order to reduce the loop overhead and
5267   // potentially expose ILP opportunities.
5268   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5269   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5270     // We assume that the cost overhead is 1 and we use the cost model
5271     // to estimate the cost of the loop and interleave until the cost of the
5272     // loop overhead is about 5% of the cost of the loop.
5273     unsigned SmallIC =
5274         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5275 
5276     // Interleave until store/load ports (estimated by max interleave count) are
5277     // saturated.
5278     unsigned NumStores = Legal->getNumStores();
5279     unsigned NumLoads = Legal->getNumLoads();
5280     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5281     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5282 
5283     // If we have a scalar reduction (vector reductions are already dealt with
5284     // by this point), we can increase the critical path length if the loop
5285     // we're interleaving is inside another loop. Limit, by default to 2, so the
5286     // critical path only gets increased by one reduction operation.
5287     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5288       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5289       SmallIC = std::min(SmallIC, F);
5290       StoresIC = std::min(StoresIC, F);
5291       LoadsIC = std::min(LoadsIC, F);
5292     }
5293 
5294     if (EnableLoadStoreRuntimeInterleave &&
5295         std::max(StoresIC, LoadsIC) > SmallIC) {
5296       LLVM_DEBUG(
5297           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5298       return std::max(StoresIC, LoadsIC);
5299     }
5300 
5301     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5302     return SmallIC;
5303   }
5304 
5305   // Interleave if this is a large loop (small loops are already dealt with by
5306   // this point) that could benefit from interleaving.
5307   bool HasReductions = !Legal->getReductionVars()->empty();
5308   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5309     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5310     return IC;
5311   }
5312 
5313   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5314   return 1;
5315 }
5316 
5317 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5318 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5319   // This function calculates the register usage by measuring the highest number
5320   // of values that are alive at a single location. Obviously, this is a very
5321   // rough estimation. We scan the loop in a topological order in order and
5322   // assign a number to each instruction. We use RPO to ensure that defs are
5323   // met before their users. We assume that each instruction that has in-loop
5324   // users starts an interval. We record every time that an in-loop value is
5325   // used, so we have a list of the first and last occurrences of each
5326   // instruction. Next, we transpose this data structure into a multi map that
5327   // holds the list of intervals that *end* at a specific location. This multi
5328   // map allows us to perform a linear search. We scan the instructions linearly
5329   // and record each time that a new interval starts, by placing it in a set.
5330   // If we find this value in the multi-map then we remove it from the set.
5331   // The max register usage is the maximum size of the set.
5332   // We also search for instructions that are defined outside the loop, but are
5333   // used inside the loop. We need this number separately from the max-interval
5334   // usage number because when we unroll, loop-invariant values do not take
5335   // more register.
5336   LoopBlocksDFS DFS(TheLoop);
5337   DFS.perform(LI);
5338 
5339   RegisterUsage RU;
5340 
5341   // Each 'key' in the map opens a new interval. The values
5342   // of the map are the index of the 'last seen' usage of the
5343   // instruction that is the key.
5344   using IntervalMap = DenseMap<Instruction *, unsigned>;
5345 
5346   // Maps instruction to its index.
5347   SmallVector<Instruction *, 64> IdxToInstr;
5348   // Marks the end of each interval.
5349   IntervalMap EndPoint;
5350   // Saves the list of instruction indices that are used in the loop.
5351   SmallPtrSet<Instruction *, 8> Ends;
5352   // Saves the list of values that are used in the loop but are
5353   // defined outside the loop, such as arguments and constants.
5354   SmallPtrSet<Value *, 8> LoopInvariants;
5355 
5356   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5357     for (Instruction &I : BB->instructionsWithoutDebug()) {
5358       IdxToInstr.push_back(&I);
5359 
5360       // Save the end location of each USE.
5361       for (Value *U : I.operands()) {
5362         auto *Instr = dyn_cast<Instruction>(U);
5363 
5364         // Ignore non-instruction values such as arguments, constants, etc.
5365         if (!Instr)
5366           continue;
5367 
5368         // If this instruction is outside the loop then record it and continue.
5369         if (!TheLoop->contains(Instr)) {
5370           LoopInvariants.insert(Instr);
5371           continue;
5372         }
5373 
5374         // Overwrite previous end points.
5375         EndPoint[Instr] = IdxToInstr.size();
5376         Ends.insert(Instr);
5377       }
5378     }
5379   }
5380 
5381   // Saves the list of intervals that end with the index in 'key'.
5382   using InstrList = SmallVector<Instruction *, 2>;
5383   DenseMap<unsigned, InstrList> TransposeEnds;
5384 
5385   // Transpose the EndPoints to a list of values that end at each index.
5386   for (auto &Interval : EndPoint)
5387     TransposeEnds[Interval.second].push_back(Interval.first);
5388 
5389   SmallPtrSet<Instruction *, 8> OpenIntervals;
5390 
5391   // Get the size of the widest register.
5392   unsigned MaxSafeDepDist = -1U;
5393   if (Legal->getMaxSafeDepDistBytes() != -1U)
5394     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5395   unsigned WidestRegister =
5396       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5397   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5398 
5399   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5400   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5401 
5402   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5403 
5404   // A lambda that gets the register usage for the given type and VF.
5405   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5406     if (Ty->isTokenTy())
5407       return 0U;
5408     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5409     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5410   };
5411 
5412   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5413     Instruction *I = IdxToInstr[i];
5414 
5415     // Remove all of the instructions that end at this location.
5416     InstrList &List = TransposeEnds[i];
5417     for (Instruction *ToRemove : List)
5418       OpenIntervals.erase(ToRemove);
5419 
5420     // Ignore instructions that are never used within the loop.
5421     if (Ends.find(I) == Ends.end())
5422       continue;
5423 
5424     // Skip ignored values.
5425     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5426       continue;
5427 
5428     // For each VF find the maximum usage of registers.
5429     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5430       // Count the number of live intervals.
5431       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5432 
5433       if (VFs[j] == 1) {
5434         for (auto Inst : OpenIntervals) {
5435           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5436           if (RegUsage.find(ClassID) == RegUsage.end())
5437             RegUsage[ClassID] = 1;
5438           else
5439             RegUsage[ClassID] += 1;
5440         }
5441       } else {
5442         collectUniformsAndScalars(VFs[j]);
5443         for (auto Inst : OpenIntervals) {
5444           // Skip ignored values for VF > 1.
5445           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5446             continue;
5447           if (isScalarAfterVectorization(Inst, VFs[j])) {
5448             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5449             if (RegUsage.find(ClassID) == RegUsage.end())
5450               RegUsage[ClassID] = 1;
5451             else
5452               RegUsage[ClassID] += 1;
5453           } else {
5454             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5455             if (RegUsage.find(ClassID) == RegUsage.end())
5456               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5457             else
5458               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5459           }
5460         }
5461       }
5462 
5463       for (auto& pair : RegUsage) {
5464         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5465           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5466         else
5467           MaxUsages[j][pair.first] = pair.second;
5468       }
5469     }
5470 
5471     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5472                       << OpenIntervals.size() << '\n');
5473 
5474     // Add the current instruction to the list of open intervals.
5475     OpenIntervals.insert(I);
5476   }
5477 
5478   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5479     SmallMapVector<unsigned, unsigned, 4> Invariant;
5480 
5481     for (auto Inst : LoopInvariants) {
5482       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5483       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5484       if (Invariant.find(ClassID) == Invariant.end())
5485         Invariant[ClassID] = Usage;
5486       else
5487         Invariant[ClassID] += Usage;
5488     }
5489 
5490     LLVM_DEBUG({
5491       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5492       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5493              << " item\n";
5494       for (const auto &pair : MaxUsages[i]) {
5495         dbgs() << "LV(REG): RegisterClass: "
5496                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5497                << " registers\n";
5498       }
5499       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5500              << " item\n";
5501       for (const auto &pair : Invariant) {
5502         dbgs() << "LV(REG): RegisterClass: "
5503                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5504                << " registers\n";
5505       }
5506     });
5507 
5508     RU.LoopInvariantRegs = Invariant;
5509     RU.MaxLocalUsers = MaxUsages[i];
5510     RUs[i] = RU;
5511   }
5512 
5513   return RUs;
5514 }
5515 
5516 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5517   // TODO: Cost model for emulated masked load/store is completely
5518   // broken. This hack guides the cost model to use an artificially
5519   // high enough value to practically disable vectorization with such
5520   // operations, except where previously deployed legality hack allowed
5521   // using very low cost values. This is to avoid regressions coming simply
5522   // from moving "masked load/store" check from legality to cost model.
5523   // Masked Load/Gather emulation was previously never allowed.
5524   // Limited number of Masked Store/Scatter emulation was allowed.
5525   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5526   return isa<LoadInst>(I) ||
5527          (isa<StoreInst>(I) &&
5528           NumPredStores > NumberOfStoresToPredicate);
5529 }
5530 
5531 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5532   // If we aren't vectorizing the loop, or if we've already collected the
5533   // instructions to scalarize, there's nothing to do. Collection may already
5534   // have occurred if we have a user-selected VF and are now computing the
5535   // expected cost for interleaving.
5536   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5537     return;
5538 
5539   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5540   // not profitable to scalarize any instructions, the presence of VF in the
5541   // map will indicate that we've analyzed it already.
5542   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5543 
5544   // Find all the instructions that are scalar with predication in the loop and
5545   // determine if it would be better to not if-convert the blocks they are in.
5546   // If so, we also record the instructions to scalarize.
5547   for (BasicBlock *BB : TheLoop->blocks()) {
5548     if (!blockNeedsPredication(BB))
5549       continue;
5550     for (Instruction &I : *BB)
5551       if (isScalarWithPredication(&I)) {
5552         ScalarCostsTy ScalarCosts;
5553         // Do not apply discount logic if hacked cost is needed
5554         // for emulated masked memrefs.
5555         if (!useEmulatedMaskMemRefHack(&I) &&
5556             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5557           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5558         // Remember that BB will remain after vectorization.
5559         PredicatedBBsAfterVectorization.insert(BB);
5560       }
5561   }
5562 }
5563 
5564 int LoopVectorizationCostModel::computePredInstDiscount(
5565     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5566     unsigned VF) {
5567   assert(!isUniformAfterVectorization(PredInst, VF) &&
5568          "Instruction marked uniform-after-vectorization will be predicated");
5569 
5570   // Initialize the discount to zero, meaning that the scalar version and the
5571   // vector version cost the same.
5572   int Discount = 0;
5573 
5574   // Holds instructions to analyze. The instructions we visit are mapped in
5575   // ScalarCosts. Those instructions are the ones that would be scalarized if
5576   // we find that the scalar version costs less.
5577   SmallVector<Instruction *, 8> Worklist;
5578 
5579   // Returns true if the given instruction can be scalarized.
5580   auto canBeScalarized = [&](Instruction *I) -> bool {
5581     // We only attempt to scalarize instructions forming a single-use chain
5582     // from the original predicated block that would otherwise be vectorized.
5583     // Although not strictly necessary, we give up on instructions we know will
5584     // already be scalar to avoid traversing chains that are unlikely to be
5585     // beneficial.
5586     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5587         isScalarAfterVectorization(I, VF))
5588       return false;
5589 
5590     // If the instruction is scalar with predication, it will be analyzed
5591     // separately. We ignore it within the context of PredInst.
5592     if (isScalarWithPredication(I))
5593       return false;
5594 
5595     // If any of the instruction's operands are uniform after vectorization,
5596     // the instruction cannot be scalarized. This prevents, for example, a
5597     // masked load from being scalarized.
5598     //
5599     // We assume we will only emit a value for lane zero of an instruction
5600     // marked uniform after vectorization, rather than VF identical values.
5601     // Thus, if we scalarize an instruction that uses a uniform, we would
5602     // create uses of values corresponding to the lanes we aren't emitting code
5603     // for. This behavior can be changed by allowing getScalarValue to clone
5604     // the lane zero values for uniforms rather than asserting.
5605     for (Use &U : I->operands())
5606       if (auto *J = dyn_cast<Instruction>(U.get()))
5607         if (isUniformAfterVectorization(J, VF))
5608           return false;
5609 
5610     // Otherwise, we can scalarize the instruction.
5611     return true;
5612   };
5613 
5614   // Compute the expected cost discount from scalarizing the entire expression
5615   // feeding the predicated instruction. We currently only consider expressions
5616   // that are single-use instruction chains.
5617   Worklist.push_back(PredInst);
5618   while (!Worklist.empty()) {
5619     Instruction *I = Worklist.pop_back_val();
5620 
5621     // If we've already analyzed the instruction, there's nothing to do.
5622     if (ScalarCosts.find(I) != ScalarCosts.end())
5623       continue;
5624 
5625     // Compute the cost of the vector instruction. Note that this cost already
5626     // includes the scalarization overhead of the predicated instruction.
5627     unsigned VectorCost = getInstructionCost(I, VF).first;
5628 
5629     // Compute the cost of the scalarized instruction. This cost is the cost of
5630     // the instruction as if it wasn't if-converted and instead remained in the
5631     // predicated block. We will scale this cost by block probability after
5632     // computing the scalarization overhead.
5633     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5634 
5635     // Compute the scalarization overhead of needed insertelement instructions
5636     // and phi nodes.
5637     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5638       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5639                                                  true, false);
5640       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5641     }
5642 
5643     // Compute the scalarization overhead of needed extractelement
5644     // instructions. For each of the instruction's operands, if the operand can
5645     // be scalarized, add it to the worklist; otherwise, account for the
5646     // overhead.
5647     for (Use &U : I->operands())
5648       if (auto *J = dyn_cast<Instruction>(U.get())) {
5649         assert(VectorType::isValidElementType(J->getType()) &&
5650                "Instruction has non-scalar type");
5651         if (canBeScalarized(J))
5652           Worklist.push_back(J);
5653         else if (needsExtract(J, VF))
5654           ScalarCost += TTI.getScalarizationOverhead(
5655                               ToVectorTy(J->getType(),VF), false, true);
5656       }
5657 
5658     // Scale the total scalar cost by block probability.
5659     ScalarCost /= getReciprocalPredBlockProb();
5660 
5661     // Compute the discount. A non-negative discount means the vector version
5662     // of the instruction costs more, and scalarizing would be beneficial.
5663     Discount += VectorCost - ScalarCost;
5664     ScalarCosts[I] = ScalarCost;
5665   }
5666 
5667   return Discount;
5668 }
5669 
5670 LoopVectorizationCostModel::VectorizationCostTy
5671 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5672   VectorizationCostTy Cost;
5673 
5674   // For each block.
5675   for (BasicBlock *BB : TheLoop->blocks()) {
5676     VectorizationCostTy BlockCost;
5677 
5678     // For each instruction in the old loop.
5679     for (Instruction &I : BB->instructionsWithoutDebug()) {
5680       // Skip ignored values.
5681       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5682           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5683         continue;
5684 
5685       VectorizationCostTy C = getInstructionCost(&I, VF);
5686 
5687       // Check if we should override the cost.
5688       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5689         C.first = ForceTargetInstructionCost;
5690 
5691       BlockCost.first += C.first;
5692       BlockCost.second |= C.second;
5693       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5694                         << " for VF " << VF << " For instruction: " << I
5695                         << '\n');
5696     }
5697 
5698     // If we are vectorizing a predicated block, it will have been
5699     // if-converted. This means that the block's instructions (aside from
5700     // stores and instructions that may divide by zero) will now be
5701     // unconditionally executed. For the scalar case, we may not always execute
5702     // the predicated block. Thus, scale the block's cost by the probability of
5703     // executing it.
5704     if (VF == 1 && blockNeedsPredication(BB))
5705       BlockCost.first /= getReciprocalPredBlockProb();
5706 
5707     Cost.first += BlockCost.first;
5708     Cost.second |= BlockCost.second;
5709   }
5710 
5711   return Cost;
5712 }
5713 
5714 /// Gets Address Access SCEV after verifying that the access pattern
5715 /// is loop invariant except the induction variable dependence.
5716 ///
5717 /// This SCEV can be sent to the Target in order to estimate the address
5718 /// calculation cost.
5719 static const SCEV *getAddressAccessSCEV(
5720               Value *Ptr,
5721               LoopVectorizationLegality *Legal,
5722               PredicatedScalarEvolution &PSE,
5723               const Loop *TheLoop) {
5724 
5725   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5726   if (!Gep)
5727     return nullptr;
5728 
5729   // We are looking for a gep with all loop invariant indices except for one
5730   // which should be an induction variable.
5731   auto SE = PSE.getSE();
5732   unsigned NumOperands = Gep->getNumOperands();
5733   for (unsigned i = 1; i < NumOperands; ++i) {
5734     Value *Opd = Gep->getOperand(i);
5735     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5736         !Legal->isInductionVariable(Opd))
5737       return nullptr;
5738   }
5739 
5740   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5741   return PSE.getSCEV(Ptr);
5742 }
5743 
5744 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5745   return Legal->hasStride(I->getOperand(0)) ||
5746          Legal->hasStride(I->getOperand(1));
5747 }
5748 
5749 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5750                                                                  unsigned VF) {
5751   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5752   Type *ValTy = getMemInstValueType(I);
5753   auto SE = PSE.getSE();
5754 
5755   unsigned AS = getLoadStoreAddressSpace(I);
5756   Value *Ptr = getLoadStorePointerOperand(I);
5757   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5758 
5759   // Figure out whether the access is strided and get the stride value
5760   // if it's known in compile time
5761   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5762 
5763   // Get the cost of the scalar memory instruction and address computation.
5764   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5765 
5766   // Don't pass *I here, since it is scalar but will actually be part of a
5767   // vectorized loop where the user of it is a vectorized instruction.
5768   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5769   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5770                                    Alignment, AS);
5771 
5772   // Get the overhead of the extractelement and insertelement instructions
5773   // we might create due to scalarization.
5774   Cost += getScalarizationOverhead(I, VF);
5775 
5776   // If we have a predicated store, it may not be executed for each vector
5777   // lane. Scale the cost by the probability of executing the predicated
5778   // block.
5779   if (isPredicatedInst(I)) {
5780     Cost /= getReciprocalPredBlockProb();
5781 
5782     if (useEmulatedMaskMemRefHack(I))
5783       // Artificially setting to a high enough value to practically disable
5784       // vectorization with such operations.
5785       Cost = 3000000;
5786   }
5787 
5788   return Cost;
5789 }
5790 
5791 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5792                                                              unsigned VF) {
5793   Type *ValTy = getMemInstValueType(I);
5794   Type *VectorTy = ToVectorTy(ValTy, VF);
5795   Value *Ptr = getLoadStorePointerOperand(I);
5796   unsigned AS = getLoadStoreAddressSpace(I);
5797   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5798 
5799   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5800          "Stride should be 1 or -1 for consecutive memory access");
5801   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5802   unsigned Cost = 0;
5803   if (Legal->isMaskRequired(I))
5804     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5805                                       Alignment ? Alignment->value() : 0, AS);
5806   else
5807     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5808 
5809   bool Reverse = ConsecutiveStride < 0;
5810   if (Reverse)
5811     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5812   return Cost;
5813 }
5814 
5815 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5816                                                          unsigned VF) {
5817   Type *ValTy = getMemInstValueType(I);
5818   Type *VectorTy = ToVectorTy(ValTy, VF);
5819   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5820   unsigned AS = getLoadStoreAddressSpace(I);
5821   if (isa<LoadInst>(I)) {
5822     return TTI.getAddressComputationCost(ValTy) +
5823            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5824            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5825   }
5826   StoreInst *SI = cast<StoreInst>(I);
5827 
5828   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5829   return TTI.getAddressComputationCost(ValTy) +
5830          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5831          (isLoopInvariantStoreValue
5832               ? 0
5833               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5834                                        VF - 1));
5835 }
5836 
5837 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5838                                                           unsigned VF) {
5839   Type *ValTy = getMemInstValueType(I);
5840   Type *VectorTy = ToVectorTy(ValTy, VF);
5841   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5842   Value *Ptr = getLoadStorePointerOperand(I);
5843 
5844   return TTI.getAddressComputationCost(VectorTy) +
5845          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5846                                     Legal->isMaskRequired(I),
5847                                     Alignment ? Alignment->value() : 0);
5848 }
5849 
5850 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5851                                                             unsigned VF) {
5852   Type *ValTy = getMemInstValueType(I);
5853   Type *VectorTy = ToVectorTy(ValTy, VF);
5854   unsigned AS = getLoadStoreAddressSpace(I);
5855 
5856   auto Group = getInterleavedAccessGroup(I);
5857   assert(Group && "Fail to get an interleaved access group.");
5858 
5859   unsigned InterleaveFactor = Group->getFactor();
5860   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5861 
5862   // Holds the indices of existing members in an interleaved load group.
5863   // An interleaved store group doesn't need this as it doesn't allow gaps.
5864   SmallVector<unsigned, 4> Indices;
5865   if (isa<LoadInst>(I)) {
5866     for (unsigned i = 0; i < InterleaveFactor; i++)
5867       if (Group->getMember(i))
5868         Indices.push_back(i);
5869   }
5870 
5871   // Calculate the cost of the whole interleaved group.
5872   bool UseMaskForGaps =
5873       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5874   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5875       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5876       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5877 
5878   if (Group->isReverse()) {
5879     // TODO: Add support for reversed masked interleaved access.
5880     assert(!Legal->isMaskRequired(I) &&
5881            "Reverse masked interleaved access not supported.");
5882     Cost += Group->getNumMembers() *
5883             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5884   }
5885   return Cost;
5886 }
5887 
5888 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5889                                                               unsigned VF) {
5890   // Calculate scalar cost only. Vectorization cost should be ready at this
5891   // moment.
5892   if (VF == 1) {
5893     Type *ValTy = getMemInstValueType(I);
5894     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5895     unsigned AS = getLoadStoreAddressSpace(I);
5896 
5897     return TTI.getAddressComputationCost(ValTy) +
5898            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5899   }
5900   return getWideningCost(I, VF);
5901 }
5902 
5903 LoopVectorizationCostModel::VectorizationCostTy
5904 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5905   // If we know that this instruction will remain uniform, check the cost of
5906   // the scalar version.
5907   if (isUniformAfterVectorization(I, VF))
5908     VF = 1;
5909 
5910   if (VF > 1 && isProfitableToScalarize(I, VF))
5911     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5912 
5913   // Forced scalars do not have any scalarization overhead.
5914   auto ForcedScalar = ForcedScalars.find(VF);
5915   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5916     auto InstSet = ForcedScalar->second;
5917     if (InstSet.find(I) != InstSet.end())
5918       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5919   }
5920 
5921   Type *VectorTy;
5922   unsigned C = getInstructionCost(I, VF, VectorTy);
5923 
5924   bool TypeNotScalarized =
5925       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5926   return VectorizationCostTy(C, TypeNotScalarized);
5927 }
5928 
5929 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5930                                                               unsigned VF) {
5931 
5932   if (VF == 1)
5933     return 0;
5934 
5935   unsigned Cost = 0;
5936   Type *RetTy = ToVectorTy(I->getType(), VF);
5937   if (!RetTy->isVoidTy() &&
5938       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5939     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5940 
5941   // Some targets keep addresses scalar.
5942   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5943     return Cost;
5944 
5945   // Some targets support efficient element stores.
5946   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5947     return Cost;
5948 
5949   // Collect operands to consider.
5950   CallInst *CI = dyn_cast<CallInst>(I);
5951   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5952 
5953   // Skip operands that do not require extraction/scalarization and do not incur
5954   // any overhead.
5955   return Cost + TTI.getOperandsScalarizationOverhead(
5956                     filterExtractingOperands(Ops, VF), VF);
5957 }
5958 
5959 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
5960   if (VF == 1)
5961     return;
5962   NumPredStores = 0;
5963   for (BasicBlock *BB : TheLoop->blocks()) {
5964     // For each instruction in the old loop.
5965     for (Instruction &I : *BB) {
5966       Value *Ptr =  getLoadStorePointerOperand(&I);
5967       if (!Ptr)
5968         continue;
5969 
5970       // TODO: We should generate better code and update the cost model for
5971       // predicated uniform stores. Today they are treated as any other
5972       // predicated store (see added test cases in
5973       // invariant-store-vectorization.ll).
5974       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
5975         NumPredStores++;
5976 
5977       if (Legal->isUniform(Ptr) &&
5978           // Conditional loads and stores should be scalarized and predicated.
5979           // isScalarWithPredication cannot be used here since masked
5980           // gather/scatters are not considered scalar with predication.
5981           !Legal->blockNeedsPredication(I.getParent())) {
5982         // TODO: Avoid replicating loads and stores instead of
5983         // relying on instcombine to remove them.
5984         // Load: Scalar load + broadcast
5985         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5986         unsigned Cost = getUniformMemOpCost(&I, VF);
5987         setWideningDecision(&I, VF, CM_Scalarize, Cost);
5988         continue;
5989       }
5990 
5991       // We assume that widening is the best solution when possible.
5992       if (memoryInstructionCanBeWidened(&I, VF)) {
5993         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
5994         int ConsecutiveStride =
5995                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
5996         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5997                "Expected consecutive stride.");
5998         InstWidening Decision =
5999             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6000         setWideningDecision(&I, VF, Decision, Cost);
6001         continue;
6002       }
6003 
6004       // Choose between Interleaving, Gather/Scatter or Scalarization.
6005       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6006       unsigned NumAccesses = 1;
6007       if (isAccessInterleaved(&I)) {
6008         auto Group = getInterleavedAccessGroup(&I);
6009         assert(Group && "Fail to get an interleaved access group.");
6010 
6011         // Make one decision for the whole group.
6012         if (getWideningDecision(&I, VF) != CM_Unknown)
6013           continue;
6014 
6015         NumAccesses = Group->getNumMembers();
6016         if (interleavedAccessCanBeWidened(&I, VF))
6017           InterleaveCost = getInterleaveGroupCost(&I, VF);
6018       }
6019 
6020       unsigned GatherScatterCost =
6021           isLegalGatherOrScatter(&I)
6022               ? getGatherScatterCost(&I, VF) * NumAccesses
6023               : std::numeric_limits<unsigned>::max();
6024 
6025       unsigned ScalarizationCost =
6026           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6027 
6028       // Choose better solution for the current VF,
6029       // write down this decision and use it during vectorization.
6030       unsigned Cost;
6031       InstWidening Decision;
6032       if (InterleaveCost <= GatherScatterCost &&
6033           InterleaveCost < ScalarizationCost) {
6034         Decision = CM_Interleave;
6035         Cost = InterleaveCost;
6036       } else if (GatherScatterCost < ScalarizationCost) {
6037         Decision = CM_GatherScatter;
6038         Cost = GatherScatterCost;
6039       } else {
6040         Decision = CM_Scalarize;
6041         Cost = ScalarizationCost;
6042       }
6043       // If the instructions belongs to an interleave group, the whole group
6044       // receives the same decision. The whole group receives the cost, but
6045       // the cost will actually be assigned to one instruction.
6046       if (auto Group = getInterleavedAccessGroup(&I))
6047         setWideningDecision(Group, VF, Decision, Cost);
6048       else
6049         setWideningDecision(&I, VF, Decision, Cost);
6050     }
6051   }
6052 
6053   // Make sure that any load of address and any other address computation
6054   // remains scalar unless there is gather/scatter support. This avoids
6055   // inevitable extracts into address registers, and also has the benefit of
6056   // activating LSR more, since that pass can't optimize vectorized
6057   // addresses.
6058   if (TTI.prefersVectorizedAddressing())
6059     return;
6060 
6061   // Start with all scalar pointer uses.
6062   SmallPtrSet<Instruction *, 8> AddrDefs;
6063   for (BasicBlock *BB : TheLoop->blocks())
6064     for (Instruction &I : *BB) {
6065       Instruction *PtrDef =
6066         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6067       if (PtrDef && TheLoop->contains(PtrDef) &&
6068           getWideningDecision(&I, VF) != CM_GatherScatter)
6069         AddrDefs.insert(PtrDef);
6070     }
6071 
6072   // Add all instructions used to generate the addresses.
6073   SmallVector<Instruction *, 4> Worklist;
6074   for (auto *I : AddrDefs)
6075     Worklist.push_back(I);
6076   while (!Worklist.empty()) {
6077     Instruction *I = Worklist.pop_back_val();
6078     for (auto &Op : I->operands())
6079       if (auto *InstOp = dyn_cast<Instruction>(Op))
6080         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6081             AddrDefs.insert(InstOp).second)
6082           Worklist.push_back(InstOp);
6083   }
6084 
6085   for (auto *I : AddrDefs) {
6086     if (isa<LoadInst>(I)) {
6087       // Setting the desired widening decision should ideally be handled in
6088       // by cost functions, but since this involves the task of finding out
6089       // if the loaded register is involved in an address computation, it is
6090       // instead changed here when we know this is the case.
6091       InstWidening Decision = getWideningDecision(I, VF);
6092       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6093         // Scalarize a widened load of address.
6094         setWideningDecision(I, VF, CM_Scalarize,
6095                             (VF * getMemoryInstructionCost(I, 1)));
6096       else if (auto Group = getInterleavedAccessGroup(I)) {
6097         // Scalarize an interleave group of address loads.
6098         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6099           if (Instruction *Member = Group->getMember(I))
6100             setWideningDecision(Member, VF, CM_Scalarize,
6101                                 (VF * getMemoryInstructionCost(Member, 1)));
6102         }
6103       }
6104     } else
6105       // Make sure I gets scalarized and a cost estimate without
6106       // scalarization overhead.
6107       ForcedScalars[VF].insert(I);
6108   }
6109 }
6110 
6111 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6112                                                         unsigned VF,
6113                                                         Type *&VectorTy) {
6114   Type *RetTy = I->getType();
6115   if (canTruncateToMinimalBitwidth(I, VF))
6116     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6117   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6118   auto SE = PSE.getSE();
6119 
6120   // TODO: We need to estimate the cost of intrinsic calls.
6121   switch (I->getOpcode()) {
6122   case Instruction::GetElementPtr:
6123     // We mark this instruction as zero-cost because the cost of GEPs in
6124     // vectorized code depends on whether the corresponding memory instruction
6125     // is scalarized or not. Therefore, we handle GEPs with the memory
6126     // instruction cost.
6127     return 0;
6128   case Instruction::Br: {
6129     // In cases of scalarized and predicated instructions, there will be VF
6130     // predicated blocks in the vectorized loop. Each branch around these
6131     // blocks requires also an extract of its vector compare i1 element.
6132     bool ScalarPredicatedBB = false;
6133     BranchInst *BI = cast<BranchInst>(I);
6134     if (VF > 1 && BI->isConditional() &&
6135         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6136              PredicatedBBsAfterVectorization.end() ||
6137          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6138              PredicatedBBsAfterVectorization.end()))
6139       ScalarPredicatedBB = true;
6140 
6141     if (ScalarPredicatedBB) {
6142       // Return cost for branches around scalarized and predicated blocks.
6143       Type *Vec_i1Ty =
6144           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6145       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6146               (TTI.getCFInstrCost(Instruction::Br) * VF));
6147     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6148       // The back-edge branch will remain, as will all scalar branches.
6149       return TTI.getCFInstrCost(Instruction::Br);
6150     else
6151       // This branch will be eliminated by if-conversion.
6152       return 0;
6153     // Note: We currently assume zero cost for an unconditional branch inside
6154     // a predicated block since it will become a fall-through, although we
6155     // may decide in the future to call TTI for all branches.
6156   }
6157   case Instruction::PHI: {
6158     auto *Phi = cast<PHINode>(I);
6159 
6160     // First-order recurrences are replaced by vector shuffles inside the loop.
6161     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6162     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6163       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6164                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6165 
6166     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6167     // converted into select instructions. We require N - 1 selects per phi
6168     // node, where N is the number of incoming values.
6169     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6170       return (Phi->getNumIncomingValues() - 1) *
6171              TTI.getCmpSelInstrCost(
6172                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6173                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6174 
6175     return TTI.getCFInstrCost(Instruction::PHI);
6176   }
6177   case Instruction::UDiv:
6178   case Instruction::SDiv:
6179   case Instruction::URem:
6180   case Instruction::SRem:
6181     // If we have a predicated instruction, it may not be executed for each
6182     // vector lane. Get the scalarization cost and scale this amount by the
6183     // probability of executing the predicated block. If the instruction is not
6184     // predicated, we fall through to the next case.
6185     if (VF > 1 && isScalarWithPredication(I)) {
6186       unsigned Cost = 0;
6187 
6188       // These instructions have a non-void type, so account for the phi nodes
6189       // that we will create. This cost is likely to be zero. The phi node
6190       // cost, if any, should be scaled by the block probability because it
6191       // models a copy at the end of each predicated block.
6192       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6193 
6194       // The cost of the non-predicated instruction.
6195       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6196 
6197       // The cost of insertelement and extractelement instructions needed for
6198       // scalarization.
6199       Cost += getScalarizationOverhead(I, VF);
6200 
6201       // Scale the cost by the probability of executing the predicated blocks.
6202       // This assumes the predicated block for each vector lane is equally
6203       // likely.
6204       return Cost / getReciprocalPredBlockProb();
6205     }
6206     LLVM_FALLTHROUGH;
6207   case Instruction::Add:
6208   case Instruction::FAdd:
6209   case Instruction::Sub:
6210   case Instruction::FSub:
6211   case Instruction::Mul:
6212   case Instruction::FMul:
6213   case Instruction::FDiv:
6214   case Instruction::FRem:
6215   case Instruction::Shl:
6216   case Instruction::LShr:
6217   case Instruction::AShr:
6218   case Instruction::And:
6219   case Instruction::Or:
6220   case Instruction::Xor: {
6221     // Since we will replace the stride by 1 the multiplication should go away.
6222     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6223       return 0;
6224     // Certain instructions can be cheaper to vectorize if they have a constant
6225     // second vector operand. One example of this are shifts on x86.
6226     Value *Op2 = I->getOperand(1);
6227     TargetTransformInfo::OperandValueProperties Op2VP;
6228     TargetTransformInfo::OperandValueKind Op2VK =
6229         TTI.getOperandInfo(Op2, Op2VP);
6230     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6231       Op2VK = TargetTransformInfo::OK_UniformValue;
6232 
6233     SmallVector<const Value *, 4> Operands(I->operand_values());
6234     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6235     return N * TTI.getArithmeticInstrCost(
6236                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6237                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6238   }
6239   case Instruction::FNeg: {
6240     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6241     return N * TTI.getArithmeticInstrCost(
6242                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6243                    TargetTransformInfo::OK_AnyValue,
6244                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6245                    I->getOperand(0), I);
6246   }
6247   case Instruction::Select: {
6248     SelectInst *SI = cast<SelectInst>(I);
6249     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6250     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6251     Type *CondTy = SI->getCondition()->getType();
6252     if (!ScalarCond)
6253       CondTy = VectorType::get(CondTy, VF);
6254 
6255     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6256   }
6257   case Instruction::ICmp:
6258   case Instruction::FCmp: {
6259     Type *ValTy = I->getOperand(0)->getType();
6260     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6261     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6262       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6263     VectorTy = ToVectorTy(ValTy, VF);
6264     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6265   }
6266   case Instruction::Store:
6267   case Instruction::Load: {
6268     unsigned Width = VF;
6269     if (Width > 1) {
6270       InstWidening Decision = getWideningDecision(I, Width);
6271       assert(Decision != CM_Unknown &&
6272              "CM decision should be taken at this point");
6273       if (Decision == CM_Scalarize)
6274         Width = 1;
6275     }
6276     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6277     return getMemoryInstructionCost(I, VF);
6278   }
6279   case Instruction::ZExt:
6280   case Instruction::SExt:
6281   case Instruction::FPToUI:
6282   case Instruction::FPToSI:
6283   case Instruction::FPExt:
6284   case Instruction::PtrToInt:
6285   case Instruction::IntToPtr:
6286   case Instruction::SIToFP:
6287   case Instruction::UIToFP:
6288   case Instruction::Trunc:
6289   case Instruction::FPTrunc:
6290   case Instruction::BitCast: {
6291     // We optimize the truncation of induction variables having constant
6292     // integer steps. The cost of these truncations is the same as the scalar
6293     // operation.
6294     if (isOptimizableIVTruncate(I, VF)) {
6295       auto *Trunc = cast<TruncInst>(I);
6296       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6297                                   Trunc->getSrcTy(), Trunc);
6298     }
6299 
6300     Type *SrcScalarTy = I->getOperand(0)->getType();
6301     Type *SrcVecTy =
6302         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6303     if (canTruncateToMinimalBitwidth(I, VF)) {
6304       // This cast is going to be shrunk. This may remove the cast or it might
6305       // turn it into slightly different cast. For example, if MinBW == 16,
6306       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6307       //
6308       // Calculate the modified src and dest types.
6309       Type *MinVecTy = VectorTy;
6310       if (I->getOpcode() == Instruction::Trunc) {
6311         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6312         VectorTy =
6313             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6314       } else if (I->getOpcode() == Instruction::ZExt ||
6315                  I->getOpcode() == Instruction::SExt) {
6316         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6317         VectorTy =
6318             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6319       }
6320     }
6321 
6322     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6323     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6324   }
6325   case Instruction::Call: {
6326     bool NeedToScalarize;
6327     CallInst *CI = cast<CallInst>(I);
6328     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6329     if (getVectorIntrinsicIDForCall(CI, TLI))
6330       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6331     return CallCost;
6332   }
6333   default:
6334     // The cost of executing VF copies of the scalar instruction. This opcode
6335     // is unknown. Assume that it is the same as 'mul'.
6336     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6337            getScalarizationOverhead(I, VF);
6338   } // end of switch.
6339 }
6340 
6341 char LoopVectorize::ID = 0;
6342 
6343 static const char lv_name[] = "Loop Vectorization";
6344 
6345 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6346 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6347 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6348 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6349 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6350 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6351 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6352 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6353 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6354 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6355 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6356 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6357 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6358 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6359 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6360 
6361 namespace llvm {
6362 
6363 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6364 
6365 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6366                               bool VectorizeOnlyWhenForced) {
6367   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6368 }
6369 
6370 } // end namespace llvm
6371 
6372 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6373   // Check if the pointer operand of a load or store instruction is
6374   // consecutive.
6375   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6376     return Legal->isConsecutivePtr(Ptr);
6377   return false;
6378 }
6379 
6380 void LoopVectorizationCostModel::collectValuesToIgnore() {
6381   // Ignore ephemeral values.
6382   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6383 
6384   // Ignore type-promoting instructions we identified during reduction
6385   // detection.
6386   for (auto &Reduction : *Legal->getReductionVars()) {
6387     RecurrenceDescriptor &RedDes = Reduction.second;
6388     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6389     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6390   }
6391   // Ignore type-casting instructions we identified during induction
6392   // detection.
6393   for (auto &Induction : *Legal->getInductionVars()) {
6394     InductionDescriptor &IndDes = Induction.second;
6395     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6396     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6397   }
6398 }
6399 
6400 // TODO: we could return a pair of values that specify the max VF and
6401 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6402 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6403 // doesn't have a cost model that can choose which plan to execute if
6404 // more than one is generated.
6405 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6406                                  LoopVectorizationCostModel &CM) {
6407   unsigned WidestType;
6408   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6409   return WidestVectorRegBits / WidestType;
6410 }
6411 
6412 VectorizationFactor
6413 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6414   unsigned VF = UserVF;
6415   // Outer loop handling: They may require CFG and instruction level
6416   // transformations before even evaluating whether vectorization is profitable.
6417   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6418   // the vectorization pipeline.
6419   if (!OrigLoop->empty()) {
6420     // If the user doesn't provide a vectorization factor, determine a
6421     // reasonable one.
6422     if (!UserVF) {
6423       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6424       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6425 
6426       // Make sure we have a VF > 1 for stress testing.
6427       if (VPlanBuildStressTest && VF < 2) {
6428         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6429                           << "overriding computed VF.\n");
6430         VF = 4;
6431       }
6432     }
6433     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6434     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6435     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6436                       << " to build VPlans.\n");
6437     buildVPlans(VF, VF);
6438 
6439     // For VPlan build stress testing, we bail out after VPlan construction.
6440     if (VPlanBuildStressTest)
6441       return VectorizationFactor::Disabled();
6442 
6443     return {VF, 0};
6444   }
6445 
6446   LLVM_DEBUG(
6447       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6448                 "VPlan-native path.\n");
6449   return VectorizationFactor::Disabled();
6450 }
6451 
6452 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6453   assert(OrigLoop->empty() && "Inner loop expected.");
6454   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6455   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6456     return None;
6457 
6458   // Invalidate interleave groups if all blocks of loop will be predicated.
6459   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6460       !useMaskedInterleavedAccesses(*TTI)) {
6461     LLVM_DEBUG(
6462         dbgs()
6463         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6464            "which requires masked-interleaved support.\n");
6465     CM.InterleaveInfo.reset();
6466   }
6467 
6468   if (UserVF) {
6469     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6470     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6471     // Collect the instructions (and their associated costs) that will be more
6472     // profitable to scalarize.
6473     CM.selectUserVectorizationFactor(UserVF);
6474     buildVPlansWithVPRecipes(UserVF, UserVF);
6475     LLVM_DEBUG(printPlans(dbgs()));
6476     return {{UserVF, 0}};
6477   }
6478 
6479   unsigned MaxVF = MaybeMaxVF.getValue();
6480   assert(MaxVF != 0 && "MaxVF is zero.");
6481 
6482   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6483     // Collect Uniform and Scalar instructions after vectorization with VF.
6484     CM.collectUniformsAndScalars(VF);
6485 
6486     // Collect the instructions (and their associated costs) that will be more
6487     // profitable to scalarize.
6488     if (VF > 1)
6489       CM.collectInstsToScalarize(VF);
6490   }
6491 
6492   buildVPlansWithVPRecipes(1, MaxVF);
6493   LLVM_DEBUG(printPlans(dbgs()));
6494   if (MaxVF == 1)
6495     return VectorizationFactor::Disabled();
6496 
6497   // Select the optimal vectorization factor.
6498   return CM.selectVectorizationFactor(MaxVF);
6499 }
6500 
6501 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6502   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6503                     << '\n');
6504   BestVF = VF;
6505   BestUF = UF;
6506 
6507   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6508     return !Plan->hasVF(VF);
6509   });
6510   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6511 }
6512 
6513 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6514                                            DominatorTree *DT) {
6515   // Perform the actual loop transformation.
6516 
6517   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6518   VPCallbackILV CallbackILV(ILV);
6519 
6520   VPTransformState State{BestVF, BestUF,      LI,
6521                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6522                          &ILV,   CallbackILV};
6523   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6524   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6525 
6526   //===------------------------------------------------===//
6527   //
6528   // Notice: any optimization or new instruction that go
6529   // into the code below should also be implemented in
6530   // the cost-model.
6531   //
6532   //===------------------------------------------------===//
6533 
6534   // 2. Copy and widen instructions from the old loop into the new loop.
6535   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6536   VPlans.front()->execute(&State);
6537 
6538   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6539   //    predication, updating analyses.
6540   ILV.fixVectorizedLoop();
6541 }
6542 
6543 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6544     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6545   BasicBlock *Latch = OrigLoop->getLoopLatch();
6546 
6547   // We create new control-flow for the vectorized loop, so the original
6548   // condition will be dead after vectorization if it's only used by the
6549   // branch.
6550   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6551   if (Cmp && Cmp->hasOneUse())
6552     DeadInstructions.insert(Cmp);
6553 
6554   // We create new "steps" for induction variable updates to which the original
6555   // induction variables map. An original update instruction will be dead if
6556   // all its users except the induction variable are dead.
6557   for (auto &Induction : *Legal->getInductionVars()) {
6558     PHINode *Ind = Induction.first;
6559     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6560     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6561           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6562                                  DeadInstructions.end();
6563         }))
6564       DeadInstructions.insert(IndUpdate);
6565 
6566     // We record as "Dead" also the type-casting instructions we had identified
6567     // during induction analysis. We don't need any handling for them in the
6568     // vectorized loop because we have proven that, under a proper runtime
6569     // test guarding the vectorized loop, the value of the phi, and the casted
6570     // value of the phi, are the same. The last instruction in this casting chain
6571     // will get its scalar/vector/widened def from the scalar/vector/widened def
6572     // of the respective phi node. Any other casts in the induction def-use chain
6573     // have no other uses outside the phi update chain, and will be ignored.
6574     InductionDescriptor &IndDes = Induction.second;
6575     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6576     DeadInstructions.insert(Casts.begin(), Casts.end());
6577   }
6578 }
6579 
6580 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6581 
6582 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6583 
6584 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6585                                         Instruction::BinaryOps BinOp) {
6586   // When unrolling and the VF is 1, we only need to add a simple scalar.
6587   Type *Ty = Val->getType();
6588   assert(!Ty->isVectorTy() && "Val must be a scalar");
6589 
6590   if (Ty->isFloatingPointTy()) {
6591     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6592 
6593     // Floating point operations had to be 'fast' to enable the unrolling.
6594     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6595     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6596   }
6597   Constant *C = ConstantInt::get(Ty, StartIdx);
6598   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6599 }
6600 
6601 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6602   SmallVector<Metadata *, 4> MDs;
6603   // Reserve first location for self reference to the LoopID metadata node.
6604   MDs.push_back(nullptr);
6605   bool IsUnrollMetadata = false;
6606   MDNode *LoopID = L->getLoopID();
6607   if (LoopID) {
6608     // First find existing loop unrolling disable metadata.
6609     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6610       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6611       if (MD) {
6612         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6613         IsUnrollMetadata =
6614             S && S->getString().startswith("llvm.loop.unroll.disable");
6615       }
6616       MDs.push_back(LoopID->getOperand(i));
6617     }
6618   }
6619 
6620   if (!IsUnrollMetadata) {
6621     // Add runtime unroll disable metadata.
6622     LLVMContext &Context = L->getHeader()->getContext();
6623     SmallVector<Metadata *, 1> DisableOperands;
6624     DisableOperands.push_back(
6625         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6626     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6627     MDs.push_back(DisableNode);
6628     MDNode *NewLoopID = MDNode::get(Context, MDs);
6629     // Set operand 0 to refer to the loop id itself.
6630     NewLoopID->replaceOperandWith(0, NewLoopID);
6631     L->setLoopID(NewLoopID);
6632   }
6633 }
6634 
6635 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6636     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6637   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6638   bool PredicateAtRangeStart = Predicate(Range.Start);
6639 
6640   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6641     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6642       Range.End = TmpVF;
6643       break;
6644     }
6645 
6646   return PredicateAtRangeStart;
6647 }
6648 
6649 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6650 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6651 /// of VF's starting at a given VF and extending it as much as possible. Each
6652 /// vectorization decision can potentially shorten this sub-range during
6653 /// buildVPlan().
6654 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6655   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6656     VFRange SubRange = {VF, MaxVF + 1};
6657     VPlans.push_back(buildVPlan(SubRange));
6658     VF = SubRange.End;
6659   }
6660 }
6661 
6662 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6663                                          VPlanPtr &Plan) {
6664   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6665 
6666   // Look for cached value.
6667   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6668   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6669   if (ECEntryIt != EdgeMaskCache.end())
6670     return ECEntryIt->second;
6671 
6672   VPValue *SrcMask = createBlockInMask(Src, Plan);
6673 
6674   // The terminator has to be a branch inst!
6675   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6676   assert(BI && "Unexpected terminator found");
6677 
6678   if (!BI->isConditional())
6679     return EdgeMaskCache[Edge] = SrcMask;
6680 
6681   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6682   assert(EdgeMask && "No Edge Mask found for condition");
6683 
6684   if (BI->getSuccessor(0) != Dst)
6685     EdgeMask = Builder.createNot(EdgeMask);
6686 
6687   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6688     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6689 
6690   return EdgeMaskCache[Edge] = EdgeMask;
6691 }
6692 
6693 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6694   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6695 
6696   // Look for cached value.
6697   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6698   if (BCEntryIt != BlockMaskCache.end())
6699     return BCEntryIt->second;
6700 
6701   // All-one mask is modelled as no-mask following the convention for masked
6702   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6703   VPValue *BlockMask = nullptr;
6704 
6705   if (OrigLoop->getHeader() == BB) {
6706     if (!CM.blockNeedsPredication(BB))
6707       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6708 
6709     // Introduce the early-exit compare IV <= BTC to form header block mask.
6710     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6711     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6712     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6713     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6714     return BlockMaskCache[BB] = BlockMask;
6715   }
6716 
6717   // This is the block mask. We OR all incoming edges.
6718   for (auto *Predecessor : predecessors(BB)) {
6719     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6720     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6721       return BlockMaskCache[BB] = EdgeMask;
6722 
6723     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6724       BlockMask = EdgeMask;
6725       continue;
6726     }
6727 
6728     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6729   }
6730 
6731   return BlockMaskCache[BB] = BlockMask;
6732 }
6733 
6734 VPWidenMemoryInstructionRecipe *
6735 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6736                                   VPlanPtr &Plan) {
6737   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6738     return nullptr;
6739 
6740   auto willWiden = [&](unsigned VF) -> bool {
6741     if (VF == 1)
6742       return false;
6743     LoopVectorizationCostModel::InstWidening Decision =
6744         CM.getWideningDecision(I, VF);
6745     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6746            "CM decision should be taken at this point.");
6747     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6748       return true;
6749     if (CM.isScalarAfterVectorization(I, VF) ||
6750         CM.isProfitableToScalarize(I, VF))
6751       return false;
6752     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6753   };
6754 
6755   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6756     return nullptr;
6757 
6758   VPValue *Mask = nullptr;
6759   if (Legal->isMaskRequired(I))
6760     Mask = createBlockInMask(I->getParent(), Plan);
6761 
6762   return new VPWidenMemoryInstructionRecipe(*I, Mask);
6763 }
6764 
6765 VPWidenIntOrFpInductionRecipe *
6766 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6767   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6768     // Check if this is an integer or fp induction. If so, build the recipe that
6769     // produces its scalar and vector values.
6770     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6771     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6772         II.getKind() == InductionDescriptor::IK_FpInduction)
6773       return new VPWidenIntOrFpInductionRecipe(Phi);
6774 
6775     return nullptr;
6776   }
6777 
6778   // Optimize the special case where the source is a constant integer
6779   // induction variable. Notice that we can only optimize the 'trunc' case
6780   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6781   // (c) other casts depend on pointer size.
6782 
6783   // Determine whether \p K is a truncation based on an induction variable that
6784   // can be optimized.
6785   auto isOptimizableIVTruncate =
6786       [&](Instruction *K) -> std::function<bool(unsigned)> {
6787     return
6788         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6789   };
6790 
6791   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6792                                isOptimizableIVTruncate(I), Range))
6793     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6794                                              cast<TruncInst>(I));
6795   return nullptr;
6796 }
6797 
6798 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6799   PHINode *Phi = dyn_cast<PHINode>(I);
6800   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6801     return nullptr;
6802 
6803   // We know that all PHIs in non-header blocks are converted into selects, so
6804   // we don't have to worry about the insertion order and we can just use the
6805   // builder. At this point we generate the predication tree. There may be
6806   // duplications since this is a simple recursive scan, but future
6807   // optimizations will clean it up.
6808 
6809   SmallVector<VPValue *, 2> Masks;
6810   unsigned NumIncoming = Phi->getNumIncomingValues();
6811   for (unsigned In = 0; In < NumIncoming; In++) {
6812     VPValue *EdgeMask =
6813       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6814     assert((EdgeMask || NumIncoming == 1) &&
6815            "Multiple predecessors with one having a full mask");
6816     if (EdgeMask)
6817       Masks.push_back(EdgeMask);
6818   }
6819   return new VPBlendRecipe(Phi, Masks);
6820 }
6821 
6822 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6823                                  VFRange &Range) {
6824 
6825   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6826       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6827 
6828   if (IsPredicated)
6829     return false;
6830 
6831   auto IsVectorizableOpcode = [](unsigned Opcode) {
6832     switch (Opcode) {
6833     case Instruction::Add:
6834     case Instruction::And:
6835     case Instruction::AShr:
6836     case Instruction::BitCast:
6837     case Instruction::Br:
6838     case Instruction::Call:
6839     case Instruction::FAdd:
6840     case Instruction::FCmp:
6841     case Instruction::FDiv:
6842     case Instruction::FMul:
6843     case Instruction::FNeg:
6844     case Instruction::FPExt:
6845     case Instruction::FPToSI:
6846     case Instruction::FPToUI:
6847     case Instruction::FPTrunc:
6848     case Instruction::FRem:
6849     case Instruction::FSub:
6850     case Instruction::ICmp:
6851     case Instruction::IntToPtr:
6852     case Instruction::Load:
6853     case Instruction::LShr:
6854     case Instruction::Mul:
6855     case Instruction::Or:
6856     case Instruction::PHI:
6857     case Instruction::PtrToInt:
6858     case Instruction::SDiv:
6859     case Instruction::Select:
6860     case Instruction::SExt:
6861     case Instruction::Shl:
6862     case Instruction::SIToFP:
6863     case Instruction::SRem:
6864     case Instruction::Store:
6865     case Instruction::Sub:
6866     case Instruction::Trunc:
6867     case Instruction::UDiv:
6868     case Instruction::UIToFP:
6869     case Instruction::URem:
6870     case Instruction::Xor:
6871     case Instruction::ZExt:
6872       return true;
6873     }
6874     return false;
6875   };
6876 
6877   if (!IsVectorizableOpcode(I->getOpcode()))
6878     return false;
6879 
6880   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6881     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6882     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6883                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6884       return false;
6885   }
6886 
6887   auto willWiden = [&](unsigned VF) -> bool {
6888     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6889                              CM.isProfitableToScalarize(I, VF)))
6890       return false;
6891     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6892       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6893       // The following case may be scalarized depending on the VF.
6894       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6895       // version of the instruction.
6896       // Is it beneficial to perform intrinsic call compared to lib call?
6897       bool NeedToScalarize;
6898       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6899       bool UseVectorIntrinsic =
6900           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6901       return UseVectorIntrinsic || !NeedToScalarize;
6902     }
6903     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6904       assert(CM.getWideningDecision(I, VF) ==
6905                  LoopVectorizationCostModel::CM_Scalarize &&
6906              "Memory widening decisions should have been taken care by now");
6907       return false;
6908     }
6909     return true;
6910   };
6911 
6912   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6913     return false;
6914   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6915   // to avoid having to split recipes later.
6916   bool IsSingleton = Ingredient2Recipe.count(I);
6917 
6918   // Success: widen this instruction.
6919 
6920   // Use the default widening recipe. We optimize the common case where
6921   // consecutive instructions can be represented by a single recipe.
6922   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6923       LastExtensibleRecipe->appendInstruction(I))
6924     return true;
6925 
6926   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6927   if (!IsSingleton)
6928     LastExtensibleRecipe = WidenRecipe;
6929   setRecipe(I, WidenRecipe);
6930   VPBB->appendRecipe(WidenRecipe);
6931   return true;
6932 }
6933 
6934 VPBasicBlock *VPRecipeBuilder::handleReplication(
6935     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6936     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6937     VPlanPtr &Plan) {
6938   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6939       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6940       Range);
6941 
6942   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6943       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6944 
6945   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6946   setRecipe(I, Recipe);
6947 
6948   // Find if I uses a predicated instruction. If so, it will use its scalar
6949   // value. Avoid hoisting the insert-element which packs the scalar value into
6950   // a vector value, as that happens iff all users use the vector value.
6951   for (auto &Op : I->operands())
6952     if (auto *PredInst = dyn_cast<Instruction>(Op))
6953       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6954         PredInst2Recipe[PredInst]->setAlsoPack(false);
6955 
6956   // Finalize the recipe for Instr, first if it is not predicated.
6957   if (!IsPredicated) {
6958     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6959     VPBB->appendRecipe(Recipe);
6960     return VPBB;
6961   }
6962   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6963   assert(VPBB->getSuccessors().empty() &&
6964          "VPBB has successors when handling predicated replication.");
6965   // Record predicated instructions for above packing optimizations.
6966   PredInst2Recipe[I] = Recipe;
6967   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
6968   VPBlockUtils::insertBlockAfter(Region, VPBB);
6969   auto *RegSucc = new VPBasicBlock();
6970   VPBlockUtils::insertBlockAfter(RegSucc, Region);
6971   return RegSucc;
6972 }
6973 
6974 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
6975                                                       VPRecipeBase *PredRecipe,
6976                                                       VPlanPtr &Plan) {
6977   // Instructions marked for predication are replicated and placed under an
6978   // if-then construct to prevent side-effects.
6979 
6980   // Generate recipes to compute the block mask for this region.
6981   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
6982 
6983   // Build the triangular if-then region.
6984   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
6985   assert(Instr->getParent() && "Predicated instruction not in any basic block");
6986   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
6987   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
6988   auto *PHIRecipe =
6989       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
6990   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
6991   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
6992   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
6993 
6994   // Note: first set Entry as region entry and then connect successors starting
6995   // from it in order, to propagate the "parent" of each VPBasicBlock.
6996   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
6997   VPBlockUtils::connectBlocks(Pred, Exit);
6998 
6999   return Region;
7000 }
7001 
7002 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7003                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7004   VPRecipeBase *Recipe = nullptr;
7005 
7006   // First, check for specific widening recipes that deal with memory
7007   // operations, inductions and Phi nodes.
7008   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7009       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7010       (Recipe = tryToBlend(Instr, Plan)) ||
7011       (isa<PHINode>(Instr) &&
7012        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7013     setRecipe(Instr, Recipe);
7014     VPBB->appendRecipe(Recipe);
7015     return true;
7016   }
7017 
7018   // Handle GEP widening.
7019   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7020     auto Scalarize = [&](unsigned VF) {
7021       return CM.isScalarWithPredication(Instr, VF) ||
7022              CM.isScalarAfterVectorization(Instr, VF) ||
7023              CM.isProfitableToScalarize(Instr, VF);
7024     };
7025     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7026       return false;
7027     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7028     setRecipe(Instr, Recipe);
7029     VPBB->appendRecipe(Recipe);
7030     return true;
7031   }
7032 
7033   // Check if Instr is to be widened by a general VPWidenRecipe, after
7034   // having first checked for specific widening recipes.
7035   if (tryToWiden(Instr, VPBB, Range))
7036     return true;
7037 
7038   return false;
7039 }
7040 
7041 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7042                                                         unsigned MaxVF) {
7043   assert(OrigLoop->empty() && "Inner loop expected.");
7044 
7045   // Collect conditions feeding internal conditional branches; they need to be
7046   // represented in VPlan for it to model masking.
7047   SmallPtrSet<Value *, 1> NeedDef;
7048 
7049   auto *Latch = OrigLoop->getLoopLatch();
7050   for (BasicBlock *BB : OrigLoop->blocks()) {
7051     if (BB == Latch)
7052       continue;
7053     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7054     if (Branch && Branch->isConditional())
7055       NeedDef.insert(Branch->getCondition());
7056   }
7057 
7058   // If the tail is to be folded by masking, the primary induction variable
7059   // needs to be represented in VPlan for it to model early-exit masking.
7060   // Also, both the Phi and the live-out instruction of each reduction are
7061   // required in order to introduce a select between them in VPlan.
7062   if (CM.foldTailByMasking()) {
7063     NeedDef.insert(Legal->getPrimaryInduction());
7064     for (auto &Reduction : *Legal->getReductionVars()) {
7065       NeedDef.insert(Reduction.first);
7066       NeedDef.insert(Reduction.second.getLoopExitInstr());
7067     }
7068   }
7069 
7070   // Collect instructions from the original loop that will become trivially dead
7071   // in the vectorized loop. We don't need to vectorize these instructions. For
7072   // example, original induction update instructions can become dead because we
7073   // separately emit induction "steps" when generating code for the new loop.
7074   // Similarly, we create a new latch condition when setting up the structure
7075   // of the new loop, so the old one can become dead.
7076   SmallPtrSet<Instruction *, 4> DeadInstructions;
7077   collectTriviallyDeadInstructions(DeadInstructions);
7078 
7079   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7080     VFRange SubRange = {VF, MaxVF + 1};
7081     VPlans.push_back(
7082         buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
7083     VF = SubRange.End;
7084   }
7085 }
7086 
7087 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7088     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7089     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7090 
7091   // Hold a mapping from predicated instructions to their recipes, in order to
7092   // fix their AlsoPack behavior if a user is determined to replicate and use a
7093   // scalar instead of vector value.
7094   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7095 
7096   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7097 
7098   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7099 
7100   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7101 
7102   // ---------------------------------------------------------------------------
7103   // Pre-construction: record ingredients whose recipes we'll need to further
7104   // process after constructing the initial VPlan.
7105   // ---------------------------------------------------------------------------
7106 
7107   // Mark instructions we'll need to sink later and their targets as
7108   // ingredients whose recipe we'll need to record.
7109   for (auto &Entry : SinkAfter) {
7110     RecipeBuilder.recordRecipeOf(Entry.first);
7111     RecipeBuilder.recordRecipeOf(Entry.second);
7112   }
7113 
7114   // For each interleave group which is relevant for this (possibly trimmed)
7115   // Range, add it to the set of groups to be later applied to the VPlan and add
7116   // placeholders for its members' Recipes which we'll be replacing with a
7117   // single VPInterleaveRecipe.
7118   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7119     auto applyIG = [IG, this](unsigned VF) -> bool {
7120       return (VF >= 2 && // Query is illegal for VF == 1
7121               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7122                   LoopVectorizationCostModel::CM_Interleave);
7123     };
7124     if (!getDecisionAndClampRange(applyIG, Range))
7125       continue;
7126     InterleaveGroups.insert(IG);
7127     for (unsigned i = 0; i < IG->getFactor(); i++)
7128       if (Instruction *Member = IG->getMember(i))
7129         RecipeBuilder.recordRecipeOf(Member);
7130   };
7131 
7132   // ---------------------------------------------------------------------------
7133   // Build initial VPlan: Scan the body of the loop in a topological order to
7134   // visit each basic block after having visited its predecessor basic blocks.
7135   // ---------------------------------------------------------------------------
7136 
7137   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7138   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7139   auto Plan = std::make_unique<VPlan>(VPBB);
7140 
7141   // Represent values that will have defs inside VPlan.
7142   for (Value *V : NeedDef)
7143     Plan->addVPValue(V);
7144 
7145   // Scan the body of the loop in a topological order to visit each basic block
7146   // after having visited its predecessor basic blocks.
7147   LoopBlocksDFS DFS(OrigLoop);
7148   DFS.perform(LI);
7149 
7150   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7151     // Relevant instructions from basic block BB will be grouped into VPRecipe
7152     // ingredients and fill a new VPBasicBlock.
7153     unsigned VPBBsForBB = 0;
7154     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7155     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7156     VPBB = FirstVPBBForBB;
7157     Builder.setInsertPoint(VPBB);
7158 
7159     // Introduce each ingredient into VPlan.
7160     for (Instruction &I : BB->instructionsWithoutDebug()) {
7161       Instruction *Instr = &I;
7162 
7163       // First filter out irrelevant instructions, to ensure no recipes are
7164       // built for them.
7165       if (isa<BranchInst>(Instr) ||
7166           DeadInstructions.find(Instr) != DeadInstructions.end())
7167         continue;
7168 
7169       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7170         continue;
7171 
7172       // Otherwise, if all widening options failed, Instruction is to be
7173       // replicated. This may create a successor for VPBB.
7174       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7175           Instr, Range, VPBB, PredInst2Recipe, Plan);
7176       if (NextVPBB != VPBB) {
7177         VPBB = NextVPBB;
7178         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7179                                     : "");
7180       }
7181     }
7182   }
7183 
7184   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7185   // may also be empty, such as the last one VPBB, reflecting original
7186   // basic-blocks with no recipes.
7187   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7188   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7189   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7190   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7191   delete PreEntry;
7192 
7193   // ---------------------------------------------------------------------------
7194   // Transform initial VPlan: Apply previously taken decisions, in order, to
7195   // bring the VPlan to its final state.
7196   // ---------------------------------------------------------------------------
7197 
7198   // Apply Sink-After legal constraints.
7199   for (auto &Entry : SinkAfter) {
7200     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7201     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7202     Sink->moveAfter(Target);
7203   }
7204 
7205   // Interleave memory: for each Interleave Group we marked earlier as relevant
7206   // for this VPlan, replace the Recipes widening its memory instructions with a
7207   // single VPInterleaveRecipe at its insertion point.
7208   for (auto IG : InterleaveGroups) {
7209     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7210         RecipeBuilder.getRecipe(IG->getInsertPos()));
7211     (new VPInterleaveRecipe(IG, Recipe->getMask()))->insertBefore(Recipe);
7212 
7213     for (unsigned i = 0; i < IG->getFactor(); ++i)
7214       if (Instruction *Member = IG->getMember(i)) {
7215         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7216       }
7217   }
7218 
7219   // Finally, if tail is folded by masking, introduce selects between the phi
7220   // and the live-out instruction of each reduction, at the end of the latch.
7221   if (CM.foldTailByMasking()) {
7222     Builder.setInsertPoint(VPBB);
7223     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7224     for (auto &Reduction : *Legal->getReductionVars()) {
7225       VPValue *Phi = Plan->getVPValue(Reduction.first);
7226       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7227       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7228     }
7229   }
7230 
7231   std::string PlanName;
7232   raw_string_ostream RSO(PlanName);
7233   unsigned VF = Range.Start;
7234   Plan->addVF(VF);
7235   RSO << "Initial VPlan for VF={" << VF;
7236   for (VF *= 2; VF < Range.End; VF *= 2) {
7237     Plan->addVF(VF);
7238     RSO << "," << VF;
7239   }
7240   RSO << "},UF>=1";
7241   RSO.flush();
7242   Plan->setName(PlanName);
7243 
7244   return Plan;
7245 }
7246 
7247 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7248   // Outer loop handling: They may require CFG and instruction level
7249   // transformations before even evaluating whether vectorization is profitable.
7250   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7251   // the vectorization pipeline.
7252   assert(!OrigLoop->empty());
7253   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7254 
7255   // Create new empty VPlan
7256   auto Plan = std::make_unique<VPlan>();
7257 
7258   // Build hierarchical CFG
7259   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7260   HCFGBuilder.buildHierarchicalCFG();
7261 
7262   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7263     Plan->addVF(VF);
7264 
7265   if (EnableVPlanPredication) {
7266     VPlanPredicator VPP(*Plan);
7267     VPP.predicate();
7268 
7269     // Avoid running transformation to recipes until masked code generation in
7270     // VPlan-native path is in place.
7271     return Plan;
7272   }
7273 
7274   SmallPtrSet<Instruction *, 1> DeadInstructions;
7275   VPlanTransforms::VPInstructionsToVPRecipes(
7276       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7277   return Plan;
7278 }
7279 
7280 Value* LoopVectorizationPlanner::VPCallbackILV::
7281 getOrCreateVectorValues(Value *V, unsigned Part) {
7282       return ILV.getOrCreateVectorValue(V, Part);
7283 }
7284 
7285 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7286   O << " +\n"
7287     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7288   IG->getInsertPos()->printAsOperand(O, false);
7289   if (User) {
7290     O << ", ";
7291     User->getOperand(0)->printAsOperand(O);
7292   }
7293   O << "\\l\"";
7294   for (unsigned i = 0; i < IG->getFactor(); ++i)
7295     if (Instruction *I = IG->getMember(i))
7296       O << " +\n"
7297         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7298 }
7299 
7300 void VPWidenRecipe::execute(VPTransformState &State) {
7301   for (auto &Instr : make_range(Begin, End))
7302     State.ILV->widenInstruction(Instr);
7303 }
7304 
7305 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7306   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7307                       IsIndexLoopInvariant);
7308 }
7309 
7310 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7311   assert(!State.Instance && "Int or FP induction being replicated.");
7312   State.ILV->widenIntOrFpInduction(IV, Trunc);
7313 }
7314 
7315 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7316   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7317 }
7318 
7319 void VPBlendRecipe::execute(VPTransformState &State) {
7320   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7321   // We know that all PHIs in non-header blocks are converted into
7322   // selects, so we don't have to worry about the insertion order and we
7323   // can just use the builder.
7324   // At this point we generate the predication tree. There may be
7325   // duplications since this is a simple recursive scan, but future
7326   // optimizations will clean it up.
7327 
7328   unsigned NumIncoming = Phi->getNumIncomingValues();
7329 
7330   assert((User || NumIncoming == 1) &&
7331          "Multiple predecessors with predecessors having a full mask");
7332   // Generate a sequence of selects of the form:
7333   // SELECT(Mask3, In3,
7334   //      SELECT(Mask2, In2,
7335   //                   ( ...)))
7336   InnerLoopVectorizer::VectorParts Entry(State.UF);
7337   for (unsigned In = 0; In < NumIncoming; ++In) {
7338     for (unsigned Part = 0; Part < State.UF; ++Part) {
7339       // We might have single edge PHIs (blocks) - use an identity
7340       // 'select' for the first PHI operand.
7341       Value *In0 =
7342           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7343       if (In == 0)
7344         Entry[Part] = In0; // Initialize with the first incoming value.
7345       else {
7346         // Select between the current value and the previous incoming edge
7347         // based on the incoming mask.
7348         Value *Cond = State.get(User->getOperand(In), Part);
7349         Entry[Part] =
7350             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7351       }
7352     }
7353   }
7354   for (unsigned Part = 0; Part < State.UF; ++Part)
7355     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7356 }
7357 
7358 void VPInterleaveRecipe::execute(VPTransformState &State) {
7359   assert(!State.Instance && "Interleave group being replicated.");
7360   if (!User)
7361     return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
7362 
7363   // Last (and currently only) operand is a mask.
7364   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7365   VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
7366   for (unsigned Part = 0; Part < State.UF; ++Part)
7367     MaskValues[Part] = State.get(Mask, Part);
7368   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
7369 }
7370 
7371 void VPReplicateRecipe::execute(VPTransformState &State) {
7372   if (State.Instance) { // Generate a single instance.
7373     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7374     // Insert scalar instance packing it into a vector.
7375     if (AlsoPack && State.VF > 1) {
7376       // If we're constructing lane 0, initialize to start from undef.
7377       if (State.Instance->Lane == 0) {
7378         Value *Undef =
7379             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7380         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7381       }
7382       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7383     }
7384     return;
7385   }
7386 
7387   // Generate scalar instances for all VF lanes of all UF parts, unless the
7388   // instruction is uniform inwhich case generate only the first lane for each
7389   // of the UF parts.
7390   unsigned EndLane = IsUniform ? 1 : State.VF;
7391   for (unsigned Part = 0; Part < State.UF; ++Part)
7392     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7393       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7394 }
7395 
7396 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7397   assert(State.Instance && "Branch on Mask works only on single instance.");
7398 
7399   unsigned Part = State.Instance->Part;
7400   unsigned Lane = State.Instance->Lane;
7401 
7402   Value *ConditionBit = nullptr;
7403   if (!User) // Block in mask is all-one.
7404     ConditionBit = State.Builder.getTrue();
7405   else {
7406     VPValue *BlockInMask = User->getOperand(0);
7407     ConditionBit = State.get(BlockInMask, Part);
7408     if (ConditionBit->getType()->isVectorTy())
7409       ConditionBit = State.Builder.CreateExtractElement(
7410           ConditionBit, State.Builder.getInt32(Lane));
7411   }
7412 
7413   // Replace the temporary unreachable terminator with a new conditional branch,
7414   // whose two destinations will be set later when they are created.
7415   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7416   assert(isa<UnreachableInst>(CurrentTerminator) &&
7417          "Expected to replace unreachable terminator with conditional branch.");
7418   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7419   CondBr->setSuccessor(0, nullptr);
7420   ReplaceInstWithInst(CurrentTerminator, CondBr);
7421 }
7422 
7423 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7424   assert(State.Instance && "Predicated instruction PHI works per instance.");
7425   Instruction *ScalarPredInst = cast<Instruction>(
7426       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7427   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7428   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7429   assert(PredicatingBB && "Predicated block has no single predecessor.");
7430 
7431   // By current pack/unpack logic we need to generate only a single phi node: if
7432   // a vector value for the predicated instruction exists at this point it means
7433   // the instruction has vector users only, and a phi for the vector value is
7434   // needed. In this case the recipe of the predicated instruction is marked to
7435   // also do that packing, thereby "hoisting" the insert-element sequence.
7436   // Otherwise, a phi node for the scalar value is needed.
7437   unsigned Part = State.Instance->Part;
7438   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7439     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7440     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7441     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7442     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7443     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7444     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7445   } else {
7446     Type *PredInstType = PredInst->getType();
7447     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7448     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7449     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7450     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7451   }
7452 }
7453 
7454 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7455   VPValue *Mask = getMask();
7456   if (!Mask)
7457     return State.ILV->vectorizeMemoryInstruction(&Instr);
7458 
7459   InnerLoopVectorizer::VectorParts MaskValues(State.UF);
7460   for (unsigned Part = 0; Part < State.UF; ++Part)
7461     MaskValues[Part] = State.get(Mask, Part);
7462   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
7463 }
7464 
7465 static ScalarEpilogueLowering
7466 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7467                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
7468                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7469                           AssumptionCache *AC, LoopInfo *LI,
7470                           ScalarEvolution *SE, DominatorTree *DT,
7471                           const LoopAccessInfo *LAI) {
7472   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7473   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7474                               !PreferPredicateOverEpilog;
7475 
7476   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7477       (F->hasOptSize() ||
7478        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7479                                    PGSOQueryType::IRPass)))
7480     SEL = CM_ScalarEpilogueNotAllowedOptSize;
7481   else if (PreferPredicateOverEpilog ||
7482            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7483            (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
7484             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
7485             !PredicateOptDisabled))
7486     SEL = CM_ScalarEpilogueNotNeededUsePredicate;
7487 
7488   return SEL;
7489 }
7490 
7491 // Process the loop in the VPlan-native vectorization path. This path builds
7492 // VPlan upfront in the vectorization pipeline, which allows to apply
7493 // VPlan-to-VPlan transformations from the very beginning without modifying the
7494 // input LLVM IR.
7495 static bool processLoopInVPlanNativePath(
7496     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7497     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7498     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7499     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7500     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7501 
7502   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7503   Function *F = L->getHeader()->getParent();
7504   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7505 
7506   ScalarEpilogueLowering SEL =
7507     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7508                               PSE.getSE(), DT, LVL->getLAI());
7509 
7510   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7511                                 &Hints, IAI);
7512   // Use the planner for outer loop vectorization.
7513   // TODO: CM is not used at this point inside the planner. Turn CM into an
7514   // optional argument if we don't need it in the future.
7515   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7516 
7517   // Get user vectorization factor.
7518   const unsigned UserVF = Hints.getWidth();
7519 
7520   // Plan how to best vectorize, return the best VF and its cost.
7521   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7522 
7523   // If we are stress testing VPlan builds, do not attempt to generate vector
7524   // code. Masked vector code generation support will follow soon.
7525   // Also, do not attempt to vectorize if no vector code will be produced.
7526   if (VPlanBuildStressTest || EnableVPlanPredication ||
7527       VectorizationFactor::Disabled() == VF)
7528     return false;
7529 
7530   LVP.setBestPlan(VF.Width, 1);
7531 
7532   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7533                          &CM);
7534   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7535                     << L->getHeader()->getParent()->getName() << "\"\n");
7536   LVP.executePlan(LB, DT);
7537 
7538   // Mark the loop as already vectorized to avoid vectorizing again.
7539   Hints.setAlreadyVectorized();
7540 
7541   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7542   return true;
7543 }
7544 
7545 bool LoopVectorizePass::processLoop(Loop *L) {
7546   assert((EnableVPlanNativePath || L->empty()) &&
7547          "VPlan-native path is not enabled. Only process inner loops.");
7548 
7549 #ifndef NDEBUG
7550   const std::string DebugLocStr = getDebugLocString(L);
7551 #endif /* NDEBUG */
7552 
7553   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7554                     << L->getHeader()->getParent()->getName() << "\" from "
7555                     << DebugLocStr << "\n");
7556 
7557   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7558 
7559   LLVM_DEBUG(
7560       dbgs() << "LV: Loop hints:"
7561              << " force="
7562              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7563                      ? "disabled"
7564                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7565                             ? "enabled"
7566                             : "?"))
7567              << " width=" << Hints.getWidth()
7568              << " unroll=" << Hints.getInterleave() << "\n");
7569 
7570   // Function containing loop
7571   Function *F = L->getHeader()->getParent();
7572 
7573   // Looking at the diagnostic output is the only way to determine if a loop
7574   // was vectorized (other than looking at the IR or machine code), so it
7575   // is important to generate an optimization remark for each loop. Most of
7576   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7577   // generated as OptimizationRemark and OptimizationRemarkMissed are
7578   // less verbose reporting vectorized loops and unvectorized loops that may
7579   // benefit from vectorization, respectively.
7580 
7581   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7582     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7583     return false;
7584   }
7585 
7586   PredicatedScalarEvolution PSE(*SE, *L);
7587 
7588   // Check if it is legal to vectorize the loop.
7589   LoopVectorizationRequirements Requirements(*ORE);
7590   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7591                                 &Requirements, &Hints, DB, AC);
7592   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7593     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7594     Hints.emitRemarkWithHints();
7595     return false;
7596   }
7597 
7598   // Check the function attributes and profiles to find out if this function
7599   // should be optimized for size.
7600   ScalarEpilogueLowering SEL =
7601     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
7602                               PSE.getSE(), DT, LVL.getLAI());
7603 
7604   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7605   // here. They may require CFG and instruction level transformations before
7606   // even evaluating whether vectorization is profitable. Since we cannot modify
7607   // the incoming IR, we need to build VPlan upfront in the vectorization
7608   // pipeline.
7609   if (!L->empty())
7610     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7611                                         ORE, BFI, PSI, Hints);
7612 
7613   assert(L->empty() && "Inner loop expected.");
7614 
7615   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7616   // count by optimizing for size, to minimize overheads.
7617   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7618   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7619     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7620                       << "This loop is worth vectorizing only if no scalar "
7621                       << "iteration overheads are incurred.");
7622     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7623       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7624     else {
7625       LLVM_DEBUG(dbgs() << "\n");
7626       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7627     }
7628   }
7629 
7630   // Check the function attributes to see if implicit floats are allowed.
7631   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7632   // an integer loop and the vector instructions selected are purely integer
7633   // vector instructions?
7634   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7635     reportVectorizationFailure(
7636         "Can't vectorize when the NoImplicitFloat attribute is used",
7637         "loop not vectorized due to NoImplicitFloat attribute",
7638         "NoImplicitFloat", ORE, L);
7639     Hints.emitRemarkWithHints();
7640     return false;
7641   }
7642 
7643   // Check if the target supports potentially unsafe FP vectorization.
7644   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7645   // for the target we're vectorizing for, to make sure none of the
7646   // additional fp-math flags can help.
7647   if (Hints.isPotentiallyUnsafe() &&
7648       TTI->isFPVectorizationPotentiallyUnsafe()) {
7649     reportVectorizationFailure(
7650         "Potentially unsafe FP op prevents vectorization",
7651         "loop not vectorized due to unsafe FP support.",
7652         "UnsafeFP", ORE, L);
7653     Hints.emitRemarkWithHints();
7654     return false;
7655   }
7656 
7657   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7658   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7659 
7660   // If an override option has been passed in for interleaved accesses, use it.
7661   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7662     UseInterleaved = EnableInterleavedMemAccesses;
7663 
7664   // Analyze interleaved memory accesses.
7665   if (UseInterleaved) {
7666     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7667   }
7668 
7669   // Use the cost model.
7670   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7671                                 F, &Hints, IAI);
7672   CM.collectValuesToIgnore();
7673 
7674   // Use the planner for vectorization.
7675   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7676 
7677   // Get user vectorization factor.
7678   unsigned UserVF = Hints.getWidth();
7679 
7680   // Plan how to best vectorize, return the best VF and its cost.
7681   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7682 
7683   VectorizationFactor VF = VectorizationFactor::Disabled();
7684   unsigned IC = 1;
7685   unsigned UserIC = Hints.getInterleave();
7686 
7687   if (MaybeVF) {
7688     VF = *MaybeVF;
7689     // Select the interleave count.
7690     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7691   }
7692 
7693   // Identify the diagnostic messages that should be produced.
7694   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7695   bool VectorizeLoop = true, InterleaveLoop = true;
7696   if (Requirements.doesNotMeet(F, L, Hints)) {
7697     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7698                          "requirements.\n");
7699     Hints.emitRemarkWithHints();
7700     return false;
7701   }
7702 
7703   if (VF.Width == 1) {
7704     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7705     VecDiagMsg = std::make_pair(
7706         "VectorizationNotBeneficial",
7707         "the cost-model indicates that vectorization is not beneficial");
7708     VectorizeLoop = false;
7709   }
7710 
7711   if (!MaybeVF && UserIC > 1) {
7712     // Tell the user interleaving was avoided up-front, despite being explicitly
7713     // requested.
7714     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7715                          "interleaving should be avoided up front\n");
7716     IntDiagMsg = std::make_pair(
7717         "InterleavingAvoided",
7718         "Ignoring UserIC, because interleaving was avoided up front");
7719     InterleaveLoop = false;
7720   } else if (IC == 1 && UserIC <= 1) {
7721     // Tell the user interleaving is not beneficial.
7722     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7723     IntDiagMsg = std::make_pair(
7724         "InterleavingNotBeneficial",
7725         "the cost-model indicates that interleaving is not beneficial");
7726     InterleaveLoop = false;
7727     if (UserIC == 1) {
7728       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7729       IntDiagMsg.second +=
7730           " and is explicitly disabled or interleave count is set to 1";
7731     }
7732   } else if (IC > 1 && UserIC == 1) {
7733     // Tell the user interleaving is beneficial, but it explicitly disabled.
7734     LLVM_DEBUG(
7735         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7736     IntDiagMsg = std::make_pair(
7737         "InterleavingBeneficialButDisabled",
7738         "the cost-model indicates that interleaving is beneficial "
7739         "but is explicitly disabled or interleave count is set to 1");
7740     InterleaveLoop = false;
7741   }
7742 
7743   // Override IC if user provided an interleave count.
7744   IC = UserIC > 0 ? UserIC : IC;
7745 
7746   // Emit diagnostic messages, if any.
7747   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7748   if (!VectorizeLoop && !InterleaveLoop) {
7749     // Do not vectorize or interleaving the loop.
7750     ORE->emit([&]() {
7751       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7752                                       L->getStartLoc(), L->getHeader())
7753              << VecDiagMsg.second;
7754     });
7755     ORE->emit([&]() {
7756       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7757                                       L->getStartLoc(), L->getHeader())
7758              << IntDiagMsg.second;
7759     });
7760     return false;
7761   } else if (!VectorizeLoop && InterleaveLoop) {
7762     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7763     ORE->emit([&]() {
7764       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7765                                         L->getStartLoc(), L->getHeader())
7766              << VecDiagMsg.second;
7767     });
7768   } else if (VectorizeLoop && !InterleaveLoop) {
7769     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7770                       << ") in " << DebugLocStr << '\n');
7771     ORE->emit([&]() {
7772       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7773                                         L->getStartLoc(), L->getHeader())
7774              << IntDiagMsg.second;
7775     });
7776   } else if (VectorizeLoop && InterleaveLoop) {
7777     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7778                       << ") in " << DebugLocStr << '\n');
7779     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7780   }
7781 
7782   LVP.setBestPlan(VF.Width, IC);
7783 
7784   using namespace ore;
7785   bool DisableRuntimeUnroll = false;
7786   MDNode *OrigLoopID = L->getLoopID();
7787 
7788   if (!VectorizeLoop) {
7789     assert(IC > 1 && "interleave count should not be 1 or 0");
7790     // If we decided that it is not legal to vectorize the loop, then
7791     // interleave it.
7792     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7793                                &CM);
7794     LVP.executePlan(Unroller, DT);
7795 
7796     ORE->emit([&]() {
7797       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7798                                 L->getHeader())
7799              << "interleaved loop (interleaved count: "
7800              << NV("InterleaveCount", IC) << ")";
7801     });
7802   } else {
7803     // If we decided that it is *legal* to vectorize the loop, then do it.
7804     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7805                            &LVL, &CM);
7806     LVP.executePlan(LB, DT);
7807     ++LoopsVectorized;
7808 
7809     // Add metadata to disable runtime unrolling a scalar loop when there are
7810     // no runtime checks about strides and memory. A scalar loop that is
7811     // rarely used is not worth unrolling.
7812     if (!LB.areSafetyChecksAdded())
7813       DisableRuntimeUnroll = true;
7814 
7815     // Report the vectorization decision.
7816     ORE->emit([&]() {
7817       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7818                                 L->getHeader())
7819              << "vectorized loop (vectorization width: "
7820              << NV("VectorizationFactor", VF.Width)
7821              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7822     });
7823   }
7824 
7825   Optional<MDNode *> RemainderLoopID =
7826       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7827                                       LLVMLoopVectorizeFollowupEpilogue});
7828   if (RemainderLoopID.hasValue()) {
7829     L->setLoopID(RemainderLoopID.getValue());
7830   } else {
7831     if (DisableRuntimeUnroll)
7832       AddRuntimeUnrollDisableMetaData(L);
7833 
7834     // Mark the loop as already vectorized to avoid vectorizing again.
7835     Hints.setAlreadyVectorized();
7836   }
7837 
7838   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7839   return true;
7840 }
7841 
7842 bool LoopVectorizePass::runImpl(
7843     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7844     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7845     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7846     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7847     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7848   SE = &SE_;
7849   LI = &LI_;
7850   TTI = &TTI_;
7851   DT = &DT_;
7852   BFI = &BFI_;
7853   TLI = TLI_;
7854   AA = &AA_;
7855   AC = &AC_;
7856   GetLAA = &GetLAA_;
7857   DB = &DB_;
7858   ORE = &ORE_;
7859   PSI = PSI_;
7860 
7861   // Don't attempt if
7862   // 1. the target claims to have no vector registers, and
7863   // 2. interleaving won't help ILP.
7864   //
7865   // The second condition is necessary because, even if the target has no
7866   // vector registers, loop vectorization may still enable scalar
7867   // interleaving.
7868   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7869       TTI->getMaxInterleaveFactor(1) < 2)
7870     return false;
7871 
7872   bool Changed = false;
7873 
7874   // The vectorizer requires loops to be in simplified form.
7875   // Since simplification may add new inner loops, it has to run before the
7876   // legality and profitability checks. This means running the loop vectorizer
7877   // will simplify all loops, regardless of whether anything end up being
7878   // vectorized.
7879   for (auto &L : *LI)
7880     Changed |=
7881         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7882 
7883   // Build up a worklist of inner-loops to vectorize. This is necessary as
7884   // the act of vectorizing or partially unrolling a loop creates new loops
7885   // and can invalidate iterators across the loops.
7886   SmallVector<Loop *, 8> Worklist;
7887 
7888   for (Loop *L : *LI)
7889     collectSupportedLoops(*L, LI, ORE, Worklist);
7890 
7891   LoopsAnalyzed += Worklist.size();
7892 
7893   // Now walk the identified inner loops.
7894   while (!Worklist.empty()) {
7895     Loop *L = Worklist.pop_back_val();
7896 
7897     // For the inner loops we actually process, form LCSSA to simplify the
7898     // transform.
7899     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7900 
7901     Changed |= processLoop(L);
7902   }
7903 
7904   // Process each loop nest in the function.
7905   return Changed;
7906 }
7907 
7908 PreservedAnalyses LoopVectorizePass::run(Function &F,
7909                                          FunctionAnalysisManager &AM) {
7910     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7911     auto &LI = AM.getResult<LoopAnalysis>(F);
7912     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7913     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7914     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7915     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7916     auto &AA = AM.getResult<AAManager>(F);
7917     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7918     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7919     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7920     MemorySSA *MSSA = EnableMSSALoopDependency
7921                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7922                           : nullptr;
7923 
7924     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7925     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7926         [&](Loop &L) -> const LoopAccessInfo & {
7927       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7928       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7929     };
7930     const ModuleAnalysisManager &MAM =
7931         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7932     ProfileSummaryInfo *PSI =
7933         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7934     bool Changed =
7935         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7936     if (!Changed)
7937       return PreservedAnalyses::all();
7938     PreservedAnalyses PA;
7939 
7940     // We currently do not preserve loopinfo/dominator analyses with outer loop
7941     // vectorization. Until this is addressed, mark these analyses as preserved
7942     // only for non-VPlan-native path.
7943     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7944     if (!EnableVPlanNativePath) {
7945       PA.preserve<LoopAnalysis>();
7946       PA.preserve<DominatorTreeAnalysis>();
7947     }
7948     PA.preserve<BasicAA>();
7949     PA.preserve<GlobalsAA>();
7950     return PA;
7951 }
7952